[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <273fdd58-549c-30d4-39a9-85fe631162ba@linux.ibm.com>
Date: Fri, 2 Sep 2022 13:11:09 -0400
From: Matthew Rosato <mjrosato@...ux.ibm.com>
To: Jason Gunthorpe <jgg@...dia.com>
Cc: Robin Murphy <robin.murphy@....com>, iommu@...ts.linux.dev,
Alex Williamson <alex.williamson@...hat.com>,
linux-s390@...r.kernel.org, schnelle@...ux.ibm.com,
pmorel@...ux.ibm.com, borntraeger@...ux.ibm.com, hca@...ux.ibm.com,
gor@...ux.ibm.com, gerald.schaefer@...ux.ibm.com,
agordeev@...ux.ibm.com, svens@...ux.ibm.com, joro@...tes.org,
will@...nel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops
On 9/1/22 4:37 PM, Jason Gunthorpe wrote:
> On Thu, Sep 01, 2022 at 12:14:24PM -0400, Matthew Rosato wrote:
>> On 9/1/22 6:25 AM, Robin Murphy wrote:
>>> On 2022-08-31 21:12, Matthew Rosato wrote:
>>>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
>>>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU
>>>> domains and the DMA API handling. However, this commit does not
>>>> sufficiently handle the case where the device is released via a call
>>>> to the release_device op as it may occur at the same time as an opposing
>>>> attach_dev or detach_dev since the group mutex is not held over
>>>> release_device. This was observed if the device is deconfigured during a
>>>> small window during vfio-pci initialization and can result in WARNs and
>>>> potential kernel panics.
>>>
>>> Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all?
>>>
>>> Robin.
>>
>> So, I generally have seen the issue manifest as one of the calls
>> into the iommu core from __vfio_group_unset_container
>> (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN.
>> This happens when the vfio group fd is released, which could be
>> coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER.
>> AFAICT there's nothing serializing the notion of calling into the
>> iommu core here against a device that is simultaneously going
>> through release_device (because we don't enter release_device with
>> the group mutex held), resulting in unpredictable behavior between
>> the dueling attach_dev/detach_dev and the release_device for
>> s390-iommu at least.
>
> Oh, this is a vfio bug.
I've been running with your diff applied today on s390 and this indeed fixes the issue by preventing the detach-after-release coming out of vfio.
Can you send as a patch for review?
>
> dev->iommu_group is only a valid pointer as long as a driver is attach
> to the device.
>
> vfio copies the dev->iommu_group into struct vfio_group during probe()
> but then lets vfio_group live independently. Particularly the driver
> can be removed()'d and the vfio_group keeps on going.
>
> Thus it is possible to UAF the iommu_group by referencing it through
> the vfio_group.
>
> We must wait during remove for all the vfio_groups to stop
> referencing iommu_group.
>
> Something like this or so:
>
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index eb714a484662fc..d8f40b22c98ddb 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -65,7 +65,15 @@ struct vfio_container {
> struct vfio_group {
> struct device dev;
> struct cdev cdev;
> + /*
> + * When drivers is non-zero a driver is attached to the struct device
> + * that provided the iommu_group and thus the iommu_group is a valid
> + * pointer. When drivers is 0 the driver is being detached. Once users
> + * reaches 0 then the iommu_group is invalid.
> + */
> + refcount_t drivers;
> refcount_t users;
> + struct completion comp;
> unsigned int container_users;
> struct iommu_group *iommu_group;
> struct vfio_container *container;
> @@ -276,8 +284,6 @@ void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
> }
> EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
>
> -static void vfio_group_get(struct vfio_group *group);
> -
> /*
> * Container objects - containers are created when /dev/vfio/vfio is
> * opened, but their lifecycle extends until the last user is done, so
> @@ -305,16 +311,21 @@ static void vfio_container_put(struct vfio_container *container)
> /*
> * Group objects - create, release, get, put, search
> */
> +
> + /*
> + * This returns a driver reference. It can only be used in the probe function
> + * of a device_driver, eg as part of the internal implementation of
> + * __vfio_register_dev().
> + */
> static struct vfio_group *
> __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
> {
> struct vfio_group *group;
>
> list_for_each_entry(group, &vfio.group_list, vfio_next) {
> - if (group->iommu_group == iommu_group) {
> - vfio_group_get(group);
> + if (group->iommu_group == iommu_group &&
> + refcount_inc_not_zero(&group->drivers))
> return group;
> - }
> }
> return NULL;
> }
> @@ -364,6 +375,8 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
> group->cdev.owner = THIS_MODULE;
>
> refcount_set(&group->users, 1);
> + refcount_set(&group->drivers, 1);
> + init_completion(&group->comp);
> init_rwsem(&group->group_rwsem);
> INIT_LIST_HEAD(&group->device_list);
> mutex_init(&group->device_lock);
> @@ -422,8 +435,28 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
>
> static void vfio_group_put(struct vfio_group *group)
> {
> - if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
> - return;
> + if (refcount_dec_and_test(&group->users))
> + complete(&group->comp);
> +}
> +
> +/*
> + * When the drivers count reaches 0 then the group must be destroyed
> + * immediately. A zero driver group is a zombie awaiting destruction.
> + */
> +static void vfio_group_remove(struct vfio_group *group)
> +{
> + /* Matches the get from vfio_group_alloc() */
> + vfio_group_put(group);
> +
> + cdev_device_del(&group->cdev, &group->dev);
> +
> + /*
> + * Before we allow the last driver in the group to be unplugged the
> + * group must be sanitized so nothing else is or can reference it. This
> + * is because the group->iommu_group pointer is only valid so long as a
> + * VFIO device driver is attached to a device belonging to the group.
> + */
> + wait_for_completion(&group->comp);
>
> /*
> * These data structures all have paired operations that can only be
> @@ -434,19 +467,15 @@ static void vfio_group_put(struct vfio_group *group)
> WARN_ON(!list_empty(&group->device_list));
> WARN_ON(group->container || group->container_users);
> WARN_ON(group->notifier.head);
> + group->iommu_group = NULL;
>
> + mutex_lock(&vfio.group_lock);
> list_del(&group->vfio_next);
> - cdev_device_del(&group->cdev, &group->dev);
> mutex_unlock(&vfio.group_lock);
>
> put_device(&group->dev);
> }
>
> -static void vfio_group_get(struct vfio_group *group)
> -{
> - refcount_inc(&group->users);
> -}
> -
> /*
> * Device objects - create, release, get, put, search
> */
> @@ -462,22 +491,6 @@ static bool vfio_device_try_get(struct vfio_device *device)
> return refcount_inc_not_zero(&device->refcount);
> }
>
> -static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
> - struct device *dev)
> -{
> - struct vfio_device *device;
> -
> - mutex_lock(&group->device_lock);
> - list_for_each_entry(device, &group->device_list, group_next) {
> - if (device->dev == dev && vfio_device_try_get(device)) {
> - mutex_unlock(&group->device_lock);
> - return device;
> - }
> - }
> - mutex_unlock(&group->device_lock);
> - return NULL;
> -}
> -
> /*
> * VFIO driver API
> */
> @@ -576,8 +589,10 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
> static int __vfio_register_dev(struct vfio_device *device,
> struct vfio_group *group)
> {
> - struct vfio_device *existing_device;
> -
> + /*
> + * In all cases group is the output of one of the group allocation functions
> + * and we have group->drivers incremetned for us
> + */
> if (IS_ERR(group))
> return PTR_ERR(group);
>
> @@ -588,18 +603,6 @@ static int __vfio_register_dev(struct vfio_device *device,
> if (!device->dev_set)
> vfio_assign_device_set(device, device);
>
> - existing_device = vfio_group_get_device(group, device->dev);
> - if (existing_device) {
> - dev_WARN(device->dev, "Device already exists on group %d\n",
> - iommu_group_id(group->iommu_group));
> - vfio_device_put(existing_device);
> - if (group->type == VFIO_NO_IOMMU ||
> - group->type == VFIO_EMULATED_IOMMU)
> - iommu_group_remove_device(device->dev);
> - vfio_group_put(group);
> - return -EBUSY;
> - }
> -
> /* Our reference on group is moved to the device */
> device->group = group;
>
> @@ -702,8 +705,9 @@ void vfio_unregister_group_dev(struct vfio_device *device)
> if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
> iommu_group_remove_device(device->dev);
>
> - /* Matches the get in vfio_register_group_dev() */
> - vfio_group_put(group);
> + /* Matches the alloc get in vfio_register_group_dev() */
> + if (refcount_dec_and_test(&group->drivers))
> + vfio_group_remove(group);
> }
> EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
>
Powered by blists - more mailing lists