[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250924141018.80202-4-mngyadam@amazon.de>
Date: Wed, 24 Sep 2025 16:09:54 +0200
From: Mahmoud Adam <mngyadam@...zon.de>
To: <kvm@...r.kernel.org>
CC: <alex.williamson@...hat.com>, <jgg@...pe.ca>, <kbusch@...nel.org>,
<benh@...nel.crashing.org>, David Woodhouse <dwmw@...zon.co.uk>,
<pravkmr@...zon.de>, <nagy@...aternagy.com>, <linux-kernel@...r.kernel.org>
Subject: [RFC PATCH 3/7] vfio/pci: add RCU locking for regions access
Since we could request to add more regions after initialization. We
would need locking to avoid racing with readers and cause UAF. use RCU
for read-write synchronization. And region_lock mutex is used to
synchronize the write section.
Changing the value of num_regions is done under the mutex. Since the
num_regions can only increase, using READ_ONCE and WRITE_ONCE should
be enough to make sure we have a valid value. On the write section,
synchronize_rcu() is run before incrementing num_regions. Doing that
makes sure read sections are passed before increasing num_regions to
avoid causing out-of-bound access.
Signed-off-by: Mahmoud Adam <mngyadam@...zon.de>
---
drivers/vfio/pci/vfio_pci_core.c | 59 +++++++++++++++++++++++---------
drivers/vfio/pci/vfio_pci_igd.c | 16 ++++++---
include/linux/vfio_pci_core.h | 1 +
3 files changed, 55 insertions(+), 21 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 6629490c0e46f..78e18bfd973e5 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -882,7 +882,8 @@ static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
}
/*
- * Registers a new region to vfio_pci_core_device.
+ * Registers a new region to vfio_pci_core_device. region_lock should
+ * be held when multiple registers could happen.
* Returns region index on success or a negative errno.
*/
int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
@@ -890,15 +891,20 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
const struct vfio_pci_regops *ops,
size_t size, u32 flags, void *data)
{
- int num_regions = vdev->num_regions;
struct vfio_pci_region *region, *old_region;
+ int num_regions;
+
+ mutex_lock(&vdev->region_lock);
+ num_regions = READ_ONCE(vdev->num_regions);
region = kmalloc((num_regions + 1) * sizeof(*region),
GFP_KERNEL_ACCOUNT);
if (!region)
return -ENOMEM;
- old_region = vdev->region;
+ old_region =
+ rcu_dereference_protected(vdev->region,
+ lockdep_is_held(&vdev->region_lock));
if (old_region)
memcpy(region, old_region, num_regions * sizeof(*region));
@@ -909,8 +915,10 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
region[num_regions].flags = flags;
region[num_regions].data = data;
- vdev->region = region;
- vdev->num_regions++;
+ rcu_assign_pointer(vdev->region, region);
+ synchronize_rcu();
+ WRITE_ONCE(vdev->num_regions, READ_ONCE(vdev->num_regions) + 1);
+ mutex_unlock(&vdev->region_lock);
kfree(old_region);
return num_regions;
}
@@ -968,7 +976,7 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;
- info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
+ info.num_regions = VFIO_PCI_NUM_REGIONS + READ_ONCE(vdev->num_regions);
info.num_irqs = VFIO_PCI_NUM_IRQS;
ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
@@ -1094,13 +1102,16 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
.header.version = 1
};
- if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+ if (info.index >= VFIO_PCI_NUM_REGIONS +
+ READ_ONCE(vdev->num_regions))
return -EINVAL;
- info.index = array_index_nospec(
- info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
+ info.index = array_index_nospec(info.index,
+ VFIO_PCI_NUM_REGIONS +
+ READ_ONCE(vdev->num_regions));
i = info.index - VFIO_PCI_NUM_REGIONS;
- region = &vdev->region[i];
+ rcu_read_lock();
+ region = &rcu_dereference(vdev->region)[i];
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.size = region->size;
@@ -1111,15 +1122,20 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
ret = vfio_info_add_capability(&caps, &cap_type.header,
sizeof(cap_type));
- if (ret)
+ if (ret) {
+ rcu_read_unlock();
return ret;
+ }
if (region->ops->add_capability) {
ret = region->ops->add_capability(
vdev, region, &caps);
- if (ret)
+ if (ret) {
+ rcu_read_unlock();
return ret;
+ }
}
+ rcu_read_unlock();
}
}
@@ -1536,7 +1552,7 @@ static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
int ret;
- if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+ if (index >= VFIO_PCI_NUM_REGIONS + READ_ONCE(vdev->num_regions))
return -EINVAL;
ret = pm_runtime_resume_and_get(&vdev->pdev->dev);
@@ -1568,8 +1584,11 @@ static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
default:
index -= VFIO_PCI_NUM_REGIONS;
- ret = vdev->region[index].ops->rw(vdev, buf,
- count, ppos, iswrite);
+ rcu_read_lock();
+ ret = rcu_dereference(vdev->region)[index].ops->rw(vdev, buf,
+ count, ppos,
+ iswrite);
+ rcu_read_unlock();
break;
}
@@ -1726,7 +1745,7 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
- if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+ if (index >= VFIO_PCI_NUM_REGIONS + READ_ONCE(vdev->num_regions))
return -EINVAL;
if (vma->vm_end < vma->vm_start)
return -EINVAL;
@@ -1734,12 +1753,16 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
return -EINVAL;
if (index >= VFIO_PCI_NUM_REGIONS) {
int regnum = index - VFIO_PCI_NUM_REGIONS;
- struct vfio_pci_region *region = vdev->region + regnum;
+ struct vfio_pci_region *region;
+
+ rcu_read_lock();
+ region = rcu_dereference(vdev->region) + regnum;
ret = -EINVAL;
if (region->ops && region->ops->mmap &&
(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
ret = region->ops->mmap(vdev, region, vma);
+ rcu_read_unlock();
return ret;
}
if (index >= VFIO_PCI_ROM_REGION_INDEX)
@@ -2107,6 +2130,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
INIT_LIST_HEAD(&vdev->sriov_pfs_item);
init_rwsem(&vdev->memory_lock);
xa_init(&vdev->ctx);
+ mutex_init(&vdev->region_lock);
return 0;
}
@@ -2119,6 +2143,7 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
mutex_destroy(&vdev->igate);
mutex_destroy(&vdev->ioeventfds_lock);
+ mutex_destroy(&vdev->region_lock);
kfree(vdev->region);
kfree(vdev->pm_save);
}
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index 93ddef48e4e4c..1f7e9e82ac08c 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -71,13 +71,17 @@ static ssize_t vfio_pci_igd_rw(struct vfio_pci_core_device *vdev,
struct vfio_pci_region *region;
struct igd_opregion_vbt *opregionvbt;
- region = &vdev->region[i];
+ rcu_read_lock();
+ region = &rcu_dereference(vdev->region)[i];
opregionvbt = region->data;
- if (pos >= region->size || iswrite)
+ if (pos >= region->size || iswrite) {
+ rcu_read_unlock();
return -EINVAL;
+ }
count = min_t(size_t, count, region->size - pos);
+ rcu_read_unlock();
remaining = count;
/* Copy until OpRegion version */
@@ -293,13 +297,17 @@ static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev,
struct vfio_pci_region *region;
struct pci_dev *pdev;
- region = &vdev->region[i];
+ rcu_read_lock();
+ region = &rcu_dereference(vdev->region)[i];
pdev = region->data;
- if (pos >= region->size || iswrite)
+ if (pos >= region->size || iswrite) {
+ rcu_read_unlock();
return -EINVAL;
+ }
size = count = min(count, (size_t)(region->size - pos));
+ rcu_read_unlock();
if ((pos & 1) && size) {
u8 val;
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index f541044e42a2a..e106e58f297e9 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -63,6 +63,7 @@ struct vfio_pci_core_device {
int irq_type;
int num_regions;
struct vfio_pci_region *region;
+ struct mutex region_lock;
u8 msi_qmax;
u8 msix_bar;
u16 msix_size;
--
2.47.3
Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christian Schlaeger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
Powered by blists - more mailing lists