[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6c36de14b00a3f06df3a602f18baf6b51fde429f.1760487869.git.nicolinc@nvidia.com>
Date: Tue, 14 Oct 2025 17:29:36 -0700
From: Nicolin Chen <nicolinc@...dia.com>
To: <jgg@...dia.com>, <kevin.tian@...el.com>
CC: <robin.murphy@....com>, <joro@...tes.org>, <will@...nel.org>,
<iommu@...ts.linux.dev>, <linux-kernel@...r.kernel.org>, <shuah@...nel.org>,
<linux-kselftest@...r.kernel.org>, <shyamsaini@...ux.microsoft.com>
Subject: [PATCH v2 4/7] iommufd: Add IOMMU_OPTION_SW_MSI_START/SIZE ioctls
For systems that require MSI pages to be mapped into the IOMMU translation
the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default
recommended IOVA window to place these mappings. However, there is nothing
special about this address. And to support the RMR trick in VMM for nested
translation, the VMM needs to know what sw_msi window the kernel is using.
Moreover, there are cases that the default IOMMU_RESV_SW_MSI region cannot
be reserved as some platforms reserve this address for other purposes:
https://lore.kernel.org/all/20250909154600.910110-1-shyamsaini@linux.microsoft.com/
Provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use
to directly specify its desired sw_msi window, which replaces and disables
the default IOMMU_RESV_SW_MSI from the driver, to avoid having to build an
API to discover the default IOMMU_RESV_SW_MSI.
Since iommufd now has its own sw_msi function, this is easy to implement.
Keep these two options per iommufd_device, so each device can set its own
desired MSI window. VMM must set the values before attaching the device to
any HWPT/IOAS to have an effect.
Suggested-by: Jason Gunthorpe <jgg@...dia.com>
Signed-off-by: Nicolin Chen <nicolinc@...dia.com>
---
drivers/iommu/iommufd/iommufd_private.h | 2 +
include/uapi/linux/iommufd.h | 21 ++++-
drivers/iommu/iommufd/io_pagetable.c | 15 +++-
drivers/iommu/iommufd/ioas.c | 113 ++++++++++++++++++++++++
drivers/iommu/iommufd/main.c | 4 +
5 files changed, 151 insertions(+), 4 deletions(-)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index c458ab16736b6..1defd416813c8 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -346,6 +346,7 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd);
int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
+int iommufd_option_sw_msi(struct iommufd_ucmd *ucmd);
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx);
@@ -490,6 +491,7 @@ struct iommufd_device {
struct iommufd_vdevice *vdev;
bool destroying;
phys_addr_t sw_msi_start;
+ size_t sw_msi_size;
};
static inline struct iommufd_device *
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index c218c89e0e2eb..5e5277f77a97b 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -296,7 +296,9 @@ struct iommu_ioas_unmap {
/**
* enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
- * ioctl(IOMMU_OPTION_HUGE_PAGES)
+ * ioctl(IOMMU_OPTION_HUGE_PAGES) and
+ * ioctl(IOMMU_OPTION_SW_MSI_START) and
+ * ioctl(IOMMU_OPTION_SW_MSI_SIZE)
* @IOMMU_OPTION_RLIMIT_MODE:
* Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
* to invoke this. Value 0 (default) is user based accounting, 1 uses process
@@ -306,10 +308,27 @@ struct iommu_ioas_unmap {
* iommu mappings. Value 0 disables combining, everything is mapped to
* PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
* option, the object_id must be the IOAS ID.
+ * @IOMMU_OPTION_SW_MSI_START:
+ * Change the base address of the IOMMU mapping region for MSI doorbell(s).
+ * This option being unset or @IOMMU_OPTION_SW_MSI_SIZE being value 0 tells
+ * the kernel to pick its default MSI doorbell window, ignoring these two
+ * options. To set this option, userspace must do before attaching a device
+ * to an IOAS/HWPT. Otherwise, kernel will return error (-EBUSY). An address
+ * must be 1MB aligned. This option is per-device, the object_id must be the
+ * device ID.
+ * @IOMMU_OPTION_SW_MSI_SIZE:
+ * Change the size (in MB) of the IOMMU mapping region for MSI doorbell(s).
+ * The minimum value is 1 MB. A value 0 (default) tells the kernel to ignore
+ * the base address value set to @IOMMU_OPTION_SW_MSI_START, and to pick its
+ * default MSI doorbell window. Same requirements are applied to this option
+ * too, so check @IOMMU_OPTION_SW_MSI_START for details. User space must set
+ * IOMMU_OPTION_SW_MSI_START first before setting IOMMU_OPTION_SW_MSI_SIZE.
*/
enum iommufd_option {
IOMMU_OPTION_RLIMIT_MODE = 0,
IOMMU_OPTION_HUGE_PAGES = 1,
+ IOMMU_OPTION_SW_MSI_START = 2,
+ IOMMU_OPTION_SW_MSI_SIZE = 3,
};
/**
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index dee0aa3e7cb4a..7a1016d6dcfe0 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -1458,18 +1458,27 @@ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
iommu_get_resv_regions(dev, &resv_regions);
list_for_each_entry(resv, &resv_regions, list) {
+ unsigned long start = PHYS_ADDR_MAX, last = 0;
+
if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
continue;
if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
num_hw_msi++;
if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
- *sw_msi_start = resv->start;
+ if (idev->sw_msi_size) {
+ start = *sw_msi_start;
+ last = idev->sw_msi_size - 1 + start;
+ }
num_sw_msi++;
}
- rc = iopt_reserve_iova(iopt, resv->start,
- resv->length - 1 + resv->start, dev);
+ if (start == PHYS_ADDR_MAX) {
+ start = resv->start;
+ last = resv->length - 1 + start;
+ }
+
+ rc = iopt_reserve_iova(iopt, start, last, dev);
if (rc)
goto out_reserved;
}
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 1542c5fd10a85..f2a4ab98f1665 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -620,6 +620,119 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
return -EOPNOTSUPP;
}
+static inline int iommufd_option_sw_msi_test(struct iommufd_device *idev,
+ phys_addr_t start, size_t size)
+{
+ const phys_addr_t alignment = SZ_1M - 1;
+ struct iommu_resv_region *resv;
+ LIST_HEAD(resv_regions);
+ phys_addr_t last;
+ int rc = 0;
+
+ if (start & alignment || size & alignment)
+ return -EINVAL;
+
+ size = max_t(size_t, size, SZ_1M);
+
+ if (check_add_overflow(start, size - 1, &last))
+ return -EOVERFLOW;
+
+ /* Test if the new sw_msi range overlaps with other reserved regions */
+ iommu_get_resv_regions(idev->dev, &resv_regions);
+ list_for_each_entry(resv, &resv_regions, list) {
+ phys_addr_t resv_last = resv->length - 1 + resv->start;
+
+ /* start/size replaces the driver-defined IOMMU_RESV_SW_MSI */
+ if (resv->type == IOMMU_RESV_SW_MSI)
+ continue;
+ /* IOMMU_RESV_DIRECT_RELAXABLE does not get enforced to iopt */
+ if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
+ continue;
+
+ if (resv->start <= last && resv_last >= start) {
+ rc = -EADDRINUSE;
+ break;
+ }
+ }
+ iommu_put_resv_regions(idev->dev, &resv_regions);
+ return rc;
+}
+
+int iommufd_option_sw_msi(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_option *cmd = ucmd->cmd;
+ struct iommufd_device *idev;
+ int rc = 0;
+
+ idev = iommufd_get_device(ucmd, cmd->object_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+
+ mutex_lock(&idev->igroup->lock);
+
+ /* Device cannot enforce the sw_msi window if already attached */
+ if (iommufd_device_is_attached(idev, IOMMU_NO_PASID)) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+
+ if (cmd->op == IOMMU_OPTION_OP_GET) {
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_SW_MSI_START:
+ cmd->val64 = (u64)idev->sw_msi_start;
+ break;
+ case IOMMU_OPTION_SW_MSI_SIZE:
+ cmd->val64 = (u64)idev->sw_msi_size / SZ_1M;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ }
+
+ if (cmd->op == IOMMU_OPTION_OP_SET) {
+ phys_addr_t start = idev->sw_msi_start;
+ size_t size = idev->sw_msi_size;
+
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_SW_MSI_START:
+ if (cmd->val64 > PHYS_ADDR_MAX) {
+ rc = -EINVAL;
+ break;
+ }
+ start = (phys_addr_t)cmd->val64;
+ rc = iommufd_option_sw_msi_test(idev, start, size);
+ if (rc)
+ break;
+ idev->sw_msi_start = start;
+ break;
+ case IOMMU_OPTION_SW_MSI_SIZE:
+ /* The input unit is MB */
+ if (cmd->val64 > SIZE_MAX >> 20) {
+ rc = -EINVAL;
+ break;
+ }
+ size = (size_t)cmd->val64 * SZ_1M;
+ if (size) {
+ rc = iommufd_option_sw_msi_test(idev, start,
+ size);
+ if (rc)
+ break;
+ }
+ idev->sw_msi_size = size;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ }
+
+out_unlock:
+ mutex_unlock(&idev->igroup->lock);
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+ return rc;
+}
+
static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
struct iommufd_ioas *ioas)
{
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index ce775fbbae94e..9a8ab58d694d4 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -398,6 +398,10 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
case IOMMU_OPTION_RLIMIT_MODE:
rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
break;
+ case IOMMU_OPTION_SW_MSI_START:
+ case IOMMU_OPTION_SW_MSI_SIZE:
+ rc = iommufd_option_sw_msi(ucmd);
+ break;
case IOMMU_OPTION_HUGE_PAGES:
rc = iommufd_ioas_option(ucmd);
break;
--
2.43.0
Powered by blists - more mailing lists