[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0521187e-c511-4ab1-9ffa-be2be8eacd04@redhat.com>
Date: Wed, 29 Jan 2025 14:44:12 +0100
From: Eric Auger <eric.auger@...hat.com>
To: Nicolin Chen <nicolinc@...dia.com>, will@...nel.org,
robin.murphy@....com, jgg@...dia.com, kevin.tian@...el.com,
tglx@...utronix.de, maz@...nel.org, alex.williamson@...hat.com
Cc: joro@...tes.org, shuah@...nel.org, reinette.chatre@...el.com,
yebin10@...wei.com, apatel@...tanamicro.com,
shivamurthy.shastri@...utronix.de, bhelgaas@...gle.com,
anna-maria@...utronix.de, yury.norov@...il.com, nipun.gupta@....com,
iommu@...ts.linux.dev, linux-kernel@...r.kernel.org,
linux-arm-kernel@...ts.infradead.org, kvm@...r.kernel.org,
linux-kselftest@...r.kernel.org, patches@...ts.linux.dev,
jean-philippe@...aro.org, mdf@...nel.org, mshavit@...gle.com,
shameerali.kolothum.thodi@...wei.com, smostafa@...gle.com, ddutile@...hat.com
Subject: Re: [PATCH RFCv2 09/13] iommufd: Add IOMMU_OPTION_SW_MSI_START/SIZE
ioctls
Hi,
On 1/11/25 4:32 AM, Nicolin Chen wrote:
> For systems that require MSI pages to be mapped into the IOMMU translation
> the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default
> recommended IOVA window to place these mappings. However, there is nothing
> special about this address. And to support the RMR trick in VMM for nested
well at least it shall not overlap VMM's RAM. So it was not random either.
> translation, the VMM needs to know what sw_msi window the kernel is using.
> As there is no particular reason to force VMM to adopt the kernel default,
> provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use
> to directly specify the sw_msi window that it wants to use, which replaces
> and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having
> to build an API to discover the default IOMMU_RESV_SW_MSI.
IIUC the MSI window will then be different when using legacy VFIO
assignment and iommufd backend.
MSI reserved regions are exposed in
/sys/kernel/iommu_groups/<n>/reserved_regions
0x0000000008000000 0x00000000080fffff msi
Is that configurability reflected accordingly?
How do you make sure it does not collide with other resv regions? I
don't see any check here.
>
> Since iommufd now has its own sw_msi function, this is easy to implement.
>
> To keep things simple, the parameters are global to the entire iommufd FD,
> and will directly replace the IOMMU_RESV_SW_MSI values. The VMM must set
> the values before creating any hwpt's to have any effect.
>
> Suggested-by: Jason Gunthorpe <jgg@...dia.com>
> Signed-off-by: Nicolin Chen <nicolinc@...dia.com>
> ---
> drivers/iommu/iommufd/iommufd_private.h | 4 +++
> include/uapi/linux/iommufd.h | 18 ++++++++++++-
> drivers/iommu/iommufd/device.c | 4 +++
> drivers/iommu/iommufd/io_pagetable.c | 4 ++-
> drivers/iommu/iommufd/ioas.c | 34 +++++++++++++++++++++++++
> drivers/iommu/iommufd/main.c | 6 +++++
> 6 files changed, 68 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 3e83bbb5912c..9f071609f00b 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -45,6 +45,9 @@ struct iommufd_ctx {
> struct mutex sw_msi_lock;
> struct list_head sw_msi_list;
> unsigned int sw_msi_id;
> + /* User-programmed SW_MSI region, to override igroup->sw_msi_start */
> + phys_addr_t sw_msi_start;
> + size_t sw_msi_size;
>
> u8 account_mode;
> /* Compatibility with VFIO no iommu */
> @@ -281,6 +284,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
> int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
> int iommufd_option_rlimit_mode(struct iommu_option *cmd,
> struct iommufd_ctx *ictx);
> +int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx);
>
> int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
> int iommufd_check_iova_range(struct io_pagetable *iopt,
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 34810f6ae2b5..c864a201e502 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -294,7 +294,9 @@ struct iommu_ioas_unmap {
>
> /**
> * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
> - * ioctl(IOMMU_OPTION_HUGE_PAGES)
> + * ioctl(IOMMU_OPTION_HUGE_PAGES) and
> + * ioctl(IOMMU_OPTION_SW_MSI_START) and
> + * ioctl(IOMMU_OPTION_SW_MSI_SIZE)
> * @IOMMU_OPTION_RLIMIT_MODE:
> * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
> * to invoke this. Value 0 (default) is user based accounting, 1 uses process
> @@ -304,10 +306,24 @@ struct iommu_ioas_unmap {
> * iommu mappings. Value 0 disables combining, everything is mapped to
> * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
> * option, the object_id must be the IOAS ID.
> + * @IOMMU_OPTION_SW_MSI_START:
> + * Change the base address of the IOMMU mapping region for MSI doorbell(s).
> + * It must be set this before attaching a device to an IOAS/HWPT, otherwise
> + * this option will be not effective on that IOAS/HWPT. User can choose to
> + * let kernel pick a base address, by simply ignoring this option or setting
> + * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0
I think we should document it cannot be put at a random place either.
> + * @IOMMU_OPTION_SW_MSI_SIZE:
> + * Change the size of the IOMMU mapping region for MSI doorbell(s). It must
> + * be set this before attaching a device to an IOAS/HWPT, otherwise it won't
> + * be effective on that IOAS/HWPT. The value is in MB, and the minimum value
> + * is 1 MB. A value 0 (default) will invalidate the MSI doorbell base address
> + * value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id must be 0
> */
> enum iommufd_option {
> IOMMU_OPTION_RLIMIT_MODE = 0,
> IOMMU_OPTION_HUGE_PAGES = 1,
> + IOMMU_OPTION_SW_MSI_START = 2,
> + IOMMU_OPTION_SW_MSI_SIZE = 3,
> };
>
> /**
> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> index f75b3c23cd41..093a3bd798db 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -445,10 +445,14 @@ static int
> iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
> struct iommufd_hwpt_paging *hwpt_paging)
> {
> + struct iommufd_ctx *ictx = idev->ictx;
> int rc;
>
> lockdep_assert_held(&idev->igroup->lock);
>
> + /* Override it with a user-programmed SW_MSI region */
> + if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX)
> + idev->igroup->sw_msi_start = ictx->sw_msi_start;
> rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
> idev->dev,
> &idev->igroup->sw_msi_start);
> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
> index 8a790e597e12..5d7f5ca1eecf 100644
> --- a/drivers/iommu/iommufd/io_pagetable.c
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -1446,7 +1446,9 @@ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
> if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
> num_hw_msi++;
> if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
> - *sw_msi_start = resv->start;
> + /* Bypass the driver-defined SW_MSI region, if preset */
> + if (*sw_msi_start == PHYS_ADDR_MAX)
> + *sw_msi_start = resv->start;
> num_sw_msi++;
> }
>
> diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
> index 1542c5fd10a8..3f4e25b660f9 100644
> --- a/drivers/iommu/iommufd/ioas.c
> +++ b/drivers/iommu/iommufd/ioas.c
> @@ -620,6 +620,40 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
> return -EOPNOTSUPP;
> }
>
> +int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx)
> +{
> + if (cmd->object_id)
> + return -EOPNOTSUPP;
> +
> + if (cmd->op == IOMMU_OPTION_OP_GET) {
> + switch (cmd->option_id) {
> + case IOMMU_OPTION_SW_MSI_START:
> + cmd->val64 = (u64)ictx->sw_msi_start;
> + break;
> + case IOMMU_OPTION_SW_MSI_SIZE:
> + cmd->val64 = (u64)ictx->sw_msi_size;
> + break;
> + default:
> + return -EOPNOTSUPP;
> + }
> + return 0;
> + }
> + if (cmd->op == IOMMU_OPTION_OP_SET) {
> + switch (cmd->option_id) {
> + case IOMMU_OPTION_SW_MSI_START:
> + ictx->sw_msi_start = (phys_addr_t)cmd->val64;
> + break;
> + case IOMMU_OPTION_SW_MSI_SIZE:
> + ictx->sw_msi_size = (size_t)cmd->val64;
> + break;
> + default:
> + return -EOPNOTSUPP;
> + }
> + return 0;
> + }
> + return -EOPNOTSUPP;
> +}
> +
> static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
> struct iommufd_ioas *ioas)
> {
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 7cc9497b7193..026297265c71 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -229,6 +229,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
> init_waitqueue_head(&ictx->destroy_wait);
> mutex_init(&ictx->sw_msi_lock);
> INIT_LIST_HEAD(&ictx->sw_msi_list);
> + ictx->sw_msi_start = PHYS_ADDR_MAX;
> + ictx->sw_msi_size = 0;
> filp->private_data = ictx;
> return 0;
> }
> @@ -287,6 +289,10 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
> case IOMMU_OPTION_RLIMIT_MODE:
> rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
> break;
> + case IOMMU_OPTION_SW_MSI_START:
> + case IOMMU_OPTION_SW_MSI_SIZE:
> + rc = iommufd_option_sw_msi(cmd, ucmd->ictx);
> + break;
> case IOMMU_OPTION_HUGE_PAGES:
> rc = iommufd_ioas_option(ucmd);
> break;
Eric
Powered by blists - more mailing lists