linux-kernel - Re: [PATCH v6 10/25] iommufd/viommu: Add IOMMUFD_CMD_HW_QUEUE

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1ab8030b-8d2f-4ebe-a280-6d0e4e1d17c7@linux.intel.com>
Date: Mon, 16 Jun 2025 14:12:04 +0800
From: Baolu Lu <baolu.lu@...ux.intel.com>
To: Nicolin Chen <nicolinc@...dia.com>, jgg@...dia.com, kevin.tian@...el.com,
 corbet@....net, will@...nel.org
Cc: bagasdotme@...il.com, robin.murphy@....com, joro@...tes.org,
 thierry.reding@...il.com, vdumpa@...dia.com, jonathanh@...dia.com,
 shuah@...nel.org, jsnitsel@...hat.com, nathan@...nel.org,
 peterz@...radead.org, yi.l.liu@...el.com, mshavit@...gle.com,
 praan@...gle.com, zhangzekun11@...wei.com, iommu@...ts.linux.dev,
 linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
 linux-arm-kernel@...ts.infradead.org, linux-tegra@...r.kernel.org,
 linux-kselftest@...r.kernel.org, patches@...ts.linux.dev, mochs@...dia.com,
 alok.a.tiwari@...cle.com, vasant.hegde@....com, dwmw2@...radead.org
Subject: Re: [PATCH v6 10/25] iommufd/viommu: Add IOMMUFD_CMD_HW_QUEUE_ALLOC
 ioctl

On 6/14/25 15:14, Nicolin Chen wrote:
> Introduce a new IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl for user space to allocate
> a HW QUEUE object for a vIOMMU specific HW-accelerated queue, e.g.:
>   - NVIDIA's Virtual Command Queue
>   - AMD vIOMMU's Command Buffer, Event Log Buffers, and PPR Log Buffers
> 
> Since this is introduced with NVIDIA's VCMDQs that access the guest memory
> in the physical address space, add an iommufd_hw_queue_alloc_phys() helper
> that will create an access object to the queue memory in the IOAS, to avoid
> the mappings of the guest memory from being unmapped, during the life cycle
> of the HW queue object.
> 
> Reviewed-by: Pranjal Shrivastava<praan@...gle.com>
> Reviewed-by: Kevin Tian<kevin.tian@...el.com>
> Signed-off-by: Nicolin Chen<nicolinc@...dia.com>
> ---
>   drivers/iommu/iommufd/iommufd_private.h |   2 +
>   include/linux/iommufd.h                 |   1 +
>   include/uapi/linux/iommufd.h            |  33 +++++
>   drivers/iommu/iommufd/main.c            |   6 +
>   drivers/iommu/iommufd/viommu.c          | 184 ++++++++++++++++++++++++
>   5 files changed, 226 insertions(+)
> 

[...]

> diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
> index 28ea5d026222..506479ece826 100644
> --- a/drivers/iommu/iommufd/viommu.c
> +++ b/drivers/iommu/iommufd/viommu.c
> @@ -201,3 +201,187 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
>   	iommufd_put_object(ucmd->ictx, &viommu->obj);
>   	return rc;
>   }
> +
> +static void iommufd_hw_queue_destroy_access(struct iommufd_ctx *ictx,
> +					    struct iommufd_access *access,
> +					    u64 base_iova, size_t length)
> +{
> +	iommufd_access_unpin_pages(access, base_iova, length);
> +	iommufd_access_detach_internal(access);
> +	iommufd_access_destroy_internal(ictx, access);
> +}
> +
> +void iommufd_hw_queue_destroy(struct iommufd_object *obj)
> +{
> +	struct iommufd_hw_queue *hw_queue =
> +		container_of(obj, struct iommufd_hw_queue, obj);
> +	struct iommufd_viommu *viommu = hw_queue->viommu;
> +
> +	if (hw_queue->destroy)
> +		hw_queue->destroy(hw_queue);
> +	if (hw_queue->access)
> +		iommufd_hw_queue_destroy_access(viommu->ictx, hw_queue->access,
> +						hw_queue->base_addr,
> +						hw_queue->length);
> +	refcount_dec(&viommu->obj.users);
> +}
> +
> +/*
> + * When the HW accesses the guest queue via physical addresses, the underlying
> + * physical pages of the guest queue must be contiguous. Also, for the security
> + * concern that IOMMUFD_CMD_IOAS_UNMAP could potentially remove the mappings of
> + * the guest queue from the nesting parent iopt while the HW is still accessing
> + * the guest queue memory physically, such a HW queue must require an access to
> + * pin the underlying pages and prevent that from happening.
> + */
> +static struct iommufd_access *
> +iommufd_hw_queue_alloc_phys(struct iommu_hw_queue_alloc *cmd,
> +			    struct iommufd_viommu *viommu, phys_addr_t *base_pa)
> +{
> +	struct iommufd_access *access;
> +	struct page **pages;
> +	int max_npages, i;
> +	u64 offset;
> +	int rc;
> +
> +	offset =
> +		cmd->nesting_parent_iova - PAGE_ALIGN(cmd->nesting_parent_iova);
> +	max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE);
> +
> +	/*
> +	 * FIXME allocation may fail when sizeof(*pages) * max_npages is
> +	 * larger than PAGE_SIZE. This might need a new API returning a
> +	 * bio_vec or something more efficient.
> +	 */
> +	pages = kcalloc(max_npages, sizeof(*pages), GFP_KERNEL);
> +	if (!pages)
> +		return ERR_PTR(-ENOMEM);
> +
> +	access = iommufd_access_create_internal(viommu->ictx);
> +	if (IS_ERR(access)) {
> +		rc = PTR_ERR(access);
> +		goto out_free;
> +	}
> +
> +	rc = iommufd_access_attach_internal(access, viommu->hwpt->ioas);
> +	if (rc)
> +		goto out_destroy;
> +
> +	rc = iommufd_access_pin_pages(access, cmd->nesting_parent_iova,
> +				      cmd->length, pages, 0);
> +	if (rc)
> +		goto out_detach;
> +
> +	/* Validate if the underlying physical pages are contiguous */
> +	for (i = 1; i < max_npages; i++) {
> +		if (page_to_pfn(pages[i]) == page_to_pfn(pages[i - 1]) + 1)
> +			continue;
> +		rc = -EFAULT;
> +		goto out_unpin;
> +	}
> +
> +	*base_pa = page_to_pfn(pages[0]) << PAGE_SHIFT;
> +	kfree(pages);
> +	return access;
> +
> +out_unpin:
> +	iommufd_access_unpin_pages(access, cmd->nesting_parent_iova,
> +				   cmd->length);
> +out_detach:
> +	iommufd_access_detach_internal(access);
> +out_destroy:
> +	iommufd_access_destroy_internal(viommu->ictx, access);
> +out_free:
> +	kfree(pages);
> +	return ERR_PTR(rc);
> +}
> +
> +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_hw_queue_alloc *cmd = ucmd->cmd;
> +	struct iommufd_hw_queue *hw_queue;
> +	struct iommufd_viommu *viommu;
> +	struct iommufd_access *access;
> +	size_t hw_queue_size;
> +	phys_addr_t base_pa;
> +	u64 last;
> +	int rc;
> +
> +	if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT)
> +		return -EOPNOTSUPP;
> +	if (!cmd->length)
> +		return -EINVAL;
> +	if (check_add_overflow(cmd->nesting_parent_iova, cmd->length - 1,
> +			       &last))
> +		return -EOVERFLOW;
> +
> +	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
> +	if (IS_ERR(viommu))
> +		return PTR_ERR(viommu);
> +
> +	if (!viommu->ops || !viommu->ops->get_hw_queue_size ||
> +	    !viommu->ops->hw_queue_init_phys) {
> +		rc = -EOPNOTSUPP;
> +		goto out_put_viommu;
> +	}
> +
> +	/*
> +	 * FIXME once ops->hw_queue_init is introduced, a WARN_ON_ONCE will be
> +	 * required, if hw_queue_init and hw_queue_init_phys both exist, since
> +	 * they should be mutually exclusive
> +	 */
> +
> +	hw_queue_size = viommu->ops->get_hw_queue_size(viommu, cmd->type);
> +	if (!hw_queue_size) {
> +		rc = -EOPNOTSUPP;
> +		goto out_put_viommu;
> +	}
> +
> +	/*
> +	 * It is a driver bug for providing a hw_queue_size smaller than the
> +	 * core HW queue structure size
> +	 */
> +	if (WARN_ON_ONCE(hw_queue_size < sizeof(*hw_queue))) {
> +		rc = -EOPNOTSUPP;
> +		goto out_put_viommu;
> +	}
> +
> +	/*
> +	 * FIXME once ops->hw_queue_init is introduced, this should check "if
> +	 * ops->hw_queue_init_phys". And "access" should be initialized to NULL.
> +	 */

I just don't follow here. Up until now, only viommu->ops->
hw_queue_init_phys has been added, which means the current code only
supports hardware queues that access guest memory using physical
addresses. The access object is not needed for the other type of
hardware queue that uses guest IOVA.

So, why not just abort here if ops->hw_queue_init_phys is not supported
by the IOMMU driver? Leave other logics to the patches that introduce
ops->hw_queue_init? I guess that would make this patch more readible.

> +	access = iommufd_hw_queue_alloc_phys(cmd, viommu, &base_pa);
> +	if (IS_ERR(access)) {
> +		rc = PTR_ERR(access);
> +		goto out_put_viommu;
> +	}

Thanks,
baolu