[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251030143836.66cdf116@shazbot.org>
Date: Thu, 30 Oct 2025 14:38:36 -0600
From: Alex Williamson <alex@...zbot.org>
To: Leon Romanovsky <leon@...nel.org>
Cc: Alex Williamson <alex.williamson@...hat.com>, Leon Romanovsky
 <leonro@...dia.com>, Jason Gunthorpe <jgg@...dia.com>, Andrew Morton
 <akpm@...ux-foundation.org>, Bjorn Helgaas <bhelgaas@...gle.com>, Christian
 König <christian.koenig@....com>,
 dri-devel@...ts.freedesktop.org, iommu@...ts.linux.dev, Jens Axboe
 <axboe@...nel.dk>, Joerg Roedel <joro@...tes.org>, kvm@...r.kernel.org,
 linaro-mm-sig@...ts.linaro.org, linux-block@...r.kernel.org,
 linux-kernel@...r.kernel.org, linux-media@...r.kernel.org,
 linux-mm@...ck.org, linux-pci@...r.kernel.org, Logan Gunthorpe
 <logang@...tatee.com>, Marek Szyprowski <m.szyprowski@...sung.com>, Robin
 Murphy <robin.murphy@....com>, Sumit Semwal <sumit.semwal@...aro.org>,
 Vivek Kasireddy <vivek.kasireddy@...el.com>, Will Deacon <will@...nel.org>
Subject: Re: [PATCH v5 9/9] vfio/pci: Add dma-buf export support for MMIO
 regions
On Mon, 13 Oct 2025 18:26:11 +0300
Leon Romanovsky <leon@...nel.org> wrote:
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index fe247d0e2831..56b1320238a9 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_DMA_BUF:
> +		if (device->ops->ioctl != vfio_pci_core_ioctl)
> +			/*
> +			 * Devices that overwrite general .ioctl() callback
> +			 * usually do it to implement their own
> +			 * VFIO_DEVICE_GET_REGION_INFO handlerm and they present
Typo, "handlerm"
> +			 * different BAR information from the real PCI.
> +			 *
> +			 * DMABUF relies on real PCI information.
> +			 */
> +			return -EOPNOTSUPP;
> +
> +		return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
...
> @@ -2459,6 +2482,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>  			break;
>  		}
>  
> +		vfio_pci_dma_buf_move(vdev, true);
>  		vfio_pci_zap_bars(vdev);
>  	}
>  
> @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>  
>  	ret = pci_reset_bus(pdev);
>  
> +	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +		if (__vfio_pci_memory_enabled(vdev))
> +			vfio_pci_dma_buf_move(vdev, false);
> +
>  	vdev = list_last_entry(&dev_set->device_list,
>  			       struct vfio_pci_core_device, vdev.dev_set_list);
>  
This needs to be placed in the existing undo loop with the up_write(),
otherwise it can be missed in the error case.
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> new file mode 100644
> index 000000000000..eaba010777f3
> --- /dev/null
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv,
> +				  struct dma_iova_state *state)
> +{
> +	struct phys_vec *phys_vec = priv->phys_vec;
> +	unsigned int nents = 0;
> +	u32 i;
> +
> +	if (!state || !dma_use_iova(state))
> +		for (i = 0; i < priv->nr_ranges; i++)
> +			nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
> +	else
> +		/*
> +		 * In IOVA case, there is only one SG entry which spans
> +		 * for whole IOVA address space, but we need to make sure
> +		 * that it fits sg->length, maybe we need more.
> +		 */
> +		nents = DIV_ROUND_UP(priv->size, UINT_MAX);
I think we're arguably running afoul of the coding style standard here
that this is not a single simple statement and should use braces.
> +
> +	return nents;
> +}
> +
> +static struct sg_table *
> +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> +		     enum dma_data_direction dir)
> +{
> +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +	struct dma_iova_state *state = attachment->priv;
> +	struct phys_vec *phys_vec = priv->phys_vec;
> +	unsigned long attrs = DMA_ATTR_MMIO;
> +	unsigned int nents, mapped_len = 0;
> +	struct scatterlist *sgl;
> +	struct sg_table *sgt;
> +	dma_addr_t addr;
> +	int ret;
> +	u32 i;
> +
> +	dma_resv_assert_held(priv->dmabuf->resv);
> +
> +	if (priv->revoked)
> +		return ERR_PTR(-ENODEV);
> +
> +	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> +	if (!sgt)
> +		return ERR_PTR(-ENOMEM);
> +
> +	nents = calc_sg_nents(priv, state);
> +	ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
> +	if (ret)
> +		goto err_kfree_sgt;
> +
> +	sgl = sgt->sgl;
> +
> +	for (i = 0; i < priv->nr_ranges; i++) {
> +		if (!state) {
> +			addr = pci_p2pdma_bus_addr_map(priv->provider,
> +						       phys_vec[i].paddr);
> +		} else if (dma_use_iova(state)) {
> +			ret = dma_iova_link(attachment->dev, state,
> +					    phys_vec[i].paddr, 0,
> +					    phys_vec[i].len, dir, attrs);
> +			if (ret)
> +				goto err_unmap_dma;
> +
> +			mapped_len += phys_vec[i].len;
> +		} else {
> +			addr = dma_map_phys(attachment->dev, phys_vec[i].paddr,
> +					    phys_vec[i].len, dir, attrs);
> +			ret = dma_mapping_error(attachment->dev, addr);
> +			if (ret)
> +				goto err_unmap_dma;
> +		}
> +
> +		if (!state || !dma_use_iova(state))
> +			sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
> +	}
> +
> +	if (state && dma_use_iova(state)) {
> +		WARN_ON_ONCE(mapped_len != priv->size);
> +		ret = dma_iova_sync(attachment->dev, state, 0, mapped_len);
> +		if (ret)
> +			goto err_unmap_dma;
> +		sgl = fill_sg_entry(sgl, mapped_len, state->addr);
> +	}
> +
> +	/*
> +	 * SGL must be NULL to indicate that SGL is the last one
> +	 * and we allocated correct number of entries in sg_alloc_table()
> +	 */
> +	WARN_ON_ONCE(sgl);
> +	return sgt;
> +
> +err_unmap_dma:
> +	if (!i || !state)
> +		; /* Do nothing */
> +	else if (dma_use_iova(state))
> +		dma_iova_destroy(attachment->dev, state, mapped_len, dir,
> +				 attrs);
> +	else
> +		for_each_sgtable_dma_sg(sgt, sgl, i)
> +			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> +					sg_dma_len(sgl), dir, attrs);
Same, here for braces.
> +	sg_free_table(sgt);
> +err_kfree_sgt:
> +	kfree(sgt);
> +	return ERR_PTR(ret);
> +}
> +
> +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> +				   struct sg_table *sgt,
> +				   enum dma_data_direction dir)
> +{
> +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +	struct dma_iova_state *state = attachment->priv;
> +	unsigned long attrs = DMA_ATTR_MMIO;
> +	struct scatterlist *sgl;
> +	int i;
> +
> +	if (!state)
> +		; /* Do nothing */
> +	else if (dma_use_iova(state))
> +		dma_iova_destroy(attachment->dev, state, priv->size, dir,
> +				 attrs);
> +	else
> +		for_each_sgtable_dma_sg(sgt, sgl, i)
> +			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> +				       sg_dma_len(sgl), dir, attrs);
> +
Here too.
> +	sg_free_table(sgt);
> +	kfree(sgt);
> +}
...
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 75100bf009ba..63214467c875 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> + * regions selected.
> + *
> + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
> + * etc. offset/length specify a slice of the region to create the dmabuf from.
> + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
> + *
Probably worth noting that .flags should be zero, I see we enforce
that.  Thanks,
Alex
> + * Return: The fd number on success, -1 and errno is set on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> +
> +struct vfio_region_dma_range {
> +	__u64 offset;
> +	__u64 length;
> +};
> +
> +struct vfio_device_feature_dma_buf {
> +	__u32	region_index;
> +	__u32	open_flags;
> +	__u32   flags;
> +	__u32   nr_ranges;
> +	struct vfio_region_dma_range dma_ranges[];
> +};
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**
Powered by blists - more mailing lists
 
