[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251030143836.66cdf116@shazbot.org>
Date: Thu, 30 Oct 2025 14:38:36 -0600
From: Alex Williamson <alex@...zbot.org>
To: Leon Romanovsky <leon@...nel.org>
Cc: Alex Williamson <alex.williamson@...hat.com>, Leon Romanovsky
<leonro@...dia.com>, Jason Gunthorpe <jgg@...dia.com>, Andrew Morton
<akpm@...ux-foundation.org>, Bjorn Helgaas <bhelgaas@...gle.com>, Christian
König <christian.koenig@....com>,
dri-devel@...ts.freedesktop.org, iommu@...ts.linux.dev, Jens Axboe
<axboe@...nel.dk>, Joerg Roedel <joro@...tes.org>, kvm@...r.kernel.org,
linaro-mm-sig@...ts.linaro.org, linux-block@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-media@...r.kernel.org,
linux-mm@...ck.org, linux-pci@...r.kernel.org, Logan Gunthorpe
<logang@...tatee.com>, Marek Szyprowski <m.szyprowski@...sung.com>, Robin
Murphy <robin.murphy@....com>, Sumit Semwal <sumit.semwal@...aro.org>,
Vivek Kasireddy <vivek.kasireddy@...el.com>, Will Deacon <will@...nel.org>
Subject: Re: [PATCH v5 9/9] vfio/pci: Add dma-buf export support for MMIO
regions
On Mon, 13 Oct 2025 18:26:11 +0300
Leon Romanovsky <leon@...nel.org> wrote:
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index fe247d0e2831..56b1320238a9 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
> case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
> return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> + case VFIO_DEVICE_FEATURE_DMA_BUF:
> + if (device->ops->ioctl != vfio_pci_core_ioctl)
> + /*
> + * Devices that overwrite general .ioctl() callback
> + * usually do it to implement their own
> + * VFIO_DEVICE_GET_REGION_INFO handlerm and they present
Typo, "handlerm"
> + * different BAR information from the real PCI.
> + *
> + * DMABUF relies on real PCI information.
> + */
> + return -EOPNOTSUPP;
> +
> + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> default:
> return -ENOTTY;
> }
...
> @@ -2459,6 +2482,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> break;
> }
>
> + vfio_pci_dma_buf_move(vdev, true);
> vfio_pci_zap_bars(vdev);
> }
>
> @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>
> ret = pci_reset_bus(pdev);
>
> + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> + if (__vfio_pci_memory_enabled(vdev))
> + vfio_pci_dma_buf_move(vdev, false);
> +
> vdev = list_last_entry(&dev_set->device_list,
> struct vfio_pci_core_device, vdev.dev_set_list);
>
This needs to be placed in the existing undo loop with the up_write(),
otherwise it can be missed in the error case.
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> new file mode 100644
> index 000000000000..eaba010777f3
> --- /dev/null
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv,
> + struct dma_iova_state *state)
> +{
> + struct phys_vec *phys_vec = priv->phys_vec;
> + unsigned int nents = 0;
> + u32 i;
> +
> + if (!state || !dma_use_iova(state))
> + for (i = 0; i < priv->nr_ranges; i++)
> + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
> + else
> + /*
> + * In IOVA case, there is only one SG entry which spans
> + * for whole IOVA address space, but we need to make sure
> + * that it fits sg->length, maybe we need more.
> + */
> + nents = DIV_ROUND_UP(priv->size, UINT_MAX);
I think we're arguably running afoul of the coding style standard here
that this is not a single simple statement and should use braces.
> +
> + return nents;
> +}
> +
> +static struct sg_table *
> +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> + enum dma_data_direction dir)
> +{
> + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> + struct dma_iova_state *state = attachment->priv;
> + struct phys_vec *phys_vec = priv->phys_vec;
> + unsigned long attrs = DMA_ATTR_MMIO;
> + unsigned int nents, mapped_len = 0;
> + struct scatterlist *sgl;
> + struct sg_table *sgt;
> + dma_addr_t addr;
> + int ret;
> + u32 i;
> +
> + dma_resv_assert_held(priv->dmabuf->resv);
> +
> + if (priv->revoked)
> + return ERR_PTR(-ENODEV);
> +
> + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> + if (!sgt)
> + return ERR_PTR(-ENOMEM);
> +
> + nents = calc_sg_nents(priv, state);
> + ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
> + if (ret)
> + goto err_kfree_sgt;
> +
> + sgl = sgt->sgl;
> +
> + for (i = 0; i < priv->nr_ranges; i++) {
> + if (!state) {
> + addr = pci_p2pdma_bus_addr_map(priv->provider,
> + phys_vec[i].paddr);
> + } else if (dma_use_iova(state)) {
> + ret = dma_iova_link(attachment->dev, state,
> + phys_vec[i].paddr, 0,
> + phys_vec[i].len, dir, attrs);
> + if (ret)
> + goto err_unmap_dma;
> +
> + mapped_len += phys_vec[i].len;
> + } else {
> + addr = dma_map_phys(attachment->dev, phys_vec[i].paddr,
> + phys_vec[i].len, dir, attrs);
> + ret = dma_mapping_error(attachment->dev, addr);
> + if (ret)
> + goto err_unmap_dma;
> + }
> +
> + if (!state || !dma_use_iova(state))
> + sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
> + }
> +
> + if (state && dma_use_iova(state)) {
> + WARN_ON_ONCE(mapped_len != priv->size);
> + ret = dma_iova_sync(attachment->dev, state, 0, mapped_len);
> + if (ret)
> + goto err_unmap_dma;
> + sgl = fill_sg_entry(sgl, mapped_len, state->addr);
> + }
> +
> + /*
> + * SGL must be NULL to indicate that SGL is the last one
> + * and we allocated correct number of entries in sg_alloc_table()
> + */
> + WARN_ON_ONCE(sgl);
> + return sgt;
> +
> +err_unmap_dma:
> + if (!i || !state)
> + ; /* Do nothing */
> + else if (dma_use_iova(state))
> + dma_iova_destroy(attachment->dev, state, mapped_len, dir,
> + attrs);
> + else
> + for_each_sgtable_dma_sg(sgt, sgl, i)
> + dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> + sg_dma_len(sgl), dir, attrs);
Same, here for braces.
> + sg_free_table(sgt);
> +err_kfree_sgt:
> + kfree(sgt);
> + return ERR_PTR(ret);
> +}
> +
> +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> + struct sg_table *sgt,
> + enum dma_data_direction dir)
> +{
> + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> + struct dma_iova_state *state = attachment->priv;
> + unsigned long attrs = DMA_ATTR_MMIO;
> + struct scatterlist *sgl;
> + int i;
> +
> + if (!state)
> + ; /* Do nothing */
> + else if (dma_use_iova(state))
> + dma_iova_destroy(attachment->dev, state, priv->size, dir,
> + attrs);
> + else
> + for_each_sgtable_dma_sg(sgt, sgl, i)
> + dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> + sg_dma_len(sgl), dir, attrs);
> +
Here too.
> + sg_free_table(sgt);
> + kfree(sgt);
> +}
...
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 75100bf009ba..63214467c875 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master {
> };
> #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> + * regions selected.
> + *
> + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
> + * etc. offset/length specify a slice of the region to create the dmabuf from.
> + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
> + *
Probably worth noting that .flags should be zero, I see we enforce
that. Thanks,
Alex
> + * Return: The fd number on success, -1 and errno is set on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> +
> +struct vfio_region_dma_range {
> + __u64 offset;
> + __u64 length;
> +};
> +
> +struct vfio_device_feature_dma_buf {
> + __u32 region_index;
> + __u32 open_flags;
> + __u32 flags;
> + __u32 nr_ranges;
> + struct vfio_region_dma_range dma_ranges[];
> +};
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
Powered by blists - more mailing lists