[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250922150032.3e3da410.alex.williamson@redhat.com>
Date: Mon, 22 Sep 2025 15:00:32 -0600
From: Alex Williamson <alex.williamson@...hat.com>
To: Leon Romanovsky <leon@...nel.org>
Cc: Leon Romanovsky <leonro@...dia.com>, Jason Gunthorpe <jgg@...dia.com>,
Andrew Morton <akpm@...ux-foundation.org>, Bjorn Helgaas
<bhelgaas@...gle.com>, Christian König
<christian.koenig@....com>, dri-devel@...ts.freedesktop.org,
iommu@...ts.linux.dev, Jens Axboe <axboe@...nel.dk>, Joerg Roedel
<joro@...tes.org>, kvm@...r.kernel.org, linaro-mm-sig@...ts.linaro.org,
linux-block@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-media@...r.kernel.org, linux-mm@...ck.org, linux-pci@...r.kernel.org,
Logan Gunthorpe <logang@...tatee.com>, Marek Szyprowski
<m.szyprowski@...sung.com>, Robin Murphy <robin.murphy@....com>, Sumit
Semwal <sumit.semwal@...aro.org>, Vivek Kasireddy
<vivek.kasireddy@...el.com>, Will Deacon <will@...nel.org>
Subject: Re: [PATCH v2 03/10] PCI/P2PDMA: Refactor to separate core P2P
functionality from memory allocation
On Thu, 11 Sep 2025 14:33:07 +0300
Leon Romanovsky <leon@...nel.org> wrote:
> From: Leon Romanovsky <leonro@...dia.com>
>
> Refactor the PCI P2PDMA subsystem to separate the core peer-to-peer DMA
> functionality from the optional memory allocation layer. This creates a
> two-tier architecture:
>
> The core layer provides P2P mapping functionality for physical addresses
> based on PCI device MMIO BARs and integrates with the DMA API for
> mapping operations. This layer is required for all P2PDMA users.
>
> The optional upper layer provides memory allocation capabilities
> including gen_pool allocator, struct page support, and sysfs interface
> for user space access.
>
> This separation allows subsystems like VFIO to use only the core P2P
> mapping functionality without the overhead of memory allocation features
> they don't need. The core functionality is now available through the
> new pci_p2pdma_enable() function that returns a p2pdma_provider
> structure.
>
> Signed-off-by: Leon Romanovsky <leonro@...dia.com>
> ---
> drivers/pci/p2pdma.c | 129 +++++++++++++++++++++++++++----------
> include/linux/pci-p2pdma.h | 5 ++
> 2 files changed, 100 insertions(+), 34 deletions(-)
>
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 176a99232fdca..c22cbb3a26030 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -25,11 +25,12 @@ struct pci_p2pdma {
> struct gen_pool *pool;
> bool p2pmem_published;
> struct xarray map_types;
> + struct p2pdma_provider mem[PCI_STD_NUM_BARS];
> };
>
> struct pci_p2pdma_pagemap {
> struct dev_pagemap pgmap;
> - struct p2pdma_provider mem;
> + struct p2pdma_provider *mem;
> };
>
> static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
> @@ -204,7 +205,7 @@ static void p2pdma_page_free(struct page *page)
> struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page));
> /* safe to dereference while a reference is held to the percpu ref */
> struct pci_p2pdma *p2pdma = rcu_dereference_protected(
> - to_pci_dev(pgmap->mem.owner)->p2pdma, 1);
> + to_pci_dev(pgmap->mem->owner)->p2pdma, 1);
> struct percpu_ref *ref;
>
> gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page),
> @@ -227,44 +228,93 @@ static void pci_p2pdma_release(void *data)
>
> /* Flush and disable pci_alloc_p2p_mem() */
> pdev->p2pdma = NULL;
> - synchronize_rcu();
> + if (p2pdma->pool)
> + synchronize_rcu();
> + xa_destroy(&p2pdma->map_types);
> +
> + if (!p2pdma->pool)
> + return;
>
> gen_pool_destroy(p2pdma->pool);
> sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
> - xa_destroy(&p2pdma->map_types);
> }
>
> -static int pci_p2pdma_setup(struct pci_dev *pdev)
> +/**
> + * pcim_p2pdma_enable - Enable peer-to-peer DMA support for a PCI device
> + * @pdev: The PCI device to enable P2PDMA for
> + * @bar: BAR index to get provider
> + *
> + * This function initializes the peer-to-peer DMA infrastructure for a PCI
> + * device. It allocates and sets up the necessary data structures to support
> + * P2PDMA operations, including mapping type tracking.
> + */
> +struct p2pdma_provider *pcim_p2pdma_enable(struct pci_dev *pdev, int bar)
> {
> - int error = -ENOMEM;
> struct pci_p2pdma *p2p;
> + int i, ret;
> +
> + p2p = rcu_dereference_protected(pdev->p2pdma, 1);
> + if (p2p)
> + /* PCI device was "rebound" to the driver */
> + return &p2p->mem[bar];
>
This seems like two separate functions rolled into one, an 'initialize
providers' and a 'get provider for BAR'. The comment above even makes
it sound like only a driver re-probing a device would encounter this
branch, but the use case later in vfio-pci shows it to be the common
case to iterate BARs for a device.
But then later in patch 8/ and again in 10/ why exactly do we cache
the provider on the vfio_pci_core_device rather than ask for it on
demand from the p2pdma?
It also seems like the coordination of a valid provider is ad-hoc
between p2pdma and vfio-pci. For example, this only fills providers
for MMIO BARs and vfio-pci validates that dmabuf operations are for
MMIO BARs, but it would be more consistent if vfio-pci relied on p2pdma
to give it a valid provider for a given BAR. Thanks,
Alex
> p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL);
> if (!p2p)
> - return -ENOMEM;
> + return ERR_PTR(-ENOMEM);
>
> xa_init(&p2p->map_types);
> + /*
> + * Iterate over all standard PCI BARs and record only those that
> + * correspond to MMIO regions. Skip non-memory resources (e.g. I/O
> + * port BARs) since they cannot be used for peer-to-peer (P2P)
> + * transactions.
> + */
> + for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
> + continue;
>
> - p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
> - if (!p2p->pool)
> - goto out;
> + p2p->mem[i].owner = &pdev->dev;
> + p2p->mem[i].bus_offset =
> + pci_bus_address(pdev, i) - pci_resource_start(pdev, i);
> + }
>
> - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
> - if (error)
> - goto out_pool_destroy;
> + ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
> + if (ret)
> + goto out_p2p;
>
> - error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
> - if (error)
> + rcu_assign_pointer(pdev->p2pdma, p2p);
> + return &p2p->mem[bar];
> +
> +out_p2p:
> + devm_kfree(&pdev->dev, p2p);
> + return ERR_PTR(ret);
> +}
> +EXPORT_SYMBOL_GPL(pcim_p2pdma_enable);
> +
> +static int pci_p2pdma_setup_pool(struct pci_dev *pdev)
> +{
> + struct pci_p2pdma *p2pdma;
> + int ret;
> +
> + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
> + if (p2pdma->pool)
> + /* We already setup pools, do nothing, */
> + return 0;
> +
> + p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
> + if (!p2pdma->pool)
> + return -ENOMEM;
> +
> + ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
> + if (ret)
> goto out_pool_destroy;
>
> - rcu_assign_pointer(pdev->p2pdma, p2p);
> return 0;
>
> out_pool_destroy:
> - gen_pool_destroy(p2p->pool);
> -out:
> - devm_kfree(&pdev->dev, p2p);
> - return error;
> + gen_pool_destroy(p2pdma->pool);
> + p2pdma->pool = NULL;
> + return ret;
> }
>
> static void pci_p2pdma_unmap_mappings(void *data)
> @@ -276,7 +326,7 @@ static void pci_p2pdma_unmap_mappings(void *data)
> * unmap_mapping_range() on the inode, teardown any existing userspace
> * mappings and prevent new ones from being created.
> */
> - sysfs_remove_file_from_group(&p2p_pgmap->mem.owner->kobj,
> + sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj,
> &p2pmem_alloc_attr.attr,
> p2pmem_group.name);
> }
> @@ -295,6 +345,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
> u64 offset)
> {
> struct pci_p2pdma_pagemap *p2p_pgmap;
> + struct p2pdma_provider *mem;
> struct dev_pagemap *pgmap;
> struct pci_p2pdma *p2pdma;
> void *addr;
> @@ -312,15 +363,25 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
> if (size + offset > pci_resource_len(pdev, bar))
> return -EINVAL;
>
> - if (!pdev->p2pdma) {
> - error = pci_p2pdma_setup(pdev);
> + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
> + if (!p2pdma) {
> + mem = pcim_p2pdma_enable(pdev, bar);
> + if (IS_ERR(mem))
> + return PTR_ERR(mem);
> +
> + error = pci_p2pdma_setup_pool(pdev);
> if (error)
> return error;
> - }
> +
> + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
> + } else
> + mem = &p2pdma->mem[bar];
>
> p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL);
> - if (!p2p_pgmap)
> - return -ENOMEM;
> + if (!p2p_pgmap) {
> + error = -ENOMEM;
> + goto free_pool;
> + }
>
> pgmap = &p2p_pgmap->pgmap;
> pgmap->range.start = pci_resource_start(pdev, bar) + offset;
> @@ -328,9 +389,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
> pgmap->nr_range = 1;
> pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
> pgmap->ops = &p2pdma_pgmap_ops;
> - p2p_pgmap->mem.owner = &pdev->dev;
> - p2p_pgmap->mem.bus_offset =
> - pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar);
> + p2p_pgmap->mem = mem;
>
> addr = devm_memremap_pages(&pdev->dev, pgmap);
> if (IS_ERR(addr)) {
> @@ -343,7 +402,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
> if (error)
> goto pages_free;
>
> - p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
> error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
> pci_bus_address(pdev, bar) + offset,
> range_len(&pgmap->range), dev_to_node(&pdev->dev),
> @@ -359,7 +417,10 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
> pages_free:
> devm_memunmap_pages(&pdev->dev, pgmap);
> pgmap_free:
> - devm_kfree(&pdev->dev, pgmap);
> + devm_kfree(&pdev->dev, p2p_pgmap);
> +free_pool:
> + sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
> + gen_pool_destroy(p2pdma->pool);
> return error;
> }
> EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
> @@ -1008,11 +1069,11 @@ void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
> {
> struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page));
>
> - if (state->mem == &p2p_pgmap->mem)
> + if (state->mem == p2p_pgmap->mem)
> return;
>
> - state->mem = &p2p_pgmap->mem;
> - state->map = pci_p2pdma_map_type(&p2p_pgmap->mem, dev);
> + state->mem = p2p_pgmap->mem;
> + state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev);
> }
>
> /**
> diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
> index eef96636c67e6..888ad7b0c54cf 100644
> --- a/include/linux/pci-p2pdma.h
> +++ b/include/linux/pci-p2pdma.h
> @@ -27,6 +27,7 @@ struct p2pdma_provider {
> };
>
> #ifdef CONFIG_PCI_P2PDMA
> +struct p2pdma_provider *pcim_p2pdma_enable(struct pci_dev *pdev, int bar);
> int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
> u64 offset);
> int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients,
> @@ -45,6 +46,10 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
> ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
> bool use_p2pdma);
> #else /* CONFIG_PCI_P2PDMA */
> +static inline struct p2pdma_provider *pcim_p2pdma_enable(struct pci_dev *pdev, int bar)
> +{
> + return ERR_PTR(-EOPNOTSUPP);
> +}
> static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
> size_t size, u64 offset)
> {
Powered by blists - more mailing lists