[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ru35ev2clily7277fh2uwxuiellerlocfexhjkqim7stixuact@7fp5h7fdmz5h>
Date: Thu, 8 Jan 2026 16:20:58 +1100
From: Alistair Popple <apopple@...dia.com>
To: Hou Tao <houtao@...weicloud.com>
Cc: linux-kernel@...r.kernel.org, linux-pci@...r.kernel.org,
linux-mm@...ck.org, linux-nvme@...ts.infradead.org,
Bjorn Helgaas <bhelgaas@...gle.com>, Logan Gunthorpe <logang@...tatee.com>,
Leon Romanovsky <leonro@...dia.com>, Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
Tejun Heo <tj@...nel.org>, "Rafael J . Wysocki" <rafael@...nel.org>,
Danilo Krummrich <dakr@...nel.org>, Andrew Morton <akpm@...ux-foundation.org>,
David Hildenbrand <david@...nel.org>, Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
Keith Busch <kbusch@...nel.org>, Jens Axboe <axboe@...nel.dk>, Christoph Hellwig <hch@....de>,
Sagi Grimberg <sagi@...mberg.me>, houtao1@...wei.com
Subject: Re: [PATCH 10/13] PCI/P2PDMA: support compound page in
p2pmem_alloc_mmap()
On 2025-12-20 at 15:04 +1100, Hou Tao <houtao@...weicloud.com> wrote...
> From: Hou Tao <houtao1@...wei.com>
>
> P2PDMA memory has already supported compound page and the helpers which
> support inserting compound page into vma is also ready, therefore, add
> support for compound page in p2pmem_alloc_mmap() as well. It will reduce
> the overhead of mmap() and get_user_pages() a lot when compound page is
> enabled for p2pdma memory.
>
> The use of vm_private_data to save the alignment of p2pdma memory needs
> explanation. The normal way to get the alignment is through pci_dev. It
> can be achieved by either invoking kernfs_of() and sysfs_file_kobj() or
> defining a new struct kernfs_vm_ops to pass the kobject to the
> may_split() and ->pagesize() callbacks. The former approach depends too
> much on kernfs implementation details, and the latter would lead to
> excessive churn. Therefore, choose the simpler way of saving alignment
> in vm_private_data instead.
>
> Signed-off-by: Hou Tao <houtao1@...wei.com>
> ---
> drivers/pci/p2pdma.c | 48 ++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 44 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index e97f5da73458..4a133219ac43 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -128,6 +128,25 @@ static unsigned long p2pmem_get_unmapped_area(struct file *filp, struct kobject
> return mm_get_unmapped_area(filp, uaddr, len, pgoff, flags);
> }
>
> +static int p2pmem_may_split(struct vm_area_struct *vma, unsigned long addr)
> +{
> + size_t align = (uintptr_t)vma->vm_private_data;
> +
> + if (!IS_ALIGNED(addr, align))
> + return -EINVAL;
> + return 0;
> +}
> +
> +static unsigned long p2pmem_pagesize(struct vm_area_struct *vma)
> +{
> + return (uintptr_t)vma->vm_private_data;
> +}
> +
> +static const struct vm_operations_struct p2pmem_vm_ops = {
> + .may_split = p2pmem_may_split,
> + .pagesize = p2pmem_pagesize,
> +};
> +
> static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
> const struct bin_attribute *attr, struct vm_area_struct *vma)
> {
> @@ -136,6 +155,7 @@ static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
> struct pci_p2pdma *p2pdma;
> struct percpu_ref *ref;
> unsigned long vaddr;
> + size_t align;
> void *kaddr;
> int ret;
>
> @@ -161,6 +181,16 @@ static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
> goto out;
> }
>
> + align = p2pdma->align;
> + if (vma->vm_start & (align - 1) || vma->vm_end & (align - 1)) {
> + pci_info_ratelimited(pdev,
> + "%s: unaligned vma (%#lx~%#lx, %#lx)\n",
> + current->comm, vma->vm_start, vma->vm_end,
> + align);
> + ret = -EINVAL;
> + goto out;
> + }
> +
> kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
> if (!kaddr) {
> ret = -ENOMEM;
> @@ -178,7 +208,7 @@ static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
> }
> rcu_read_unlock();
>
> - for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
> + for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += align) {
> struct page *page = virt_to_page(kaddr);
>
> /*
> @@ -188,7 +218,12 @@ static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
> */
> VM_WARN_ON_ONCE_PAGE(page_ref_count(page), page);
> set_page_count(page, 1);
> - ret = vm_insert_page(vma, vaddr, page);
> + if (align == PUD_SIZE)
> + ret = vm_insert_folio_pud(vma, vaddr, page_folio(page));
> + else if (align == PMD_SIZE)
> + ret = vm_insert_folio_pmd(vma, vaddr, page_folio(page));
This doesn't look quite right to me - where do you initialise the folio
metadata? I'd expect a call to prep_compound_page() or some equivalent somewhere
- for example calling something like zone_device_page_init() to set the correct
folio order, etc.
- Alistair
> + else
> + ret = vm_insert_page(vma, vaddr, page);
> if (ret) {
> gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
> percpu_ref_put(ref);
> @@ -196,10 +231,15 @@ static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
> }
> percpu_ref_get(ref);
> put_page(page);
> - kaddr += PAGE_SIZE;
> - len -= PAGE_SIZE;
> + kaddr += align;
> + len -= align;
> }
>
> + /* Disable unaligned splitting due to vma merge */
> + vm_flags_set(vma, VM_DONTEXPAND);
> + vma->vm_ops = &p2pmem_vm_ops;
> + vma->vm_private_data = (void *)(uintptr_t)align;
> +
> percpu_ref_put(ref);
>
> return 0;
> --
> 2.29.2
>
Powered by blists - more mailing lists