lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aFHWbX_LTjcRveVm@x1.local>
Date: Tue, 17 Jun 2025 16:56:13 -0400
From: Peter Xu <peterx@...hat.com>
To: Jason Gunthorpe <jgg@...dia.com>,
	"Liam R. Howlett" <Liam.Howlett@...cle.com>,
	Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org, kvm@...r.kernel.org,
	Andrew Morton <akpm@...ux-foundation.org>,
	Alex Williamson <alex.williamson@...hat.com>,
	Zi Yan <ziy@...dia.com>, Alex Mastro <amastro@...com>,
	David Hildenbrand <david@...hat.com>,
	Nico Pache <npache@...hat.com>
Subject: Re: [PATCH 5/5] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED
 mappings

On Mon, Jun 16, 2025 at 08:00:11PM -0300, Jason Gunthorpe wrote:
> On Mon, Jun 16, 2025 at 06:06:23PM -0400, Peter Xu wrote:
> 
> > Can I understand it as a suggestion to pass in a bitmask into the core mm
> > API (e.g. keep the name of mm_get_unmapped_area_aligned()), instead of a
> > constant "align", so that core mm would try to allocate from the largest
> > size to smaller until it finds some working VA to use?
> 
> I don't think you need a bitmask.
> 
> Split the concerns, the caller knows what is inside it's FD. It only
> needs to provide the highest pgoff aligned folio/pfn within the FD.

Ultimately I even dropped this hint.  I found that it's not really
get_unmapped_area()'s job to detect over-sized pgoffs.  It's mmap()'s job.
So I decided to avoid this parameter as of now.

> 
> The mm knows what leaf page tables options exist. It should try to
> align to the closest leaf page table size that is <= the FD's max
> aligned folio.

So again IMHO this is also not per-FD information, but needs to be passed
over from the driver for each call.

Likely the "order" parameter appeared in other discussions to imply a
maximum supported size from the driver side (or, for a folio, but that is
definitely another user after this series can land).

So far I didn't yet add the "order", because currently VFIO definitely
supports all max orders the system supports.  Maybe we can add the order
when there's a real need, but maybe it won't happen in the near future?

In summary, I came up with below changes, would below look reasonable?

I also added Liam and Lorenzo in this reply.

================8<================

>From 7f1b7aada21ab036849edc49635fb0656e0457c4 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@...hat.com>
Date: Fri, 30 May 2025 12:45:55 -0400
Subject: [PATCH 1/4] mm: Rename __thp_get_unmapped_area to
 mm_get_unmapped_area_aligned

This function is handy to locate an unmapped region which is best aligned
to the specified alignment, taking whatever form of pgoff address space
into considerations.

Rename the function and make it more general for even non-THP use in follow
up patches.  Dropping "THP" in the name because it doesn't have much to do
with THP internally.  The suffix "_aligned" imply it is a helper to
generate aligned virtual address based on what is specified (which can be
not PMD_SIZE).

When at it, using check_add_overflow() helpers to verify the inputs to make
sure no overflow will happen.

Cc: Baolin Wang <baolin.wang@...ux.alibaba.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Cc: "Liam R. Howlett" <Liam.Howlett@...cle.com>
Cc: Ryan Roberts <ryan.roberts@....com>
Cc: Dev Jain <dev.jain@....com>
Cc: Barry Song <baohua@...nel.org>
Signed-off-by: Peter Xu <peterx@...hat.com>
---
 mm/huge_memory.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4734de1dc0ae..885b5845dbba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1088,23 +1088,28 @@ static inline bool is_transparent_hugepage(const struct folio *folio)
 		folio_test_large_rmappable(folio);
 }
 
-static unsigned long __thp_get_unmapped_area(struct file *filp,
+static unsigned long mm_get_unmapped_area_aligned(struct file *filp,
 		unsigned long addr, unsigned long len,
-		loff_t off, unsigned long flags, unsigned long size,
+		loff_t off, unsigned long flags, unsigned long align,
 		vm_flags_t vm_flags)
 {
-	loff_t off_end = off + len;
-	loff_t off_align = round_up(off, size);
+	loff_t off_end;
+	loff_t off_align = round_up(off, align);
 	unsigned long len_pad, ret, off_sub;
 
 	if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
 		return 0;
 
-	if (off_end <= off_align || (off_end - off_align) < size)
+	/* Can't use the overflow API, do manual check for now */
+	if (off_align < off)
 		return 0;
-
-	len_pad = len + size;
-	if (len_pad < len || (off + len_pad) < off)
+	if (check_add_overflow(off, len, &off_end))
+		return 0;
+	if (off_end <= off_align || (off_end - off_align) < align)
+		return 0;
+	if (check_add_overflow(len, align, &len_pad))
+		return 0;
+	if ((off + len_pad) < off)
 		return 0;
 
 	ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
@@ -1118,16 +1123,16 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
 		return 0;
 
 	/*
-	 * Do not try to align to THP boundary if allocation at the address
+	 * Do not try to provide alignment if allocation at the address
 	 * hint succeeds.
 	 */
 	if (ret == addr)
 		return addr;
 
-	off_sub = (off - ret) & (size - 1);
+	off_sub = (off - ret) & (align - 1);
 
 	if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
-		return ret + size;
+		return ret + align;
 
 	ret += off_sub;
 	return ret;
@@ -1140,7 +1145,8 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
 	unsigned long ret;
 	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
 
-	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
+	ret = mm_get_unmapped_area_aligned(filp, addr, len, off, flags,
+					   PMD_SIZE, vm_flags);
 	if (ret)
 		return ret;
 
-- 
2.49.0


>From 709379a39f4a59a6d3bda7a39ca55f08fdaf9e1a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@...hat.com>
Date: Tue, 17 Jun 2025 15:27:07 -0400
Subject: [PATCH 2/4] mm: huge_mapping_get_va_aligned() helper

Add this helper to allocate a VA that would be best to map huge mappings
that the system would support. It can be used in file's get_unmapped_area()
functions as long as proper max_pgoff will be provided so that core mm will
know the available range of pgoff to map in the future.

Signed-off-by: Peter Xu <peterx@...hat.com>
---
 include/linux/huge_mm.h | 10 ++++++++-
 mm/huge_memory.c        | 46 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2f190c90192d..59fdafb1034b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -339,7 +339,8 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags,
 		vm_flags_t vm_flags);
-
+unsigned long huge_mapping_get_va_aligned(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags);
 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order);
@@ -543,6 +544,13 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
 	return 0;
 }
 
+static inline unsigned long
+huge_mapping_get_va_aligned(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
+}
+
 static inline bool
 can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 885b5845dbba..bc016b656dc7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1161,6 +1161,52 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
+/**
+ * huge_mapping_get_va_aligned: best-effort VA allocation for huge mappings
+ *
+ * @filp: file target of the mmap() request
+ * @addr: hint address from mmap() request
+ * @len: len of the mmap() request
+ * @pgoff: file offset of the mmap() request
+ * @flags: flags of the mmap() request
+ *
+ * This function should normally be used by a driver's specific
+ * get_unmapped_area() handler to provide a huge-mapping friendly virtual
+ * address for a specific mmap() request.  The caller should pass in most
+ * of the parameters from the get_unmapped_area() request.
+ *
+ * Normally it means the caller's mmap() needs to also support any possible
+ * huge mappings the system supports.
+ *
+ * Return: a best-effort virtual address that will satisfy the most huge
+ * mappings for the result VMA to be mapped.
+ */
+unsigned long huge_mapping_get_va_aligned(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
+	unsigned long ret;
+
+	/* TODO: support continuous ptes/pmds */
+	if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+	    len >= PUD_SIZE) {
+		ret = mm_get_unmapped_area_aligned(filp, addr, len, off, flags,
+						   PUD_SIZE, 0);
+		if (ret)
+			return ret;
+	}
+
+	if (len >= PMD_SIZE) {
+		ret = mm_get_unmapped_area_aligned(filp, addr, len, off, flags,
+						   PMD_SIZE, 0);
+		if (ret)
+			return ret;
+	}
+
+	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
+}
+EXPORT_SYMBOL_GPL(huge_mapping_get_va_aligned);
+
 static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
 		unsigned long addr)
 {
-- 
2.49.0


>From ff90dbba05ea54e5c6690fbedf330c837f8f0ea1 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@...hat.com>
Date: Wed, 4 Jun 2025 17:54:40 -0400
Subject: [PATCH 3/4] vfio: Introduce vfio_device_ops.get_unmapped_area hook

Add a hook to vfio_device_ops to allow sub-modules provide virtual
addresses for an mmap() request.

Note that the fallback will be mm_get_unmapped_area(), which should
maintain the old behavior of generic VA allocation (__get_unmapped_area).
It's a bit unfortunate that is needed, as the current get_unmapped_area()
file ops cannot support a retval which fallbacks to the default.  So that
is needed both here and whenever sub-module will opt-in with its own.

Signed-off-by: Peter Xu <peterx@...hat.com>
---
 drivers/vfio/vfio_main.c | 25 +++++++++++++++++++++++++
 include/linux/vfio.h     |  8 ++++++++
 2 files changed, 33 insertions(+)

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 1fd261efc582..480cc2398810 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1354,6 +1354,28 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 	return device->ops->mmap(device, vma);
 }
 
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+static unsigned long vfio_device_get_unmapped_area(struct file *file,
+						   unsigned long addr,
+						   unsigned long len,
+						   unsigned long pgoff,
+						   unsigned long flags)
+{
+	struct vfio_device_file *df = file->private_data;
+	struct vfio_device *device = df->device;
+	unsigned long ret;
+
+	if (device->ops->get_unmapped_area) {
+		ret = device->ops->get_unmapped_area(device, file, addr,
+						      len, pgoff, flags);
+		if (ret)
+			return ret;
+	}
+
+	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
+}
+#endif
+
 const struct file_operations vfio_device_fops = {
 	.owner		= THIS_MODULE,
 	.open		= vfio_device_fops_cdev_open,
@@ -1363,6 +1385,9 @@ const struct file_operations vfio_device_fops = {
 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
 	.mmap		= vfio_device_fops_mmap,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+	.get_unmapped_area = vfio_device_get_unmapped_area,
+#endif
 };
 
 static struct vfio_device *vfio_device_from_file(struct file *file)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 707b00772ce1..d900541e2716 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -108,6 +108,8 @@ struct vfio_device {
  * @dma_unmap: Called when userspace unmaps IOVA from the container
  *             this device is attached to.
  * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
+ * @get_unmapped_area: Optional, provide virtual address hint for mmap().
+ *                     If zero is returned, fallback to the default allocator.
  */
 struct vfio_device_ops {
 	char	*name;
@@ -135,6 +137,12 @@ struct vfio_device_ops {
 	void	(*dma_unmap)(struct vfio_device *vdev, u64 iova, u64 length);
 	int	(*device_feature)(struct vfio_device *device, u32 flags,
 				  void __user *arg, size_t argsz);
+	unsigned long (*get_unmapped_area)(struct vfio_device *device,
+					   struct file *file,
+					   unsigned long addr,
+					   unsigned long len,
+					   unsigned long pgoff,
+					   unsigned long flags);
 };
 
 #if IS_ENABLED(CONFIG_IOMMUFD)
-- 
2.49.0

>From 38539aafac83ae204d3e03f441f7e33841db6b07 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@...hat.com>
Date: Fri, 30 May 2025 13:21:20 -0400
Subject: [PATCH 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED
 mappings

This patch enables best-effort mmap() for vfio-pci bars even without
MAP_FIXED, so as to utilize huge pfnmaps as much as possible.  It should
also avoid userspace changes (switching to MAP_FIXED with pre-aligned VA
addresses) to start enabling huge pfnmaps on VFIO bars.

Here the trick is making sure the MMIO PFNs will be aligned with the VAs
allocated from mmap() when !MAP_FIXED, so that whatever returned from
mmap(!MAP_FIXED) of vfio-pci MMIO regions will be automatically suitable
for huge pfnmaps as much as possible.

To achieve that, a custom vfio_device's get_unmapped_area() for vfio-pci
devices is needed.

Note, MMIO physical addresses should normally be guaranteed to be always
bar-size aligned, hence the bar offset can logically be directly used to do
the calculation.  However to make it strict and clear (rather than relying
on spec details), we still try to fetch the bar's physical addresses from
pci_dev.resource[].

[1] https://lore.kernel.org/linux-pci/20250529214414.1508155-1-amastro@fb.com/

Reported-by: Alex Mastro <amastro@...com>
Signed-off-by: Peter Xu <peterx@...hat.com>
---
 drivers/vfio/pci/vfio_pci.c      |  1 +
 drivers/vfio/pci/vfio_pci_core.c | 34 ++++++++++++++++++++++++++++++++
 include/linux/vfio_pci_core.h    |  3 +++
 3 files changed, 38 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 5ba39f7623bb..32b570f17d0f 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -144,6 +144,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
 	.pasid_attach_ioas	= vfio_iommufd_physical_pasid_attach_ioas,
 	.pasid_detach_ioas	= vfio_iommufd_physical_pasid_detach_ioas,
+	.get_unmapped_area	= vfio_pci_core_get_unmapped_area,
 };
 
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 6328c3a05bcd..5392bec4929a 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1641,6 +1641,40 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma)
 	return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
 }
 
+/*
+ * Hint function to provide mmap() virtual address candidate so as to be
+ * able to map huge pfnmaps as much as possible.  It is done by aligning
+ * the VA to the PFN to be mapped in the specific bar.
+ *
+ * Note that this function does the minimum check on mmap() parameters to
+ * make the PFN calculation valid only. The majority of mmap() sanity check
+ * will be done later in mmap().
+ */
+unsigned long vfio_pci_core_get_unmapped_area(struct vfio_device *device,
+		struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	struct pci_dev *pdev = vdev->pdev;
+	unsigned int index = pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+	unsigned long req_start;
+
+	/* Currently, only bars 0-5 supports huge pfnmap */
+	if (index >= VFIO_PCI_ROM_REGION_INDEX)
+		return 0;
+
+	/* Calculate the start of physical address to be mapped */
+	req_start = (pgoff << PAGE_SHIFT) & ((1UL << VFIO_PCI_OFFSET_SHIFT) - 1);
+	if (check_add_overflow(req_start, pci_resource_start(pdev, index),
+			       &req_start))
+		return 0;
+
+	return huge_mapping_get_va_aligned(file, addr, len, req_start >> PAGE_SHIFT,
+					   flags);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_get_unmapped_area);
+
 static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
 					   unsigned int order)
 {
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index fbb472dd99b3..d97c920b4dbf 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -119,6 +119,9 @@ ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
 		size_t count, loff_t *ppos);
 ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
 		size_t count, loff_t *ppos);
+unsigned long vfio_pci_core_get_unmapped_area(struct vfio_device *device,
+		struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags);
 int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
 void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
 int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
-- 
2.49.0


-- 
Peter Xu


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ