lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aTGmkHsRSsnneW0G@x1.local>
Date: Thu, 4 Dec 2025 10:19:44 -0500
From: Peter Xu <peterx@...hat.com>
To: kvm@...r.kernel.org, linux-mm@...ck.org, linux-kernel@...r.kernel.org
Cc: Jason Gunthorpe <jgg@...dia.com>, Nico Pache <npache@...hat.com>,
	Zi Yan <ziy@...dia.com>, Alex Mastro <amastro@...com>,
	David Hildenbrand <david@...hat.com>,
	Alex Williamson <alex@...zbot.org>, Zhi Wang <zhiw@...dia.com>,
	David Laight <david.laight.linux@...il.com>,
	Yi Liu <yi.l.liu@...el.com>, Ankit Agrawal <ankita@...dia.com>,
	Kevin Tian <kevin.tian@...el.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	David Hildenbrand <david@...nel.org>,
	Matthew Wilcox <willy@...radead.org>,
	Jonathan Corbet <corbet@....net>, linux-fsdevel@...r.kernel.org,
	linux-doc@...r.kernel.org,
	"Liam R. Howlett" <Liam.Howlett@...cle.com>,
	Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
	Vlastimil Babka <vbabka@...e.cz>, Jann Horn <jannh@...gle.com>,
	Pedro Falcato <pfalcato@...e.de>,
	Alexander Viro <viro@...iv.linux.org.uk>,
	Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>,
	Mike Rapoport <rppt@...nel.org>,
	Suren Baghdasaryan <surenb@...gle.com>,
	Michal Hocko <mhocko@...nel.org>
Subject: Re: [PATCH v2 2/4] mm: Add file_operations.get_mapping_order()

I forgot to copy mm/fs maintainers for the 1st/2nd patches in this series,
my apologies.  Whole series can be found here:

https://lore.kernel.org/r/20251204151003.171039-1-peterx@redhat.com

I'll modify the cc list when repost.

Thanks,

On Thu, Dec 04, 2025 at 10:10:01AM -0500, Peter Xu wrote:
> Add one new file operation, get_mapping_order().  It can be used by file
> backends to report mapping order hints.
> 
> By default, Linux assumed we will map in PAGE_SIZE chunks.  With this hint,
> the driver can report the possibility of mapping chunks that are larger
> than PAGE_SIZE.  Then, the VA allocator will try to use that as alignment
> when allocating the VA ranges.
> 
> This is useful because when chunks to be mapped are larger than PAGE_SIZE,
> VA alignment matters and it needs to be aligned with the size of the chunk
> to be mapped.
> 
> Said that, no matter what is the alignment used for the VA allocation, the
> driver can still decide which size to map the chunks.  It is also not an
> issue if it keeps mapping in PAGE_SIZE.
> 
> get_mapping_order() is defined to take three parameters.  Besides the 1st
> parameter which will be the file object pointer, the 2nd + 3rd parameters
> being the pgoff + size of the mmap() request.  Its retval is defined as the
> order, which must be non-negative to enable the alignment.  When zero is
> returned, it should behave like when the hint is not provided, IOW,
> alignment will still be PAGE_SIZE.
> 
> When the order is too big, ignore the hint.  Normally drivers are trusted,
> so it's more of an extra layer of safety measure.
> 
> Suggested-by: Jason Gunthorpe <jgg@...dia.com>
> Signed-off-by: Peter Xu <peterx@...hat.com>
> ---
>  Documentation/filesystems/vfs.rst |  4 +++
>  include/linux/fs.h                |  1 +
>  mm/mmap.c                         | 59 +++++++++++++++++++++++++++----
>  3 files changed, 57 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> index 4f13b01e42eb5..b707ddbebbf52 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -1069,6 +1069,7 @@ This describes how the VFS can manipulate an open file.  As of kernel
>  		int (*fasync) (int, struct file *, int);
>  		int (*lock) (struct file *, int, struct file_lock *);
>  		unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
> +		int (*get_mapping_order)(struct file *, unsigned long, size_t);
>  		int (*check_flags)(int);
>  		int (*flock) (struct file *, int, struct file_lock *);
>  		ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
> @@ -1165,6 +1166,9 @@ otherwise noted.
>  ``get_unmapped_area``
>  	called by the mmap(2) system call
>  
> +``get_mapping_order``
> +	called by the mmap(2) system call to get mapping order hint
> +
>  ``check_flags``
>  	called by the fcntl(2) system call for F_SETFL command
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index dd3b57cfadeeb..5ba373576bfe5 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2287,6 +2287,7 @@ struct file_operations {
>  	int (*fasync) (int, struct file *, int);
>  	int (*lock) (struct file *, int, struct file_lock *);
>  	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
> +	int (*get_mapping_order)(struct file *file, unsigned long pgoff, size_t len);
>  	int (*check_flags)(int);
>  	int (*flock) (struct file *, int, struct file_lock *);
>  	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 8fa397a18252e..be3dd0623f00c 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -808,6 +808,33 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi
>  	return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
>  }
>  
> +static inline bool file_has_mmap_order_hint(struct file *file)
> +{
> +	return file && file->f_op && file->f_op->get_mapping_order;
> +}
> +
> +static inline bool
> +mmap_should_align(struct file *file, unsigned long addr, unsigned long len)
> +{
> +	/* When THP not enabled at all, skip */
> +	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> +		return false;
> +
> +	/* Never try any alignment if the mmap() address hint is provided */
> +	if (addr)
> +		return false;
> +
> +	/* Anonymous THP could use some better alignment when len aligned */
> +	if (!file)
> +		return IS_ALIGNED(len, PMD_SIZE);
> +
> +	/*
> +	 * It's a file mapping, no address hint provided by caller, try any
> +	 * alignment if the file backend would provide a hint
> +	 */
> +	return file_has_mmap_order_hint(file);
> +}
> +
>  unsigned long
>  __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
>  		unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
> @@ -815,8 +842,9 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
>  	unsigned long (*get_area)(struct file *, unsigned long,
>  				  unsigned long, unsigned long, unsigned long)
>  				  = NULL;
> -
>  	unsigned long error = arch_mmap_check(addr, len, flags);
> +	unsigned long align;
> +
>  	if (error)
>  		return error;
>  
> @@ -841,13 +869,30 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
>  
>  	if (get_area) {
>  		addr = get_area(file, addr, len, pgoff, flags);
> -	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file
> -		   && !addr /* no hint */
> -		   && IS_ALIGNED(len, PMD_SIZE)) {
> -		/* Ensures that larger anonymous mappings are THP aligned. */
> +	} else if (mmap_should_align(file, addr, len)) {
> +		if (file_has_mmap_order_hint(file)) {
> +			int order;
> +			/*
> +			 * Allow driver to opt-in on the order hint.
> +			 *
> +			 * Sanity check on the order returned. Treating
> +			 * either negative or too big order to be invalid,
> +			 * where alignment will be skipped.
> +			 */
> +			order = file->f_op->get_mapping_order(file, pgoff, len);
> +			if (order < 0)
> +				order = 0;
> +			if (check_shl_overflow(PAGE_SIZE, order, &align))
> +				/* No alignment applied */
> +				align = PAGE_SIZE;
> +		} else {
> +			/* Default alignment for anonymous THPs */
> +			align = PMD_SIZE;
> +		}
> +
>  		addr = thp_get_unmapped_area_vmflags(file, addr, len,
> -						     pgoff, flags, PMD_SIZE,
> -						     vm_flags);
> +						     pgoff, flags,
> +						     align, vm_flags);
>  	} else {
>  		addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
>  						    pgoff, flags, vm_flags);
> -- 
> 2.50.1
> 

-- 
Peter Xu


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ