lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aGD_fyBSy6hwIHHk@kernel.org>
Date: Sun, 29 Jun 2025 11:55:27 +0300
From: Mike Rapoport <rppt@...nel.org>
To: Peter Xu <peterx@...hat.com>
Cc: linux-mm@...ck.org, linux-kernel@...r.kernel.org,
	Vlastimil Babka <vbabka@...e.cz>,
	Suren Baghdasaryan <surenb@...gle.com>,
	Muchun Song <muchun.song@...ux.dev>,
	Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
	Hugh Dickins <hughd@...gle.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	James Houghton <jthoughton@...gle.com>,
	"Liam R . Howlett" <Liam.Howlett@...cle.com>,
	Nikita Kalyazin <kalyazin@...zon.com>,
	Michal Hocko <mhocko@...e.com>,
	David Hildenbrand <david@...hat.com>,
	Andrea Arcangeli <aarcange@...hat.com>,
	Oscar Salvador <osalvador@...e.de>,
	Axel Rasmussen <axelrasmussen@...gle.com>,
	Ujwal Kundur <ujwal.kundur@...il.com>
Subject: Re: [PATCH v2 4/4] mm: Apply vm_uffd_ops API to core mm

On Fri, Jun 27, 2025 at 11:46:55AM -0400, Peter Xu wrote:
> This patch completely moves the old userfaultfd core to use the new
> vm_uffd_ops API.  After this change, existing file systems will start to
> use the new API for userfault operations.

Maybe:

  Move userfaultfd core to use new vm_uffd_ops API. After this change file
  systems that implement vm_operations_struct can start using new API for
  userfaultfd operations.
 
> When at it, moving vma_can_userfault() into mm/userfaultfd.c instead,
> because it's getting too big.  It's only used in slow paths so it shouldn't
> be an issue.  Move the pte marker check before wp_async, which might be
> more intuitive because wp_async depends on pte markers.  That shouldn't
> cause any functional change though because only one check would take effect
> depending on whether pte marker was selected in config.
> 
> This will also remove quite some hard-coded checks for either shmem or
> hugetlbfs.  Now all the old checks should still work but with vm_uffd_ops.
> 
> Note that anonymous memory will still need to be processed separately
> because it doesn't have vm_ops at all.
> 
> Reviewed-by: James Houghton <jthoughton@...gle.com>
> Signed-off-by: Peter Xu <peterx@...hat.com>

Acked-by: Mike Rapoport (Microsoft) <rppt@...nel.org>

> ---
>  include/linux/shmem_fs.h      |  14 -----
>  include/linux/userfaultfd_k.h |  46 ++++----------
>  mm/shmem.c                    |   2 +-
>  mm/userfaultfd.c              | 115 +++++++++++++++++++++++++---------
>  4 files changed, 101 insertions(+), 76 deletions(-)
> 
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 6d0f9c599ff7..2f5b7b295cf6 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -195,20 +195,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
>  extern bool shmem_charge(struct inode *inode, long pages);
>  extern void shmem_uncharge(struct inode *inode, long pages);
>  
> -#ifdef CONFIG_USERFAULTFD
> -#ifdef CONFIG_SHMEM
> -extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
> -				  struct vm_area_struct *dst_vma,
> -				  unsigned long dst_addr,
> -				  unsigned long src_addr,
> -				  uffd_flags_t flags,
> -				  struct folio **foliop);
> -#else /* !CONFIG_SHMEM */
> -#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
> -			       src_addr, flags, foliop) ({ BUG(); 0; })
> -#endif /* CONFIG_SHMEM */
> -#endif /* CONFIG_USERFAULTFD */
> -
>  /*
>   * Used space is stored as unsigned 64-bit value in bytes but
>   * quota core supports only signed 64-bit values so use that
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index c9a093c4502b..1aa9b246fb84 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -149,9 +149,14 @@ typedef struct vm_uffd_ops vm_uffd_ops;
>  #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr))
>  #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1))
>  
> +static inline enum mfill_atomic_mode uffd_flags_get_mode(uffd_flags_t flags)
> +{
> +	return (__force enum mfill_atomic_mode)(flags & MFILL_ATOMIC_MODE_MASK);
> +}
> +
>  static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected)
>  {
> -	return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected);
> +	return uffd_flags_get_mode(flags) == expected;
>  }
>  
>  static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode)
> @@ -260,41 +265,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
>  	return vma->vm_flags & __VM_UFFD_FLAGS;
>  }
>  
> -static inline bool vma_can_userfault(struct vm_area_struct *vma,
> -				     vm_flags_t vm_flags,
> -				     bool wp_async)
> +static inline const vm_uffd_ops *vma_get_uffd_ops(struct vm_area_struct *vma)
>  {
> -	vm_flags &= __VM_UFFD_FLAGS;
> -
> -	if (vma->vm_flags & VM_DROPPABLE)
> -		return false;
> -
> -	if ((vm_flags & VM_UFFD_MINOR) &&
> -	    (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
> -		return false;
> -
> -	/*
> -	 * If wp async enabled, and WP is the only mode enabled, allow any
> -	 * memory type.
> -	 */
> -	if (wp_async && (vm_flags == VM_UFFD_WP))
> -		return true;
> -
> -#ifndef CONFIG_PTE_MARKER_UFFD_WP
> -	/*
> -	 * If user requested uffd-wp but not enabled pte markers for
> -	 * uffd-wp, then shmem & hugetlbfs are not supported but only
> -	 * anonymous.
> -	 */
> -	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
> -		return false;
> -#endif
> -
> -	/* By default, allow any of anon|shmem|hugetlb */
> -	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
> -	    vma_is_shmem(vma);
> +	if (vma->vm_ops && vma->vm_ops->userfaultfd_ops)
> +		return vma->vm_ops->userfaultfd_ops;
> +	return NULL;
>  }
>  
> +bool vma_can_userfault(struct vm_area_struct *vma,
> +		       unsigned long vm_flags, bool wp_async);
> +
>  static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
>  {
>  	struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9a8b8dd4709b..fc85002dd8af 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -3158,7 +3158,7 @@ static int shmem_uffd_get_folio(struct inode *inode, pgoff_t pgoff,
>  	return shmem_get_folio(inode, pgoff, 0, folio, SGP_NOALLOC);
>  }
>  
> -int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
> +static int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
>  			   struct vm_area_struct *dst_vma,
>  			   unsigned long dst_addr,
>  			   unsigned long src_addr,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index cbed91b09640..52d7d5f144b8 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -14,12 +14,48 @@
>  #include <linux/userfaultfd_k.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/hugetlb.h>
> -#include <linux/shmem_fs.h>
>  #include <asm/tlbflush.h>
>  #include <asm/tlb.h>
>  #include "internal.h"
>  #include "swap.h"
>  
> +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags,
> +		       bool wp_async)
> +{
> +	unsigned long supported;
> +
> +	if (vma->vm_flags & VM_DROPPABLE)
> +		return false;
> +
> +	vm_flags &= __VM_UFFD_FLAGS;
> +
> +#ifndef CONFIG_PTE_MARKER_UFFD_WP
> +	/*
> +	 * If user requested uffd-wp but not enabled pte markers for
> +	 * uffd-wp, then any file system (like shmem or hugetlbfs) are not
> +	 * supported but only anonymous.
> +	 */
> +	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
> +		return false;
> +#endif
> +	/*
> +	 * If wp async enabled, and WP is the only mode enabled, allow any
> +	 * memory type.
> +	 */
> +	if (wp_async && (vm_flags == VM_UFFD_WP))
> +		return true;
> +
> +	if (vma_is_anonymous(vma))
> +		/* Anonymous has no page cache, MINOR not supported */
> +		supported = VM_UFFD_MISSING | VM_UFFD_WP;
> +	else if (vma_get_uffd_ops(vma))
> +		supported = vma_get_uffd_ops(vma)->uffd_features;
> +	else
> +		return false;
> +
> +	return !(vm_flags & (~supported));
> +}
> +
>  static __always_inline
>  bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
>  {
> @@ -384,11 +420,15 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
>  {
>  	struct inode *inode = file_inode(dst_vma->vm_file);
>  	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
> +	const vm_uffd_ops *uffd_ops = vma_get_uffd_ops(dst_vma);
>  	struct folio *folio;
>  	struct page *page;
>  	int ret;
>  
> -	ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
> +	if (WARN_ON_ONCE(!uffd_ops || !uffd_ops->uffd_get_folio))
> +		return -EINVAL;
> +
> +	ret = uffd_ops->uffd_get_folio(inode, pgoff, &folio);
>  	/* Our caller expects us to return -EFAULT if we failed to find folio */
>  	if (ret == -ENOENT)
>  		ret = -EFAULT;
> @@ -504,18 +544,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	u32 hash;
>  	struct address_space *mapping;
>  
> -	/*
> -	 * There is no default zero huge page for all huge page sizes as
> -	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
> -	 * by THP.  Since we can not reliably insert a zero page, this
> -	 * feature is not supported.
> -	 */
> -	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> -		up_read(&ctx->map_changing_lock);
> -		uffd_mfill_unlock(dst_vma);
> -		return -EINVAL;
> -	}
> -
>  	src_addr = src_start;
>  	dst_addr = dst_start;
>  	copied = 0;
> @@ -686,14 +714,55 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
>  			err = mfill_atomic_pte_zeropage(dst_pmd,
>  						 dst_vma, dst_addr);
>  	} else {
> -		err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
> -					     dst_addr, src_addr,
> -					     flags, foliop);
> +		const vm_uffd_ops *uffd_ops = vma_get_uffd_ops(dst_vma);
> +
> +		if (WARN_ON_ONCE(!uffd_ops || !uffd_ops->uffd_copy)) {
> +			err = -EINVAL;
> +		} else {
> +			err = uffd_ops->uffd_copy(dst_pmd, dst_vma,
> +						  dst_addr, src_addr,
> +						  flags, foliop);
> +		}
>  	}
>  
>  	return err;
>  }
>  
> +static inline bool
> +vma_uffd_ops_supported(struct vm_area_struct *vma, uffd_flags_t flags)
> +{
> +	enum mfill_atomic_mode mode = uffd_flags_get_mode(flags);
> +	const vm_uffd_ops *uffd_ops;
> +	unsigned long uffd_ioctls;
> +
> +	if ((flags & MFILL_ATOMIC_WP) && !(vma->vm_flags & VM_UFFD_WP))
> +		return false;
> +
> +	/* Anonymous supports everything except CONTINUE */
> +	if (vma_is_anonymous(vma))
> +		return mode != MFILL_ATOMIC_CONTINUE;
> +
> +	uffd_ops = vma_get_uffd_ops(vma);
> +	if (!uffd_ops)
> +		return false;
> +
> +	uffd_ioctls = uffd_ops->uffd_ioctls;
> +	switch (mode) {
> +	case MFILL_ATOMIC_COPY:
> +		return uffd_ioctls & BIT(_UFFDIO_COPY);
> +	case MFILL_ATOMIC_ZEROPAGE:
> +		return uffd_ioctls & BIT(_UFFDIO_ZEROPAGE);
> +	case MFILL_ATOMIC_CONTINUE:
> +		if (!(vma->vm_flags & VM_SHARED))
> +			return false;
> +		return uffd_ioctls & BIT(_UFFDIO_CONTINUE);
> +	case MFILL_ATOMIC_POISON:
> +		return uffd_ioctls & BIT(_UFFDIO_POISON);
> +	default:
> +		return false;
> +	}
> +}
> +
>  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  					    unsigned long dst_start,
>  					    unsigned long src_start,
> @@ -752,11 +821,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	    dst_vma->vm_flags & VM_SHARED))
>  		goto out_unlock;
>  
> -	/*
> -	 * validate 'mode' now that we know the dst_vma: don't allow
> -	 * a wrprotect copy if the userfaultfd didn't register as WP.
> -	 */
> -	if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
> +	if (!vma_uffd_ops_supported(dst_vma, flags))
>  		goto out_unlock;
>  
>  	/*
> @@ -766,12 +831,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
>  					     src_start, len, flags);
>  
> -	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> -		goto out_unlock;
> -	if (!vma_is_shmem(dst_vma) &&
> -	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> -		goto out_unlock;
> -
>  	while (src_addr < src_start + len) {
>  		pmd_t dst_pmdval;
>  
> -- 
> 2.49.0
> 

-- 
Sincerely yours,
Mike.

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ