lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2x7whirgbvwzpm6iy2pnh7pqhnjgpvoxpo526nxifryizs7bsn@wpxu2d77co4n>
Date: Wed, 16 Apr 2025 09:07:21 -0400
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
To: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
        Suren Baghdasaryan <surenb@...gle.com>,
        Vlastimil Babka <vbabka@...e.cz>, Matthew Wilcox <willy@...radead.org>,
        "Paul E . McKenney" <paulmck@...nel.org>,
        SeongJae Park <sj@...nel.org>, Shakeel Butt <shakeel.butt@...ux.dev>,
        David Hildenbrand <david@...hat.com>, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org
Subject: Re: [PATCH v2 1/2] mm: move mmap/vma locking logic into specific
 files

* Lorenzo Stoakes <lorenzo.stoakes@...cle.com> [250416 06:38]:
> Currently the VMA and mmap locking logic is entangled in two of the most
> overwrought files in mm - include/linux/mm.h and mm/memory.c. Separate this
> logic out so we can more easily make changes and create an appropriate
> MAINTAINERS entry that spans only the logic relating to locking.
> 
> This should have no functional change. Care is taken to avoid dependency
> loops, we must regrettably keep release_fault_lock() and
> assert_fault_locked() in mm.h as a result due to the dependence on the
> vm_fault type.
> 
> Additionally we must declare rcuwait_wake_up() manually to avoid a
> dependency cycle on linux/rcuwait.h.
> 
> Additionally move the nommu implementatino of lock_mm_and_find_vma() to
> mmap_lock.c so everything lock-related is in one place.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> Reviewed-by: Suren Baghdasaryan <surenb@...gle.com>

Reviewed-by: Liam R. Howlett <Liam.Howlett@...cle.com>

> ---
>  include/linux/mm.h        | 231 +-------------------------------
>  include/linux/mmap_lock.h | 227 +++++++++++++++++++++++++++++++
>  mm/memory.c               | 252 -----------------------------------
>  mm/mmap_lock.c            | 273 ++++++++++++++++++++++++++++++++++++++
>  mm/nommu.c                |  16 ---
>  5 files changed, 504 insertions(+), 495 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 5eb0d77c4438..9b701cfbef22 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -671,204 +671,11 @@ static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
>  static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
>  #endif /* CONFIG_NUMA_BALANCING */
>  
> -#ifdef CONFIG_PER_VMA_LOCK
> -static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
> -{
> -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> -	static struct lock_class_key lockdep_key;
> -
> -	lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
> -#endif
> -	if (reset_refcnt)
> -		refcount_set(&vma->vm_refcnt, 0);
> -	vma->vm_lock_seq = UINT_MAX;
> -}
> -
> -static inline bool is_vma_writer_only(int refcnt)
> -{
> -	/*
> -	 * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
> -	 * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
> -	 * a detached vma happens only in vma_mark_detached() and is a rare
> -	 * case, therefore most of the time there will be no unnecessary wakeup.
> -	 */
> -	return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
> -}
> -
> -static inline void vma_refcount_put(struct vm_area_struct *vma)
> -{
> -	/* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
> -	struct mm_struct *mm = vma->vm_mm;
> -	int oldcnt;
> -
> -	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> -	if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
> -
> -		if (is_vma_writer_only(oldcnt - 1))
> -			rcuwait_wake_up(&mm->vma_writer_wait);
> -	}
> -}
> -
> -/*
> - * Try to read-lock a vma. The function is allowed to occasionally yield false
> - * locked result to avoid performance overhead, in which case we fall back to
> - * using mmap_lock. The function should never yield false unlocked result.
> - * False locked result is possible if mm_lock_seq overflows or if vma gets
> - * reused and attached to a different mm before we lock it.
> - * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
> - * detached.
> - */
> -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
> -						    struct vm_area_struct *vma)
> -{
> -	int oldcnt;
> -
> -	/*
> -	 * Check before locking. A race might cause false locked result.
> -	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
> -	 * ACQUIRE semantics, because this is just a lockless check whose result
> -	 * we don't rely on for anything - the mm_lock_seq read against which we
> -	 * need ordering is below.
> -	 */
> -	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
> -		return NULL;
> -
> -	/*
> -	 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
> -	 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
> -	 * Acquire fence is required here to avoid reordering against later
> -	 * vm_lock_seq check and checks inside lock_vma_under_rcu().
> -	 */
> -	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
> -							      VMA_REF_LIMIT))) {
> -		/* return EAGAIN if vma got detached from under us */
> -		return oldcnt ? NULL : ERR_PTR(-EAGAIN);
> -	}
> -
> -	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
> -	/*
> -	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
> -	 * False unlocked result is impossible because we modify and check
> -	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
> -	 * modification invalidates all existing locks.
> -	 *
> -	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
> -	 * racing with vma_end_write_all(), we only start reading from the VMA
> -	 * after it has been unlocked.
> -	 * This pairs with RELEASE semantics in vma_end_write_all().
> -	 */
> -	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
> -		vma_refcount_put(vma);
> -		return NULL;
> -	}
> -
> -	return vma;
> -}
> -
> -/*
> - * Use only while holding mmap read lock which guarantees that locking will not
> - * fail (nobody can concurrently write-lock the vma). vma_start_read() should
> - * not be used in such cases because it might fail due to mm_lock_seq overflow.
> - * This functionality is used to obtain vma read lock and drop the mmap read lock.
> - */
> -static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
> -{
> -	int oldcnt;
> -
> -	mmap_assert_locked(vma->vm_mm);
> -	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
> -							      VMA_REF_LIMIT)))
> -		return false;
> -
> -	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
> -	return true;
> -}
> -
> -/*
> - * Use only while holding mmap read lock which guarantees that locking will not
> - * fail (nobody can concurrently write-lock the vma). vma_start_read() should
> - * not be used in such cases because it might fail due to mm_lock_seq overflow.
> - * This functionality is used to obtain vma read lock and drop the mmap read lock.
> - */
> -static inline bool vma_start_read_locked(struct vm_area_struct *vma)
> -{
> -	return vma_start_read_locked_nested(vma, 0);
> -}
> -
> -static inline void vma_end_read(struct vm_area_struct *vma)
> -{
> -	vma_refcount_put(vma);
> -}
> -
> -/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
> -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
> -{
> -	mmap_assert_write_locked(vma->vm_mm);
> -
> -	/*
> -	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> -	 * mm->mm_lock_seq can't be concurrently modified.
> -	 */
> -	*mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
> -	return (vma->vm_lock_seq == *mm_lock_seq);
> -}
> -
> -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
> -
> -/*
> - * Begin writing to a VMA.
> - * Exclude concurrent readers under the per-VMA lock until the currently
> - * write-locked mmap_lock is dropped or downgraded.
> - */
> -static inline void vma_start_write(struct vm_area_struct *vma)
> -{
> -	unsigned int mm_lock_seq;
> -
> -	if (__is_vma_write_locked(vma, &mm_lock_seq))
> -		return;
> -
> -	__vma_start_write(vma, mm_lock_seq);
> -}
> -
> -static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> -{
> -	unsigned int mm_lock_seq;
> -
> -	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
> -}
> -
> -static inline void vma_assert_locked(struct vm_area_struct *vma)
> -{
> -	unsigned int mm_lock_seq;
> -
> -	VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
> -		      !__is_vma_write_locked(vma, &mm_lock_seq), vma);
> -}
> -
>  /*
> - * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
> - * assertions should be made either under mmap_write_lock or when the object
> - * has been isolated under mmap_write_lock, ensuring no competing writers.
> + * These must be here rather than mmap_lock.h as dependent on vm_fault type,
> + * declared in this header.
>   */
> -static inline void vma_assert_attached(struct vm_area_struct *vma)
> -{
> -	WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
> -}
> -
> -static inline void vma_assert_detached(struct vm_area_struct *vma)
> -{
> -	WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
> -}
> -
> -static inline void vma_mark_attached(struct vm_area_struct *vma)
> -{
> -	vma_assert_write_locked(vma);
> -	vma_assert_detached(vma);
> -	refcount_set_release(&vma->vm_refcnt, 1);
> -}
> -
> -void vma_mark_detached(struct vm_area_struct *vma);
> -
> +#ifdef CONFIG_PER_VMA_LOCK
>  static inline void release_fault_lock(struct vm_fault *vmf)
>  {
>  	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
> @@ -884,36 +691,7 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
>  	else
>  		mmap_assert_locked(vmf->vma->vm_mm);
>  }
> -
> -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> -					  unsigned long address);
> -
> -#else /* CONFIG_PER_VMA_LOCK */
> -
> -static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
> -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
> -						    struct vm_area_struct *vma)
> -		{ return NULL; }
> -static inline void vma_end_read(struct vm_area_struct *vma) {}
> -static inline void vma_start_write(struct vm_area_struct *vma) {}
> -static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> -		{ mmap_assert_write_locked(vma->vm_mm); }
> -static inline void vma_assert_attached(struct vm_area_struct *vma) {}
> -static inline void vma_assert_detached(struct vm_area_struct *vma) {}
> -static inline void vma_mark_attached(struct vm_area_struct *vma) {}
> -static inline void vma_mark_detached(struct vm_area_struct *vma) {}
> -
> -static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> -		unsigned long address)
> -{
> -	return NULL;
> -}
> -
> -static inline void vma_assert_locked(struct vm_area_struct *vma)
> -{
> -	mmap_assert_locked(vma->vm_mm);
> -}
> -
> +#else
>  static inline void release_fault_lock(struct vm_fault *vmf)
>  {
>  	mmap_read_unlock(vmf->vma->vm_mm);
> @@ -923,7 +701,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
>  {
>  	mmap_assert_locked(vmf->vma->vm_mm);
>  }
> -
>  #endif /* CONFIG_PER_VMA_LOCK */
>  
>  extern const struct vm_operations_struct vma_dummy_vm_ops;
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index 4706c6769902..7983b2efe9bf 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -1,6 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
>  #ifndef _LINUX_MMAP_LOCK_H
>  #define _LINUX_MMAP_LOCK_H
>  
> +/* Avoid a dependency loop by declaring here. */
> +extern int rcuwait_wake_up(struct rcuwait *w);
> +
>  #include <linux/lockdep.h>
>  #include <linux/mm_types.h>
>  #include <linux/mmdebug.h>
> @@ -104,6 +108,206 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
>  	return read_seqcount_retry(&mm->mm_lock_seq, seq);
>  }
>  
> +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
> +{
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	static struct lock_class_key lockdep_key;
> +
> +	lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
> +#endif
> +	if (reset_refcnt)
> +		refcount_set(&vma->vm_refcnt, 0);
> +	vma->vm_lock_seq = UINT_MAX;
> +}
> +
> +static inline bool is_vma_writer_only(int refcnt)
> +{
> +	/*
> +	 * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
> +	 * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
> +	 * a detached vma happens only in vma_mark_detached() and is a rare
> +	 * case, therefore most of the time there will be no unnecessary wakeup.
> +	 */
> +	return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
> +}
> +
> +static inline void vma_refcount_put(struct vm_area_struct *vma)
> +{
> +	/* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
> +	struct mm_struct *mm = vma->vm_mm;
> +	int oldcnt;
> +
> +	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> +	if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
> +
> +		if (is_vma_writer_only(oldcnt - 1))
> +			rcuwait_wake_up(&mm->vma_writer_wait);
> +	}
> +}
> +
> +/*
> + * Try to read-lock a vma. The function is allowed to occasionally yield false
> + * locked result to avoid performance overhead, in which case we fall back to
> + * using mmap_lock. The function should never yield false unlocked result.
> + * False locked result is possible if mm_lock_seq overflows or if vma gets
> + * reused and attached to a different mm before we lock it.
> + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
> + * detached.
> + */
> +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
> +						    struct vm_area_struct *vma)
> +{
> +	int oldcnt;
> +
> +	/*
> +	 * Check before locking. A race might cause false locked result.
> +	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
> +	 * ACQUIRE semantics, because this is just a lockless check whose result
> +	 * we don't rely on for anything - the mm_lock_seq read against which we
> +	 * need ordering is below.
> +	 */
> +	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
> +		return NULL;
> +
> +	/*
> +	 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
> +	 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
> +	 * Acquire fence is required here to avoid reordering against later
> +	 * vm_lock_seq check and checks inside lock_vma_under_rcu().
> +	 */
> +	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
> +							      VMA_REF_LIMIT))) {
> +		/* return EAGAIN if vma got detached from under us */
> +		return oldcnt ? NULL : ERR_PTR(-EAGAIN);
> +	}
> +
> +	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
> +	/*
> +	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
> +	 * False unlocked result is impossible because we modify and check
> +	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
> +	 * modification invalidates all existing locks.
> +	 *
> +	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
> +	 * racing with vma_end_write_all(), we only start reading from the VMA
> +	 * after it has been unlocked.
> +	 * This pairs with RELEASE semantics in vma_end_write_all().
> +	 */
> +	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
> +		vma_refcount_put(vma);
> +		return NULL;
> +	}
> +
> +	return vma;
> +}
> +
> +/*
> + * Use only while holding mmap read lock which guarantees that locking will not
> + * fail (nobody can concurrently write-lock the vma). vma_start_read() should
> + * not be used in such cases because it might fail due to mm_lock_seq overflow.
> + * This functionality is used to obtain vma read lock and drop the mmap read lock.
> + */
> +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
> +{
> +	int oldcnt;
> +
> +	mmap_assert_locked(vma->vm_mm);
> +	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
> +							      VMA_REF_LIMIT)))
> +		return false;
> +
> +	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
> +	return true;
> +}
> +
> +/*
> + * Use only while holding mmap read lock which guarantees that locking will not
> + * fail (nobody can concurrently write-lock the vma). vma_start_read() should
> + * not be used in such cases because it might fail due to mm_lock_seq overflow.
> + * This functionality is used to obtain vma read lock and drop the mmap read lock.
> + */
> +static inline bool vma_start_read_locked(struct vm_area_struct *vma)
> +{
> +	return vma_start_read_locked_nested(vma, 0);
> +}
> +
> +static inline void vma_end_read(struct vm_area_struct *vma)
> +{
> +	vma_refcount_put(vma);
> +}
> +
> +/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
> +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
> +{
> +	mmap_assert_write_locked(vma->vm_mm);
> +
> +	/*
> +	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> +	 * mm->mm_lock_seq can't be concurrently modified.
> +	 */
> +	*mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
> +	return (vma->vm_lock_seq == *mm_lock_seq);
> +}
> +
> +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
> +
> +/*
> + * Begin writing to a VMA.
> + * Exclude concurrent readers under the per-VMA lock until the currently
> + * write-locked mmap_lock is dropped or downgraded.
> + */
> +static inline void vma_start_write(struct vm_area_struct *vma)
> +{
> +	unsigned int mm_lock_seq;
> +
> +	if (__is_vma_write_locked(vma, &mm_lock_seq))
> +		return;
> +
> +	__vma_start_write(vma, mm_lock_seq);
> +}
> +
> +static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> +{
> +	unsigned int mm_lock_seq;
> +
> +	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
> +}
> +
> +static inline void vma_assert_locked(struct vm_area_struct *vma)
> +{
> +	unsigned int mm_lock_seq;
> +
> +	VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
> +		      !__is_vma_write_locked(vma, &mm_lock_seq), vma);
> +}
> +
> +/*
> + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
> + * assertions should be made either under mmap_write_lock or when the object
> + * has been isolated under mmap_write_lock, ensuring no competing writers.
> + */
> +static inline void vma_assert_attached(struct vm_area_struct *vma)
> +{
> +	WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
> +}
> +
> +static inline void vma_assert_detached(struct vm_area_struct *vma)
> +{
> +	WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
> +}
> +
> +static inline void vma_mark_attached(struct vm_area_struct *vma)
> +{
> +	vma_assert_write_locked(vma);
> +	vma_assert_detached(vma);
> +	refcount_set_release(&vma->vm_refcnt, 1);
> +}
> +
> +void vma_mark_detached(struct vm_area_struct *vma);
> +
> +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> +					  unsigned long address);
> +
>  #else /* CONFIG_PER_VMA_LOCK */
>  
>  static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
> @@ -119,6 +323,29 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
>  {
>  	return true;
>  }
> +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
> +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
> +						    struct vm_area_struct *vma)
> +		{ return NULL; }
> +static inline void vma_end_read(struct vm_area_struct *vma) {}
> +static inline void vma_start_write(struct vm_area_struct *vma) {}
> +static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> +		{ mmap_assert_write_locked(vma->vm_mm); }
> +static inline void vma_assert_attached(struct vm_area_struct *vma) {}
> +static inline void vma_assert_detached(struct vm_area_struct *vma) {}
> +static inline void vma_mark_attached(struct vm_area_struct *vma) {}
> +static inline void vma_mark_detached(struct vm_area_struct *vma) {}
> +
> +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> +		unsigned long address)
> +{
> +	return NULL;
> +}
> +
> +static inline void vma_assert_locked(struct vm_area_struct *vma)
> +{
> +	mmap_assert_locked(vma->vm_mm);
> +}
>  
>  #endif /* CONFIG_PER_VMA_LOCK */
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 688f3612e16d..35cfc91d5a2e 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -6378,258 +6378,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
>  }
>  EXPORT_SYMBOL_GPL(handle_mm_fault);
>  
> -#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
> -#include <linux/extable.h>
> -
> -static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
> -{
> -	if (likely(mmap_read_trylock(mm)))
> -		return true;
> -
> -	if (regs && !user_mode(regs)) {
> -		unsigned long ip = exception_ip(regs);
> -		if (!search_exception_tables(ip))
> -			return false;
> -	}
> -
> -	return !mmap_read_lock_killable(mm);
> -}
> -
> -static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
> -{
> -	/*
> -	 * We don't have this operation yet.
> -	 *
> -	 * It should be easy enough to do: it's basically a
> -	 *    atomic_long_try_cmpxchg_acquire()
> -	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
> -	 * it also needs the proper lockdep magic etc.
> -	 */
> -	return false;
> -}
> -
> -static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
> -{
> -	mmap_read_unlock(mm);
> -	if (regs && !user_mode(regs)) {
> -		unsigned long ip = exception_ip(regs);
> -		if (!search_exception_tables(ip))
> -			return false;
> -	}
> -	return !mmap_write_lock_killable(mm);
> -}
> -
> -/*
> - * Helper for page fault handling.
> - *
> - * This is kind of equivalent to "mmap_read_lock()" followed
> - * by "find_extend_vma()", except it's a lot more careful about
> - * the locking (and will drop the lock on failure).
> - *
> - * For example, if we have a kernel bug that causes a page
> - * fault, we don't want to just use mmap_read_lock() to get
> - * the mm lock, because that would deadlock if the bug were
> - * to happen while we're holding the mm lock for writing.
> - *
> - * So this checks the exception tables on kernel faults in
> - * order to only do this all for instructions that are actually
> - * expected to fault.
> - *
> - * We can also actually take the mm lock for writing if we
> - * need to extend the vma, which helps the VM layer a lot.
> - */
> -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
> -			unsigned long addr, struct pt_regs *regs)
> -{
> -	struct vm_area_struct *vma;
> -
> -	if (!get_mmap_lock_carefully(mm, regs))
> -		return NULL;
> -
> -	vma = find_vma(mm, addr);
> -	if (likely(vma && (vma->vm_start <= addr)))
> -		return vma;
> -
> -	/*
> -	 * Well, dang. We might still be successful, but only
> -	 * if we can extend a vma to do so.
> -	 */
> -	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
> -		mmap_read_unlock(mm);
> -		return NULL;
> -	}
> -
> -	/*
> -	 * We can try to upgrade the mmap lock atomically,
> -	 * in which case we can continue to use the vma
> -	 * we already looked up.
> -	 *
> -	 * Otherwise we'll have to drop the mmap lock and
> -	 * re-take it, and also look up the vma again,
> -	 * re-checking it.
> -	 */
> -	if (!mmap_upgrade_trylock(mm)) {
> -		if (!upgrade_mmap_lock_carefully(mm, regs))
> -			return NULL;
> -
> -		vma = find_vma(mm, addr);
> -		if (!vma)
> -			goto fail;
> -		if (vma->vm_start <= addr)
> -			goto success;
> -		if (!(vma->vm_flags & VM_GROWSDOWN))
> -			goto fail;
> -	}
> -
> -	if (expand_stack_locked(vma, addr))
> -		goto fail;
> -
> -success:
> -	mmap_write_downgrade(mm);
> -	return vma;
> -
> -fail:
> -	mmap_write_unlock(mm);
> -	return NULL;
> -}
> -#endif
> -
> -#ifdef CONFIG_PER_VMA_LOCK
> -static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
> -{
> -	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
> -
> -	/* Additional refcnt if the vma is attached. */
> -	if (!detaching)
> -		tgt_refcnt++;
> -
> -	/*
> -	 * If vma is detached then only vma_mark_attached() can raise the
> -	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
> -	 */
> -	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> -		return false;
> -
> -	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> -	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> -		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> -		   TASK_UNINTERRUPTIBLE);
> -	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> -
> -	return true;
> -}
> -
> -static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
> -{
> -	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
> -	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> -}
> -
> -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
> -{
> -	bool locked;
> -
> -	/*
> -	 * __vma_enter_locked() returns false immediately if the vma is not
> -	 * attached, otherwise it waits until refcnt is indicating that vma
> -	 * is attached with no readers.
> -	 */
> -	locked = __vma_enter_locked(vma, false);
> -
> -	/*
> -	 * We should use WRITE_ONCE() here because we can have concurrent reads
> -	 * from the early lockless pessimistic check in vma_start_read().
> -	 * We don't really care about the correctness of that early check, but
> -	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> -	 */
> -	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> -
> -	if (locked) {
> -		bool detached;
> -
> -		__vma_exit_locked(vma, &detached);
> -		WARN_ON_ONCE(detached); /* vma should remain attached */
> -	}
> -}
> -EXPORT_SYMBOL_GPL(__vma_start_write);
> -
> -void vma_mark_detached(struct vm_area_struct *vma)
> -{
> -	vma_assert_write_locked(vma);
> -	vma_assert_attached(vma);
> -
> -	/*
> -	 * We are the only writer, so no need to use vma_refcount_put().
> -	 * The condition below is unlikely because the vma has been already
> -	 * write-locked and readers can increment vm_refcnt only temporarily
> -	 * before they check vm_lock_seq, realize the vma is locked and drop
> -	 * back the vm_refcnt. That is a narrow window for observing a raised
> -	 * vm_refcnt.
> -	 */
> -	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
> -		/* Wait until vma is detached with no readers. */
> -		if (__vma_enter_locked(vma, true)) {
> -			bool detached;
> -
> -			__vma_exit_locked(vma, &detached);
> -			WARN_ON_ONCE(!detached);
> -		}
> -	}
> -}
> -
> -/*
> - * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
> - * stable and not isolated. If the VMA is not found or is being modified the
> - * function returns NULL.
> - */
> -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> -					  unsigned long address)
> -{
> -	MA_STATE(mas, &mm->mm_mt, address, address);
> -	struct vm_area_struct *vma;
> -
> -	rcu_read_lock();
> -retry:
> -	vma = mas_walk(&mas);
> -	if (!vma)
> -		goto inval;
> -
> -	vma = vma_start_read(mm, vma);
> -	if (IS_ERR_OR_NULL(vma)) {
> -		/* Check if the VMA got isolated after we found it */
> -		if (PTR_ERR(vma) == -EAGAIN) {
> -			count_vm_vma_lock_event(VMA_LOCK_MISS);
> -			/* The area was replaced with another one */
> -			goto retry;
> -		}
> -
> -		/* Failed to lock the VMA */
> -		goto inval;
> -	}
> -	/*
> -	 * At this point, we have a stable reference to a VMA: The VMA is
> -	 * locked and we know it hasn't already been isolated.
> -	 * From here on, we can access the VMA without worrying about which
> -	 * fields are accessible for RCU readers.
> -	 */
> -
> -	/* Check if the vma we locked is the right one. */
> -	if (unlikely(vma->vm_mm != mm ||
> -		     address < vma->vm_start || address >= vma->vm_end))
> -		goto inval_end_read;
> -
> -	rcu_read_unlock();
> -	return vma;
> -
> -inval_end_read:
> -	vma_end_read(vma);
> -inval:
> -	rcu_read_unlock();
> -	count_vm_vma_lock_event(VMA_LOCK_ABORT);
> -	return NULL;
> -}
> -#endif /* CONFIG_PER_VMA_LOCK */
> -
>  #ifndef __PAGETABLE_P4D_FOLDED
>  /*
>   * Allocate p4d page table.
> diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> index e7dbaf96aa17..5f725cc67334 100644
> --- a/mm/mmap_lock.c
> +++ b/mm/mmap_lock.c
> @@ -42,3 +42,276 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
>  }
>  EXPORT_SYMBOL(__mmap_lock_do_trace_released);
>  #endif /* CONFIG_TRACING */
> +
> +#ifdef CONFIG_MMU
> +#ifdef CONFIG_PER_VMA_LOCK
> +static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
> +{
> +	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
> +
> +	/* Additional refcnt if the vma is attached. */
> +	if (!detaching)
> +		tgt_refcnt++;
> +
> +	/*
> +	 * If vma is detached then only vma_mark_attached() can raise the
> +	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
> +	 */
> +	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> +		return false;
> +
> +	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> +	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> +		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> +		   TASK_UNINTERRUPTIBLE);
> +	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> +
> +	return true;
> +}
> +
> +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
> +{
> +	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
> +	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> +}
> +
> +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
> +{
> +	bool locked;
> +
> +	/*
> +	 * __vma_enter_locked() returns false immediately if the vma is not
> +	 * attached, otherwise it waits until refcnt is indicating that vma
> +	 * is attached with no readers.
> +	 */
> +	locked = __vma_enter_locked(vma, false);
> +
> +	/*
> +	 * We should use WRITE_ONCE() here because we can have concurrent reads
> +	 * from the early lockless pessimistic check in vma_start_read().
> +	 * We don't really care about the correctness of that early check, but
> +	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> +	 */
> +	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> +
> +	if (locked) {
> +		bool detached;
> +
> +		__vma_exit_locked(vma, &detached);
> +		WARN_ON_ONCE(detached); /* vma should remain attached */
> +	}
> +}
> +EXPORT_SYMBOL_GPL(__vma_start_write);
> +
> +void vma_mark_detached(struct vm_area_struct *vma)
> +{
> +	vma_assert_write_locked(vma);
> +	vma_assert_attached(vma);
> +
> +	/*
> +	 * We are the only writer, so no need to use vma_refcount_put().
> +	 * The condition below is unlikely because the vma has been already
> +	 * write-locked and readers can increment vm_refcnt only temporarily
> +	 * before they check vm_lock_seq, realize the vma is locked and drop
> +	 * back the vm_refcnt. That is a narrow window for observing a raised
> +	 * vm_refcnt.
> +	 */
> +	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
> +		/* Wait until vma is detached with no readers. */
> +		if (__vma_enter_locked(vma, true)) {
> +			bool detached;
> +
> +			__vma_exit_locked(vma, &detached);
> +			WARN_ON_ONCE(!detached);
> +		}
> +	}
> +}
> +
> +/*
> + * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
> + * stable and not isolated. If the VMA is not found or is being modified the
> + * function returns NULL.
> + */
> +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> +					  unsigned long address)
> +{
> +	MA_STATE(mas, &mm->mm_mt, address, address);
> +	struct vm_area_struct *vma;
> +
> +	rcu_read_lock();
> +retry:
> +	vma = mas_walk(&mas);
> +	if (!vma)
> +		goto inval;
> +
> +	vma = vma_start_read(mm, vma);
> +	if (IS_ERR_OR_NULL(vma)) {
> +		/* Check if the VMA got isolated after we found it */
> +		if (PTR_ERR(vma) == -EAGAIN) {
> +			count_vm_vma_lock_event(VMA_LOCK_MISS);
> +			/* The area was replaced with another one */
> +			goto retry;
> +		}
> +
> +		/* Failed to lock the VMA */
> +		goto inval;
> +	}
> +	/*
> +	 * At this point, we have a stable reference to a VMA: The VMA is
> +	 * locked and we know it hasn't already been isolated.
> +	 * From here on, we can access the VMA without worrying about which
> +	 * fields are accessible for RCU readers.
> +	 */
> +
> +	/* Check if the vma we locked is the right one. */
> +	if (unlikely(vma->vm_mm != mm ||
> +		     address < vma->vm_start || address >= vma->vm_end))
> +		goto inval_end_read;
> +
> +	rcu_read_unlock();
> +	return vma;
> +
> +inval_end_read:
> +	vma_end_read(vma);
> +inval:
> +	rcu_read_unlock();
> +	count_vm_vma_lock_event(VMA_LOCK_ABORT);
> +	return NULL;
> +}
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
> +#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
> +#include <linux/extable.h>
> +
> +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
> +{
> +	if (likely(mmap_read_trylock(mm)))
> +		return true;
> +
> +	if (regs && !user_mode(regs)) {
> +		unsigned long ip = exception_ip(regs);
> +		if (!search_exception_tables(ip))
> +			return false;
> +	}
> +
> +	return !mmap_read_lock_killable(mm);
> +}
> +
> +static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
> +{
> +	/*
> +	 * We don't have this operation yet.
> +	 *
> +	 * It should be easy enough to do: it's basically a
> +	 *    atomic_long_try_cmpxchg_acquire()
> +	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
> +	 * it also needs the proper lockdep magic etc.
> +	 */
> +	return false;
> +}
> +
> +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
> +{
> +	mmap_read_unlock(mm);
> +	if (regs && !user_mode(regs)) {
> +		unsigned long ip = exception_ip(regs);
> +		if (!search_exception_tables(ip))
> +			return false;
> +	}
> +	return !mmap_write_lock_killable(mm);
> +}
> +
> +/*
> + * Helper for page fault handling.
> + *
> + * This is kind of equivalent to "mmap_read_lock()" followed
> + * by "find_extend_vma()", except it's a lot more careful about
> + * the locking (and will drop the lock on failure).
> + *
> + * For example, if we have a kernel bug that causes a page
> + * fault, we don't want to just use mmap_read_lock() to get
> + * the mm lock, because that would deadlock if the bug were
> + * to happen while we're holding the mm lock for writing.
> + *
> + * So this checks the exception tables on kernel faults in
> + * order to only do this all for instructions that are actually
> + * expected to fault.
> + *
> + * We can also actually take the mm lock for writing if we
> + * need to extend the vma, which helps the VM layer a lot.
> + */
> +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
> +			unsigned long addr, struct pt_regs *regs)
> +{
> +	struct vm_area_struct *vma;
> +
> +	if (!get_mmap_lock_carefully(mm, regs))
> +		return NULL;
> +
> +	vma = find_vma(mm, addr);
> +	if (likely(vma && (vma->vm_start <= addr)))
> +		return vma;
> +
> +	/*
> +	 * Well, dang. We might still be successful, but only
> +	 * if we can extend a vma to do so.
> +	 */
> +	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
> +		mmap_read_unlock(mm);
> +		return NULL;
> +	}
> +
> +	/*
> +	 * We can try to upgrade the mmap lock atomically,
> +	 * in which case we can continue to use the vma
> +	 * we already looked up.
> +	 *
> +	 * Otherwise we'll have to drop the mmap lock and
> +	 * re-take it, and also look up the vma again,
> +	 * re-checking it.
> +	 */
> +	if (!mmap_upgrade_trylock(mm)) {
> +		if (!upgrade_mmap_lock_carefully(mm, regs))
> +			return NULL;
> +
> +		vma = find_vma(mm, addr);
> +		if (!vma)
> +			goto fail;
> +		if (vma->vm_start <= addr)
> +			goto success;
> +		if (!(vma->vm_flags & VM_GROWSDOWN))
> +			goto fail;
> +	}
> +
> +	if (expand_stack_locked(vma, addr))
> +		goto fail;
> +
> +success:
> +	mmap_write_downgrade(mm);
> +	return vma;
> +
> +fail:
> +	mmap_write_unlock(mm);
> +	return NULL;
> +}
> +#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
> +
> +#else /* CONFIG_MMU */
> +
> +/*
> + * At least xtensa ends up having protection faults even with no
> + * MMU.. No stack expansion, at least.
> + */
> +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
> +			unsigned long addr, struct pt_regs *regs)
> +{
> +	struct vm_area_struct *vma;
> +
> +	mmap_read_lock(mm);
> +	vma = vma_lookup(mm, addr);
> +	if (!vma)
> +		mmap_read_unlock(mm);
> +	return vma;
> +}
> +
> +#endif /* CONFIG_MMU */
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 617e7ba8022f..2b4d304c6445 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -626,22 +626,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
>  }
>  EXPORT_SYMBOL(find_vma);
>  
> -/*
> - * At least xtensa ends up having protection faults even with no
> - * MMU.. No stack expansion, at least.
> - */
> -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
> -			unsigned long addr, struct pt_regs *regs)
> -{
> -	struct vm_area_struct *vma;
> -
> -	mmap_read_lock(mm);
> -	vma = vma_lookup(mm, addr);
> -	if (!vma)
> -		mmap_read_unlock(mm);
> -	return vma;
> -}
> -
>  /*
>   * expand a stack to a given address
>   * - not supported under NOMMU conditions
> -- 
> 2.49.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ