linux-kernel - Re: [PATCH v2 2/3] mm: convert mm_lock

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <qvidvc6rxwacauspyvrkpfjmv4ear7g54zvsmh7prrbzeli4vk@wwjhvzsus77i>
Date: Fri, 22 Nov 2024 10:14:24 -0500
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
To: Suren Baghdasaryan <surenb@...gle.com>
Cc: akpm@...ux-foundation.org, peterz@...radead.org, andrii@...nel.org,
        jannh@...gle.com, lorenzo.stoakes@...cle.com, vbabka@...e.cz,
        mhocko@...nel.org, shakeel.butt@...ux.dev, hannes@...xchg.org,
        david@...hat.com, willy@...radead.org, brauner@...nel.org,
        oleg@...hat.com, arnd@...db.de, richard.weiyang@...il.com,
        zhangpeng.00@...edance.com, linmiaohe@...wei.com,
        viro@...iv.linux.org.uk, hca@...ux.ibm.com, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 2/3] mm: convert mm_lock_seq to a proper seqcount

* Suren Baghdasaryan <surenb@...gle.com> [241121 11:28]:
> Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock
> variants to increment it, in-line with the usual seqcount usage pattern.
> This lets us check whether the mmap_lock is write-locked by checking
> mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be
> used when implementing mmap_lock speculation functions.
> As a result vm_lock_seq is also change to be unsigned to match the type
> of mm_lock_seq.sequence.
> 
> Suggested-by: Peter Zijlstra <peterz@...radead.org>
> Signed-off-by: Suren Baghdasaryan <surenb@...gle.com>

Reviewed-by: Liam R. Howlett <Liam.Howlett@...cle.com>

> ---
> Changes since v1 [1]
> - Added ASSERT_EXCLUSIVE_WRITER() instead of a comment in
> vma_end_write_all, per Peter Zijlstra
> 
> [1] https://lore.kernel.org/all/20241024205231.1944747-1-surenb@google.com/
> 
>  include/linux/mm.h               | 12 +++----
>  include/linux/mm_types.h         |  7 ++--
>  include/linux/mmap_lock.h        | 55 +++++++++++++++++++++-----------
>  kernel/fork.c                    |  5 +--
>  mm/init-mm.c                     |  2 +-
>  tools/testing/vma/vma.c          |  4 +--
>  tools/testing/vma/vma_internal.h |  4 +--
>  7 files changed, 53 insertions(+), 36 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index feb5c8021bef..e6de22738ee1 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -710,7 +710,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  	 * we don't rely on for anything - the mm_lock_seq read against which we
>  	 * need ordering is below.
>  	 */
> -	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> +	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
>  		return false;
>  
>  	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> @@ -727,7 +727,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  	 * after it has been unlocked.
>  	 * This pairs with RELEASE semantics in vma_end_write_all().
>  	 */
> -	if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
> +	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
>  		up_read(&vma->vm_lock->lock);
>  		return false;
>  	}
> @@ -742,7 +742,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
>  }
>  
>  /* WARNING! Can only be used if mmap_lock is expected to be write-locked */
> -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
> +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
>  {
>  	mmap_assert_write_locked(vma->vm_mm);
>  
> @@ -750,7 +750,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
>  	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
>  	 * mm->mm_lock_seq can't be concurrently modified.
>  	 */
> -	*mm_lock_seq = vma->vm_mm->mm_lock_seq;
> +	*mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
>  	return (vma->vm_lock_seq == *mm_lock_seq);
>  }
>  
> @@ -761,7 +761,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
>   */
>  static inline void vma_start_write(struct vm_area_struct *vma)
>  {
> -	int mm_lock_seq;
> +	unsigned int mm_lock_seq;
>  
>  	if (__is_vma_write_locked(vma, &mm_lock_seq))
>  		return;
> @@ -779,7 +779,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>  
>  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
>  {
> -	int mm_lock_seq;
> +	unsigned int mm_lock_seq;
>  
>  	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
>  }
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 381d22eba088..ac72888a54b8 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -715,7 +715,7 @@ struct vm_area_struct {
>  	 * counter reuse can only lead to occasional unnecessary use of the
>  	 * slowpath.
>  	 */
> -	int vm_lock_seq;
> +	unsigned int vm_lock_seq;
>  	/* Unstable RCU readers are allowed to read this. */
>  	struct vma_lock *vm_lock;
>  #endif
> @@ -909,6 +909,9 @@ struct mm_struct {
>  		 * Roughly speaking, incrementing the sequence number is
>  		 * equivalent to releasing locks on VMAs; reading the sequence
>  		 * number can be part of taking a read lock on a VMA.
> +		 * Incremented every time mmap_lock is write-locked/unlocked.
> +		 * Initialized to 0, therefore odd values indicate mmap_lock
> +		 * is write-locked and even values that it's released.
>  		 *
>  		 * Can be modified under write mmap_lock using RELEASE
>  		 * semantics.
> @@ -917,7 +920,7 @@ struct mm_struct {
>  		 * Can be read with ACQUIRE semantics if not holding write
>  		 * mmap_lock.
>  		 */
> -		int mm_lock_seq;
> +		seqcount_t mm_lock_seq;
>  #endif
>  
>  
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index de9dc20b01ba..083b7fa2588e 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -71,39 +71,38 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
>  }
>  
>  #ifdef CONFIG_PER_VMA_LOCK
> -/*
> - * Drop all currently-held per-VMA locks.
> - * This is called from the mmap_lock implementation directly before releasing
> - * a write-locked mmap_lock (or downgrading it to read-locked).
> - * This should normally NOT be called manually from other places.
> - * If you want to call this manually anyway, keep in mind that this will release
> - * *all* VMA write locks, including ones from further up the stack.
> - */
> -static inline void vma_end_write_all(struct mm_struct *mm)
> +static inline void mm_lock_seqcount_init(struct mm_struct *mm)
>  {
> -	mmap_assert_write_locked(mm);
> -	/*
> -	 * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
> -	 * mmap_lock being held.
> -	 * We need RELEASE semantics here to ensure that preceding stores into
> -	 * the VMA take effect before we unlock it with this store.
> -	 * Pairs with ACQUIRE semantics in vma_start_read().
> -	 */
> -	smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
> +	seqcount_init(&mm->mm_lock_seq);
> +}
> +
> +static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
> +{
> +	do_raw_write_seqcount_begin(&mm->mm_lock_seq);
> +}
> +
> +static inline void mm_lock_seqcount_end(struct mm_struct *mm)
> +{
> +	do_raw_write_seqcount_end(&mm->mm_lock_seq);
>  }
> +
>  #else
> -static inline void vma_end_write_all(struct mm_struct *mm) {}
> +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
> +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
> +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
>  #endif
>  
>  static inline void mmap_init_lock(struct mm_struct *mm)
>  {
>  	init_rwsem(&mm->mmap_lock);
> +	mm_lock_seqcount_init(mm);
>  }
>  
>  static inline void mmap_write_lock(struct mm_struct *mm)
>  {
>  	__mmap_lock_trace_start_locking(mm, true);
>  	down_write(&mm->mmap_lock);
> +	mm_lock_seqcount_begin(mm);
>  	__mmap_lock_trace_acquire_returned(mm, true, true);
>  }
>  
> @@ -111,6 +110,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
>  {
>  	__mmap_lock_trace_start_locking(mm, true);
>  	down_write_nested(&mm->mmap_lock, subclass);
> +	mm_lock_seqcount_begin(mm);
>  	__mmap_lock_trace_acquire_returned(mm, true, true);
>  }
>  
> @@ -120,10 +120,27 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm)
>  
>  	__mmap_lock_trace_start_locking(mm, true);
>  	ret = down_write_killable(&mm->mmap_lock);
> +	if (!ret)
> +		mm_lock_seqcount_begin(mm);
>  	__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
>  	return ret;
>  }
>  
> +/*
> + * Drop all currently-held per-VMA locks.
> + * This is called from the mmap_lock implementation directly before releasing
> + * a write-locked mmap_lock (or downgrading it to read-locked).
> + * This should normally NOT be called manually from other places.
> + * If you want to call this manually anyway, keep in mind that this will release
> + * *all* VMA write locks, including ones from further up the stack.
> + */
> +static inline void vma_end_write_all(struct mm_struct *mm)
> +{
> +	mmap_assert_write_locked(mm);
> +	ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
> +	mm_lock_seqcount_end(mm);
> +}
> +
>  static inline void mmap_write_unlock(struct mm_struct *mm)
>  {
>  	__mmap_lock_trace_released(mm, true);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index e58d27c05788..8cd36645b9fc 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -449,7 +449,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma)
>  		return false;
>  
>  	init_rwsem(&vma->vm_lock->lock);
> -	vma->vm_lock_seq = -1;
> +	vma->vm_lock_seq = UINT_MAX;
>  
>  	return true;
>  }
> @@ -1262,9 +1262,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	seqcount_init(&mm->write_protect_seq);
>  	mmap_init_lock(mm);
>  	INIT_LIST_HEAD(&mm->mmlist);
> -#ifdef CONFIG_PER_VMA_LOCK
> -	mm->mm_lock_seq = 0;
> -#endif
>  	mm_pgtables_bytes_init(mm);
>  	mm->map_count = 0;
>  	mm->locked_vm = 0;
> diff --git a/mm/init-mm.c b/mm/init-mm.c
> index 24c809379274..6af3ad675930 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -40,7 +40,7 @@ struct mm_struct init_mm = {
>  	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
>  	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
>  #ifdef CONFIG_PER_VMA_LOCK
> -	.mm_lock_seq	= 0,
> +	.mm_lock_seq	= SEQCNT_ZERO(init_mm.mm_lock_seq),
>  #endif
>  	.user_ns	= &init_user_ns,
>  	.cpu_bitmap	= CPU_BITS_NONE,
> diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> index b33b47342d41..9074aaced9c5 100644
> --- a/tools/testing/vma/vma.c
> +++ b/tools/testing/vma/vma.c
> @@ -87,7 +87,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
>  	 * begun. Linking to the tree will have caused this to be incremented,
>  	 * which means we will get a false positive otherwise.
>  	 */
> -	vma->vm_lock_seq = -1;
> +	vma->vm_lock_seq = UINT_MAX;
>  
>  	return vma;
>  }
> @@ -212,7 +212,7 @@ static bool vma_write_started(struct vm_area_struct *vma)
>  	int seq = vma->vm_lock_seq;
>  
>  	/* We reset after each check. */
> -	vma->vm_lock_seq = -1;
> +	vma->vm_lock_seq = UINT_MAX;
>  
>  	/* The vma_start_write() stub simply increments this value. */
>  	return seq > -1;
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index c5b9da034511..4007ec580f85 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -231,7 +231,7 @@ struct vm_area_struct {
>  	 * counter reuse can only lead to occasional unnecessary use of the
>  	 * slowpath.
>  	 */
> -	int vm_lock_seq;
> +	unsigned int vm_lock_seq;
>  	struct vma_lock *vm_lock;
>  #endif
>  
> @@ -406,7 +406,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma)
>  		return false;
>  
>  	init_rwsem(&vma->vm_lock->lock);
> -	vma->vm_lock_seq = -1;
> +	vma->vm_lock_seq = UINT_MAX;
>  
>  	return true;
>  }
> -- 
> 2.47.0.338.g60cca15819-goog
>