linux-kernel - Re: [PATCH v7 02/17] mm: move per-vma lock into vm_area

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <jcdhaaavbzawxikgntsfxhjmtbxv4trgigfgjg4vvxljz6ruv7@3cu5uh7n5jc3>
Date: Wed, 8 Jan 2025 09:59:54 -0500
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
To: Suren Baghdasaryan <surenb@...gle.com>
Cc: akpm@...ux-foundation.org, peterz@...radead.org, willy@...radead.org,
        lorenzo.stoakes@...cle.com, mhocko@...e.com, vbabka@...e.cz,
        hannes@...xchg.org, mjguzik@...il.com, oliver.sang@...el.com,
        mgorman@...hsingularity.net, david@...hat.com, peterx@...hat.com,
        oleg@...hat.com, dave@...olabs.net, paulmck@...nel.org,
        brauner@...nel.org, dhowells@...hat.com, hdanton@...a.com,
        hughd@...gle.com, lokeshgidra@...gle.com, minchan@...gle.com,
        jannh@...gle.com, shakeel.butt@...ux.dev, souravpanda@...gle.com,
        pasha.tatashin@...een.com, klarasmodin@...il.com, corbet@....net,
        linux-doc@...r.kernel.org, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org, kernel-team@...roid.com
Subject: Re: [PATCH v7 02/17] mm: move per-vma lock into vm_area_struct

* Suren Baghdasaryan <surenb@...gle.com> [241226 12:07]:
> Back when per-vma locks were introduces, vm_lock was moved out of
> vm_area_struct in [1] because of the performance regression caused by
> false cacheline sharing.  Recent investigation [2] revealed that the
> regressions is limited to a rather old Broadwell microarchitecture and
> even there it can be mitigated by disabling adjacent cacheline
> prefetching, see [3].
> 
> Splitting single logical structure into multiple ones leads to more
> complicated management, extra pointer dereferences and overall less
> maintainable code.  When that split-away part is a lock, it complicates
> things even further.  With no performance benefits, there are no reasons
> for this split.  Merging the vm_lock back into vm_area_struct also allows
> vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset.  Move
> vm_lock back into vm_area_struct, aligning it at the cacheline boundary
> and changing the cache to be cacheline-aligned as well.  With kernel
> compiled using defconfig, this causes VMA memory consumption to grow from
> 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes:
> 
>     slabinfo before:
>      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
>      vma_lock         ...     40  102    1 : ...
>      vm_area_struct   ...    160   51    2 : ...
> 
>     slabinfo after moving vm_lock:
>      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
>      vm_area_struct   ...    256   32    2 : ...
> 
> Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages,
> which is 5.5MB per 100000 VMAs.  Note that the size of this structure is
> dependent on the kernel configuration and typically the original size is
> higher than 160 bytes.  Therefore these calculations are close to the
> worst case scenario.  A more realistic vm_area_struct usage before this
> change is:
> 
>      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
>      vma_lock         ...     40  102    1 : ...
>      vm_area_struct   ...    176   46    2 : ...
> 
> Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages,
> which is 3.9MB per 100000 VMAs.  This memory consumption growth can be
> addressed later by optimizing the vm_lock.
> 
> [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/
> [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/
> [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/
> 
> Signed-off-by: Suren Baghdasaryan <surenb@...gle.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> Reviewed-by: Shakeel Butt <shakeel.butt@...ux.dev>
> Reviewed-by: Vlastimil Babka <vbabka@...e.cz>

Reviewed-by: Liam R. Howlett <Liam.Howlett@...cle.com>

> ---
>  include/linux/mm.h               | 28 ++++++++++--------
>  include/linux/mm_types.h         |  6 ++--
>  kernel/fork.c                    | 49 ++++----------------------------
>  tools/testing/vma/vma_internal.h | 33 +++++----------------
>  4 files changed, 32 insertions(+), 84 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a48e207d25f2..f3f92ba8f5fe 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -697,6 +697,12 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
>  #endif /* CONFIG_NUMA_BALANCING */
>  
>  #ifdef CONFIG_PER_VMA_LOCK
> +static inline void vma_lock_init(struct vm_area_struct *vma)
> +{
> +	init_rwsem(&vma->vm_lock.lock);
> +	vma->vm_lock_seq = UINT_MAX;
> +}
> +
>  /*
>   * Try to read-lock a vma. The function is allowed to occasionally yield false
>   * locked result to avoid performance overhead, in which case we fall back to
> @@ -714,7 +720,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
>  		return false;
>  
> -	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> +	if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
>  		return false;
>  
>  	/*
> @@ -729,7 +735,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  	 * This pairs with RELEASE semantics in vma_end_write_all().
>  	 */
>  	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
> -		up_read(&vma->vm_lock->lock);
> +		up_read(&vma->vm_lock.lock);
>  		return false;
>  	}
>  	return true;
> @@ -744,7 +750,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
>  {
>  	mmap_assert_locked(vma->vm_mm);
> -	down_read_nested(&vma->vm_lock->lock, subclass);
> +	down_read_nested(&vma->vm_lock.lock, subclass);
>  }
>  
>  /*
> @@ -756,13 +762,13 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int
>  static inline void vma_start_read_locked(struct vm_area_struct *vma)
>  {
>  	mmap_assert_locked(vma->vm_mm);
> -	down_read(&vma->vm_lock->lock);
> +	down_read(&vma->vm_lock.lock);
>  }
>  
>  static inline void vma_end_read(struct vm_area_struct *vma)
>  {
>  	rcu_read_lock(); /* keeps vma alive till the end of up_read */
> -	up_read(&vma->vm_lock->lock);
> +	up_read(&vma->vm_lock.lock);
>  	rcu_read_unlock();
>  }
>  
> @@ -791,7 +797,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>  	if (__is_vma_write_locked(vma, &mm_lock_seq))
>  		return;
>  
> -	down_write(&vma->vm_lock->lock);
> +	down_write(&vma->vm_lock.lock);
>  	/*
>  	 * We should use WRITE_ONCE() here because we can have concurrent reads
>  	 * from the early lockless pessimistic check in vma_start_read().
> @@ -799,7 +805,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>  	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
>  	 */
>  	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> -	up_write(&vma->vm_lock->lock);
> +	up_write(&vma->vm_lock.lock);
>  }
>  
>  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> @@ -811,7 +817,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
>  
>  static inline void vma_assert_locked(struct vm_area_struct *vma)
>  {
> -	if (!rwsem_is_locked(&vma->vm_lock->lock))
> +	if (!rwsem_is_locked(&vma->vm_lock.lock))
>  		vma_assert_write_locked(vma);
>  }
>  
> @@ -844,6 +850,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
>  
>  #else /* CONFIG_PER_VMA_LOCK */
>  
> +static inline void vma_lock_init(struct vm_area_struct *vma) {}
>  static inline bool vma_start_read(struct vm_area_struct *vma)
>  		{ return false; }
>  static inline void vma_end_read(struct vm_area_struct *vma) {}
> @@ -878,10 +885,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
>  
>  extern const struct vm_operations_struct vma_dummy_vm_ops;
>  
> -/*
> - * WARNING: vma_init does not initialize vma->vm_lock.
> - * Use vm_area_alloc()/vm_area_free() if vma needs locking.
> - */
>  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
>  {
>  	memset(vma, 0, sizeof(*vma));
> @@ -890,6 +893,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
>  	INIT_LIST_HEAD(&vma->anon_vma_chain);
>  	vma_mark_detached(vma, false);
>  	vma_numab_state_init(vma);
> +	vma_lock_init(vma);
>  }
>  
>  /* Use when VMA is not part of the VMA tree and needs no locking */
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 5f1b2dc788e2..6573d95f1d1e 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -730,8 +730,6 @@ struct vm_area_struct {
>  	 * slowpath.
>  	 */
>  	unsigned int vm_lock_seq;
> -	/* Unstable RCU readers are allowed to read this. */
> -	struct vma_lock *vm_lock;
>  #endif
>  
>  	/*
> @@ -784,6 +782,10 @@ struct vm_area_struct {
>  	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
>  #endif
>  	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> +#ifdef CONFIG_PER_VMA_LOCK
> +	/* Unstable RCU readers are allowed to read this. */
> +	struct vma_lock vm_lock ____cacheline_aligned_in_smp;
> +#endif
>  } __randomize_layout;
>  
>  #ifdef CONFIG_NUMA
> diff --git a/kernel/fork.c b/kernel/fork.c
> index ded49f18cd95..40a8e615499f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep;
>  /* SLAB cache for mm_struct structures (tsk->mm) */
>  static struct kmem_cache *mm_cachep;
>  
> -#ifdef CONFIG_PER_VMA_LOCK
> -
> -/* SLAB cache for vm_area_struct.lock */
> -static struct kmem_cache *vma_lock_cachep;
> -
> -static bool vma_lock_alloc(struct vm_area_struct *vma)
> -{
> -	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
> -	if (!vma->vm_lock)
> -		return false;
> -
> -	init_rwsem(&vma->vm_lock->lock);
> -	vma->vm_lock_seq = UINT_MAX;
> -
> -	return true;
> -}
> -
> -static inline void vma_lock_free(struct vm_area_struct *vma)
> -{
> -	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
> -}
> -
> -#else /* CONFIG_PER_VMA_LOCK */
> -
> -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
> -static inline void vma_lock_free(struct vm_area_struct *vma) {}
> -
> -#endif /* CONFIG_PER_VMA_LOCK */
> -
>  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
>  {
>  	struct vm_area_struct *vma;
> @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
>  		return NULL;
>  
>  	vma_init(vma, mm);
> -	if (!vma_lock_alloc(vma)) {
> -		kmem_cache_free(vm_area_cachep, vma);
> -		return NULL;
> -	}
>  
>  	return vma;
>  }
> @@ -496,10 +463,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
>  	 * will be reinitialized.
>  	 */
>  	data_race(memcpy(new, orig, sizeof(*new)));
> -	if (!vma_lock_alloc(new)) {
> -		kmem_cache_free(vm_area_cachep, new);
> -		return NULL;
> -	}
> +	vma_lock_init(new);
>  	INIT_LIST_HEAD(&new->anon_vma_chain);
>  	vma_numab_state_init(new);
>  	dup_anon_vma_name(orig, new);
> @@ -511,7 +475,6 @@ void __vm_area_free(struct vm_area_struct *vma)
>  {
>  	vma_numab_state_free(vma);
>  	free_anon_vma_name(vma);
> -	vma_lock_free(vma);
>  	kmem_cache_free(vm_area_cachep, vma);
>  }
>  
> @@ -522,7 +485,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
>  						  vm_rcu);
>  
>  	/* The vma should not be locked while being destroyed. */
> -	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
> +	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
>  	__vm_area_free(vma);
>  }
>  #endif
> @@ -3188,11 +3151,9 @@ void __init proc_caches_init(void)
>  			sizeof(struct fs_struct), 0,
>  			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
>  			NULL);
> -
> -	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
> -#ifdef CONFIG_PER_VMA_LOCK
> -	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
> -#endif
> +	vm_area_cachep = KMEM_CACHE(vm_area_struct,
> +			SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC|
> +			SLAB_ACCOUNT);
>  	mmap_init();
>  	nsproxy_cache_init();
>  }
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index ae635eecbfa8..d19ce6fcab83 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -270,10 +270,10 @@ struct vm_area_struct {
>  	/*
>  	 * Can only be written (using WRITE_ONCE()) while holding both:
>  	 *  - mmap_lock (in write mode)
> -	 *  - vm_lock->lock (in write mode)
> +	 *  - vm_lock.lock (in write mode)
>  	 * Can be read reliably while holding one of:
>  	 *  - mmap_lock (in read or write mode)
> -	 *  - vm_lock->lock (in read or write mode)
> +	 *  - vm_lock.lock (in read or write mode)
>  	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
>  	 * while holding nothing (except RCU to keep the VMA struct allocated).
>  	 *
> @@ -282,7 +282,7 @@ struct vm_area_struct {
>  	 * slowpath.
>  	 */
>  	unsigned int vm_lock_seq;
> -	struct vma_lock *vm_lock;
> +	struct vma_lock vm_lock;
>  #endif
>  
>  	/*
> @@ -459,17 +459,10 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
>  	return mas_find(&vmi->mas, ULONG_MAX);
>  }
>  
> -static inline bool vma_lock_alloc(struct vm_area_struct *vma)
> +static inline void vma_lock_init(struct vm_area_struct *vma)
>  {
> -	vma->vm_lock = calloc(1, sizeof(struct vma_lock));
> -
> -	if (!vma->vm_lock)
> -		return false;
> -
> -	init_rwsem(&vma->vm_lock->lock);
> +	init_rwsem(&vma->vm_lock.lock);
>  	vma->vm_lock_seq = UINT_MAX;
> -
> -	return true;
>  }
>  
>  static inline void vma_assert_write_locked(struct vm_area_struct *);
> @@ -492,6 +485,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
>  	vma->vm_ops = &vma_dummy_vm_ops;
>  	INIT_LIST_HEAD(&vma->anon_vma_chain);
>  	vma_mark_detached(vma, false);
> +	vma_lock_init(vma);
>  }
>  
>  static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> @@ -502,10 +496,6 @@ static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
>  		return NULL;
>  
>  	vma_init(vma, mm);
> -	if (!vma_lock_alloc(vma)) {
> -		free(vma);
> -		return NULL;
> -	}
>  
>  	return vma;
>  }
> @@ -518,10 +508,7 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
>  		return NULL;
>  
>  	memcpy(new, orig, sizeof(*new));
> -	if (!vma_lock_alloc(new)) {
> -		free(new);
> -		return NULL;
> -	}
> +	vma_lock_init(new);
>  	INIT_LIST_HEAD(&new->anon_vma_chain);
>  
>  	return new;
> @@ -691,14 +678,8 @@ static inline void mpol_put(struct mempolicy *)
>  {
>  }
>  
> -static inline void vma_lock_free(struct vm_area_struct *vma)
> -{
> -	free(vma->vm_lock);
> -}
> -
>  static inline void __vm_area_free(struct vm_area_struct *vma)
>  {
> -	vma_lock_free(vma);
>  	free(vma);
>  }
>  
> -- 
> 2.47.1.613.gc27f4b7a9f-goog
>