[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6ccb903e-b743-4b12-b30b-5b7f55c0bc30@lucifer.local>
Date: Mon, 18 Nov 2024 13:24:36 +0000
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: Suren Baghdasaryan <surenb@...gle.com>
Cc: akpm@...ux-foundation.org, willy@...radead.org, liam.howlett@...cle.com,
mhocko@...e.com, vbabka@...e.cz, hannes@...xchg.org, mjguzik@...il.com,
oliver.sang@...el.com, mgorman@...hsingularity.net, david@...hat.com,
peterx@...hat.com, oleg@...hat.com, dave@...olabs.net,
paulmck@...nel.org, brauner@...nel.org, dhowells@...hat.com,
hdanton@...a.com, hughd@...gle.com, minchan@...gle.com,
jannh@...gle.com, shakeel.butt@...ux.dev, souravpanda@...gle.com,
pasha.tatashin@...een.com, corbet@....net, linux-doc@...r.kernel.org,
linux-mm@...ck.org, linux-kernel@...r.kernel.org,
kernel-team@...roid.com
Subject: Re: [PATCH v3 2/5] mm: move per-vma lock into vm_area_struct
On Sun, Nov 17, 2024 at 12:09:28AM -0800, Suren Baghdasaryan wrote:
> Back when per-vma locks were introduces, vm_lock was moved out of
> vm_area_struct in [1] because of the performance regression caused by
> false cacheline sharing. Recent investigation [2] revealed that the
> regressions is limited to a rather old Broadwell microarchitecture and
> even there it can be mitigated by disabling adjacent cacheline
> prefetching, see [3].
> Splitting single logical structure into multiple ones leads to more
> complicated management, extra pointer dereferences and overall less
> maintainable code. When that split-away part is a lock, it complicates
> things even further. With no performance benefits, there are no reasons
> for this split. Merging the vm_lock back into vm_area_struct also allows
> vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset.
> Move vm_lock back into vm_area_struct, aligning it at the cacheline
> boundary and changing the cache to be cacheline-aligned as well.
Thanks!
> With kernel compiled using defconfig, this causes VMA memory consumption
> to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes:
>
> slabinfo before:
> <name> ... <objsize> <objperslab> <pagesperslab> : ...
> vma_lock ... 40 102 1 : ...
> vm_area_struct ... 160 51 2 : ...
>
> slabinfo after moving vm_lock:
> <name> ... <objsize> <objperslab> <pagesperslab> : ...
> vm_area_struct ... 256 32 2 : ...
>
> Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages,
> which is 5.5MB per 100000 VMAs. Note that the size of this structure is
> dependent on the kernel configuration and typically the original size is
> higher than 160 bytes. Therefore these calculations are close to the
> worst case scenario. A more realistic vm_area_struct usage before this
> change is:
>
> <name> ... <objsize> <objperslab> <pagesperslab> : ...
> vma_lock ... 40 102 1 : ...
> vm_area_struct ... 176 46 2 : ...
>
> Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages,
> which is 3.9MB per 100000 VMAs.
> This memory consumption growth can be addressed later by optimizing the
> vm_lock.
>
> [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/
> [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/
> [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/
>
> Signed-off-by: Suren Baghdasaryan <surenb@...gle.com>
LGTM, and briefly tested in VM and looking good so:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> ---
> include/linux/mm.h | 28 ++++++++++--------
> include/linux/mm_types.h | 6 ++--
> kernel/fork.c | 49 ++++----------------------------
> tools/testing/vma/vma_internal.h | 33 +++++----------------
> 4 files changed, 32 insertions(+), 84 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1ba2e480ae63..737c003b0a1e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -684,6 +684,12 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
> #endif /* CONFIG_NUMA_BALANCING */
>
> #ifdef CONFIG_PER_VMA_LOCK
> +static inline void vma_lock_init(struct vm_area_struct *vma)
> +{
> + init_rwsem(&vma->vm_lock.lock);
> + vma->vm_lock_seq = UINT_MAX;
> +}
> +
> /*
> * Try to read-lock a vma. The function is allowed to occasionally yield false
> * locked result to avoid performance overhead, in which case we fall back to
> @@ -701,7 +707,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
> return false;
>
> - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> + if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
> return false;
>
> /*
> @@ -716,7 +722,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> * This pairs with RELEASE semantics in vma_end_write_all().
> */
> if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
> - up_read(&vma->vm_lock->lock);
> + up_read(&vma->vm_lock.lock);
> return false;
> }
> return true;
> @@ -731,7 +737,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
> {
> mmap_assert_locked(vma->vm_mm);
> - down_read_nested(&vma->vm_lock->lock, subclass);
> + down_read_nested(&vma->vm_lock.lock, subclass);
> }
>
> /*
> @@ -743,13 +749,13 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int
> static inline void vma_start_read_locked(struct vm_area_struct *vma)
> {
> mmap_assert_locked(vma->vm_mm);
> - down_read(&vma->vm_lock->lock);
> + down_read(&vma->vm_lock.lock);
> }
>
> static inline void vma_end_read(struct vm_area_struct *vma)
> {
> rcu_read_lock(); /* keeps vma alive till the end of up_read */
> - up_read(&vma->vm_lock->lock);
> + up_read(&vma->vm_lock.lock);
> rcu_read_unlock();
> }
>
> @@ -778,7 +784,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> if (__is_vma_write_locked(vma, &mm_lock_seq))
> return;
>
> - down_write(&vma->vm_lock->lock);
> + down_write(&vma->vm_lock.lock);
> /*
> * We should use WRITE_ONCE() here because we can have concurrent reads
> * from the early lockless pessimistic check in vma_start_read().
> @@ -786,7 +792,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> */
> WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> - up_write(&vma->vm_lock->lock);
> + up_write(&vma->vm_lock.lock);
> }
>
> static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> @@ -798,7 +804,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
>
> static inline void vma_assert_locked(struct vm_area_struct *vma)
> {
> - if (!rwsem_is_locked(&vma->vm_lock->lock))
> + if (!rwsem_is_locked(&vma->vm_lock.lock))
> vma_assert_write_locked(vma);
> }
>
> @@ -831,6 +837,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
>
> #else /* CONFIG_PER_VMA_LOCK */
>
> +static inline void vma_lock_init(struct vm_area_struct *vma) {}
> static inline bool vma_start_read(struct vm_area_struct *vma)
> { return false; }
> static inline void vma_end_read(struct vm_area_struct *vma) {}
> @@ -865,10 +872,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
>
> extern const struct vm_operations_struct vma_dummy_vm_ops;
>
> -/*
> - * WARNING: vma_init does not initialize vma->vm_lock.
> - * Use vm_area_alloc()/vm_area_free() if vma needs locking.
> - */
> static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> {
> memset(vma, 0, sizeof(*vma));
> @@ -877,6 +880,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> INIT_LIST_HEAD(&vma->anon_vma_chain);
> vma_mark_detached(vma, false);
> vma_numab_state_init(vma);
> + vma_lock_init(vma);
> }
>
> /* Use when VMA is not part of the VMA tree and needs no locking */
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 80fef38d9d64..5c4bfdcfac72 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -716,8 +716,6 @@ struct vm_area_struct {
> * slowpath.
> */
> unsigned int vm_lock_seq;
> - /* Unstable RCU readers are allowed to read this. */
> - struct vma_lock *vm_lock;
> #endif
>
> /*
> @@ -770,6 +768,10 @@ struct vm_area_struct {
> struct vma_numab_state *numab_state; /* NUMA Balancing state */
> #endif
> struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> +#ifdef CONFIG_PER_VMA_LOCK
> + /* Unstable RCU readers are allowed to read this. */
> + struct vma_lock vm_lock ____cacheline_aligned_in_smp;
> +#endif
> } __randomize_layout;
>
> #ifdef CONFIG_NUMA
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 0061cf2450ef..7823797e31d2 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep;
> /* SLAB cache for mm_struct structures (tsk->mm) */
> static struct kmem_cache *mm_cachep;
>
> -#ifdef CONFIG_PER_VMA_LOCK
> -
> -/* SLAB cache for vm_area_struct.lock */
> -static struct kmem_cache *vma_lock_cachep;
> -
> -static bool vma_lock_alloc(struct vm_area_struct *vma)
> -{
> - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
> - if (!vma->vm_lock)
> - return false;
> -
> - init_rwsem(&vma->vm_lock->lock);
> - vma->vm_lock_seq = UINT_MAX;
> -
> - return true;
> -}
> -
> -static inline void vma_lock_free(struct vm_area_struct *vma)
> -{
> - kmem_cache_free(vma_lock_cachep, vma->vm_lock);
> -}
> -
> -#else /* CONFIG_PER_VMA_LOCK */
> -
> -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
> -static inline void vma_lock_free(struct vm_area_struct *vma) {}
> -
> -#endif /* CONFIG_PER_VMA_LOCK */
> -
> struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> {
> struct vm_area_struct *vma;
> @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> return NULL;
>
> vma_init(vma, mm);
> - if (!vma_lock_alloc(vma)) {
> - kmem_cache_free(vm_area_cachep, vma);
> - return NULL;
> - }
>
> return vma;
> }
> @@ -496,10 +463,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> * will be reinitialized.
> */
> data_race(memcpy(new, orig, sizeof(*new)));
> - if (!vma_lock_alloc(new)) {
> - kmem_cache_free(vm_area_cachep, new);
> - return NULL;
> - }
> + vma_lock_init(new);
> INIT_LIST_HEAD(&new->anon_vma_chain);
> vma_numab_state_init(new);
> dup_anon_vma_name(orig, new);
> @@ -511,7 +475,6 @@ void __vm_area_free(struct vm_area_struct *vma)
> {
> vma_numab_state_free(vma);
> free_anon_vma_name(vma);
> - vma_lock_free(vma);
> kmem_cache_free(vm_area_cachep, vma);
> }
>
> @@ -522,7 +485,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
> vm_rcu);
>
> /* The vma should not be locked while being destroyed. */
> - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
> + VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
> __vm_area_free(vma);
> }
> #endif
> @@ -3168,11 +3131,9 @@ void __init proc_caches_init(void)
> sizeof(struct fs_struct), 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> NULL);
> -
> - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
> -#ifdef CONFIG_PER_VMA_LOCK
> - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
> -#endif
> + vm_area_cachep = KMEM_CACHE(vm_area_struct,
> + SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC|
> + SLAB_ACCOUNT);
> mmap_init();
> nsproxy_cache_init();
> }
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 1d9fc97b8e80..11c2c38ca4e8 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -230,10 +230,10 @@ struct vm_area_struct {
> /*
> * Can only be written (using WRITE_ONCE()) while holding both:
> * - mmap_lock (in write mode)
> - * - vm_lock->lock (in write mode)
> + * - vm_lock.lock (in write mode)
> * Can be read reliably while holding one of:
> * - mmap_lock (in read or write mode)
> - * - vm_lock->lock (in read or write mode)
> + * - vm_lock.lock (in read or write mode)
> * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
> * while holding nothing (except RCU to keep the VMA struct allocated).
> *
> @@ -242,7 +242,7 @@ struct vm_area_struct {
> * slowpath.
> */
> unsigned int vm_lock_seq;
> - struct vma_lock *vm_lock;
> + struct vma_lock vm_lock;
> #endif
>
> /*
> @@ -408,17 +408,10 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
> return mas_find(&vmi->mas, ULONG_MAX);
> }
>
> -static inline bool vma_lock_alloc(struct vm_area_struct *vma)
> +static inline void vma_lock_init(struct vm_area_struct *vma)
> {
> - vma->vm_lock = calloc(1, sizeof(struct vma_lock));
> -
> - if (!vma->vm_lock)
> - return false;
> -
> - init_rwsem(&vma->vm_lock->lock);
> + init_rwsem(&vma->vm_lock.lock);
> vma->vm_lock_seq = UINT_MAX;
> -
> - return true;
> }
>
> static inline void vma_assert_write_locked(struct vm_area_struct *);
> @@ -439,6 +432,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> vma->vm_ops = &vma_dummy_vm_ops;
> INIT_LIST_HEAD(&vma->anon_vma_chain);
> vma_mark_detached(vma, false);
> + vma_lock_init(vma);
> }
>
> static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> @@ -449,10 +443,6 @@ static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> return NULL;
>
> vma_init(vma, mm);
> - if (!vma_lock_alloc(vma)) {
> - free(vma);
> - return NULL;
> - }
>
> return vma;
> }
> @@ -465,10 +455,7 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> return NULL;
>
> memcpy(new, orig, sizeof(*new));
> - if (!vma_lock_alloc(new)) {
> - free(new);
> - return NULL;
> - }
> + vma_lock_init(new);
> INIT_LIST_HEAD(&new->anon_vma_chain);
>
> return new;
> @@ -638,14 +625,8 @@ static inline void mpol_put(struct mempolicy *)
> {
> }
>
> -static inline void vma_lock_free(struct vm_area_struct *vma)
> -{
> - free(vma->vm_lock);
> -}
> -
> static inline void __vm_area_free(struct vm_area_struct *vma)
> {
> - vma_lock_free(vma);
> free(vma);
> }
>
> --
> 2.47.0.338.g60cca15819-goog
>
Powered by blists - more mailing lists