linux-kernel - Re: [PATCH v8 07/23] slab: skip percpu sheaves for remote object freeing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJuCfpHmDxgFj+Jh9+qK1nwnnGWGBvPYEFrLSdooCUy5ODRboA@mail.gmail.com>
Date: Thu, 25 Sep 2025 09:14:50 -0700
From: Suren Baghdasaryan <surenb@...gle.com>
To: Vlastimil Babka <vbabka@...e.cz>
Cc: "Liam R. Howlett" <Liam.Howlett@...cle.com>, Christoph Lameter <cl@...two.org>, 
	David Rientjes <rientjes@...gle.com>, Roman Gushchin <roman.gushchin@...ux.dev>, 
	Harry Yoo <harry.yoo@...cle.com>, Uladzislau Rezki <urezki@...il.com>, 
	Sidhartha Kumar <sidhartha.kumar@...cle.com>, linux-mm@...ck.org, 
	linux-kernel@...r.kernel.org, rcu@...r.kernel.org, 
	maple-tree@...ts.infradead.org
Subject: Re: [PATCH v8 07/23] slab: skip percpu sheaves for remote object freeing

On Wed, Sep 10, 2025 at 1:01 AM Vlastimil Babka <vbabka@...e.cz> wrote:
>
> Since we don't control the NUMA locality of objects in percpu sheaves,
> allocations with node restrictions bypass them. Allocations without
> restrictions may however still expect to get local objects with high
> probability, and the introduction of sheaves can decrease it due to
> freed object from a remote node ending up in percpu sheaves.
>
> The fraction of such remote frees seems low (5% on an 8-node machine)
> but it can be expected that some cache or workload specific corner cases
> exist. We can either conclude that this is not a problem due to the low
> fraction, or we can make remote frees bypass percpu sheaves and go
> directly to their slabs. This will make the remote frees more expensive,
> but if if's only a small fraction, most frees will still benefit from

s/if's/it's

> the lower overhead of percpu sheaves.
>
> This patch thus makes remote object freeing bypass percpu sheaves,
> including bulk freeing, and kfree_rcu() via the rcu_free sheaf. However
> it's not intended to be 100% guarantee that percpu sheaves will only
> contain local objects. The refill from slabs does not provide that
> guarantee in the first place, and there might be cpu migrations
> happening when we need to unlock the local_lock. Avoiding all that could
> be possible but complicated so we can leave it for later investigation
> whether it would be worth it. It can be expected that the more selective
> freeing will itself prevent accumulation of remote objects in percpu
> sheaves so any such violations would have only short-term effects.
>
> Reviewed-by: Harry Yoo <harry.yoo@...cle.com>
> Signed-off-by: Vlastimil Babka <vbabka@...e.cz>

Reviewed-by: Suren Baghdasaryan <surenb@...gle.com>

> ---
>  mm/slab_common.c |  7 +++++--
>  mm/slub.c        | 42 ++++++++++++++++++++++++++++++++++++------
>  2 files changed, 41 insertions(+), 8 deletions(-)
>
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 005a4319c06a01d2b616a75396fcc43766a62ddb..b6601e0fe598e24bd8d456dce4fc82c65b342bfd 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1623,8 +1623,11 @@ static bool kfree_rcu_sheaf(void *obj)
>
>         slab = folio_slab(folio);
>         s = slab->slab_cache;
> -       if (s->cpu_sheaves)
> -               return __kfree_rcu_sheaf(s, obj);
> +       if (s->cpu_sheaves) {
> +               if (likely(!IS_ENABLED(CONFIG_NUMA) ||
> +                          slab_nid(slab) == numa_mem_id()))
> +                       return __kfree_rcu_sheaf(s, obj);
> +       }
>
>         return false;
>  }
> diff --git a/mm/slub.c b/mm/slub.c
> index 35274ce4e709c9da7ac8f9006c824f28709e923d..9699d048b2cd08ee75c4cc3d1e460868704520b1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -472,6 +472,7 @@ struct slab_sheaf {
>         };
>         struct kmem_cache *cache;
>         unsigned int size;
> +       int node; /* only used for rcu_sheaf */
>         void *objects[];
>  };
>
> @@ -5828,7 +5829,7 @@ static void rcu_free_sheaf(struct rcu_head *head)
>          */
>         __rcu_free_sheaf_prepare(s, sheaf);
>
> -       barn = get_node(s, numa_mem_id())->barn;
> +       barn = get_node(s, sheaf->node)->barn;
>
>         /* due to slab_free_hook() */
>         if (unlikely(sheaf->size == 0))
> @@ -5914,10 +5915,12 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
>
>         rcu_sheaf->objects[rcu_sheaf->size++] = obj;
>
> -       if (likely(rcu_sheaf->size < s->sheaf_capacity))
> +       if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
>                 rcu_sheaf = NULL;
> -       else
> +       } else {
>                 pcs->rcu_free = NULL;
> +               rcu_sheaf->node = numa_mem_id();
> +       }
>
>         local_unlock(&s->cpu_sheaves->lock);
>
> @@ -5944,7 +5947,11 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
>         bool init = slab_want_init_on_free(s);
>         unsigned int batch, i = 0;
>         struct node_barn *barn;
> +       void *remote_objects[PCS_BATCH_MAX];
> +       unsigned int remote_nr = 0;
> +       int node = numa_mem_id();
>
> +next_remote_batch:
>         while (i < size) {
>                 struct slab *slab = virt_to_slab(p[i]);
>
> @@ -5954,7 +5961,15 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
>                 if (unlikely(!slab_free_hook(s, p[i], init, false))) {
>                         p[i] = p[--size];
>                         if (!size)
> -                               return;
> +                               goto flush_remote;
> +                       continue;
> +               }
> +
> +               if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) {
> +                       remote_objects[remote_nr] = p[i];
> +                       p[i] = p[--size];
> +                       if (++remote_nr >= PCS_BATCH_MAX)
> +                               goto flush_remote;
>                         continue;
>                 }
>
> @@ -6024,6 +6039,15 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
>          */
>  fallback:
>         __kmem_cache_free_bulk(s, size, p);
> +
> +flush_remote:
> +       if (remote_nr) {
> +               __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
> +               if (i < size) {
> +                       remote_nr = 0;
> +                       goto next_remote_batch;
> +               }
> +       }
>  }
>
>  #ifndef CONFIG_SLUB_TINY
> @@ -6115,8 +6139,14 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
>         if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
>                 return;
>
> -       if (!s->cpu_sheaves || !free_to_pcs(s, object))
> -               do_slab_free(s, slab, object, object, 1, addr);
> +       if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
> +                                    slab_nid(slab) == numa_mem_id())) {
> +               if (likely(free_to_pcs(s, object))) {

nit: no need for curly braces here.

> +                       return;
> +               }
> +       }
> +
> +       do_slab_free(s, slab, object, object, 1, addr);
>  }
>
>  #ifdef CONFIG_MEMCG
>
> --
> 2.51.0
>