linux-kernel - Re: [PATCH v7 03/21] slab: add opt-in caching layer of percpu sheaves

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <be0ba18e-f752-4df3-a224-fc978431236c@suse.cz>
Date: Mon, 8 Sep 2025 14:26:11 +0200
From: Vlastimil Babka <vbabka@...e.cz>
To: Harry Yoo <harry.yoo@...cle.com>
Cc: Suren Baghdasaryan <surenb@...gle.com>,
 "Liam R. Howlett" <Liam.Howlett@...cle.com>,
 Christoph Lameter <cl@...two.org>, David Rientjes <rientjes@...gle.com>,
 Roman Gushchin <roman.gushchin@...ux.dev>,
 Uladzislau Rezki <urezki@...il.com>,
 Sidhartha Kumar <sidhartha.kumar@...cle.com>, linux-mm@...ck.org,
 linux-kernel@...r.kernel.org, rcu@...r.kernel.org,
 maple-tree@...ts.infradead.org, Venkat Rao Bagalkote <venkat88@...ux.ibm.com>
Subject: Re: [PATCH v7 03/21] slab: add opt-in caching layer of percpu sheaves

On 9/8/25 13:19, Harry Yoo wrote:
> On Wed, Sep 03, 2025 at 02:59:45PM +0200, Vlastimil Babka wrote:
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index bfe7c40eeee1a01c175766935c1e3c0304434a53..e2b197e47866c30acdbd1fee4159f262a751c5a7 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -163,6 +163,9 @@ int slab_unmergeable(struct kmem_cache *s)
>>  		return 1;
>>  #endif
>>  
>> +	if (s->cpu_sheaves)
>> +		return 1;
>> +
>>  	/*
>>  	 * We may have set a slab to be unmergeable during bootstrap.
>>  	 */
>> @@ -321,7 +324,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
>>  		    object_size - args->usersize < args->useroffset))
>>  		args->usersize = args->useroffset = 0;
>>  
>> -	if (!args->usersize)
>> +	if (!args->usersize && !args->sheaf_capacity)
>>  		s = __kmem_cache_alias(name, object_size, args->align, flags,
>>  				       args->ctor);
> 
> Can we merge caches that use sheaves in the future if the capacity
> is the same, or are there any restrictions for merging that I overlooked?

I think we will be able to merge. It will make more sense if we get to
enabling sheaves for everything and set capacity automatically, with the
args->sheaf_capacity serving only as a lower bound. Right now we can leave
it as is I think.
>> +static bool has_pcs_used(int cpu, struct kmem_cache *s)
>> +{
>> +	struct slub_percpu_sheaves *pcs;
>> +
>> +	if (!s->cpu_sheaves)
>> +		return false;
>> +
>> +	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
>> +
>> +	return (pcs->spare || pcs->main->size);
>> +}
>> +
>> +static void pcs_flush_all(struct kmem_cache *s);
> 
> nit: we don't need these functions to flush sheaves if SLUB_TINY=y
> as we don't create sheaves for SLUB_TINY anymore?

I'll check but also if this is true I would expect to have a clang warning
report already. Maybe they are called but don't need to?
TBH we should get rid of SLUB_TINY at this point, maybe have only a boot
option that disables percpu caching and keep the code compiled.
>>  /*
>>   * Flush cpu slab.
>> @@ -3358,30 +3793,18 @@ struct slub_flush_work {
>>  static void flush_cpu_slab(struct work_struct *w)
>>  {
>>  	struct kmem_cache *s;
>> -	struct kmem_cache_cpu *c;
>>  	struct slub_flush_work *sfw;
>>  
>>  	sfw = container_of(w, struct slub_flush_work, work);
>>  
>>  	s = sfw->s;
>> -	c = this_cpu_ptr(s->cpu_slab);
>> -
>> -	if (c->slab)
>> -		flush_slab(s, c);
>> -
>> -	put_partials(s);
>> -}
>>  
>> -static bool has_cpu_slab(int cpu, struct kmem_cache *s)
>> -{
>> -	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
>> +	if (s->cpu_sheaves)
>> +		pcs_flush_all(s);
>>  
>> -	return c->slab || slub_percpu_partial(c);
>> +	flush_this_cpu_slab(s);
>>  } 
>> -#else /* CONFIG_SLUB_TINY */
>> -static inline void flush_all_cpus_locked(struct kmem_cache *s) { }
>> -static inline void flush_all(struct kmem_cache *s) { }
>> -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
>> -static inline int slub_cpu_dead(unsigned int cpu) { return 0; }
>> -#endif /* CONFIG_SLUB_TINY */
>> -
>>  /*
>>   * Check if the objects in a per cpu structure fit numa
>>   * locality expectations.
>> @@ -4191,30 +4610,240 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
>>  }
>>  
>>  /*
>> - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
>> - * have the fastpath folded into their functions. So no function call
>> - * overhead for requests that can be satisfied on the fastpath.
>> - *
>> - * The fastpath works by first checking if the lockless freelist can be used.
>> - * If not then __slab_alloc is called for slow processing.
>> + * Replace the empty main sheaf with a (at least partially) full sheaf.
>>   *
>> - * Otherwise we can simply pick the next object from the lockless free list.
>> + * Must be called with the cpu_sheaves local lock locked. If successful, returns
>> + * the pcs pointer and the local lock locked (possibly on a different cpu than
>> + * initially called). If not successful, returns NULL and the local lock
>> + * unlocked.
>>   */
>> -static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
>> -		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
>> +static struct slub_percpu_sheaves *
>> +__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp)
>>  {
>> -	void *object;
>> -	bool init = false;
>> +	struct slab_sheaf *empty = NULL;
>> +	struct slab_sheaf *full;
>> +	struct node_barn *barn;
>> +	bool can_alloc;
>>  
>> -	s = slab_pre_alloc_hook(s, gfpflags);
>> -	if (unlikely(!s))
>> +	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
>> +
>> +	if (pcs->spare && pcs->spare->size > 0) {
>> +		swap(pcs->main, pcs->spare);
>> +		return pcs;
>> +	}
>> +
>> +	barn = get_barn(s);
>> +
>> +	full = barn_replace_empty_sheaf(barn, pcs->main);
>> +
>> +	if (full) {
>> +		stat(s, BARN_GET);
>> +		pcs->main = full;
>> +		return pcs;
>> +	}
>> +
>> +	stat(s, BARN_GET_FAIL);
>> +
>> +	can_alloc = gfpflags_allow_blocking(gfp);
>> +
>> +	if (can_alloc) {
>> +		if (pcs->spare) {
>> +			empty = pcs->spare;
>> +			pcs->spare = NULL;
>> +		} else {
>> +			empty = barn_get_empty_sheaf(barn);
>> +		}
>> +	}
>> +
>> +	local_unlock(&s->cpu_sheaves->lock);
>> +
>> +	if (!can_alloc)
>> +		return NULL;
>> +
>> +	if (empty) {
>> +		if (!refill_sheaf(s, empty, gfp)) {
>> +			full = empty;
>> +		} else {
>> +			/*
>> +			 * we must be very low on memory so don't bother
>> +			 * with the barn
>> +			 */
>> +			free_empty_sheaf(s, empty);
>> +		}
>> +	} else {
>> +		full = alloc_full_sheaf(s, gfp);
>> +	}
>> +
>> +	if (!full)
>> +		return NULL;
>> +
>> +	/*
>> +	 * we can reach here only when gfpflags_allow_blocking
>> +	 * so this must not be an irq
>> +	 */
>> +	local_lock(&s->cpu_sheaves->lock);
>> +	pcs = this_cpu_ptr(s->cpu_sheaves);
>> +
>> +	/*
>> +	 * If we are returning empty sheaf, we either got it from the
>> +	 * barn or had to allocate one. If we are returning a full
>> +	 * sheaf, it's due to racing or being migrated to a different
>> +	 * cpu. Breaching the barn's sheaf limits should be thus rare
>> +	 * enough so just ignore them to simplify the recovery.
>> +	 */
>> +
>> +	if (pcs->main->size == 0) {
>> +		barn_put_empty_sheaf(barn, pcs->main);
> 
> It should be very rare but it should do
> barn = get_barn(s); again after taking s->cpu_sheaves->lock?

Yeah but it shouldn't really matter which node we use for the empty sheaf
return. And in case of the full sheaf below it's actually better for
preserving numa locality to use the original barn.