linux-kernel - Re: [PATCH RFC 14/19] slab: simplify kmalloc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4ukrk3ziayvxrcfxm2izwrwt3qrmr4fcsefl4n7oodc4t2hxgt@ijk63r4f3rkr>
Date: Tue, 16 Dec 2025 10:35:33 +0800
From: Hao Li <hao.li@...ux.dev>
To: Vlastimil Babka <vbabka@...e.cz>
Cc: Andrew Morton <akpm@...ux-foundation.org>, 
	Christoph Lameter <cl@...two.org>, David Rientjes <rientjes@...gle.com>, 
	Roman Gushchin <roman.gushchin@...ux.dev>, Harry Yoo <harry.yoo@...cle.com>, 
	Uladzislau Rezki <urezki@...il.com>, "Liam R. Howlett" <Liam.Howlett@...cle.com>, 
	Suren Baghdasaryan <surenb@...gle.com>, Sebastian Andrzej Siewior <bigeasy@...utronix.de>, 
	Alexei Starovoitov <ast@...nel.org>, linux-mm@...ck.org, linux-kernel@...r.kernel.org, 
	linux-rt-devel@...ts.linux.dev, bpf@...r.kernel.org, kasan-dev@...glegroups.com
Subject: Re: [PATCH RFC 14/19] slab: simplify kmalloc_nolock()

On Thu, Oct 23, 2025 at 03:52:36PM +0200, Vlastimil Babka wrote:
> The kmalloc_nolock() implementation has several complications and
> restrictions due to SLUB's cpu slab locking, lockless fastpath and
> PREEMPT_RT differences. With cpu slab usage removed, we can simplify
> things:
> 
> - the local_lock_cpu_slab() macros became unused, remove them
> 
> - we no longer need to set up lockdep classes on PREEMPT_RT
> 
> - we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
>   since there's no lockless cpu freelist manipulation anymore
> 
> - __slab_alloc_node() can be called from kmalloc_nolock_noprof()
>   unconditionally
> 
> Note that we still need __CMPXCHG_DOUBLE, because while it was removed
> we don't use cmpxchg16b on cpu freelist anymore, we still use it on
> slab freelist, and the alternative is slab_lock() which can be
> interrupted by a nmi. Clarify the comment to mention it specifically.
> 
> Signed-off-by: Vlastimil Babka <vbabka@...e.cz>
> ---
>  mm/slab.h |   1 -
>  mm/slub.c | 100 ++++----------------------------------------------------------
>  2 files changed, 6 insertions(+), 95 deletions(-)
> 
> diff --git a/mm/slab.h b/mm/slab.h
> index b2663cc594f3..7dde0b56a7b0 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -208,7 +208,6 @@ struct kmem_cache_order_objects {
>   */
>  struct kmem_cache {
>  	struct kmem_cache_cpu __percpu *cpu_slab;
> -	struct lock_class_key lock_key;
>  	struct slub_percpu_sheaves __percpu *cpu_sheaves;
>  	/* Used for retrieving partial slabs, etc. */
>  	slab_flags_t flags;
> diff --git a/mm/slub.c b/mm/slub.c
> index 6f5ca26bbb00..6dd7fd153391 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3679,29 +3679,12 @@ static inline unsigned int init_tid(int cpu)
>  
>  static void init_kmem_cache_cpus(struct kmem_cache *s)
>  {
> -#ifdef CONFIG_PREEMPT_RT
> -	/*
> -	 * Register lockdep key for non-boot kmem caches to avoid
> -	 * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
> -	 */
> -	bool finegrain_lockdep = !init_section_contains(s, 1);
> -#else
> -	/*
> -	 * Don't bother with different lockdep classes for each
> -	 * kmem_cache, since we only use local_trylock_irqsave().
> -	 */
> -	bool finegrain_lockdep = false;
> -#endif
>  	int cpu;
>  	struct kmem_cache_cpu *c;
>  
> -	if (finegrain_lockdep)
> -		lockdep_register_key(&s->lock_key);
>  	for_each_possible_cpu(cpu) {
>  		c = per_cpu_ptr(s->cpu_slab, cpu);
>  		local_trylock_init(&c->lock);
> -		if (finegrain_lockdep)
> -			lockdep_set_class(&c->lock, &s->lock_key);
>  		c->tid = init_tid(cpu);
>  	}
>  }
> @@ -3792,47 +3775,6 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
>  	}
>  }
>  
> -/*
> - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
> - * can be acquired without a deadlock before invoking the function.
> - *
> - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
> - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
> - * and kmalloc() is not used in an unsupported context.
> - *
> - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
> - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
> - * lockdep_assert() will catch a bug in case:
> - * #1
> - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
> - * or
> - * #2
> - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
> - *
> - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
> - * disabled context. The lock will always be acquired and if needed it
> - * block and sleep until the lock is available.
> - * #1 is possible in !PREEMPT_RT only.
> - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
> - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
> - *    tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
> - *
> - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
> - */
> -#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
> -#define local_lock_cpu_slab(s, flags)	\
> -	local_lock_irqsave(&(s)->cpu_slab->lock, flags)
> -#else
> -#define local_lock_cpu_slab(s, flags)					       \
> -	do {								       \
> -		bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
> -		lockdep_assert(__l);					       \
> -	} while (0)
> -#endif
> -
> -#define local_unlock_cpu_slab(s, flags)	\
> -	local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
> -
>  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
>  {
>  	unsigned long flags;
> @@ -4320,19 +4262,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  
>  	return freelist;
>  }
> -/*
> - * We disallow kprobes in ___slab_alloc() to prevent reentrance
> - *
> - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
> - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
> - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
> - * manipulating c->freelist without lock.
> - *
> - * This does not prevent kprobe in functions called from ___slab_alloc() such as
> - * local_lock_irqsave() itself, and that is fine, we only need to protect the
> - * c->freelist manipulation in ___slab_alloc() itself.
> - */
> -NOKPROBE_SYMBOL(___slab_alloc);
>  
>  static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
>  		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
> @@ -5201,10 +5130,11 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
>  	if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
>  		/*
>  		 * kmalloc_nolock() is not supported on architectures that
> -		 * don't implement cmpxchg16b, but debug caches don't use
> -		 * per-cpu slab and per-cpu partial slabs. They rely on
> -		 * kmem_cache_node->list_lock, so kmalloc_nolock() can
> -		 * attempt to allocate from debug caches by
> +		 * don't implement cmpxchg16b and thus need slab_lock()
> +		 * which could be preempted by a nmi.
> +		 * But debug caches don't use that and only rely on
> +		 * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt
> +		 * to allocate from debug caches by
>  		 * spin_trylock_irqsave(&n->list_lock, ...)
>  		 */
>  		return NULL;
> @@ -5214,27 +5144,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
>  	if (ret)
>  		goto success;
>  
> -	ret = ERR_PTR(-EBUSY);
> -
>  	/*
>  	 * Do not call slab_alloc_node(), since trylock mode isn't
>  	 * compatible with slab_pre_alloc_hook/should_failslab and
>  	 * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
>  	 * and slab_post_alloc_hook() directly.
> -	 *
> -	 * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
> -	 * in irq saved region. It assumes that the same cpu will not
> -	 * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
> -	 * Therefore use in_nmi() to check whether particular bucket is in
> -	 * irq protected section.
> -	 *
> -	 * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
> -	 * this cpu was interrupted somewhere inside ___slab_alloc() after
> -	 * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
> -	 * In this case fast path with __update_cpu_freelist_fast() is not safe.
>  	 */
> -	if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
> -		ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
> +	ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
>  
>  	if (PTR_ERR(ret) == -EBUSY) {

After Patch 10 is applied, the logic that returns `EBUSY` has been
removed along with the `s->cpu_slab` logic. As a result, it appears that
`__slab_alloc_node` will no longer return `EBUSY`.

>  		if (can_retry) {
> @@ -7250,10 +7166,6 @@ void __kmem_cache_release(struct kmem_cache *s)
>  {
>  	cache_random_seq_destroy(s);
>  	pcs_destroy(s);
> -#ifdef CONFIG_PREEMPT_RT
> -	if (s->cpu_slab)
> -		lockdep_unregister_key(&s->lock_key);
> -#endif
>  	free_percpu(s->cpu_slab);
>  	free_kmem_cache_nodes(s);
>  }
> 
> -- 
> 2.51.1
>