linux-kernel - [PATCH RFC 5/6] mm, slub: cheaper locking for percpu sheaves

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241112-slub-percpu-caches-v1-5-ddc0bdc27e05@suse.cz>
Date: Tue, 12 Nov 2024 17:38:49 +0100
From: Vlastimil Babka <vbabka@...e.cz>
To: Suren Baghdasaryan <surenb@...gle.com>, 
 "Liam R. Howlett" <Liam.Howlett@...cle.com>, 
 Christoph Lameter <cl@...ux.com>, David Rientjes <rientjes@...gle.com>, 
 Pekka Enberg <penberg@...nel.org>, Joonsoo Kim <iamjoonsoo.kim@....com>
Cc: Roman Gushchin <roman.gushchin@...ux.dev>, 
 Hyeonggon Yoo <42.hyeyoo@...il.com>, 
 "Paul E. McKenney" <paulmck@...nel.org>, 
 Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, 
 Matthew Wilcox <willy@...radead.org>, Boqun Feng <boqun.feng@...il.com>, 
 Uladzislau Rezki <urezki@...il.com>, linux-mm@...ck.org, 
 linux-kernel@...r.kernel.org, rcu@...r.kernel.org, 
 maple-tree@...ts.infradead.org, Mateusz Guzik <mjguzik@...il.com>, 
 Jann Horn <jannh@...gle.com>, Vlastimil Babka <vbabka@...e.cz>
Subject: [PATCH RFC 5/6] mm, slub: cheaper locking for percpu sheaves

Instead of local_lock_irqsave(), use just get_cpu_ptr() (which only
disables preemption) and then set an active flag. If potential callers
include irq handler, the operation must use a trylock variant that bails
out if the flag is already set to active because we interrupted another
operation in progress.

Changing the flag doesn't need to be atomic as the irq is one the same
cpu. This should make using percpu sheaves cheaper, with the downside of
some unlucky operations in irq handlers have to fallback to non-sheave
variants. That should be rare so there should be a net benefit.

On PREEMPT_RT we can use simply local_lock() as that does the right
thing without the need to disable irqs.

Thanks to Mateusz Guzik and Jann Horn for suggesting this kind of
locking scheme in online conversations. Initially attempted to fully
copy the page allocator's pcplist locking, but its reliance on
spin_trylock() made it much more costly.

Cc: Mateusz Guzik <mjguzik@...il.com>
Cc: Jann Horn <jannh@...gle.com>
Signed-off-by: Vlastimil Babka <vbabka@...e.cz>
---
 mm/slub.c | 230 +++++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 174 insertions(+), 56 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 6811d766c0470cd7066c2574ad86e00405c916bb..1900afa6153ca6d88f9df7db3ce84d98629489e7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -450,14 +450,111 @@ struct slab_sheaf {
 	void *objects[];
 };
 
+struct local_tryirq_lock {
+#ifndef CONFIG_PREEMPT_RT
+	int active;
+#else
+	local_lock_t llock;
+#endif
+};
+
 struct slub_percpu_sheaves {
-	local_lock_t lock;
+	struct local_tryirq_lock lock;
 	struct slab_sheaf *main; /* never NULL when unlocked */
 	struct slab_sheaf *spare; /* empty or full, may be NULL */
 	struct slab_sheaf *rcu_free;
 	struct node_barn *barn;
 };
 
+/*
+ * Generic helper to lookup a per-cpu variable with a lock that allows only
+ * trylock from irq handler context to avoid expensive irq disable or atomic
+ * operations and memory barriers - only compiler barriers are needed.
+ *
+ * On !PREEMPT_RT this is done by get_cpu_ptr(), which disables preemption, and
+ * checking that a variable is not already set to 1. If it is, it means we are
+ * in irq handler that has interrupted the locked operation, and must give up.
+ * Otherwise we set the variable to 1.
+ *
+ * On PREEMPT_RT we can simply use local_lock() as that does the right thing
+ * without actually disabling irqs. Thus the trylock can't actually fail.
+ *
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+#define pcpu_local_tryirq_lock(type, member, ptr)                       \
+({                                                                      \
+	type *_ret;                                                     \
+	lockdep_assert(!irq_count());					\
+	_ret = get_cpu_ptr(ptr);                                        \
+	lockdep_assert(_ret->member.active == 0);			\
+	WRITE_ONCE(_ret->member.active, 1);				\
+	barrier();							\
+	_ret;                                                           \
+})
+
+#define pcpu_local_tryirq_trylock(type, member, ptr)                    \
+({                                                                      \
+	type *_ret;                                                     \
+	_ret = get_cpu_ptr(ptr);                                        \
+	if (unlikely(READ_ONCE(_ret->member.active) == 1)) {		\
+		put_cpu_ptr(ptr);					\
+		_ret = NULL;						\
+	} else {                                                        \
+		WRITE_ONCE(_ret->member.active, 1);			\
+		barrier();						\
+	}								\
+	_ret;                                                           \
+})
+
+#define pcpu_local_tryirq_unlock(member, ptr)                           \
+({                                                                      \
+	lockdep_assert(this_cpu_ptr(ptr)->member.active == 1);		\
+	barrier();							\
+	WRITE_ONCE(this_cpu_ptr(ptr)->member.active, 0);		\
+	put_cpu_ptr(ptr);						\
+})
+
+#define local_tryirq_lock_init(lock)					\
+({									\
+	(lock)->active = 0;						\
+})
+
+#else
+
+#define pcpu_local_tryirq_lock(type, member, ptr)                       \
+({                                                                      \
+	type *_ret;                                                     \
+	local_lock(&ptr->member.llock);					\
+	_ret = this_cpu_ptr(ptr);                                       \
+	_ret;                                                           \
+})
+
+#define pcpu_local_tryirq_trylock(type, member, ptr)                    \
+	pcpu_local_tryirq_lock(type, member, ptr)
+
+#define pcpu_local_tryirq_unlock(member, ptr)                           \
+({                                                                      \
+	local_unlock(&ptr->member.llock);				\
+})
+
+#define local_tryirq_lock_init(lock)					\
+({									\
+	local_lock_init(&(lock)->llock);				\
+})
+
+#endif
+
+/* struct slub_percpu_sheaves specific helpers. */
+#define cpu_sheaves_lock(ptr)                                           \
+	pcpu_local_tryirq_lock(struct slub_percpu_sheaves, lock, ptr)
+
+#define cpu_sheaves_trylock(ptr)                                        \
+	pcpu_local_tryirq_trylock(struct slub_percpu_sheaves, lock, ptr)
+
+#define cpu_sheaves_unlock(ptr)                                         \
+	pcpu_local_tryirq_unlock(lock, ptr)
+
 /*
  * The slab lists for all objects.
  */
@@ -2517,17 +2614,20 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
 
 static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
 
-static void sheaf_flush_main(struct kmem_cache *s)
+/* returns true if at least partially flushed */
+static bool sheaf_flush_main(struct kmem_cache *s)
 {
 	struct slub_percpu_sheaves *pcs;
 	unsigned int batch, remaining;
 	void *objects[PCS_BATCH_MAX];
 	struct slab_sheaf *sheaf;
-	unsigned long flags;
+	bool ret = false;
 
 next_batch:
-	local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+	if (!pcs)
+		return ret;
+
 	sheaf = pcs->main;
 
 	batch = min(PCS_BATCH_MAX, sheaf->size);
@@ -2537,14 +2637,18 @@ static void sheaf_flush_main(struct kmem_cache *s)
 
 	remaining = sheaf->size;
 
-	local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+	cpu_sheaves_unlock(s->cpu_sheaves);
 
 	__kmem_cache_free_bulk(s, batch, &objects[0]);
 
 	stat_add(s, SHEAF_FLUSH_MAIN, batch);
 
+	ret = true;
+
 	if (remaining)
 		goto next_batch;
+
+	return ret;
 }
 
 static void sheaf_flush(struct kmem_cache *s, struct slab_sheaf *sheaf)
@@ -2581,6 +2685,8 @@ static void rcu_free_sheaf_nobarn(struct rcu_head *head)
  * Caller needs to make sure migration is disabled in order to fully flush
  * single cpu's sheaves
  *
+ * must not be called from an irq
+ *
  * flushing operations are rare so let's keep it simple and flush to slabs
  * directly, skipping the barn
  */
@@ -2588,10 +2694,8 @@ static void pcs_flush_all(struct kmem_cache *s)
 {
 	struct slub_percpu_sheaves *pcs;
 	struct slab_sheaf *spare, *rcu_free;
-	unsigned long flags;
 
-	local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	pcs = cpu_sheaves_lock(s->cpu_sheaves);
 
 	spare = pcs->spare;
 	pcs->spare = NULL;
@@ -2599,7 +2703,7 @@ static void pcs_flush_all(struct kmem_cache *s)
 	rcu_free = pcs->rcu_free;
 	pcs->rcu_free = NULL;
 
-	local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+	cpu_sheaves_unlock(s->cpu_sheaves);
 
 	if (spare) {
 		sheaf_flush(s, spare);
@@ -4523,11 +4627,11 @@ static __fastpath_inline
 void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
 {
 	struct slub_percpu_sheaves *pcs;
-	unsigned long flags;
 	void *object;
 
-	local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+	if (!pcs)
+		return NULL;
 
 	if (unlikely(pcs->main->size == 0)) {
 
@@ -4559,7 +4663,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
 			}
 		}
 
-		local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+		cpu_sheaves_unlock(s->cpu_sheaves);
 
 		if (!can_alloc)
 			return NULL;
@@ -4581,8 +4685,11 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
 		if (!full)
 			return NULL;
 
-		local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-		pcs = this_cpu_ptr(s->cpu_sheaves);
+		/*
+		 * we can reach here only when gfpflags_allow_blocking
+		 * so this must not be an irq
+		 */
+		pcs = cpu_sheaves_lock(s->cpu_sheaves);
 
 		/*
 		 * If we are returning empty sheaf, we either got it from the
@@ -4615,7 +4722,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
 do_alloc:
 	object = pcs->main->objects[--pcs->main->size];
 
-	local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+	cpu_sheaves_unlock(s->cpu_sheaves);
 
 	stat(s, ALLOC_PCS);
 
@@ -4627,13 +4734,13 @@ unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 {
 	struct slub_percpu_sheaves *pcs;
 	struct slab_sheaf *main;
-	unsigned long flags;
 	unsigned int allocated = 0;
 	unsigned int batch;
 
 next_batch:
-	local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+	if (!pcs)
+		return allocated;
 
 	if (unlikely(pcs->main->size == 0)) {
 
@@ -4652,7 +4759,7 @@ unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 			goto do_alloc;
 		}
 
-		local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+		cpu_sheaves_unlock(s->cpu_sheaves);
 
 		/*
 		 * Once full sheaves in barn are depleted, let the bulk
@@ -4670,7 +4777,7 @@ unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 	main->size -= batch;
 	memcpy(p, main->objects + main->size, batch * sizeof(void *));
 
-	local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+	cpu_sheaves_unlock(s->cpu_sheaves);
 
 	stat_add(s, ALLOC_PCS, batch);
 
@@ -5090,14 +5197,14 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
  * The object is expected to have passed slab_free_hook() already.
  */
 static __fastpath_inline
-void free_to_pcs(struct kmem_cache *s, void *object)
+bool free_to_pcs(struct kmem_cache *s, void *object)
 {
 	struct slub_percpu_sheaves *pcs;
-	unsigned long flags;
 
 restart:
-	local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+	if (!pcs)
+		return false;
 
 	if (unlikely(pcs->main->size == s->sheaf_capacity)) {
 
@@ -5131,7 +5238,7 @@ void free_to_pcs(struct kmem_cache *s, void *object)
 			struct slab_sheaf *to_flush = pcs->spare;
 
 			pcs->spare = NULL;
-			local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+			cpu_sheaves_unlock(s->cpu_sheaves);
 
 			sheaf_flush(s, to_flush);
 			empty = to_flush;
@@ -5139,18 +5246,27 @@ void free_to_pcs(struct kmem_cache *s, void *object)
 		}
 
 alloc_empty:
-		local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+		cpu_sheaves_unlock(s->cpu_sheaves);
 
 		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
 
 		if (!empty) {
-			sheaf_flush_main(s);
-			goto restart;
+			if (sheaf_flush_main(s))
+				goto restart;
+			else
+				return false;
 		}
 
 got_empty:
-		local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-		pcs = this_cpu_ptr(s->cpu_sheaves);
+		pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+		if (!pcs) {
+			struct node_barn *barn;
+
+			barn = get_node(s, numa_mem_id())->barn;
+
+			barn_put_empty_sheaf(barn, empty, true);
+			return false;
+		}
 
 		/*
 		 * if we put any sheaf to barn here, it's because we raced or
@@ -5178,9 +5294,11 @@ void free_to_pcs(struct kmem_cache *s, void *object)
 do_free:
 	pcs->main->objects[pcs->main->size++] = object;
 
-	local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+	cpu_sheaves_unlock(s->cpu_sheaves);
 
 	stat(s, FREE_PCS);
+
+	return true;
 }
 
 static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
@@ -5242,10 +5360,10 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
 {
 	struct slub_percpu_sheaves *pcs;
 	struct slab_sheaf *rcu_sheaf;
-	unsigned long flags;
 
-	local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+	if (!pcs)
+		goto fail;
 
 	if (unlikely(!pcs->rcu_free)) {
 
@@ -5258,17 +5376,16 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
 			goto do_free;
 		}
 
-		local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+		cpu_sheaves_unlock(s->cpu_sheaves);
 
 		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
 
-		if (!empty) {
-			stat(s, FREE_RCU_SHEAF_FAIL);
-			return false;
-		}
+		if (!empty)
+			goto fail;
 
-		local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-		pcs = this_cpu_ptr(s->cpu_sheaves);
+		pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+		if (!pcs)
+			goto fail;
 
 		if (unlikely(pcs->rcu_free))
 			barn_put_empty_sheaf(pcs->barn, empty, true);
@@ -5283,19 +5400,22 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
 	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
 
 	if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
-		local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+		cpu_sheaves_unlock(s->cpu_sheaves);
 		stat(s, FREE_RCU_SHEAF);
 		return true;
 	}
 
 	pcs->rcu_free = NULL;
-	local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+	cpu_sheaves_unlock(s->cpu_sheaves);
 
 	call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
 
 	stat(s, FREE_RCU_SHEAF);
-
 	return true;
+
+fail:
+	stat(s, FREE_RCU_SHEAF_FAIL);
+	return false;
 }
 
 /*
@@ -5307,7 +5427,6 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 {
 	struct slub_percpu_sheaves *pcs;
 	struct slab_sheaf *main;
-	unsigned long flags;
 	unsigned int batch, i = 0;
 	bool init;
 
@@ -5330,8 +5449,9 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 	}
 
 next_batch:
-	local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+	if (!pcs)
+		goto fallback;
 
 	if (unlikely(pcs->main->size == s->sheaf_capacity)) {
 
@@ -5361,13 +5481,13 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 		}
 
 no_empty:
-		local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+		cpu_sheaves_unlock(s->cpu_sheaves);
 
 		/*
 		 * if we depleted all empty sheaves in the barn or there are too
 		 * many full sheaves, free the rest to slab pages
 		 */
-
+fallback:
 		__kmem_cache_free_bulk(s, size, p);
 		return;
 	}
@@ -5379,7 +5499,7 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 	memcpy(main->objects + main->size, p, batch * sizeof(void *));
 	main->size += batch;
 
-	local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+	cpu_sheaves_unlock(s->cpu_sheaves);
 
 	stat_add(s, FREE_PCS, batch);
 
@@ -5479,9 +5599,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
 	if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
 		return;
 
-	if (s->cpu_sheaves)
-		free_to_pcs(s, object);
-	else
+	if (!s->cpu_sheaves || !free_to_pcs(s, object))
 		do_slab_free(s, slab, object, object, 1, addr);
 }
 
@@ -6121,7 +6239,7 @@ static int init_percpu_sheaves(struct kmem_cache *s)
 
 		pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
-		local_lock_init(&pcs->lock);
+		local_tryirq_lock_init(&pcs->lock);
 
 		nid = cpu_to_mem(cpu);
 

-- 
2.47.0