linux-kernel - Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6711935d-20b5-41c1-8864-db3fc7d7823d@suse.cz>
Date: Mon, 17 Jun 2024 19:23:36 +0200
From: Vlastimil Babka <vbabka@...e.cz>
To: paulmck@...nel.org
Cc: "Jason A. Donenfeld" <Jason@...c4.com>,
 "Uladzislau Rezki (Sony)" <urezki@...il.com>,
 Jakub Kicinski <kuba@...nel.org>, Julia Lawall <Julia.Lawall@...ia.fr>,
 linux-block@...r.kernel.org, kernel-janitors@...r.kernel.org,
 bridge@...ts.linux.dev, linux-trace-kernel@...r.kernel.org,
 Mathieu Desnoyers <mathieu.desnoyers@...icios.com>, kvm@...r.kernel.org,
 linuxppc-dev@...ts.ozlabs.org, "Naveen N. Rao" <naveen.n.rao@...ux.ibm.com>,
 Christophe Leroy <christophe.leroy@...roup.eu>,
 Nicholas Piggin <npiggin@...il.com>, netdev@...r.kernel.org,
 wireguard@...ts.zx2c4.com, linux-kernel@...r.kernel.org,
 ecryptfs@...r.kernel.org, Neil Brown <neilb@...e.de>,
 Olga Kornievskaia <kolga@...app.com>, Dai Ngo <Dai.Ngo@...cle.com>,
 Tom Talpey <tom@...pey.com>, linux-nfs@...r.kernel.org,
 linux-can@...r.kernel.org, Lai Jiangshan <jiangshanlai@...il.com>,
 netfilter-devel@...r.kernel.org, coreteam@...filter.org,
 kasan-dev <kasan-dev@...glegroups.com>
Subject: Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple
 kmem_cache_free callback

On 6/17/24 6:12 PM, Paul E. McKenney wrote:
> On Mon, Jun 17, 2024 at 05:10:50PM +0200, Vlastimil Babka wrote:
>> On 6/13/24 2:22 PM, Jason A. Donenfeld wrote:
>> > On Wed, Jun 12, 2024 at 08:38:02PM -0700, Paul E. McKenney wrote:
>> >> o	Make the current kmem_cache_destroy() asynchronously wait for
>> >> 	all memory to be returned, then complete the destruction.
>> >> 	(This gets rid of a valuable debugging technique because
>> >> 	in normal use, it is a bug to attempt to destroy a kmem_cache
>> >> 	that has objects still allocated.)
>> 
>> This seems like the best option to me. As Jason already said, the debugging
>> technique is not affected significantly, if the warning just occurs
>> asynchronously later. The module can be already unloaded at that point, as
>> the leak is never checked programatically anyway to control further
>> execution, it's just a splat in dmesg.
> 
> Works for me!

Great. So this is how a prototype could look like, hopefully? The kunit test
does generate the splat for me, which should be because the rcu_barrier() in
the implementation (marked to be replaced with the real thing) is really
insufficient. Note the test itself passes as this kind of error isn't wired
up properly.

Another thing to resolve is the marked comment about kasan_shutdown() with
potential kfree_rcu()'s in flight.

Also you need CONFIG_SLUB_DEBUG enabled otherwise node_nr_slabs() is a no-op
and it might fail to notice the pending slabs. This will need to change.

----8<----
diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c
index e6667a28c014..e3e4d0ca40b7 100644
--- a/lib/slub_kunit.c
+++ b/lib/slub_kunit.c
@@ -5,6 +5,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/rcupdate.h>
 #include "../mm/slab.h"
 
 static struct kunit_resource resource;
@@ -157,6 +158,26 @@ static void test_kmalloc_redzone_access(struct kunit *test)
 	kmem_cache_destroy(s);
 }
 
+struct test_kfree_rcu_struct {
+	struct rcu_head rcu;
+};
+
+static void test_kfree_rcu(struct kunit *test)
+{
+	struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu",
+				sizeof(struct test_kfree_rcu_struct),
+				SLAB_NO_MERGE);
+	struct test_kfree_rcu_struct *p = kmem_cache_alloc(s, GFP_KERNEL);
+
+	kasan_disable_current();
+
+	KUNIT_EXPECT_EQ(test, 0, slab_errors);
+
+	kasan_enable_current();
+	kfree_rcu(p, rcu);
+	kmem_cache_destroy(s);
+}
+
 static int test_init(struct kunit *test)
 {
 	slab_errors = 0;
@@ -177,6 +198,7 @@ static struct kunit_case test_cases[] = {
 
 	KUNIT_CASE(test_clobber_redzone_free),
 	KUNIT_CASE(test_kmalloc_redzone_access),
+	KUNIT_CASE(test_kfree_rcu),
 	{}
 };
 
diff --git a/mm/slab.h b/mm/slab.h
index b16e63191578..a0295600af92 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -277,6 +277,8 @@ struct kmem_cache {
 	unsigned int red_left_pad;	/* Left redzone padding size */
 	const char *name;		/* Name (only for display!) */
 	struct list_head list;		/* List of slab caches */
+	struct work_struct async_destroy_work;
+
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;		/* For sysfs */
 #endif
@@ -474,7 +476,7 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
 			      SLAB_NO_USER_FLAGS)
 
 bool __kmem_cache_empty(struct kmem_cache *);
-int __kmem_cache_shutdown(struct kmem_cache *);
+int __kmem_cache_shutdown(struct kmem_cache *, bool);
 void __kmem_cache_release(struct kmem_cache *);
 int __kmem_cache_shrink(struct kmem_cache *);
 void slab_kmem_cache_release(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5b1f996bed06..c5c356d0235d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,6 +44,8 @@ static LIST_HEAD(slab_caches_to_rcu_destroy);
 static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
 static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 		    slab_caches_to_rcu_destroy_workfn);
+static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work);
+
 
 /*
  * Set of flags that will prevent slab merging
@@ -234,6 +236,7 @@ static struct kmem_cache *create_cache(const char *name,
 
 	s->refcount = 1;
 	list_add(&s->list, &slab_caches);
+	INIT_WORK(&s->async_destroy_work, kmem_cache_kfree_rcu_destroy_workfn);
 	return s;
 
 out_free_cache:
@@ -449,12 +452,16 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 	}
 }
 
-static int shutdown_cache(struct kmem_cache *s)
+static int shutdown_cache(struct kmem_cache *s, bool warn_inuse)
 {
 	/* free asan quarantined objects */
+	/*
+	 * XXX: is it ok to call this multiple times? and what happens with a
+	 * kfree_rcu() in flight that finishes after or in parallel with this?
+	 */
 	kasan_cache_shutdown(s);
 
-	if (__kmem_cache_shutdown(s) != 0)
+	if (__kmem_cache_shutdown(s, warn_inuse) != 0)
 		return -EBUSY;
 
 	list_del(&s->list);
@@ -477,6 +484,32 @@ void slab_kmem_cache_release(struct kmem_cache *s)
 	kmem_cache_free(kmem_cache, s);
 }
 
+static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work)
+{
+	struct kmem_cache *s;
+	int err = -EBUSY;
+	bool rcu_set;
+
+	s = container_of(work, struct kmem_cache, async_destroy_work);
+
+	// XXX use the real kmem_cache_free_barrier() or similar thing here
+	rcu_barrier();
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
+
+	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
+
+	err = shutdown_cache(s, true);
+	WARN(err, "kmem_cache_destroy %s: Slab cache still has objects",
+	     s->name);
+
+	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
+	if (!err && !rcu_set)
+		kmem_cache_release(s);
+}
+
 void kmem_cache_destroy(struct kmem_cache *s)
 {
 	int err = -EBUSY;
@@ -494,9 +527,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (s->refcount)
 		goto out_unlock;
 
-	err = shutdown_cache(s);
-	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
-	     __func__, s->name, (void *)_RET_IP_);
+	err = shutdown_cache(s, false);
+	if (err)
+		schedule_work(&s->async_destroy_work);
 out_unlock:
 	mutex_unlock(&slab_mutex);
 	cpus_read_unlock();
diff --git a/mm/slub.c b/mm/slub.c
index 1617d8014ecd..4d435b3d2b5f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5342,7 +5342,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
  * This is called from __kmem_cache_shutdown(). We must take list_lock
  * because sysfs file might still access partial list after the shutdowning.
  */
-static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
+static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n,
+			 bool warn_inuse)
 {
 	LIST_HEAD(discard);
 	struct slab *slab, *h;
@@ -5353,7 +5354,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 		if (!slab->inuse) {
 			remove_partial(n, slab);
 			list_add(&slab->slab_list, &discard);
-		} else {
+		} else if (warn_inuse) {
 			list_slab_objects(s, slab,
 			  "Objects remaining in %s on __kmem_cache_shutdown()");
 		}
@@ -5378,7 +5379,7 @@ bool __kmem_cache_empty(struct kmem_cache *s)
 /*
  * Release all resources used by a slab cache.
  */
-int __kmem_cache_shutdown(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s, bool warn_inuse)
 {
 	int node;
 	struct kmem_cache_node *n;
@@ -5386,7 +5387,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	flush_all_cpus_locked(s);
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
-		free_partial(s, n);
+		free_partial(s, n, warn_inuse);
 		if (n->nr_partial || node_nr_slabs(n))
 			return 1;
 	}