linux-kernel - [RFC PATCH 4/7] net/sched/act_api: use slab ctor/dtor to reduce contention on pcpu alloc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250424080755.272925-5-harry.yoo@oracle.com>
Date: Thu, 24 Apr 2025 17:07:52 +0900
From: Harry Yoo <harry.yoo@...cle.com>
To: Vlastimil Babka <vbabka@...e.cz>, Christoph Lameter <cl@...two.org>,
        David Rientjes <rientjes@...gle.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Dennis Zhou <dennis@...nel.org>, Tejun Heo <tj@...nel.org>,
        Mateusz Guzik <mjguzik@...il.com>
Cc: Jamal Hadi Salim <jhs@...atatu.com>, Cong Wang <xiyou.wangcong@...il.com>,
        Jiri Pirko <jiri@...nulli.us>, Vlad Buslov <vladbu@...dia.com>,
        Yevgeny Kliteynik <kliteyn@...dia.com>, Jan Kara <jack@...e.cz>,
        Byungchul Park <byungchul@...com>, linux-mm@...ck.org,
        netdev@...r.kernel.org, linux-kernel@...r.kernel.org,
        Harry Yoo <harry.yoo@...cle.com>
Subject: [RFC PATCH 4/7] net/sched/act_api: use slab ctor/dtor to reduce contention on pcpu alloc

A tc_action object allocates three percpu memory regions and stores their
pointers within the object. For each object's lifetime, this requires
acquiring and releasing the globak lock, pcpu_alloc_mutex three times.

In workloads that frequently create and destroy TC filters, this leads
to severe lock contention due to the globl lock.

By using the slab constructor/destructor pair, the contention on
pcpu_alloc_mutex is shifted to the creation and destruction of slabs
(which contains multiple objects), which occur far less frequently than
allocating and freeing individual tc_action objects.

When tested with the following command, a 26% reduction in system time
was observed.

$ cd tools/testing/selftests/tc-testing
$ sudo python3 tdc.py -f ./tc-tests/filters/flower.json -d <NIC name>

Lock contention as measured with `perf lock record/report`:

Before:
                Name   acquired  contended     avg wait   total wait     max wait     min wait

                       15042346   15042346      3.82 us     57.40 s       4.00 ms       316 ns
    pcpu_alloc_mutex   10959650   10959650      7.10 us      1.30 m       3.76 ms       313 ns

After:
                Name   acquired  contended     avg wait   total wait     max wait     min wait

                       15488031   15488031      5.16 us      1.33 m  5124095.50 h        316 ns
    pcpu_alloc_mutex    7695276    7695276      3.39 us     26.07 s       4.03 ms       284 ns

The contention has moved from pcpu_alloc_mutex to other locks (which are not
symbolized and appear as blank in the output above).

Signed-off-by: Harry Yoo <harry.yoo@...cle.com>
---
 net/sched/act_api.c | 82 +++++++++++++++++++++++++++++++--------------
 1 file changed, 56 insertions(+), 26 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 839790043256..60cde766135a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -112,6 +112,8 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
 }
 EXPORT_SYMBOL(tcf_action_set_ctrlact);
 
+static struct kmem_cache *tc_action_cache;
+
 /* XXX: For standalone actions, we don't need a RCU grace period either, because
  * actions are always connected to filters and filters are already destroyed in
  * RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -121,15 +123,15 @@ static void free_tcf(struct tc_action *p)
 {
 	struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1);
 
-	free_percpu(p->cpu_bstats);
-	free_percpu(p->cpu_bstats_hw);
-	free_percpu(p->cpu_qstats);
 
 	tcf_set_action_cookie(&p->user_cookie, NULL);
 	if (chain)
 		tcf_chain_put_by_act(chain);
 
-	kfree(p);
+	if (p->cpu_bstats)
+		kmem_cache_free(tc_action_cache, p);
+	else
+		kfree(p);
 }
 
 static void offload_action_hw_count_set(struct tc_action *act,
@@ -778,27 +780,20 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 		   struct tc_action **a, const struct tc_action_ops *ops,
 		   int bind, bool cpustats, u32 flags)
 {
-	struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
+	struct tc_action *p;
 	struct tcf_idrinfo *idrinfo = tn->idrinfo;
 	int err = -ENOMEM;
 
+	if (cpustats)
+		p = kmem_cache_alloc(tc_action_cache, GFP_KERNEL);
+	else
+		p = kzalloc(ops->size, GFP_KERNEL);
+
 	if (unlikely(!p))
 		return -ENOMEM;
 	refcount_set(&p->tcfa_refcnt, 1);
 	if (bind)
 		atomic_set(&p->tcfa_bindcnt, 1);
-
-	if (cpustats) {
-		p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
-		if (!p->cpu_bstats)
-			goto err1;
-		p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
-		if (!p->cpu_bstats_hw)
-			goto err2;
-		p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
-		if (!p->cpu_qstats)
-			goto err3;
-	}
 	gnet_stats_basic_sync_init(&p->tcfa_bstats);
 	gnet_stats_basic_sync_init(&p->tcfa_bstats_hw);
 	spin_lock_init(&p->tcfa_lock);
@@ -812,7 +807,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 					&p->tcfa_rate_est,
 					&p->tcfa_lock, false, est);
 		if (err)
-			goto err4;
+			goto err;
 	}
 
 	p->idrinfo = idrinfo;
@@ -820,14 +815,11 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 	p->ops = ops;
 	*a = p;
 	return 0;
-err4:
-	free_percpu(p->cpu_qstats);
-err3:
-	free_percpu(p->cpu_bstats_hw);
-err2:
-	free_percpu(p->cpu_bstats);
-err1:
-	kfree(p);
+err:
+	if (cpustats)
+		kmem_cache_free(tc_action_cache, p);
+	else
+		kfree(p);
 	return err;
 }
 EXPORT_SYMBOL(tcf_idr_create);
@@ -2270,8 +2262,46 @@ static const struct rtnl_msg_handler tc_action_rtnl_msg_handlers[] __initconst =
 	 .dumpit = tc_dump_action},
 };
 
+static int tcf_action_ctor(void *object) {
+	struct tc_action *p = object;
+
+	p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
+	if (!p->cpu_bstats)
+		goto err1;
+	p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
+	if (!p->cpu_bstats_hw)
+		goto err2;
+	p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+	if (!p->cpu_qstats)
+		goto err3;
+	return 0;
+
+err3:
+	free_percpu(p->cpu_bstats_hw);
+err2:
+	free_percpu(p->cpu_bstats);
+err1:
+	return -ENOMEM;
+}
+
+static void tcf_action_dtor(void *object) {
+	struct tc_action *p = object;
+
+	free_percpu(p->cpu_bstats);
+	free_percpu(p->cpu_bstats_hw);
+	free_percpu(p->cpu_qstats);
+}
+
 static int __init tc_action_init(void)
 {
+	struct kmem_cache_args kmem_args = {
+		.ctor = tcf_action_ctor,
+		.dtor = tcf_action_dtor,
+	};
+
+	tc_action_cache = kmem_cache_create("tc_action",
+					    sizeof(struct tc_action),
+					    &kmem_args, SLAB_PANIC);
 	rtnl_register_many(tc_action_rtnl_msg_handlers);
 	return 0;
 }
-- 
2.43.0