netdev - [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240326230319.190117-1-jhs@mojatatu.com>
Date: Tue, 26 Mar 2024 19:03:19 -0400
From: Jamal Hadi Salim <jhs@...atatu.com>
To: davem@...emloft.net,
	kuba@...nel.org,
	edumazet@...gle.com,
	pabeni@...hat.com
Cc: jiri@...nulli.us,
	xiyou.wangcong@...il.com,
	netdev@...r.kernel.org,
	renmingshuai@...wei.com,
	Jamal Hadi Salim <jhs@...atatu.com>,
	Victor Nogueira <victor@...atatu.com>
Subject: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion

When the mirred action is used on a classful egress qdisc and a packet is
mirrored or redirected to self we hit a qdisc lock deadlock.
See trace below.

[..... other info removed for brevity....]
[   82.890906]
[   82.890906] ============================================
[   82.890906] WARNING: possible recursive locking detected
[   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
[   82.890906] --------------------------------------------
[   82.890906] ping/418 is trying to acquire lock:
[   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
__dev_queue_xmit+0x1778/0x3550
[   82.890906]
[   82.890906] but task is already holding lock:
[   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
__dev_queue_xmit+0x1778/0x3550
[   82.890906]
[   82.890906] other info that might help us debug this:
[   82.890906]  Possible unsafe locking scenario:
[   82.890906]
[   82.890906]        CPU0
[   82.890906]        ----
[   82.890906]   lock(&sch->q.lock);
[   82.890906]   lock(&sch->q.lock);
[   82.890906]
[   82.890906]  *** DEADLOCK ***
[   82.890906]
[..... other info removed for brevity....]

Example setup (eth0->eth0) to recreate
tc qdisc add dev eth0 root handle 1: htb default 30
tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
     action mirred egress redirect dev eth0

Another example(eth0->eth1->eth0) to recreate
tc qdisc add dev eth0 root handle 1: htb default 30
tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
     action mirred egress redirect dev eth1

tc qdisc add dev eth1 root handle 1: htb default 30
tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
     action mirred egress redirect dev eth0

We fix this by adding a per-cpu, per-qdisc recursion counter which is
incremented the first time a root qdisc is entered and on a second attempt
enter the same root qdisc from the top, the packet is dropped to break the
loop.

Reported-by: renmingshuai@...wei.com
Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
Co-developed-by: Victor Nogueira <victor@...atatu.com>
Signed-off-by: Victor Nogueira <victor@...atatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@...atatu.com>
---
 include/net/sch_generic.h |  2 ++
 net/core/dev.c            |  9 +++++++++
 net/sched/sch_api.c       | 12 ++++++++++++
 net/sched/sch_generic.c   |  2 ++
 4 files changed, 25 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cefe0c4bdae3..f9f99df037ed 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -125,6 +125,8 @@ struct Qdisc {
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 	spinlock_t		seqlock;
 
+	u16 __percpu            *xmit_recursion;
+
 	struct rcu_head		rcu;
 	netdevice_tracker	dev_tracker;
 	/* private data */
diff --git a/net/core/dev.c b/net/core/dev.c
index 9a67003e49db..2b712388c06f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	if (unlikely(contended))
 		spin_lock(&q->busylock);
 
+	if (__this_cpu_read(*q->xmit_recursion) > 0) {
+		__qdisc_drop(skb, &to_free);
+		rc = NET_XMIT_DROP;
+		goto free_skb_list;
+	}
+
+	__this_cpu_inc(*q->xmit_recursion);
 	spin_lock(root_lock);
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 		__qdisc_drop(skb, &to_free);
@@ -3825,7 +3832,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		}
 	}
 	spin_unlock(root_lock);
+	__this_cpu_dec(*q->xmit_recursion);
 	if (unlikely(to_free))
+free_skb_list:
 		kfree_skb_list_reason(to_free,
 				      tcf_get_drop_reason(to_free));
 	if (unlikely(contended))
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 65e05b0c98e4..6c3bc1aff89a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1260,6 +1260,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 	struct Qdisc *sch;
 	struct Qdisc_ops *ops;
 	struct qdisc_size_table *stab;
+	int cpu;
 
 	ops = qdisc_lookup_ops(kind);
 #ifdef CONFIG_MODULES
@@ -1376,11 +1377,22 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		}
 	}
 
+	sch->xmit_recursion = alloc_percpu(u16);
+	if (!sch->xmit_recursion) {
+		err = -ENOMEM;
+		goto err_out5;
+	}
+	for_each_possible_cpu(cpu)
+		(*per_cpu_ptr(sch->xmit_recursion, cpu)) = 0;
+
 	qdisc_hash_add(sch, false);
 	trace_qdisc_create(ops, dev, parent);
 
 	return sch;
 
+err_out5:
+	if (tca[TCA_RATE])
+		gen_kill_estimator(&sch->rate_est);
 err_out4:
 	/* Even if ops->init() failed, we call ops->destroy()
 	 * like qdisc_create_dflt().
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ff5336493777..afbbd2e885a4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1070,6 +1070,8 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	netdev_put(dev, &qdisc->dev_tracker);
 
+	free_percpu(qdisc->xmit_recursion);
+
 	trace_qdisc_destroy(qdisc);
 
 	call_rcu(&qdisc->rcu, qdisc_free_cb);
-- 
2.34.1