netdev - [RFC PATCH 10/13] net: sched: lockless support for netif

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160817193738.27032.25592.stgit@john-Precision-Tower-5810>
Date:	Wed, 17 Aug 2016 12:37:38 -0700
From:	John Fastabend <john.fastabend@...il.com>
To:	xiyou.wangcong@...il.com, jhs@...atatu.com,
	alexei.starovoitov@...il.com, eric.dumazet@...il.com,
	brouer@...hat.com
Cc:	john.r.fastabend@...el.com, netdev@...r.kernel.org,
	john.fastabend@...il.com, davem@...emloft.net
Subject: [RFC PATCH 10/13] net: sched: lockless support for netif_schedule

netif_schedule uses a bit QDISC_STATE_SCHED to tell the qdisc layer
if a run of the qdisc has been scheduler. This is important when
tearing down qdisc instances. We can rcu_free an instance for example
if its possible that we might have outstanding references to it.

Perhaps more importantly in the per cpu lockless case we need to
schedule a run of the qdisc on all qdiscs that are enqueu'ing packets
and hitting the gso_skb requeue logic or else the skb may get stuck
on the gso_skb queue without anything to finish the xmit.

This patch uses a reference counter instead of a bit to account for
the multiple CPUs.
---
 include/net/sch_generic.h |    1 +
 net/core/dev.c            |   32 +++++++++++++++++++++++---------
 net/sched/sch_api.c       |    5 +++++
 net/sched/sch_generic.c   |   16 +++++++++++++++-
 4 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cc28af0..2e0e5b0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -94,6 +94,7 @@ struct Qdisc {
 	seqcount_t		running;
 	struct gnet_stats_queue	qstats;
 	unsigned long		state;
+	unsigned long __percpu	*cpu_state;
 	struct Qdisc            *next_sched;
 	struct sk_buff		*skb_bad_txq;
 	struct rcu_head		rcu_head;
diff --git a/net/core/dev.c b/net/core/dev.c
index 5db395d..f491845 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2272,8 +2272,14 @@ static void __netif_reschedule(struct Qdisc *q)
 
 void __netif_schedule(struct Qdisc *q)
 {
-	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
+	if (q->flags & TCQ_F_NOLOCK) {
+		unsigned long *s = this_cpu_ptr(q->cpu_state);
+
+		if (!test_and_set_bit(__QDISC_STATE_SCHED, s))
+			__netif_reschedule(q);
+	} else if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) {
 		__netif_reschedule(q);
+	}
 }
 EXPORT_SYMBOL(__netif_schedule);
 
@@ -3925,15 +3931,23 @@ static void net_tx_action(struct softirq_action *h)
 			if (!(q->flags & TCQ_F_NOLOCK)) {
 				root_lock = qdisc_lock(q);
 				spin_lock(root_lock);
-			}
-			/* We need to make sure head->next_sched is read
-			 * before clearing __QDISC_STATE_SCHED
-			 */
-			smp_mb__before_atomic();
-			clear_bit(__QDISC_STATE_SCHED, &q->state);
-			qdisc_run(q);
-			if (!(q->flags & TCQ_F_NOLOCK))
+
+				/* We need to make sure head->next_sched is read
+				 * before clearing __QDISC_STATE_SCHED
+				 */
+				smp_mb__before_atomic();
+				clear_bit(__QDISC_STATE_SCHED, &q->state);
+
+				qdisc_run(q);
+
 				spin_unlock(root_lock);
+			} else {
+				unsigned long *s = this_cpu_ptr(q->cpu_state);
+
+				smp_mb__before_atomic();
+				clear_bit(__QDISC_STATE_SCHED, s);
+				__qdisc_run(q);
+			}
 		}
 	}
 }
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 6c5bf13..89989a6 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -975,6 +975,10 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 				alloc_percpu(struct bad_txq_cell);
 			if (!sch->skb_bad_txq_cpu)
 				goto err_out4;
+
+			sch->cpu_state = alloc_percpu(unsigned long);
+			if (!sch->cpu_state)
+				goto err_out4;
 		}
 
 		if (tca[TCA_STAB]) {
@@ -1027,6 +1031,7 @@ err_out4:
 	free_percpu(sch->cpu_qstats);
 	free_percpu(sch->gso_cpu_skb);
 	free_percpu(sch->skb_bad_txq_cpu);
+	free_percpu(sch->cpu_state);
 	/*
 	 * Any broken qdiscs that would require a ops->reset() here?
 	 * The qdisc was never in action so it shouldn't be necessary.
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index d10b762..f5b7254 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
 			if (qdisc_is_percpu_stats(q)) {
 				qdisc_qstats_cpu_backlog_inc(q, nskb);
 				qdisc_qstats_cpu_qlen_inc(q);
+				set_thread_flag(TIF_NEED_RESCHED);
 			} else {
 				qdisc_qstats_backlog_inc(q, nskb);
 				q->q.qlen++;
@@ -768,6 +769,10 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 		sch->skb_bad_txq_cpu = alloc_percpu(struct bad_txq_cell);
 		if (!sch->skb_bad_txq_cpu)
 			goto errout;
+
+		sch->cpu_state = alloc_percpu(unsigned long);
+		if (!sch->cpu_state)
+			goto errout;
 	}
 
 	return sch;
@@ -1037,7 +1042,16 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 		q = dev_queue->qdisc_sleeping;
 
 		if (q->flags & TCQ_F_NOLOCK) {
-			val = test_bit(__QDISC_STATE_SCHED, &q->state);
+			int i;
+
+			for_each_possible_cpu(i) {
+				unsigned long *s;
+
+				s = per_cpu_ptr(q->cpu_state, i);
+				val = test_bit(__QDISC_STATE_SCHED, s);
+				if (val)
+					break;
+			}
 		} else {
 			root_lock = qdisc_lock(q);
 			spin_lock_bh(root_lock);