netdev - [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230920125418.3675569-4-edumazet@google.com>
Date: Wed, 20 Sep 2023 12:54:17 +0000
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, 
	Paolo Abeni <pabeni@...hat.com>
Cc: Willem de Bruijn <willemb@...gle.com>, Jamal Hadi Salim <jhs@...atatu.com>, 
	Cong Wang <xiyou.wangcong@...il.com>, Jiri Pirko <jiri@...nulli.us>, netdev@...r.kernel.org, 
	eric.dumazet@...il.com, Eric Dumazet <edumazet@...gle.com>
Subject: [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc

TCQ_F_CAN_BYPASS can be used by few qdiscs.

Idea is that if we queue a packet to an empty qdisc,
following dequeue() would pick it immediately.

FQ can not use the generic TCQ_F_CAN_BYPASS code,
because some additional checks need to be performed.

This patch adds a similar fast path to FQ.

Most of the time, qdisc is not throttled,
and many packets can avoid bringing/touching
at least four cache lines, and consuming 128bytes
of memory to store the state of a flow.

After this patch, netperf can send UDP packets about 13 % faster,
and pktgen goes 30 % faster (when FQ is in the way), on a fast NIC.

TCP traffic is also improved, thanks to a reduction of cache line misses.

tc -s -d qd sh dev eth1
...
qdisc fq 8004: parent 1:2 limit 10000p flow_limit 100p buckets 1024
   orphan_mask 1023 quantum 3028b initial_quantum 15140b low_rate_threshold 550Kbit
   refill_delay 40ms timer_slack 10us horizon 10s horizon_drop
 Sent 5646784384 bytes 1985161 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
  flows 122 (inactive 122 throttled 0)
  gc 0 highprio 0 fastpath 659990 throttled 27762 latency 8.57us

Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
 include/uapi/linux/pkt_sched.h |   1 +
 net/sched/sch_fq.c             | 107 +++++++++++++++++++++++++--------
 2 files changed, 82 insertions(+), 26 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 3f85ae5780563cdfb42fdb3a107ca2489d0830a4..579f641846b87da05e5d4b09c1072c90220ca601 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -962,6 +962,7 @@ struct tc_fq_qd_stats {
 	__u64	ce_mark;		/* packets above ce_threshold */
 	__u64	horizon_drops;
 	__u64	horizon_caps;
+	__u64	fastpath_packets;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4af43a401dbb4111d5cfaddb4b83fc5c7b63b83d..ea510003bc385148397a690c5afd9387fba9796c 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -2,7 +2,7 @@
 /*
  * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
  *
- *  Copyright (C) 2013-2015 Eric Dumazet <edumazet@...gle.com>
+ *  Copyright (C) 2013-2023 Eric Dumazet <edumazet@...gle.com>
  *
  *  Meant to be mostly used for locally generated traffic :
  *  Fast classification depends on skb->sk being set before reaching us.
@@ -73,7 +73,13 @@ struct fq_flow {
 		struct sk_buff *tail;	/* last skb in the list */
 		unsigned long  age;	/* (jiffies | 1UL) when flow was emptied, for gc */
 	};
-	struct rb_node	fq_node;	/* anchor in fq_root[] trees */
+	union {
+		struct rb_node	fq_node;	/* anchor in fq_root[] trees */
+		/* Following field is only used for q->internal,
+		 * because q->internal is not hashed in fq_root[]
+		 */
+		u64		stat_fastpath_packets;
+	};
 	struct sock	*sk;
 	u32		socket_hash;	/* sk_hash */
 	int		qlen;		/* number of packets in flow queue */
@@ -266,8 +272,56 @@ static void fq_gc(struct fq_sched_data *q,
 	kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
 }
 
-static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+/* Fast path can be used if :
+ * 1) Packet tstamp is in the past.
+ * 2) FQ qlen == 0   OR
+ *   (no flow is currently elligible for transmit,
+ *    AND fast path queue has less than 8 packets)
+ * 3) No SO_MAX_PACING_RATE on the socket (if any).
+ * 4) No @maxrate attribute on this qdisc,
+ *
+ * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
+ */
+static bool fq_fastpath_check(struct Qdisc *sch, struct sk_buff *skb)
 {
+	struct fq_sched_data *q = qdisc_priv(sch);
+	const struct sock *sk;
+
+	if (fq_skb_cb(skb)->time_to_send > q->ktime_cache)
+		return false;
+
+	if (sch->q.qlen != 0) {
+		/* Even if some packets are stored in this qdisc,
+		 * we can still enable fast path if all of them are
+		 * scheduled in the future (ie no flows are elligible)
+		 * or in the fast path queue.
+		 */
+		if (q->flows != q->inactive_flows + q->throttled_flows)
+			return false;
+
+		/* Do not allow fast path queue to explode, we want Fair Queue mode
+		 * under pressure.
+		 */
+		if (q->internal.qlen >= 8)
+			return false;
+	}
+
+	sk = skb->sk;
+	if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
+	    sk->sk_max_pacing_rate != ~0UL)
+		return false;
+
+	if (q->flow_max_rate != ~0UL)
+		return false;
+
+	q->internal.stat_fastpath_packets++;
+
+	return true;
+}
+
+static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
 	struct rb_node **p, *parent;
 	struct sock *sk = skb->sk;
 	struct rb_root *root;
@@ -307,6 +361,9 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 		sk = (struct sock *)((hash << 1) | 1UL);
 	}
 
+	if (fq_fastpath_check(sch, skb))
+		return &q->internal;
+
 	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
 
 	if (q->flows >= (2U << q->fq_trees_log) &&
@@ -402,12 +459,8 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
 			   struct sk_buff *skb)
 {
-	struct fq_sched_data *q = qdisc_priv(sch);
-
 	fq_erase_head(sch, flow, skb);
 	skb_mark_not_on_list(skb);
-	if (--flow->qlen == 0)
-		q->inactive_flows++;
 	qdisc_qstats_backlog_dec(sch, skb);
 	sch->q.qlen--;
 }
@@ -450,6 +503,7 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
 	return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
 }
 
+
 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		      struct sk_buff **to_free)
 {
@@ -459,49 +513,46 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (unlikely(sch->q.qlen >= sch->limit))
 		return qdisc_drop(skb, sch, to_free);
 
+	q->ktime_cache = ktime_get_ns();
 	if (!skb->tstamp) {
-		fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
+		fq_skb_cb(skb)->time_to_send = q->ktime_cache;
 	} else {
-		/* Check if packet timestamp is too far in the future.
-		 * Try first if our cached value, to avoid ktime_get_ns()
-		 * cost in most cases.
-		 */
+		/* Check if packet timestamp is too far in the future. */
 		if (fq_packet_beyond_horizon(skb, q)) {
-			/* Refresh our cache and check another time */
-			q->ktime_cache = ktime_get_ns();
-			if (fq_packet_beyond_horizon(skb, q)) {
-				if (q->horizon_drop) {
+			if (q->horizon_drop) {
 					q->stat_horizon_drops++;
 					return qdisc_drop(skb, sch, to_free);
-				}
-				q->stat_horizon_caps++;
-				skb->tstamp = q->ktime_cache + q->horizon;
 			}
+			q->stat_horizon_caps++;
+			skb->tstamp = q->ktime_cache + q->horizon;
 		}
 		fq_skb_cb(skb)->time_to_send = skb->tstamp;
 	}
 
-	f = fq_classify(skb, q);
+	f = fq_classify(sch, skb);
 	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
 		q->stat_flows_plimit++;
 		return qdisc_drop(skb, sch, to_free);
 	}
 
-	if (f->qlen++ == 0)
-		q->inactive_flows--;
-	qdisc_qstats_backlog_inc(sch, skb);
 	if (fq_flow_is_detached(f)) {
 		fq_flow_add_tail(&q->new_flows, f);
 		if (time_after(jiffies, f->age + q->flow_refill_delay))
 			f->credit = max_t(u32, f->credit, q->quantum);
 	}
 
-	/* Note: this overwrites f->age */
-	flow_queue_add(f, skb);
-
 	if (unlikely(f == &q->internal)) {
 		q->stat_internal_packets++;
+	} else {
+		if (f->qlen == 0)
+			q->inactive_flows--;
 	}
+queue:
+	f->qlen++;
+	/* Note: this overwrites f->age */
+	flow_queue_add(f, skb);
+
+	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
@@ -549,6 +600,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 
 	skb = fq_peek(&q->internal);
 	if (unlikely(skb)) {
+		q->internal.qlen--;
 		fq_dequeue_skb(sch, &q->internal, skb);
 		goto out;
 	}
@@ -592,6 +644,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 			INET_ECN_set_ce(skb);
 			q->stat_ce_mark++;
 		}
+		if (--f->qlen == 0)
+			q->inactive_flows++;
 		fq_dequeue_skb(sch, f, skb);
 	} else {
 		head->first = f->next;
@@ -1024,6 +1078,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 
 	st.gc_flows		  = q->stat_gc_flows;
 	st.highprio_packets	  = q->stat_internal_packets;
+	st.fastpath_packets	  = q->internal.stat_fastpath_packets;
 	st.tcp_retrans		  = 0;
 	st.throttled		  = q->stat_throttled;
 	st.flows_plimit		  = q->stat_flows_plimit;
-- 
2.42.0.459.ge4e396fd5e-goog