lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20230126182302.197763-1-vladimir.oltean@nxp.com>
Date:   Thu, 26 Jan 2023 20:23:02 +0200
From:   Vladimir Oltean <vladimir.oltean@....com>
To:     netdev@...r.kernel.org
Cc:     "David S. Miller" <davem@...emloft.net>,
        Eric Dumazet <edumazet@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>,
        Claudiu Manoil <claudiu.manoil@....com>,
        Vinicius Costa Gomes <vinicius.gomes@...el.com>,
        Kurt Kanzenbach <kurt@...utronix.de>,
        Jacob Keller <jacob.e.keller@...el.com>,
        Jamal Hadi Salim <jhs@...atatu.com>,
        Cong Wang <xiyou.wangcong@...il.com>,
        Jiri Pirko <jiri@...nulli.us>
Subject: [RFC PATCH net-next] net/sched: taprio: give higher priority to higher TCs in software dequeue mode

Currently taprio iterates over child qdiscs in increasing order of TXQ
index, therefore giving higher xmit priority to TXQ 0 and lower to TXQ N.

However, to the best of my understanding, we should prioritize based on
the traffic class, so we should really dequeue starting with the highest
traffic class and going down from there. We get to the TXQ using the
tc_to_txq[] netdev property.

TXQs within the same TC have the same (strict) priority, so we should
pick from them as fairly as we can. Implement something very similar to
q->curband from multiq_dequeue()/multiq_peek().

Something tells me Vinicius won't like the way in which this patch
interacts with TXTIME_ASSIST_IS_ENABLED(q->flags) and NICs where TXQ 0
really has higher priority than TXQ 1....

Signed-off-by: Vladimir Oltean <vladimir.oltean@....com>
---
Superficially (and quantitatively) tested using the following:

#!/bin/bash

ip netns add ns0
ip link add veth0 numtxqueues 2 numrxqueues 2 type veth peer name veth1
ip link set veth1 netns ns0
ip addr add 192.168.100.1/24 dev veth0 && ip link set veth0 up
ip -n ns0 addr add 192.168.100.2/24 dev veth1 && ip -n ns0 link set veth1 up

tc qdisc replace dev veth0 parent root taprio \
	num_tc 2 \
	map 0 1 \
	queues 1@0 1@1 \
	base-time 0 \
	sched-entry S 0xff 4000000000 \
	clockid CLOCK_TAI \
	flags 0x0
tc qdisc add dev veth0 clsact
tc filter add dev veth0 egress protocol ipv4 flower \
	ip_proto tcp dst_port 5201 action skbedit priority 1

ip netns exec ns0 iperf3 -s --port 5201 &
ip netns exec ns0 iperf3 -s --port 5202 &
sleep 1
taskset -c 0 iperf3 -c 192.168.100.2 -p 5201 -t 30 --logfile iperf5201.log &
taskset -c 1 iperf3 -c 192.168.100.2 -p 5202 -t 30 --logfile iperf5202.log &
sleep 20
killall iperf3

echo "iperf3 log to port 5201:"
cat iperf5201.log
echo "iperf3 log to port 5202:"
cat iperf5202.log
rm -f iperf5201.log iperf5202.log
tc -d -s filter show dev veth0 egress
tc qdisc del dev veth0 clsact
tc qdisc del dev veth0 root
ip netns del ns0

Before:

5201:
[ ID] Interval           Transfer     Bitrate         Retr
[  6]   0.00-20.12  sec   150 MBytes  62.4 Mbits/sec  457             sender
5202:
[ ID] Interval           Transfer     Bitrate         Retr
[  6]   0.00-20.00  sec   219 MBytes  91.9 Mbits/sec  635             sender

After:

5201:
[ ID] Interval           Transfer     Bitrate         Retr
[  6]   0.00-20.19  sec   198 MBytes  82.2 Mbits/sec  485             sender
5202:
[ ID] Interval           Transfer     Bitrate         Retr
[  6]   0.00-20.16  sec   162 MBytes  67.4 Mbits/sec  437             sender

Posting as RFC because I realize this is a breaking change, and because
I don't really care about software taprio all that much, but I still
don't think that the current code is something to follow.

 net/sched/sch_taprio.c | 51 ++++++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 9a11a499ea2d..392d5f0592a6 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -78,6 +78,7 @@ struct taprio_sched {
 	struct sched_gate_list __rcu *admin_sched;
 	struct hrtimer advance_timer;
 	struct list_head taprio_list;
+	int cur_txq[TC_MAX_QUEUE];
 	u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
 	u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */
 	u32 txtime_delay;
@@ -497,6 +498,16 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	return taprio_enqueue_one(skb, sch, child, to_free);
 }
 
+static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
+{
+	int offset = dev->tc_to_txq[tc].offset;
+	int count = dev->tc_to_txq[tc].count;
+
+	(*txq)++;
+	if (*txq == offset + count)
+		*txq = offset;
+}
+
 /* Will not be called in the full offload case, since the TX queues are
  * attached to the Qdisc created using qdisc_create_dflt()
  */
@@ -504,10 +515,11 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
+	int num_tc = netdev_get_num_tc(dev);
 	struct sched_entry *entry;
 	struct sk_buff *skb;
 	u32 gate_mask;
-	int i;
+	int tc;
 
 	rcu_read_lock();
 	entry = rcu_dereference(q->current_entry);
@@ -517,11 +529,16 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 	if (!gate_mask)
 		return NULL;
 
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		struct Qdisc *child = q->qdiscs[i];
-		int prio;
-		u8 tc;
+	/* Give higher priority to higher traffic classes */
+	for (tc = num_tc - 1; tc >= 0; tc--) {
+		int cur_txq = q->cur_txq[tc];
+		struct Qdisc *child;
 
+		/* Select among TXQs belonging to the same TC
+		 * using round robin
+		 */
+		child = q->qdiscs[cur_txq];
+		taprio_next_tc_txq(dev, tc, &cur_txq);
 		if (unlikely(!child))
 			continue;
 
@@ -532,9 +549,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 		if (TXTIME_ASSIST_IS_ENABLED(q->flags))
 			return skb;
 
-		prio = skb->priority;
-		tc = netdev_get_prio_tc_map(dev, prio);
-
 		if (!(gate_mask & BIT(tc)))
 			continue;
 
@@ -558,10 +572,11 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
+	int num_tc = netdev_get_num_tc(dev);
 	struct sk_buff *skb = NULL;
 	struct sched_entry *entry;
 	u32 gate_mask;
-	int i;
+	int tc;
 
 	rcu_read_lock();
 	entry = rcu_dereference(q->current_entry);
@@ -575,13 +590,16 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
 	if (!gate_mask)
 		goto done;
 
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		struct Qdisc *child = q->qdiscs[i];
+	/* Give higher priority to higher traffic classes */
+	for (tc = num_tc - 1; tc >= 0; tc--) {
+		struct Qdisc *child = q->qdiscs[q->cur_txq[tc]];
 		ktime_t guard;
-		int prio;
 		int len;
-		u8 tc;
 
+		/* As opposed to taprio_peek(), now we also advance the
+		 * current TXQ.
+		 */
+		taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);
 		if (unlikely(!child))
 			continue;
 
@@ -596,9 +614,6 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
 		if (!skb)
 			continue;
 
-		prio = skb->priority;
-		tc = netdev_get_prio_tc_map(dev, prio);
-
 		if (!(gate_mask & BIT(tc))) {
 			skb = NULL;
 			continue;
@@ -1605,10 +1620,12 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 		err = netdev_set_num_tc(dev, mqprio->num_tc);
 		if (err)
 			goto free_sched;
-		for (i = 0; i < mqprio->num_tc; i++)
+		for (i = 0; i < mqprio->num_tc; i++) {
 			netdev_set_tc_queue(dev, i,
 					    mqprio->count[i],
 					    mqprio->offset[i]);
+			q->cur_txq[i] = mqprio->offset[i];
+		}
 
 		/* Always use supplied priority mappings */
 		for (i = 0; i <= TC_BITMASK; i++)
-- 
2.34.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ