[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20230126182302.197763-1-vladimir.oltean@nxp.com>
Date: Thu, 26 Jan 2023 20:23:02 +0200
From: Vladimir Oltean <vladimir.oltean@....com>
To: netdev@...r.kernel.org
Cc: "David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>,
Claudiu Manoil <claudiu.manoil@....com>,
Vinicius Costa Gomes <vinicius.gomes@...el.com>,
Kurt Kanzenbach <kurt@...utronix.de>,
Jacob Keller <jacob.e.keller@...el.com>,
Jamal Hadi Salim <jhs@...atatu.com>,
Cong Wang <xiyou.wangcong@...il.com>,
Jiri Pirko <jiri@...nulli.us>
Subject: [RFC PATCH net-next] net/sched: taprio: give higher priority to higher TCs in software dequeue mode
Currently taprio iterates over child qdiscs in increasing order of TXQ
index, therefore giving higher xmit priority to TXQ 0 and lower to TXQ N.
However, to the best of my understanding, we should prioritize based on
the traffic class, so we should really dequeue starting with the highest
traffic class and going down from there. We get to the TXQ using the
tc_to_txq[] netdev property.
TXQs within the same TC have the same (strict) priority, so we should
pick from them as fairly as we can. Implement something very similar to
q->curband from multiq_dequeue()/multiq_peek().
Something tells me Vinicius won't like the way in which this patch
interacts with TXTIME_ASSIST_IS_ENABLED(q->flags) and NICs where TXQ 0
really has higher priority than TXQ 1....
Signed-off-by: Vladimir Oltean <vladimir.oltean@....com>
---
Superficially (and quantitatively) tested using the following:
#!/bin/bash
ip netns add ns0
ip link add veth0 numtxqueues 2 numrxqueues 2 type veth peer name veth1
ip link set veth1 netns ns0
ip addr add 192.168.100.1/24 dev veth0 && ip link set veth0 up
ip -n ns0 addr add 192.168.100.2/24 dev veth1 && ip -n ns0 link set veth1 up
tc qdisc replace dev veth0 parent root taprio \
num_tc 2 \
map 0 1 \
queues 1@0 1@1 \
base-time 0 \
sched-entry S 0xff 4000000000 \
clockid CLOCK_TAI \
flags 0x0
tc qdisc add dev veth0 clsact
tc filter add dev veth0 egress protocol ipv4 flower \
ip_proto tcp dst_port 5201 action skbedit priority 1
ip netns exec ns0 iperf3 -s --port 5201 &
ip netns exec ns0 iperf3 -s --port 5202 &
sleep 1
taskset -c 0 iperf3 -c 192.168.100.2 -p 5201 -t 30 --logfile iperf5201.log &
taskset -c 1 iperf3 -c 192.168.100.2 -p 5202 -t 30 --logfile iperf5202.log &
sleep 20
killall iperf3
echo "iperf3 log to port 5201:"
cat iperf5201.log
echo "iperf3 log to port 5202:"
cat iperf5202.log
rm -f iperf5201.log iperf5202.log
tc -d -s filter show dev veth0 egress
tc qdisc del dev veth0 clsact
tc qdisc del dev veth0 root
ip netns del ns0
Before:
5201:
[ ID] Interval Transfer Bitrate Retr
[ 6] 0.00-20.12 sec 150 MBytes 62.4 Mbits/sec 457 sender
5202:
[ ID] Interval Transfer Bitrate Retr
[ 6] 0.00-20.00 sec 219 MBytes 91.9 Mbits/sec 635 sender
After:
5201:
[ ID] Interval Transfer Bitrate Retr
[ 6] 0.00-20.19 sec 198 MBytes 82.2 Mbits/sec 485 sender
5202:
[ ID] Interval Transfer Bitrate Retr
[ 6] 0.00-20.16 sec 162 MBytes 67.4 Mbits/sec 437 sender
Posting as RFC because I realize this is a breaking change, and because
I don't really care about software taprio all that much, but I still
don't think that the current code is something to follow.
net/sched/sch_taprio.c | 51 ++++++++++++++++++++++++++++--------------
1 file changed, 34 insertions(+), 17 deletions(-)
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 9a11a499ea2d..392d5f0592a6 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -78,6 +78,7 @@ struct taprio_sched {
struct sched_gate_list __rcu *admin_sched;
struct hrtimer advance_timer;
struct list_head taprio_list;
+ int cur_txq[TC_MAX_QUEUE];
u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */
u32 txtime_delay;
@@ -497,6 +498,16 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return taprio_enqueue_one(skb, sch, child, to_free);
}
+static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
+{
+ int offset = dev->tc_to_txq[tc].offset;
+ int count = dev->tc_to_txq[tc].count;
+
+ (*txq)++;
+ if (*txq == offset + count)
+ *txq = offset;
+}
+
/* Will not be called in the full offload case, since the TX queues are
* attached to the Qdisc created using qdisc_create_dflt()
*/
@@ -504,10 +515,11 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ int num_tc = netdev_get_num_tc(dev);
struct sched_entry *entry;
struct sk_buff *skb;
u32 gate_mask;
- int i;
+ int tc;
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
@@ -517,11 +529,16 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
if (!gate_mask)
return NULL;
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
- int prio;
- u8 tc;
+ /* Give higher priority to higher traffic classes */
+ for (tc = num_tc - 1; tc >= 0; tc--) {
+ int cur_txq = q->cur_txq[tc];
+ struct Qdisc *child;
+ /* Select among TXQs belonging to the same TC
+ * using round robin
+ */
+ child = q->qdiscs[cur_txq];
+ taprio_next_tc_txq(dev, tc, &cur_txq);
if (unlikely(!child))
continue;
@@ -532,9 +549,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
if (TXTIME_ASSIST_IS_ENABLED(q->flags))
return skb;
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
-
if (!(gate_mask & BIT(tc)))
continue;
@@ -558,10 +572,11 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ int num_tc = netdev_get_num_tc(dev);
struct sk_buff *skb = NULL;
struct sched_entry *entry;
u32 gate_mask;
- int i;
+ int tc;
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
@@ -575,13 +590,16 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
if (!gate_mask)
goto done;
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
+ /* Give higher priority to higher traffic classes */
+ for (tc = num_tc - 1; tc >= 0; tc--) {
+ struct Qdisc *child = q->qdiscs[q->cur_txq[tc]];
ktime_t guard;
- int prio;
int len;
- u8 tc;
+ /* As opposed to taprio_peek(), now we also advance the
+ * current TXQ.
+ */
+ taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);
if (unlikely(!child))
continue;
@@ -596,9 +614,6 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
if (!skb)
continue;
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
-
if (!(gate_mask & BIT(tc))) {
skb = NULL;
continue;
@@ -1605,10 +1620,12 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
err = netdev_set_num_tc(dev, mqprio->num_tc);
if (err)
goto free_sched;
- for (i = 0; i < mqprio->num_tc; i++)
+ for (i = 0; i < mqprio->num_tc; i++) {
netdev_set_tc_queue(dev, i,
mqprio->count[i],
mqprio->offset[i]);
+ q->cur_txq[i] = mqprio->offset[i];
+ }
/* Always use supplied priority mappings */
for (i = 0; i <= TC_BITMASK; i++)
--
2.34.1
Powered by blists - more mailing lists