netdev - [PATCH RFC net-next 4/4] net: allow busy connected flows to switch tx queues

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251008104612.1824200-5-edumazet@google.com>
Date: Wed,  8 Oct 2025 10:46:11 +0000
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, 
	Paolo Abeni <pabeni@...hat.com>
Cc: Simon Horman <horms@...nel.org>, Kuniyuki Iwashima <kuniyu@...gle.com>, 
	Willem de Bruijn <willemb@...gle.com>, netdev@...r.kernel.org, eric.dumazet@...il.com, 
	Eric Dumazet <edumazet@...gle.com>
Subject: [PATCH RFC net-next 4/4] net: allow busy connected flows to switch tx queues

This is a followup of commit 726e9e8b94b9 ("tcp: refine
skb->ooo_okay setting") and to the prior commit in this series
("net: control skb->ooo_okay from skb_set_owner_w()")

skb->ooo_okay might never be set for bulk flows that always
have at least one skb in a qdisc queue of NIC queue,
especially if TX completion is delayed because of a stressed cpu.

The so-called "strange attractors" has caused many performance
issues, we need to do better.

We have tried very hard to avoid reorders because TCP was
not dealing with them nicely a decade ago.

Use the new net.core.txq_reselection_ms sysctl to let
flows follow XPS and select a more efficient queue.

After this patch, we no longer have to make sure threads
are pinned to cpus, they now can be migrated without
adding too much spinlock/qdisc/TX completion pressure anymore.

TX completion part was problematic, because it added false sharing
on various socket fields, but also added false sharing and spinlock
contention in mm layers. Calling skb_orphan() from ndo_start_xmit()
is not an option unfortunately.

Note for later: move sk->sk_tx_queue_mapping closer
to sk_tx_queue_mapping_jiffies for better cache locality.

Tested:

Used a host with 32 TX queues, shared by groups of 8 cores.
XPS setup :

echo ff >/sys/class/net/eth1/queue/tx-0/xps_cpus
echo ff00 >/sys/class/net/eth1/queue/tx-1/xps_cpus
echo ff0000 >/sys/class/net/eth1/queue/tx-2/xps_cpus
echo ff000000 >/sys/class/net/eth1/queue/tx-3/xps_cpus
echo ff,00000000 >/sys/class/net/eth1/queue/tx-4/xps_cpus
echo ff00,00000000 >/sys/class/net/eth1/queue/tx-5/xps_cpus
echo ff0000,00000000 >/sys/class/net/eth1/queue/tx-6/xps_cpus
echo ff000000,00000000 >/sys/class/net/eth1/queue/tx-7/xps_cpus
...

Launched a tcp_stream with 15 threads and 1000 flows, initially affined to core 0-15

taskset -c 0-15 tcp_stream -T15 -F1000 -l1000 -c -H target_host

Checked that only queues 0 and 1 are used as instructed by XPS :
tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p"
 backlog 123489410b 1890p
 backlog 69809026b 1064p
 backlog 52401054b 805p

Then force each thread to run on cpu 1,9,17,25,33,41,49,57,65,73,81,89,97,105,113,121

C=1;PID=`pidof tcp_stream`;for P in `ls /proc/$PID/task`; do taskset -pc $C $P; C=$(($C + 8));done

Set txq_reselection_ms to 1000
echo 1000 > /proc/sys/net/core/txq_reselection_ms

Check that the flows have migrated nicely:

tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p"
 backlog 130508314b 1916p
 backlog 8584380b 126p
 backlog 8584380b 126p
 backlog 8379990b 123p
 backlog 8584380b 126p
 backlog 8487484b 125p
 backlog 8584380b 126p
 backlog 8448120b 124p
 backlog 8584380b 126p
 backlog 8720640b 128p
 backlog 8856900b 130p
 backlog 8584380b 126p
 backlog 8652510b 127p
 backlog 8448120b 124p
 backlog 8516250b 125p
 backlog 7834950b 115p

Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
 include/net/sock.h | 40 +++++++++++++++++++---------------------
 net/core/dev.c     | 27 +++++++++++++++++++++++++--
 2 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 2794bc5c565424491a064049d3d76c3fb7ba1ed8..61f92bb03e00d7167cccfe70da16174f2b40f6de 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -485,6 +485,7 @@ struct sock {
 	unsigned long		sk_pacing_rate; /* bytes per second */
 	atomic_t		sk_zckey;
 	atomic_t		sk_tskey;
+	unsigned long		sk_tx_queue_mapping_jiffies;
 	__cacheline_group_end(sock_write_tx);
 
 	__cacheline_group_begin(sock_read_tx);
@@ -1984,6 +1985,14 @@ static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 	return __sk_receive_skb(sk, skb, nested, 1, true);
 }
 
+/* This helper checks if a socket is a full socket,
+ * ie _not_ a timewait or request socket.
+ */
+static inline bool sk_fullsock(const struct sock *sk)
+{
+	return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
+}
+
 static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
 {
 	/* sk_tx_queue_mapping accept only upto a 16-bit value */
@@ -1992,7 +2001,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
 	/* Paired with READ_ONCE() in sk_tx_queue_get() and
 	 * other WRITE_ONCE() because socket lock might be not held.
 	 */
-	WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+	if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) {
+		WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+		WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
+		return;
+	}
+
+	/* Refresh sk_tx_queue_mapping_jiffies if too old. */
+	if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ))
+		WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
 }
 
 #define NO_QUEUE_MAPPING	USHRT_MAX
@@ -2005,19 +2022,7 @@ static inline void sk_tx_queue_clear(struct sock *sk)
 	WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
 }
 
-static inline int sk_tx_queue_get(const struct sock *sk)
-{
-	if (sk) {
-		/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
-		 * and sk_tx_queue_set().
-		 */
-		int val = READ_ONCE(sk->sk_tx_queue_mapping);
-
-		if (val != NO_QUEUE_MAPPING)
-			return val;
-	}
-	return -1;
-}
+int sk_tx_queue_get(const struct sock *sk);
 
 static inline void __sk_rx_queue_set(struct sock *sk,
 				     const struct sk_buff *skb,
@@ -2945,13 +2950,6 @@ skb_sk_is_prefetched(struct sk_buff *skb)
 #endif /* CONFIG_INET */
 }
 
-/* This helper checks if a socket is a full socket,
- * ie _not_ a timewait or request socket.
- */
-static inline bool sk_fullsock(const struct sock *sk)
-{
-	return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
-}
 
 static inline bool
 sk_is_refcounted(struct sock *sk)
diff --git a/net/core/dev.c b/net/core/dev.c
index a64cef2c537e98ee87776e6f8d3ca3d98f0711b3..c302fd5bb57894c6e5651b7adc8d033ac719070a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4591,6 +4591,30 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(dev_pick_tx_zero);
 
+int sk_tx_queue_get(const struct sock *sk)
+{
+	int val;
+
+	if (!sk)
+		return -1;
+	/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
+	 * and sk_tx_queue_set().
+	 */
+	val = READ_ONCE(sk->sk_tx_queue_mapping);
+
+	if (val == NO_QUEUE_MAPPING)
+		return -1;
+
+	if (sk_fullsock(sk) &&
+	    time_is_before_jiffies(
+			READ_ONCE(sk->sk_tx_queue_mapping_jiffies) +
+			READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection)))
+		return -1;
+
+	return val;
+}
+EXPORT_SYMBOL(sk_tx_queue_get);
+
 u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 		     struct net_device *sb_dev)
 {
@@ -4606,8 +4630,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 		if (new_index < 0)
 			new_index = skb_tx_hash(dev, sb_dev, skb);
 
-		if (queue_index != new_index && sk &&
-		    sk_fullsock(sk) &&
+		if (sk && sk_fullsock(sk) &&
 		    rcu_access_pointer(sk->sk_dst_cache))
 			sk_tx_queue_set(sk, new_index);
 
-- 
2.51.0.710.ga91ca5db03-goog