lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250801090949.129941-1-dongml2@chinatelecom.cn>
Date: Fri,  1 Aug 2025 17:09:49 +0800
From: Menglong Dong <menglong8.dong@...il.com>
To: edumazet@...gle.com,
	kuniyu@...gle.com,
	kraig@...gle.com
Cc: ncardwell@...gle.com,
	davem@...emloft.net,
	dsahern@...nel.org,
	kuba@...nel.org,
	pabeni@...hat.com,
	horms@...nel.org,
	netdev@...r.kernel.org,
	linux-kernel@...r.kernel.org
Subject: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash

For now, the socket lookup will terminate if the socket is reuse port in
inet_lhash2_lookup(), which makes the socket is not the best match.

For example, we have socket1 and socket2 both listen on "0.0.0.0:1234",
but socket1 bind on "eth0". We create socket1 first, and then socket2.
Then, all connections will goto socket2, which is not expected, as socket1
has higher priority.

This can cause unexpected behavior if TCP MD5 keys is used, as described
in Documentation/networking/vrf.rst -> Applications.

Therefore, we compute a score for the reuseport socket and add it to the
list with order in __inet_hash(). Sockets with high score will be added
to the head.

Link: https://lore.kernel.org/netdev/20250731123309.184496-1-dongml2@chinatelecom.cn/
Signed-off-by: Menglong Dong <dongml2@...natelecom.cn>
---
v2:
- As Kuniyuki advised, sort the reuseport socket in __inet_hash() to keep
  the lookup for reuseport O(1)
---
 include/linux/rculist_nulls.h | 34 ++++++++++++++++++++++++
 include/net/sock.h            |  5 ++++
 net/ipv4/inet_hashtables.c    | 49 ++++++++++++++++++++++++++++++++---
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 89186c499dd4..da500f4ae142 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
 #define hlist_nulls_next_rcu(node) \
 	(*((struct hlist_nulls_node __rcu __force **)&(node)->next))
 
+/**
+ * hlist_nulls_pprev_rcu - returns the element of the list after @node.
+ * @node: element of the list.
+ */
+#define hlist_nulls_pprev_rcu(node) \
+	(*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
+
 /**
  * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
  * @n: the element to delete from the hash list.
@@ -145,6 +152,33 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
 	}
 }
 
+/**
+ * hlist_nulls_add_before_rcu
+ * @n: the new element to add to the hash list.
+ * @next: the existing element to add the new element before.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist
+ * before the specified node while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
+ * or hlist_nulls_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.
+ */
+static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
+					      struct hlist_nulls_node *next)
+{
+	WRITE_ONCE(n->pprev, next->pprev);
+	n->next = next;
+	rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
+	WRITE_ONCE(next->pprev, &n->next);
+}
+
 /* after that hlist_nulls_del will work */
 static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
 {
diff --git a/include/net/sock.h b/include/net/sock.h
index c8a4b283df6f..42aa1919eeee 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -885,6 +885,11 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
 	hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
 }
 
+static inline void __sk_nulls_add_node_before_rcu(struct sock *sk, struct sock *next)
+{
+	hlist_nulls_add_before_rcu(&sk->sk_nulls_node, &next->sk_nulls_node);
+}
+
 static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
 	sock_hold(sk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ceeeec9b7290..80d8bec41a58 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -334,6 +334,26 @@ static inline int compute_score(struct sock *sk, const struct net *net,
 	return score;
 }
 
+static inline int compute_reuseport_score(struct sock *sk)
+{
+	int score = 0;
+
+	if (sk->sk_bound_dev_if)
+		score += 2;
+
+	if (sk->sk_family == PF_INET)
+		score += 10;
+
+	/* the priority of sk_incoming_cpu should be lower than sk_bound_dev_if,
+	 * as it's optional in compute_score(). Thank God, this is the only
+	 * variable condition, which we can't judge now.
+	 */
+	if (READ_ONCE(sk->sk_incoming_cpu))
+		score++;
+
+	return score;
+}
+
 /**
  * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
  * @net: network namespace.
@@ -739,6 +759,27 @@ static int inet_reuseport_add_sock(struct sock *sk,
 	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
 }
 
+static void inet_hash_reuseport(struct sock *sk, struct hlist_nulls_head *head)
+{
+	const struct hlist_nulls_node *node;
+	int score, curscore;
+	struct sock *sk2;
+
+	curscore = compute_reuseport_score(sk);
+	/* lookup the socket to insert before */
+	sk_nulls_for_each_rcu(sk2, node, head) {
+		if (!sk2->sk_reuseport)
+			continue;
+		score = compute_reuseport_score(sk2);
+		if (score <= curscore) {
+			__sk_nulls_add_node_before_rcu(sk, sk2);
+			return;
+		}
+	}
+
+	__sk_nulls_add_node_tail_rcu(sk, head);
+}
+
 int __inet_hash(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
@@ -761,11 +802,11 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 			goto unlock;
 	}
 	sock_set_flag(sk, SOCK_RCU_FREE);
-	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
-		sk->sk_family == AF_INET6)
-		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
-	else
+	if (!sk->sk_reuseport)
 		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
+	else
+		inet_hash_reuseport(sk, &ilb2->nulls_head);
+
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
 	spin_unlock(&ilb2->lock);
-- 
2.50.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ