[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250801040757.1599996-1-kuniyu@google.com>
Date: Fri, 1 Aug 2025 04:06:37 +0000
From: Kuniyuki Iwashima <kuniyu@...gle.com>
To: menglong8.dong@...il.com
Cc: davem@...emloft.net, dsahern@...nel.org, edumazet@...gle.com,
horms@...nel.org, kafai@...com, kraig@...gle.com, kuba@...nel.org,
kuniyu@...gle.com, linux-kernel@...r.kernel.org, ncardwell@...gle.com,
netdev@...r.kernel.org, pabeni@...hat.com
Subject: Re: [PATCH net-next] net: ip: lookup the best matched listen socket
From: Menglong Dong <menglong8.dong@...il.com>
Date: Fri, 1 Aug 2025 09:31:43 +0800
> On Fri, Aug 1, 2025 at 1:52 AM Kuniyuki Iwashima <kuniyu@...gle.com> wrote:
> >
> > On Thu, Jul 31, 2025 at 6:01 AM Eric Dumazet <edumazet@...gle.com> wrote:
> > >
> > > On Thu, Jul 31, 2025 at 5:33 AM Menglong Dong <menglong8.dong@...il.com> wrote:
> > > >
> > > > For now, the socket lookup will terminate if the socket is reuse port in
> > > > inet_lhash2_lookup(), which makes the socket is not the best match.
> > > >
> > > > For example, we have socket1 and socket2 both listen on "0.0.0.0:1234",
> > > > but socket1 bind on "eth0". We create socket1 first, and then socket2.
> > > > Then, all connections will goto socket2, which is not expected, as socket1
> > > > has higher priority.
> > > >
> > > > This can cause unexpected behavior if TCP MD5 keys is used, as described
> > > > in Documentation/networking/vrf.rst -> Applications.
> > > >
> > > > Therefor, we lookup the best matched socket first, and then do the reuse
> > > > port logic. This can increase some overhead if there are many reuse port
> > > > socket :/
> >
> > This kills O(1) lookup for reuseport...
> >
> > Another option would be to try hard in __inet_hash() to sort
> > reuseport groups.
>
> Good idea. For the reuse port case, we can compute a score
> for the reuseport sockets and insert the high score to front of
> the list. I'll have a try this way.
I remember you reported the same issue in Feburary and
I hacked up a patch below based on a draft diff in my stash.
This fixes the issue, but we still have corner cases where
SO_REUSEPORT/SO_BINDTODEVICE are changed after listen(),
which should not be allowed given TCP does not have ->rehash()
and confuses/breaks the reuseport logic/rule.
---8<---
diff --git a/include/net/sock.h b/include/net/sock.h
index c8a4b283df6f..8436e352732f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -885,6 +885,18 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
}
+static inline void __sk_nulls_insert_after_node_rcu(struct sock *sk,
+ struct hlist_nulls_node *prev)
+{
+ struct hlist_nulls_node *n = &sk->sk_nulls_node;
+
+ n->next = prev->next;
+ n->pprev = &prev->next;
+ if (!is_a_nulls(n->next))
+ WRITE_ONCE(n->next->pprev, &n->next);
+ rcu_assign_pointer(hlist_nulls_next_rcu(prev), n);
+}
+
static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
sock_hold(sk);
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 6e4faf3ee76f..4a3e9d6887a6 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -23,12 +23,14 @@ struct sock_reuseport {
unsigned int synq_overflow_ts;
/* ID stays the same even after the size of socks[] grows. */
unsigned int reuseport_id;
- unsigned int bind_inany:1;
- unsigned int has_conns:1;
+ unsigned short bind_inany:1;
+ unsigned short has_conns:1;
+ unsigned short score;
struct bpf_prog __rcu *prog; /* optional BPF sock selector */
struct sock *socks[] __counted_by(max_socks);
};
+unsigned short reuseport_compute_score(struct sock *sk, bool bind_inany);
extern int reuseport_alloc(struct sock *sk, bool bind_inany);
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
bool bind_inany);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 4211710393a8..df3d1a6f3178 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -6,6 +6,7 @@
* selecting the socket index from the array of available sockets.
*/
+#include <net/addrconf.h>
#include <net/ip.h>
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
@@ -185,6 +186,25 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
return reuse;
}
+unsigned short reuseport_compute_score(struct sock *sk, bool bind_inany)
+{
+ unsigned short score = 0;
+
+ if (sk->sk_family == AF_INET)
+ score += 10;
+
+ if (ipv6_only_sock(sk))
+ score++;
+
+ if (!bind_inany)
+ score++;
+
+ if (sk->sk_bound_dev_if)
+ score++;
+
+ return score;
+}
+
int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
@@ -233,6 +253,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse->bind_inany = bind_inany;
reuse->socks[0] = sk;
reuse->num_socks = 1;
+ reuse->score = reuseport_compute_score(sk, bind_inany);
reuseport_get_incoming_cpu(sk, reuse);
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
@@ -278,6 +299,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
more_reuse->bind_inany = reuse->bind_inany;
more_reuse->has_conns = reuse->has_conns;
more_reuse->incoming_cpu = reuse->incoming_cpu;
+ more_reuse->score = reuse->score;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ceeeec9b7290..042a65372d00 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -739,6 +739,44 @@ static int inet_reuseport_add_sock(struct sock *sk,
return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
+static void inet_reuseport_hash_sock(struct sock *sk,
+ struct inet_listen_hashbucket *ilb2)
+{
+ struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
+ const struct hlist_nulls_node *node;
+ struct sock *sk2, *sk_anchor = NULL;
+ unsigned short score, hiscore;
+ struct sock_reuseport *reuse;
+
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 1);
+ score = reuse->score;
+
+ sk_nulls_for_each_rcu(sk2, node, &ilb2->nulls_head) {
+ if (!sk2->sk_reuseport)
+ continue;
+
+ if (inet_csk(sk2)->icsk_bind_hash != tb)
+ continue;
+
+ reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, 1);
+ if (likely(reuse))
+ hiscore = reuse->score;
+ else
+ hiscore = reuseport_compute_score(sk2,
+ inet_rcv_saddr_any(sk2));
+
+ if (hiscore <= score)
+ break;
+
+ sk_anchor = sk2;
+ }
+
+ if (sk_anchor)
+ __sk_nulls_insert_after_node_rcu(sk, &sk_anchor->sk_nulls_node);
+ else
+ __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
+}
+
int __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
@@ -759,13 +797,14 @@ int __inet_hash(struct sock *sk, struct sock *osk)
err = inet_reuseport_add_sock(sk, ilb2);
if (err)
goto unlock;
- }
- sock_set_flag(sk, SOCK_RCU_FREE);
- if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
- sk->sk_family == AF_INET6)
- __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
- else
+
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ inet_reuseport_hash_sock(sk, ilb2);
+ } else {
+ sock_set_flag(sk, SOCK_RCU_FREE);
__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
+ }
+
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
spin_unlock(&ilb2->lock);
---8<---
Tested:
---8<---
[root@...ora ~]# cat a.py
from socket import *
s1 = socket()
s1.setsockopt(SOL_SOCKET, SO_REUSEPORT, 1)
s1.setsockopt(SOL_SOCKET, SO_BINDTODEVICE, b'lo')
s1.listen()
s1.setblocking(False)
s2 = socket()
s2.setsockopt(SOL_SOCKET, SO_REUSEPORT, 1)
s2.bind(s1.getsockname())
s2.listen()
s2.setblocking(False)
for i in range(3):
c = socket()
c.connect(s1.getsockname())
try:
print("assigned properly:", s1.accept())
except:
print("wrong assignment")
[root@...ora ~]# python3 a.py
assigned properly: (<socket.socket fd=6, family=2, type=1, proto=0, laddr=('127.0.0.1', 36733), raddr=('127.0.0.1', 39478)>, ('127.0.0.1', 39478))
assigned properly: (<socket.socket fd=5, family=2, type=1, proto=0, laddr=('127.0.0.1', 36733), raddr=('127.0.0.1', 39490)>, ('127.0.0.1', 39490))
assigned properly: (<socket.socket fd=6, family=2, type=1, proto=0, laddr=('127.0.0.1', 36733), raddr=('127.0.0.1', 39506)>, ('127.0.0.1', 39506))
---8<---
Powered by blists - more mailing lists