[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CADxym3bCWf3EHF+drDPntjcXAiU3XKOtCDuwp-hX+jj6LjD6wg@mail.gmail.com>
Date: Fri, 1 Aug 2025 15:33:11 +0800
From: Menglong Dong <menglong8.dong@...il.com>
To: Kuniyuki Iwashima <kuniyu@...gle.com>
Cc: davem@...emloft.net, dsahern@...nel.org, edumazet@...gle.com,
horms@...nel.org, kafai@...com, kraig@...gle.com, kuba@...nel.org,
linux-kernel@...r.kernel.org, ncardwell@...gle.com, netdev@...r.kernel.org,
pabeni@...hat.com
Subject: Re: [PATCH net-next] net: ip: lookup the best matched listen socket
On Fri, Aug 1, 2025 at 12:07 PM Kuniyuki Iwashima <kuniyu@...gle.com> wrote:
>
> From: Menglong Dong <menglong8.dong@...il.com>
> Date: Fri, 1 Aug 2025 09:31:43 +0800
> > On Fri, Aug 1, 2025 at 1:52 AM Kuniyuki Iwashima <kuniyu@...gle.com> wrote:
> > >
> > > On Thu, Jul 31, 2025 at 6:01 AM Eric Dumazet <edumazet@...gle.com> wrote:
> > > >
> > > > On Thu, Jul 31, 2025 at 5:33 AM Menglong Dong <menglong8.dong@...il.com> wrote:
> > > > >
> > > > > For now, the socket lookup will terminate if the socket is reuse port in
> > > > > inet_lhash2_lookup(), which makes the socket is not the best match.
> > > > >
> > > > > For example, we have socket1 and socket2 both listen on "0.0.0.0:1234",
> > > > > but socket1 bind on "eth0". We create socket1 first, and then socket2.
> > > > > Then, all connections will goto socket2, which is not expected, as socket1
> > > > > has higher priority.
> > > > >
> > > > > This can cause unexpected behavior if TCP MD5 keys is used, as described
> > > > > in Documentation/networking/vrf.rst -> Applications.
> > > > >
> > > > > Therefor, we lookup the best matched socket first, and then do the reuse
> > > > > port logic. This can increase some overhead if there are many reuse port
> > > > > socket :/
> > >
> > > This kills O(1) lookup for reuseport...
> > >
> > > Another option would be to try hard in __inet_hash() to sort
> > > reuseport groups.
> >
> > Good idea. For the reuse port case, we can compute a score
> > for the reuseport sockets and insert the high score to front of
> > the list. I'll have a try this way.
>
> I remember you reported the same issue in Feburary and
> I hacked up a patch below based on a draft diff in my stash.
Yeah, I reported before in this link:
https://lore.kernel.org/netdev/20250227123137.138778-1-dongml2@chinatelecom.cn/
And I made a wrong commit log in that patch, which made the
patch you hacked up didn't solve the problem :/
>
> This fixes the issue, but we still have corner cases where
> SO_REUSEPORT/SO_BINDTODEVICE are changed after listen(),
> which should not be allowed given TCP does not have ->rehash()
> and confuses/breaks the reuseport logic/rule.
Do we need to save the score? Which will make the logic
more complex :/
I just wrote a similar patch, which should work too:
---------------------------------------patch
begin---------------------------------
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 89186c499dd4..da500f4ae142 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct
hlist_nulls_node *n)
#define hlist_nulls_next_rcu(node) \
(*((struct hlist_nulls_node __rcu __force **)&(node)->next))
+/**
+ * hlist_nulls_pprev_rcu - returns the element of the list after @node.
+ * @node: element of the list.
+ */
+#define hlist_nulls_pprev_rcu(node) \
+ (*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
+
/**
* hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
* @n: the element to delete from the hash list.
@@ -145,6 +152,33 @@ static inline void
hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
}
}
+/**
+ * hlist_nulls_add_before_rcu
+ * @n: the new element to add to the hash list.
+ * @next: the existing element to add the new element before.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist
+ * before the specified node while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
+ * or hlist_nulls_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.
+ */
+static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
+ struct hlist_nulls_node *next)
+{
+ WRITE_ONCE(n->pprev, next->pprev);
+ n->next = next;
+ rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
+ WRITE_ONCE(next->pprev, &n->next);
+}
+
/* after that hlist_nulls_del will work */
static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
{
diff --git a/include/net/sock.h b/include/net/sock.h
index c8a4b283df6f..42aa1919eeee 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -885,6 +885,11 @@ static inline void
__sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
}
+static inline void __sk_nulls_add_node_before_rcu(struct sock *sk,
struct sock *next)
+{
+ hlist_nulls_add_before_rcu(&sk->sk_nulls_node, &next->sk_nulls_node);
+}
+
static inline void sk_nulls_add_node_rcu(struct sock *sk, struct
hlist_nulls_head *list)
{
sock_hold(sk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ceeeec9b7290..53a72a8b6094 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -334,6 +334,21 @@ static inline int compute_score(struct sock *sk,
const struct net *net,
return score;
}
+static inline int compute_reuseport_score(struct sock *sk)
+{
+ int score = 0;
+
+ if (sk->sk_bound_dev_if)
+ score++;
+
+ if (sk->sk_family == PF_INET)
+ score += 5;
+ if (READ_ONCE(sk->sk_incoming_cpu))
+ score++;
+
+ return score;
+}
+
/**
* inet_lookup_reuseport() - execute reuseport logic on AF_INET
socket if necessary.
* @net: network namespace.
@@ -739,6 +754,27 @@ static int inet_reuseport_add_sock(struct sock *sk,
return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
+static void inet_hash_reuseport(struct sock *sk, struct hlist_nulls_head *head)
+{
+ const struct hlist_nulls_node *node;
+ int score, curscore;
+ struct sock *sk2;
+
+ curscore = compute_reuseport_score(sk);
+ /* lookup the socket to insert before */
+ sk_nulls_for_each_rcu(sk2, node, head) {
+ if (!sk2->sk_reuseport)
+ continue;
+ score = compute_reuseport_score(sk2);
+ if (score <= curscore) {
+ __sk_nulls_add_node_before_rcu(sk, sk2);
+ return;
+ }
+ }
+
+ __sk_nulls_add_node_tail_rcu(sk, head);
+}
+
int __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
@@ -761,11 +797,11 @@ int __inet_hash(struct sock *sk, struct sock *osk)
goto unlock;
}
sock_set_flag(sk, SOCK_RCU_FREE);
- if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
- sk->sk_family == AF_INET6)
- __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
- else
+ if (!sk->sk_reuseport)
__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
+ else
+ inet_hash_reuseport(sk, &ilb2->nulls_head);
+
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
spin_unlock(&ilb2->lock);
------------------------------patch end--------------------------------------
>
> ---8<---
> diff --git a/include/net/sock.h b/include/net/sock.h
> index c8a4b283df6f..8436e352732f 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -885,6 +885,18 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
> hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
> }
>
> +static inline void __sk_nulls_insert_after_node_rcu(struct sock *sk,
> + struct hlist_nulls_node *prev)
> +{
> + struct hlist_nulls_node *n = &sk->sk_nulls_node;
> +
> + n->next = prev->next;
> + n->pprev = &prev->next;
> + if (!is_a_nulls(n->next))
> + WRITE_ONCE(n->next->pprev, &n->next);
> + rcu_assign_pointer(hlist_nulls_next_rcu(prev), n);
> +}
> +
> static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
> {
> sock_hold(sk);
> diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
> index 6e4faf3ee76f..4a3e9d6887a6 100644
> --- a/include/net/sock_reuseport.h
> +++ b/include/net/sock_reuseport.h
> @@ -23,12 +23,14 @@ struct sock_reuseport {
> unsigned int synq_overflow_ts;
> /* ID stays the same even after the size of socks[] grows. */
> unsigned int reuseport_id;
> - unsigned int bind_inany:1;
> - unsigned int has_conns:1;
> + unsigned short bind_inany:1;
> + unsigned short has_conns:1;
> + unsigned short score;
> struct bpf_prog __rcu *prog; /* optional BPF sock selector */
> struct sock *socks[] __counted_by(max_socks);
> };
>
> +unsigned short reuseport_compute_score(struct sock *sk, bool bind_inany);
> extern int reuseport_alloc(struct sock *sk, bool bind_inany);
> extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
> bool bind_inany);
> diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> index 4211710393a8..df3d1a6f3178 100644
> --- a/net/core/sock_reuseport.c
> +++ b/net/core/sock_reuseport.c
> @@ -6,6 +6,7 @@
> * selecting the socket index from the array of available sockets.
> */
>
> +#include <net/addrconf.h>
> #include <net/ip.h>
> #include <net/sock_reuseport.h>
> #include <linux/bpf.h>
> @@ -185,6 +186,25 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
> return reuse;
> }
>
> +unsigned short reuseport_compute_score(struct sock *sk, bool bind_inany)
> +{
> + unsigned short score = 0;
> +
> + if (sk->sk_family == AF_INET)
> + score += 10;
> +
> + if (ipv6_only_sock(sk))
> + score++;
> +
> + if (!bind_inany)
> + score++;
> +
> + if (sk->sk_bound_dev_if)
> + score++;
> +
> + return score;
> +}
> +
> int reuseport_alloc(struct sock *sk, bool bind_inany)
> {
> struct sock_reuseport *reuse;
> @@ -233,6 +253,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
> reuse->bind_inany = bind_inany;
> reuse->socks[0] = sk;
> reuse->num_socks = 1;
> + reuse->score = reuseport_compute_score(sk, bind_inany);
> reuseport_get_incoming_cpu(sk, reuse);
> rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
>
> @@ -278,6 +299,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
> more_reuse->bind_inany = reuse->bind_inany;
> more_reuse->has_conns = reuse->has_conns;
> more_reuse->incoming_cpu = reuse->incoming_cpu;
> + more_reuse->score = reuse->score;
>
> memcpy(more_reuse->socks, reuse->socks,
> reuse->num_socks * sizeof(struct sock *));
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index ceeeec9b7290..042a65372d00 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -739,6 +739,44 @@ static int inet_reuseport_add_sock(struct sock *sk,
> return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
> }
>
> +static void inet_reuseport_hash_sock(struct sock *sk,
> + struct inet_listen_hashbucket *ilb2)
> +{
> + struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
> + const struct hlist_nulls_node *node;
> + struct sock *sk2, *sk_anchor = NULL;
> + unsigned short score, hiscore;
> + struct sock_reuseport *reuse;
> +
> + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 1);
> + score = reuse->score;
> +
> + sk_nulls_for_each_rcu(sk2, node, &ilb2->nulls_head) {
> + if (!sk2->sk_reuseport)
> + continue;
> +
> + if (inet_csk(sk2)->icsk_bind_hash != tb)
> + continue;
> +
> + reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, 1);
> + if (likely(reuse))
> + hiscore = reuse->score;
> + else
> + hiscore = reuseport_compute_score(sk2,
> + inet_rcv_saddr_any(sk2));
> +
> + if (hiscore <= score)
> + break;
> +
> + sk_anchor = sk2;
> + }
> +
> + if (sk_anchor)
> + __sk_nulls_insert_after_node_rcu(sk, &sk_anchor->sk_nulls_node);
> + else
> + __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> +}
> +
> int __inet_hash(struct sock *sk, struct sock *osk)
> {
> struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
> @@ -759,13 +797,14 @@ int __inet_hash(struct sock *sk, struct sock *osk)
> err = inet_reuseport_add_sock(sk, ilb2);
> if (err)
> goto unlock;
> - }
> - sock_set_flag(sk, SOCK_RCU_FREE);
> - if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
> - sk->sk_family == AF_INET6)
> - __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> - else
> +
> + sock_set_flag(sk, SOCK_RCU_FREE);
> + inet_reuseport_hash_sock(sk, ilb2);
> + } else {
> + sock_set_flag(sk, SOCK_RCU_FREE);
> __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> + }
> +
> sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
> unlock:
> spin_unlock(&ilb2->lock);
> ---8<---
>
>
> Tested:
>
> ---8<---
> [root@...ora ~]# cat a.py
> from socket import *
>
> s1 = socket()
> s1.setsockopt(SOL_SOCKET, SO_REUSEPORT, 1)
> s1.setsockopt(SOL_SOCKET, SO_BINDTODEVICE, b'lo')
> s1.listen()
> s1.setblocking(False)
>
> s2 = socket()
> s2.setsockopt(SOL_SOCKET, SO_REUSEPORT, 1)
> s2.bind(s1.getsockname())
> s2.listen()
> s2.setblocking(False)
>
> for i in range(3):
> c = socket()
> c.connect(s1.getsockname())
> try:
> print("assigned properly:", s1.accept())
> except:
> print("wrong assignment")
> [root@...ora ~]# python3 a.py
> assigned properly: (<socket.socket fd=6, family=2, type=1, proto=0, laddr=('127.0.0.1', 36733), raddr=('127.0.0.1', 39478)>, ('127.0.0.1', 39478))
> assigned properly: (<socket.socket fd=5, family=2, type=1, proto=0, laddr=('127.0.0.1', 36733), raddr=('127.0.0.1', 39490)>, ('127.0.0.1', 39490))
> assigned properly: (<socket.socket fd=6, family=2, type=1, proto=0, laddr=('127.0.0.1', 36733), raddr=('127.0.0.1', 39506)>, ('127.0.0.1', 39506))
> ---8<---
Powered by blists - more mailing lists