[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89i+1-geie7HrSmZeU-OvT-aDJabbtcwrZaHr-1S16yuRZw@mail.gmail.com>
Date: Fri, 1 Aug 2025 02:46:27 -0700
From: Eric Dumazet <edumazet@...gle.com>
To: Menglong Dong <menglong8.dong@...il.com>
Cc: kuniyu@...gle.com, kraig@...gle.com, ncardwell@...gle.com,
davem@...emloft.net, dsahern@...nel.org, kuba@...nel.org, pabeni@...hat.com,
horms@...nel.org, netdev@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
On Fri, Aug 1, 2025 at 2:09 AM Menglong Dong <menglong8.dong@...il.com> wrote:
>
> For now, the socket lookup will terminate if the socket is reuse port in
> inet_lhash2_lookup(), which makes the socket is not the best match.
>
> For example, we have socket1 and socket2 both listen on "0.0.0.0:1234",
> but socket1 bind on "eth0". We create socket1 first, and then socket2.
> Then, all connections will goto socket2, which is not expected, as socket1
> has higher priority.
>
> This can cause unexpected behavior if TCP MD5 keys is used, as described
> in Documentation/networking/vrf.rst -> Applications.
>
> Therefore, we compute a score for the reuseport socket and add it to the
> list with order in __inet_hash(). Sockets with high score will be added
> to the head.
>
> Link: https://lore.kernel.org/netdev/20250731123309.184496-1-dongml2@chinatelecom.cn/
> Signed-off-by: Menglong Dong <dongml2@...natelecom.cn>
You forgot a Fixes: tag, and a selftest.
> ---
> v2:
> - As Kuniyuki advised, sort the reuseport socket in __inet_hash() to keep
> the lookup for reuseport O(1)
Keeping sorted the list is difficult, we would have to intercept
SO_BINDTODEVICE, SO_BINDTOIFINDEX, SO_INCOMING_CPU.
This also makes the patch risky to backport to stable versions,
because it is complex and possibly buggy.
Therefore I prefer your first approach.
> ---
> include/linux/rculist_nulls.h | 34 ++++++++++++++++++++++++
> include/net/sock.h | 5 ++++
> net/ipv4/inet_hashtables.c | 49 ++++++++++++++++++++++++++++++++---
> 3 files changed, 84 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
> index 89186c499dd4..da500f4ae142 100644
> --- a/include/linux/rculist_nulls.h
> +++ b/include/linux/rculist_nulls.h
> @@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
> #define hlist_nulls_next_rcu(node) \
> (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
>
> +/**
> + * hlist_nulls_pprev_rcu - returns the element of the list after @node.
> + * @node: element of the list.
> + */
> +#define hlist_nulls_pprev_rcu(node) \
> + (*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
> +
> /**
> * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
> * @n: the element to delete from the hash list.
> @@ -145,6 +152,33 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
> }
> }
>
> +/**
> + * hlist_nulls_add_before_rcu
> + * @n: the new element to add to the hash list.
> + * @next: the existing element to add the new element before.
> + *
> + * Description:
> + * Adds the specified element to the specified hlist
> + * before the specified node while permitting racing traversals.
> + *
> + * The caller must take whatever precautions are necessary
> + * (such as holding appropriate locks) to avoid racing
> + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
> + * or hlist_nulls_del_rcu(), running on this same list.
> + * However, it is perfectly legal to run concurrently with
> + * the _rcu list-traversal primitives, such as
> + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
> + * problems on Alpha CPUs.
> + */
> +static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
> + struct hlist_nulls_node *next)
> +{
> + WRITE_ONCE(n->pprev, next->pprev);
I do not think WRITE_ONCE() is necessary here, @n is private to this cpu,
and following rcu_assign_pointer() has the needed barrier.
> + n->next = next;
> + rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
> + WRITE_ONCE(next->pprev, &n->next);
> +}
> +
> /* after that hlist_nulls_del will work */
> static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
> {
> diff --git a/include/net/sock.h b/include/net/sock.h
> index c8a4b283df6f..42aa1919eeee 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -885,6 +885,11 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
> hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
> }
>
> +static inline void __sk_nulls_add_node_before_rcu(struct sock *sk, struct sock *next)
> +{
> + hlist_nulls_add_before_rcu(&sk->sk_nulls_node, &next->sk_nulls_node);
> +}
> +
> static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
> {
> sock_hold(sk);
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index ceeeec9b7290..80d8bec41a58 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -334,6 +334,26 @@ static inline int compute_score(struct sock *sk, const struct net *net,
> return score;
> }
>
> +static inline int compute_reuseport_score(struct sock *sk)
> +{
> + int score = 0;
> +
> + if (sk->sk_bound_dev_if)
> + score += 2;
> +
> + if (sk->sk_family == PF_INET)
> + score += 10;
> +
> + /* the priority of sk_incoming_cpu should be lower than sk_bound_dev_if,
> + * as it's optional in compute_score(). Thank God, this is the only
Please do not bring God here.
> + * variable condition, which we can't judge now.
> + */
> + if (READ_ONCE(sk->sk_incoming_cpu))
> + score++;
> +
> + return score;
> +}
> +
> /**
> * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
> * @net: network namespace.
> @@ -739,6 +759,27 @@ static int inet_reuseport_add_sock(struct sock *sk,
> return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
> }
>
> +static void inet_hash_reuseport(struct sock *sk, struct hlist_nulls_head *head)
> +{
> + const struct hlist_nulls_node *node;
> + int score, curscore;
> + struct sock *sk2;
> +
> + curscore = compute_reuseport_score(sk);
> + /* lookup the socket to insert before */
> + sk_nulls_for_each_rcu(sk2, node, head) {
> + if (!sk2->sk_reuseport)
> + continue;
> + score = compute_reuseport_score(sk2);
> + if (score <= curscore) {
> + __sk_nulls_add_node_before_rcu(sk, sk2);
> + return;
> + }
> + }
> +
> + __sk_nulls_add_node_tail_rcu(sk, head);
> +}
> +
> int __inet_hash(struct sock *sk, struct sock *osk)
> {
> struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
> @@ -761,11 +802,11 @@ int __inet_hash(struct sock *sk, struct sock *osk)
> goto unlock;
> }
> sock_set_flag(sk, SOCK_RCU_FREE);
> - if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
> - sk->sk_family == AF_INET6)
> - __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> - else
> + if (!sk->sk_reuseport)
> __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> + else
> + inet_hash_reuseport(sk, &ilb2->nulls_head);
> +
> sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
> unlock:
> spin_unlock(&ilb2->lock);
> --
> 2.50.1
>
Powered by blists - more mailing lists