[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CADxym3a2mQnxbJgFCEoAb3U8s_xLAZmU9+bYN-a7mpHNLgjKVw@mail.gmail.com>
Date: Fri, 28 Feb 2025 17:36:31 +0800
From: Menglong Dong <menglong8.dong@...il.com>
To: Kuniyuki Iwashima <kuniyu@...zon.com>
Cc: davem@...emloft.net, dongml2@...natelecom.cn, dsahern@...nel.org,
edumazet@...gle.com, horms@...nel.org, kerneljasonxing@...il.com,
kuba@...nel.org, linux-kernel@...r.kernel.org, ncardwell@...gle.com,
netdev@...r.kernel.org, pabeni@...hat.com, petrm@...dia.com, yyd@...gle.com
Subject: Re: [RFC PATCH net-next] net: ip: add sysctl_ip_reuse_exact_match
On Fri, Feb 28, 2025 at 8:30 AM Kuniyuki Iwashima <kuniyu@...zon.com> wrote:
>
> From: Menglong Dong <menglong8.dong@...il.com>
> Date: Thu, 27 Feb 2025 20:31:37 +0800
> > For now, the socket lookup will terminate if the socket is reuse port in
> > inet_lhash2_lookup(), which makes the socket is not the best match.
> >
> > For example, we have socket1 listening on "0.0.0.0:1234" and socket2
> > listening on "192.168.1.1:1234", and both of them enabled reuse port. The
> > socket1 will always be matched when a connection with the peer ip
> > "192.168.1.xx" comes if the socket1 is created later than socket2. This
> > is not expected, as socket2 has higher priority.
> >
> > This can cause unexpected behavior if TCP MD5 keys is used, as described
> > in Documentation/networking/vrf.rst -> Applications.
>
> Could you provide a minimal repro setup ?
> I somehow fail to reproduce the issue.
>
Thanks for your replying :/
Sorry that I described it wrong in the commit log. I problem is that:
socket1 and socket2 both listen on "0.0.0.0:1234", but socket1 bind
on "eth0". We create socket1 first, and then socket2. Then, all connections
will goto socket2, which is not expected, as socket1 has higher priority.
You can reproduce it with following code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <linux/ip.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
int main(int argc, char** argv)
{
int listenfd, connfd;
struct sockaddr_in servaddr, client;
socklen_t clen;
char buff[4096];
int one = 1;
int n;
if ((listenfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
printf("create socket error: %s(errno: %d)\n", strerror(errno), errno);
exit(0);
}
memset(&servaddr, 0, sizeof(servaddr));
servaddr.sin_family = AF_INET;
servaddr.sin_addr.s_addr = htonl(INADDR_ANY);
servaddr.sin_port = htons(6666);
setsockopt(listenfd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
setsockopt(listenfd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one));
if (bind(listenfd, (struct sockaddr*)&servaddr, sizeof(servaddr)) == -1) {
printf("bind socket error: %s(errno: %d)\n", strerror(errno), errno);
exit(0);
}
if (argc > 1) {
setsockopt(listenfd, SOL_SOCKET, SO_BINDTODEVICE, argv[1],
strlen(argv[1]) + 1);
}
if (listen(listenfd, 10) == -1) {
printf("listen socket error: %s(errno: %d)\n", strerror(errno), errno);
exit(0);
}
printf("======waiting for client's request======\n");
while (1) {
clen = sizeof(struct sockaddr);
if ((connfd = accept(listenfd, (struct sockaddr*)&client,
&clen)) == -1) {
printf("accept socket error: %s(errno: %d)",
strerror(errno), errno);
continue;
}
printf("recv msg from client: %x\n", client.sin_addr.s_addr);
}
close(listenfd);
return 0;
}
>
> > Introduce the sysctl_ip_reuse_exact_match to make it find a best matched
> > socket when reuse port is used.
>
> I think we should not introduce a new sysctl knob and an extra lookup,
> rather we can solve that in __inet_hash() taking d894ba18d4e4
> ("soreuseport: fix ordering for mixed v4/v6 sockets") further.
>
> Could you test this patch ?
Sorry for my incorrect commit log, and this patch can't solve
the problem that I had. Maybe we can compare the score of
the socket in the list when we insert the socket to the listening
hash table to place it in a proper place. However, it will make
the inserting complex :/
Thanks!
Menglong Dong
>
> ---8<---
> $ git show
> commit 4dbc44a153afe51a2b2698a55213e625a02e23c8
> Author: Kuniyuki Iwashima <kuniyu@...zon.com>
> Date: Thu Feb 27 19:53:43 2025 +0000
>
> tcp: Place non-wildcard sockets before wildcard ones in lhash2.
>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@...zon.com>
>
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index 5eea47f135a4..115248bfe463 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -136,6 +136,9 @@ struct inet_bind_hashbucket {
> struct inet_listen_hashbucket {
> spinlock_t lock;
> struct hlist_nulls_head nulls_head;
> +#if IS_ENABLED(CONFIG_IPV6)
> + struct hlist_nulls_node *wildcard_node;
> +#endif
> };
>
> /* This is for listening sockets, thus all sockets which possess wildcards. */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index efc031163c33..4e8e10d2067b 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -863,6 +863,16 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
> hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
> }
>
> +static inline void __sk_nulls_add_node_before_rcu(struct sock *sk, struct hlist_nulls_node *next)
> +{
> + struct hlist_nulls_node *n = &sk->sk_nulls_node;
> +
> + WRITE_ONCE(n->pprev, next->pprev);
> + WRITE_ONCE(n->next, next);
> + WRITE_ONCE(next->pprev, &n->next);
> + WRITE_ONCE(*(n->pprev), n);
> +}
> +
> static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
> {
> sock_hold(sk);
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index ecda4c65788c..acfb693bb1d4 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -729,6 +729,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
> {
> struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
> struct inet_listen_hashbucket *ilb2;
> + bool add_tail = false;
> int err = 0;
>
> if (sk->sk_state != TCP_LISTEN) {
> @@ -737,21 +738,47 @@ int __inet_hash(struct sock *sk, struct sock *osk)
> local_bh_enable();
> return 0;
> }
> +
> WARN_ON(!sk_unhashed(sk));
> ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
>
> spin_lock(&ilb2->lock);
> +
> if (sk->sk_reuseport) {
> err = inet_reuseport_add_sock(sk, ilb2);
> if (err)
> goto unlock;
> +
> + if (inet_rcv_saddr_any(sk))
> + add_tail = true;
> }
> +
> sock_set_flag(sk, SOCK_RCU_FREE);
> - if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
> - sk->sk_family == AF_INET6)
> - __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> - else
> - __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> + if (sk->sk_family == AF_INET6) {
> + if (add_tail || !ilb2->wildcard_node)
> + __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> + else
> + __sk_nulls_add_node_before_rcu(sk, ilb2->wildcard_node);
> + } else
> +#endif
> + {
> + if (!add_tail)
> + __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> +#if IS_ENABLED(CONFIG_IPV6)
> + else if (ilb2->wildcard_node)
> + __sk_nulls_add_node_before_rcu(sk, ilb2->wildcard_node);
> +#endif
> + else
> + __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> + }
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> + if (add_tail && (sk->sk_family == AF_INET || !ilb2->wildcard_node))
> + ilb2->wildcard_node = &sk->sk_nulls_node;
> +#endif
> +
> sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
> unlock:
> spin_unlock(&ilb2->lock);
> @@ -794,6 +821,15 @@ void inet_unhash(struct sock *sk)
> if (rcu_access_pointer(sk->sk_reuseport_cb))
> reuseport_stop_listen_sock(sk);
>
> +#if IS_ENABLED(CONFIG_IPV6)
> + if (&sk->sk_nulls_node == ilb2->wildcard_node) {
> + if (is_a_nulls(sk->sk_nulls_node.next))
> + ilb2->wildcard_node = NULL;
> + else
> + ilb2->wildcard_node = sk->sk_nulls_node.next;
> + }
> +#endif
> +
> __sk_nulls_del_node_init_rcu(sk);
> sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
> spin_unlock(&ilb2->lock);
> @@ -1183,6 +1219,9 @@ static void init_hashinfo_lhash2(struct inet_hashinfo *h)
> spin_lock_init(&h->lhash2[i].lock);
> INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
> i + LISTENING_NULLS_BASE);
> +#if IS_ENABLED(CONFIG_IPV6)
> + h->lhash2[i].wildcard_node = NULL;
> +#endif
> }
> }
>
> ---8<---
Powered by blists - more mailing lists