[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9eae0fc2-ea4f-20fe-7a0a-cae3a4bb9354@kernel.org>
Date: Wed, 21 Dec 2022 17:37:26 +0100
From: Jiri Slaby <jirislaby@...nel.org>
To: Kuniyuki Iwashima <kuniyu@...zon.com>,
"David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>
Cc: Joanne Koong <joannelkoong@...il.com>,
Kuniyuki Iwashima <kuni1840@...il.com>, netdev@...r.kernel.org
Subject: Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
On 21. 12. 22, 16:12, Kuniyuki Iwashima wrote:
> Jiri Slaby reported regression of bind() with a simple repro. [0]
>
> The repro creates a TIME_WAIT socket and tries to bind() a new socket
> with the same local address and port. Before commit 28044fc1d495 ("net:
> Add a bhash2 table hashed by port and address"), the bind() failed with
> -EADDRINUSE, but now it succeeds.
>
> The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> requests if the address is not a wildcard one.
>
> [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
>
> Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> Reported-by: Jiri Slaby <jirislaby@...nel.org>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@...zon.com>
Tested-by: Jiri Slaby <jirislaby@...nel.org>
> ---
> include/net/inet_timewait_sock.h | 2 ++
> include/net/sock.h | 5 +++--
> net/ipv4/inet_hashtables.c | 5 +++--
> net/ipv4/inet_timewait_sock.c | 31 +++++++++++++++++++++++++++++--
> 4 files changed, 37 insertions(+), 6 deletions(-)
>
> diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> index 5b47545f22d3..c46ed239ad9a 100644
> --- a/include/net/inet_timewait_sock.h
> +++ b/include/net/inet_timewait_sock.h
> @@ -44,6 +44,7 @@ struct inet_timewait_sock {
> #define tw_bound_dev_if __tw_common.skc_bound_dev_if
> #define tw_node __tw_common.skc_nulls_node
> #define tw_bind_node __tw_common.skc_bind_node
> +#define tw_bind2_node __tw_common.skc_bind2_node
> #define tw_refcnt __tw_common.skc_refcnt
> #define tw_hash __tw_common.skc_hash
> #define tw_prot __tw_common.skc_prot
> @@ -73,6 +74,7 @@ struct inet_timewait_sock {
> u32 tw_priority;
> struct timer_list tw_timer;
> struct inet_bind_bucket *tw_tb;
> + struct inet_bind2_bucket *tw_tb2;
> };
> #define tw_tclass tw_tos
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dcd72e6285b2..aaec985c1b5b 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
> * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
> * [union with @skc_incoming_cpu]
> * @skc_refcnt: reference count
> + * @skc_bind2_node: bind node in the bhash2 table
> *
> * This is the minimal network layer representation of sockets, the header
> * for struct sock and struct inet_timewait_sock.
> @@ -241,6 +242,7 @@ struct sock_common {
> u32 skc_window_clamp;
> u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */
> };
> + struct hlist_node skc_bind2_node;
> /* public: */
> };
>
> @@ -351,7 +353,6 @@ struct sk_filter;
> * @sk_txtime_report_errors: set report errors mode for SO_TXTIME
> * @sk_txtime_unused: unused txtime flags
> * @ns_tracker: tracker for netns reference
> - * @sk_bind2_node: bind node in the bhash2 table
> */
> struct sock {
> /*
> @@ -384,6 +385,7 @@ struct sock {
> #define sk_net_refcnt __sk_common.skc_net_refcnt
> #define sk_bound_dev_if __sk_common.skc_bound_dev_if
> #define sk_bind_node __sk_common.skc_bind_node
> +#define sk_bind2_node __sk_common.skc_bind2_node
> #define sk_prot __sk_common.skc_prot
> #define sk_net __sk_common.skc_net
> #define sk_v6_daddr __sk_common.skc_v6_daddr
> @@ -542,7 +544,6 @@ struct sock {
> #endif
> struct rcu_head sk_rcu;
> netns_tracker ns_tracker;
> - struct hlist_node sk_bind2_node;
> };
>
> enum sk_pacing {
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index d039b4e732a3..1e81dc7c6de4 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -1103,15 +1103,16 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> /* Head lock still held and bh's disabled */
> inet_bind_hash(sk, tb, tb2, port);
>
> - spin_unlock(&head2->lock);
> -
> if (sk_unhashed(sk)) {
> inet_sk(sk)->inet_sport = htons(port);
> inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
> }
> if (tw)
> inet_twsk_bind_unhash(tw, hinfo);
> +
> + spin_unlock(&head2->lock);
> spin_unlock(&head->lock);
> +
> if (tw)
> inet_twsk_deschedule_put(tw);
> local_bh_enable();
> diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
> index 66fc940f9521..bec037d9ab8e 100644
> --- a/net/ipv4/inet_timewait_sock.c
> +++ b/net/ipv4/inet_timewait_sock.c
> @@ -29,6 +29,7 @@
> void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
> struct inet_hashinfo *hashinfo)
> {
> + struct inet_bind2_bucket *tb2 = tw->tw_tb2;
> struct inet_bind_bucket *tb = tw->tw_tb;
>
> if (!tb)
> @@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
> __hlist_del(&tw->tw_bind_node);
> tw->tw_tb = NULL;
> inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
> +
> + __hlist_del(&tw->tw_bind2_node);
> + tw->tw_tb2 = NULL;
> + inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
> +
> __sock_put((struct sock *)tw);
> }
>
> @@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
> {
> struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
> spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
> - struct inet_bind_hashbucket *bhead;
> + struct inet_bind_hashbucket *bhead, *bhead2;
>
> spin_lock(lock);
> sk_nulls_del_node_init_rcu((struct sock *)tw);
> @@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
> /* Disassociate with bind bucket. */
> bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
> hashinfo->bhash_size)];
> + bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
> + twsk_net(tw), tw->tw_num);
>
> spin_lock(&bhead->lock);
> + spin_lock(&bhead2->lock);
> inet_twsk_bind_unhash(tw, hashinfo);
> + spin_unlock(&bhead2->lock);
> spin_unlock(&bhead->lock);
>
> refcount_dec(&tw->tw_dr->tw_refcount);
> @@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
> hlist_add_head(&tw->tw_bind_node, list);
> }
>
> +static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
> + struct hlist_head *list)
> +{
> + hlist_add_head(&tw->tw_bind2_node, list);
> +}
> +
> /*
> * Enter the time wait state. This is called with locally disabled BH.
> * Essentially we whip up a timewait bucket, copy the relevant info into it
> @@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
> const struct inet_connection_sock *icsk = inet_csk(sk);
> struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
> spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
> - struct inet_bind_hashbucket *bhead;
> + struct inet_bind_hashbucket *bhead, *bhead2;
> +
> /* Step 1: Put TW into bind hash. Original socket stays there too.
> Note, that any socket with inet->num != 0 MUST be bound in
> binding cache, even if it is closed.
> */
> bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
> hashinfo->bhash_size)];
> + bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
> +
> spin_lock(&bhead->lock);
> + spin_lock(&bhead2->lock);
> +
> tw->tw_tb = icsk->icsk_bind_hash;
> WARN_ON(!icsk->icsk_bind_hash);
> inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
> +
> + tw->tw_tb2 = icsk->icsk_bind2_hash;
> + WARN_ON(!icsk->icsk_bind2_hash);
> + inet_twsk_add_bind2_node(tw, &tw->tw_tb2->owners);
> +
> + spin_unlock(&bhead2->lock);
> spin_unlock(&bhead->lock);
>
> spin_lock(lock);
--
js
suse labs
Powered by blists - more mailing lists