[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221221151258.25748-2-kuniyu@amazon.com>
Date: Thu, 22 Dec 2022 00:12:57 +0900
From: Kuniyuki Iwashima <kuniyu@...zon.com>
To: "David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>
CC: Jiri Slaby <jirislaby@...nel.org>,
Joanne Koong <joannelkoong@...il.com>,
Kuniyuki Iwashima <kuniyu@...zon.com>,
Kuniyuki Iwashima <kuni1840@...il.com>,
<netdev@...r.kernel.org>
Subject: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
Jiri Slaby reported regression of bind() with a simple repro. [0]
The repro creates a TIME_WAIT socket and tries to bind() a new socket
with the same local address and port. Before commit 28044fc1d495 ("net:
Add a bhash2 table hashed by port and address"), the bind() failed with
-EADDRINUSE, but now it succeeds.
The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
requests if the address is not a wildcard one.
[0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
Reported-by: Jiri Slaby <jirislaby@...nel.org>
Signed-off-by: Kuniyuki Iwashima <kuniyu@...zon.com>
---
include/net/inet_timewait_sock.h | 2 ++
include/net/sock.h | 5 +++--
net/ipv4/inet_hashtables.c | 5 +++--
net/ipv4/inet_timewait_sock.c | 31 +++++++++++++++++++++++++++++--
4 files changed, 37 insertions(+), 6 deletions(-)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 5b47545f22d3..c46ed239ad9a 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -44,6 +44,7 @@ struct inet_timewait_sock {
#define tw_bound_dev_if __tw_common.skc_bound_dev_if
#define tw_node __tw_common.skc_nulls_node
#define tw_bind_node __tw_common.skc_bind_node
+#define tw_bind2_node __tw_common.skc_bind2_node
#define tw_refcnt __tw_common.skc_refcnt
#define tw_hash __tw_common.skc_hash
#define tw_prot __tw_common.skc_prot
@@ -73,6 +74,7 @@ struct inet_timewait_sock {
u32 tw_priority;
struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb;
+ struct inet_bind2_bucket *tw_tb2;
};
#define tw_tclass tw_tos
diff --git a/include/net/sock.h b/include/net/sock.h
index dcd72e6285b2..aaec985c1b5b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
* @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
* [union with @skc_incoming_cpu]
* @skc_refcnt: reference count
+ * @skc_bind2_node: bind node in the bhash2 table
*
* This is the minimal network layer representation of sockets, the header
* for struct sock and struct inet_timewait_sock.
@@ -241,6 +242,7 @@ struct sock_common {
u32 skc_window_clamp;
u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */
};
+ struct hlist_node skc_bind2_node;
/* public: */
};
@@ -351,7 +353,6 @@ struct sk_filter;
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference
- * @sk_bind2_node: bind node in the bhash2 table
*/
struct sock {
/*
@@ -384,6 +385,7 @@ struct sock {
#define sk_net_refcnt __sk_common.skc_net_refcnt
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
+#define sk_bind2_node __sk_common.skc_bind2_node
#define sk_prot __sk_common.skc_prot
#define sk_net __sk_common.skc_net
#define sk_v6_daddr __sk_common.skc_v6_daddr
@@ -542,7 +544,6 @@ struct sock {
#endif
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
- struct hlist_node sk_bind2_node;
};
enum sk_pacing {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index d039b4e732a3..1e81dc7c6de4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -1103,15 +1103,16 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);
- spin_unlock(&head2->lock);
-
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
}
if (tw)
inet_twsk_bind_unhash(tw, hinfo);
+
+ spin_unlock(&head2->lock);
spin_unlock(&head->lock);
+
if (tw)
inet_twsk_deschedule_put(tw);
local_bh_enable();
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 66fc940f9521..bec037d9ab8e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -29,6 +29,7 @@
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
+ struct inet_bind2_bucket *tb2 = tw->tw_tb2;
struct inet_bind_bucket *tb = tw->tw_tb;
if (!tb)
@@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
__hlist_del(&tw->tw_bind_node);
tw->tw_tb = NULL;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+
+ __hlist_del(&tw->tw_bind2_node);
+ tw->tw_tb2 = NULL;
+ inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
+
__sock_put((struct sock *)tw);
}
@@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
- struct inet_bind_hashbucket *bhead;
+ struct inet_bind_hashbucket *bhead, *bhead2;
spin_lock(lock);
sk_nulls_del_node_init_rcu((struct sock *)tw);
@@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
/* Disassociate with bind bucket. */
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
hashinfo->bhash_size)];
+ bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
+ twsk_net(tw), tw->tw_num);
spin_lock(&bhead->lock);
+ spin_lock(&bhead2->lock);
inet_twsk_bind_unhash(tw, hashinfo);
+ spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock);
refcount_dec(&tw->tw_dr->tw_refcount);
@@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
hlist_add_head(&tw->tw_bind_node, list);
}
+static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
+ struct hlist_head *list)
+{
+ hlist_add_head(&tw->tw_bind2_node, list);
+}
+
/*
* Enter the time wait state. This is called with locally disabled BH.
* Essentially we whip up a timewait bucket, copy the relevant info into it
@@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
- struct inet_bind_hashbucket *bhead;
+ struct inet_bind_hashbucket *bhead, *bhead2;
+
/* Step 1: Put TW into bind hash. Original socket stays there too.
Note, that any socket with inet->num != 0 MUST be bound in
binding cache, even if it is closed.
*/
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
hashinfo->bhash_size)];
+ bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
+
spin_lock(&bhead->lock);
+ spin_lock(&bhead2->lock);
+
tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash);
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+
+ tw->tw_tb2 = icsk->icsk_bind2_hash;
+ WARN_ON(!icsk->icsk_bind2_hash);
+ inet_twsk_add_bind2_node(tw, &tw->tw_tb2->owners);
+
+ spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock);
spin_lock(lock);
--
2.30.2
Powered by blists - more mailing lists