[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89iKoxTRYpNgyA4g41bmY2k908g--_KYr5VtE3_HPObMtuw@mail.gmail.com>
Date: Tue, 10 May 2022 11:45:40 -0700
From: Eric Dumazet <edumazet@...gle.com>
To: Joanne Koong <joannelkoong@...il.com>
Cc: netdev <netdev@...r.kernel.org>, Martin KaFai Lau <kafai@...com>,
Jakub Kicinski <kuba@...nel.org>,
David Miller <davem@...emloft.net>
Subject: Re: [PATCH net-next v2 1/2] net: Add a second bind table hashed by
port and address
On Mon, May 9, 2022 at 5:54 PM Joanne Koong <joannelkoong@...il.com> wrote:
>
> We currently have one tcp bind table (bhash) which hashes by port
> number only. In the socket bind path, we check for bind conflicts by
> traversing the specified port's inet_bind2_bucket while holding the
> bucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()).
>
> In instances where there are tons of sockets hashed to the same port
> at different addresses, checking for a bind conflict is time-intensive
> and can cause softirq cpu lockups, as well as stops new tcp connections
> since __inet_inherit_port() also contests for the spinlock.
>
> This patch proposes adding a second bind table, bhash2, that hashes by
> port and ip address. Searching the bhash2 table leads to significantly
> faster conflict resolution and less time holding the spinlock.
>
> Signed-off-by: Joanne Koong <joannelkoong@...il.com>
> ---
> include/net/inet_connection_sock.h | 3 +
> include/net/inet_hashtables.h | 56 ++++++-
> include/net/sock.h | 14 ++
> net/dccp/proto.c | 14 +-
> net/ipv4/inet_connection_sock.c | 227 +++++++++++++++++++++--------
> net/ipv4/inet_hashtables.c | 188 ++++++++++++++++++++++--
> net/ipv4/tcp.c | 14 +-
> 7 files changed, 438 insertions(+), 78 deletions(-)
>
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index 3908296d103f..d89a78d10294 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -25,6 +25,7 @@
> #undef INET_CSK_CLEAR_TIMERS
>
> struct inet_bind_bucket;
> +struct inet_bind2_bucket;
> struct tcp_congestion_ops;
>
> /*
> @@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
> *
> * @icsk_accept_queue: FIFO of established children
> * @icsk_bind_hash: Bind node
> + * @icsk_bind2_hash: Bind node in the bhash2 table
> * @icsk_timeout: Timeout
> * @icsk_retransmit_timer: Resend (no ack)
> * @icsk_rto: Retransmit timeout
> @@ -84,6 +86,7 @@ struct inet_connection_sock {
> struct inet_sock icsk_inet;
> struct request_sock_queue icsk_accept_queue;
> struct inet_bind_bucket *icsk_bind_hash;
> + struct inet_bind2_bucket *icsk_bind2_hash;
> unsigned long icsk_timeout;
> struct timer_list icsk_retransmit_timer;
> struct timer_list icsk_delack_timer;
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index f72ec113ae56..143a33d815c2 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -90,11 +90,30 @@ struct inet_bind_bucket {
> struct hlist_head owners;
> };
>
> +struct inet_bind2_bucket {
> + possible_net_t ib_net;
> + int l3mdev;
> + unsigned short port;
> + union {
> +#if IS_ENABLED(CONFIG_IPV6)
> + struct in6_addr v6_rcv_saddr;
> +#endif
> + __be32 rcv_saddr;
> + };
> + struct hlist_node node; /* Node in the inet2_bind_hashbucket chain */
> + struct hlist_head owners; /* List of sockets hashed to this bucket */
> +};
> +
> static inline struct net *ib_net(struct inet_bind_bucket *ib)
> {
> return read_pnet(&ib->ib_net);
> }
>
> +static inline struct net *ib2_net(struct inet_bind2_bucket *ib)
> +{
> + return read_pnet(&ib->ib_net);
> +}
> +
> #define inet_bind_bucket_for_each(tb, head) \
> hlist_for_each_entry(tb, head, node)
>
> @@ -103,6 +122,15 @@ struct inet_bind_hashbucket {
> struct hlist_head chain;
> };
>
> +/* This is synchronized using the inet_bind_hashbucket's spinlock.
> + * Instead of having separate spinlocks, the inet_bind2_hashbucket can share
> + * the inet_bind_hashbucket's given that in every case where the bhash2 table
> + * is useful, a lookup in the bhash table also occurs.
> + */
> +struct inet_bind2_hashbucket {
> + struct hlist_head chain;
> +};
> +
> /* Sockets can be hashed in established or listening table.
> * We must use different 'nulls' end-of-chain value for all hash buckets :
> * A socket might transition from ESTABLISH to LISTEN state without
> @@ -138,6 +166,11 @@ struct inet_hashinfo {
> */
> struct kmem_cache *bind_bucket_cachep;
> struct inet_bind_hashbucket *bhash;
> + /* The 2nd binding table hashed by port and address.
> + * This is used primarily for expediting the resolution of bind conflicts.
> + */
> + struct kmem_cache *bind2_bucket_cachep;
> + struct inet_bind2_hashbucket *bhash2;
> unsigned int bhash_size;
>
> /* The 2nd listener table hashed by local port and address */
> @@ -221,6 +254,27 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
> void inet_bind_bucket_destroy(struct kmem_cache *cachep,
> struct inet_bind_bucket *tb);
>
> +static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb, struct net *net,
> + const unsigned short port, int l3mdev)
> +{
> + return net_eq(ib_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev;
> +}
> +
> +struct inet_bind2_bucket *
> +inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
> + struct inet_bind2_hashbucket *head, const unsigned short port,
> + int l3mdev, const struct sock *sk);
> +
> +void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb);
> +
> +struct inet_bind2_bucket *
> +inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net, const unsigned short port,
> + int l3mdev, struct sock *sk, struct inet_bind2_hashbucket **head);
> +
> +bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb, struct net *net,
> + const unsigned short port, int l3mdev,
> + const struct sock *sk);
> +
> static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
> const u32 bhash_size)
> {
> @@ -228,7 +282,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
> }
>
> void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
> - const unsigned short snum);
> + struct inet_bind2_bucket *tb2, const unsigned short snum);
>
> /* These can have wildcards, don't try too hard. */
> static inline u32 inet_lhashfn(const struct net *net, const unsigned short num)
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 663041b92c21..1070c6cb36fb 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -351,6 +351,7 @@ struct sk_filter;
> * @sk_txtime_report_errors: set report errors mode for SO_TXTIME
> * @sk_txtime_unused: unused txtime flags
> * @ns_tracker: tracker for netns reference
> + * @sk_bind2_node: bind node in the bhash2 table
> */
> struct sock {
> /*
> @@ -540,6 +541,7 @@ struct sock {
> #endif
> struct rcu_head sk_rcu;
> netns_tracker ns_tracker;
> + struct hlist_node sk_bind2_node;
> };
>
> enum sk_pacing {
> @@ -820,6 +822,16 @@ static inline void sk_add_bind_node(struct sock *sk,
> hlist_add_head(&sk->sk_bind_node, list);
> }
>
> +static inline void __sk_del_bind2_node(struct sock *sk)
> +{
> + __hlist_del(&sk->sk_bind2_node);
> +}
> +
> +static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
> +{
> + hlist_add_head(&sk->sk_bind2_node, list);
> +}
> +
> #define sk_for_each(__sk, list) \
> hlist_for_each_entry(__sk, list, sk_node)
> #define sk_for_each_rcu(__sk, list) \
> @@ -837,6 +849,8 @@ static inline void sk_add_bind_node(struct sock *sk,
> hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
> #define sk_for_each_bound(__sk, list) \
> hlist_for_each_entry(__sk, list, sk_bind_node)
> +#define sk_for_each_bound_bhash2(__sk, list) \
> + hlist_for_each_entry(__sk, list, sk_bind2_node)
>
> /**
> * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
> diff --git a/net/dccp/proto.c b/net/dccp/proto.c
> index 58421f94427e..01498a821f3c 100644
> --- a/net/dccp/proto.c
> +++ b/net/dccp/proto.c
> @@ -1121,6 +1121,12 @@ static int __init dccp_init(void)
> SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
> if (!dccp_hashinfo.bind_bucket_cachep)
> goto out_free_hashinfo2;
> + dccp_hashinfo.bind2_bucket_cachep =
> + kmem_cache_create("dccp_bind2_bucket",
> + sizeof(struct inet_bind2_bucket), 0,
> + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
> + if (!dccp_hashinfo.bind2_bucket_cachep)
> + goto out_free_bind_bucket_cachep;
>
> /*
> * Size and allocate the main established and bind bucket
> @@ -1151,7 +1157,7 @@ static int __init dccp_init(void)
>
> if (!dccp_hashinfo.ehash) {
> DCCP_CRIT("Failed to allocate DCCP established hash table");
> - goto out_free_bind_bucket_cachep;
> + goto out_free_bind2_bucket_cachep;
> }
>
> for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
> @@ -1170,6 +1176,8 @@ static int __init dccp_init(void)
> continue;
> dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
> __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
> + dccp_hashinfo.bhash2 = (struct inet_bind2_hashbucket *)
> + __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
You have to take care of allocation errors, and make sure to not leak memory.
Alternative would be to allow bhash2 to be NULL, and you do not have
to change dccp.
(only TCP would request bhash2 to be not NULL)
> } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
>
Can you rebase your patch and resend (after this dccp) issue is resolved,
because it does not apply cleanly on net-next.
Thanks.
Powered by blists - more mailing lists