lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c075e7c4dcd787cca604f1cb1ddd9c6fc077a3c4.camel@redhat.com>
Date:   Tue, 28 Jun 2022 12:32:25 +0200
From:   Paolo Abeni <pabeni@...hat.com>
To:     Joanne Koong <joannelkoong@...il.com>, netdev@...r.kernel.org
Cc:     edumazet@...gle.com, kafai@...com, kuba@...nel.org,
        davem@...emloft.net
Subject: Re: [PATCH net-next v1 1/3] net: Add a bhash2 table hashed by port
 + address

On Thu, 2022-06-23 at 16:42 -0700, Joanne Koong wrote:
> The current bind hashtable (bhash) is hashed by port only.
> In the socket bind path, we have to check for bind conflicts by
> traversing the specified port's inet_bind_bucket while holding the
> hashbucket's spinlock (see inet_csk_get_port() and
> inet_csk_bind_conflict()). In instances where there are tons of
> sockets hashed to the same port at different addresses, the bind
> conflict check is time-intensive and can cause softirq cpu lockups,
> as well as stops new tcp connections since __inet_inherit_port()
> also contests for the spinlock.
> 
> This patch adds a second bind table, bhash2, that hashes by
> port and sk->sk_rcv_saddr (ipv4) and sk->sk_v6_rcv_saddr (ipv6).
> Searching the bhash2 table leads to significantly faster conflict
> resolution and less time holding the hashbucket spinlock.
> 
> Please note a few things:
> * There can be the case where the a socket's address changes after it
> has been bound. There are two cases where this happens:
> 
>   1) The case where there is a bind() call on INADDR_ANY (ipv4) or
>   IPV6_ADDR_ANY (ipv6) and then a connect() call. The kernel will
>   assign the socket an address when it handles the connect()
> 
>   2) In inet_sk_reselect_saddr(), which is called when rebuilding the
>   sk header and a few pre-conditions are met (eg rerouting fails).
> 
> In these two cases, we need to update the bhash2 table by removing the
> entry for the old address, and add a new entry reflecting the updated
> address.
> 
> * The bhash2 table must have its own lock, even though concurrent
> accesses on the same port are protected by the bhash lock. Bhash2 must
> have its own lock to protect against cases where sockets on different
> ports hash to different bhash hashbuckets but to the same bhash2
> hashbucket.
> 
> This brings up a few stipulations:
>   1) When acquiring both the bhash and the bhash2 lock, the bhash2 lock
>   will always be acquired after the bhash lock and released before the
>   bhash lock is released.
> 
>   2) There are no nested bhash2 hashbucket locks. A bhash2 lock is always
>   acquired+released before another bhash2 lock is acquired+released.
> 
> * The bhash table cannot be superseded by the bhash2 table because for
> bind requests on INADDR_ANY (ipv4) or IPV6_ADDR_ANY (ipv6), every socket
> bound to that port must be checked for a potential conflict. The bhash
> table is the only source of port->socket associations.
> 
> Signed-off-by: Joanne Koong <joannelkoong@...il.com>
> ---
>  include/net/inet_connection_sock.h |   3 +
>  include/net/inet_hashtables.h      |  80 ++++++++-
>  include/net/sock.h                 |  17 +-
>  net/dccp/ipv4.c                    |  24 ++-
>  net/dccp/ipv6.c                    |  12 ++
>  net/dccp/proto.c                   |  34 +++-
>  net/ipv4/af_inet.c                 |  31 +++-
>  net/ipv4/inet_connection_sock.c    | 279 ++++++++++++++++++++++-------
>  net/ipv4/inet_hashtables.c         | 277 ++++++++++++++++++++++++++--
>  net/ipv4/tcp.c                     |  11 +-
>  net/ipv4/tcp_ipv4.c                |  21 ++-
>  net/ipv6/tcp_ipv6.c                |  12 ++
>  12 files changed, 696 insertions(+), 105 deletions(-)
> 
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index 85cd695e7fd1..077cd730ce2f 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -25,6 +25,7 @@
>  #undef INET_CSK_CLEAR_TIMERS
>  
>  struct inet_bind_bucket;
> +struct inet_bind2_bucket;
>  struct tcp_congestion_ops;
>  
>  /*
> @@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
>   *
>   * @icsk_accept_queue:	   FIFO of established children
>   * @icsk_bind_hash:	   Bind node
> + * @icsk_bind2_hash:	   Bind node in the bhash2 table
>   * @icsk_timeout:	   Timeout
>   * @icsk_retransmit_timer: Resend (no ack)
>   * @icsk_rto:		   Retransmit timeout
> @@ -83,6 +85,7 @@ struct inet_connection_sock {
>  	struct inet_sock	  icsk_inet;
>  	struct request_sock_queue icsk_accept_queue;
>  	struct inet_bind_bucket	  *icsk_bind_hash;
> +	struct inet_bind2_bucket  *icsk_bind2_hash;
>  	unsigned long		  icsk_timeout;
>   	struct timer_list	  icsk_retransmit_timer;
>   	struct timer_list	  icsk_delack_timer;
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index ebfa3df6f8dc..1e8a6ca5a988 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -23,6 +23,7 @@
>  
>  #include <net/inet_connection_sock.h>
>  #include <net/inet_sock.h>
> +#include <net/ip.h>
>  #include <net/sock.h>
>  #include <net/route.h>
>  #include <net/tcp_states.h>
> @@ -90,7 +91,28 @@ struct inet_bind_bucket {
>  	struct hlist_head	owners;
>  };
>  
> -static inline struct net *ib_net(struct inet_bind_bucket *ib)
> +struct inet_bind2_bucket {
> +	possible_net_t		ib_net;
> +	int			l3mdev;
> +	unsigned short		port;
> +	union {
> +#if IS_ENABLED(CONFIG_IPV6)
> +		struct in6_addr		v6_rcv_saddr;
> +#endif
> +		__be32			rcv_saddr;
> +	};
> +	/* Node in the bhash2 inet_bind_hashbucket chain */
> +	struct hlist_node	node;
> +	/* List of sockets hashed to this bucket */
> +	struct hlist_head	owners;
> +};
> +
> +static inline struct net *ib_net(const struct inet_bind_bucket *ib)
> +{
> +	return read_pnet(&ib->ib_net);
> +}
> +
> +static inline struct net *ib2_net(const struct inet_bind2_bucket *ib)
>  {
>  	return read_pnet(&ib->ib_net);
>  }
> @@ -133,7 +155,14 @@ struct inet_hashinfo {
>  	 * TCP hash as well as the others for fast bind/connect.
>  	 */
>  	struct kmem_cache		*bind_bucket_cachep;
> +	/* This bind table is hashed by local port */
>  	struct inet_bind_hashbucket	*bhash;
> +	struct kmem_cache		*bind2_bucket_cachep;
> +	/* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4)
> +	 * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used
> +	 * primarily for expediting bind conflict resolution.
> +	 */
> +	struct inet_bind_hashbucket	*bhash2;
>  	unsigned int			bhash_size;
>  
>  	/* The 2nd listener table hashed by local port and address */
> @@ -193,14 +222,61 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
>  void inet_bind_bucket_destroy(struct kmem_cache *cachep,
>  			      struct inet_bind_bucket *tb);
>  
> +bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
> +			    const struct net *net, unsigned short port,
> +			    int l3mdev);
> +
> +struct inet_bind2_bucket *
> +inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
> +			 struct inet_bind_hashbucket *head,
> +			 unsigned short port, int l3mdev,
> +			 const struct sock *sk);
> +
> +void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
> +			       struct inet_bind2_bucket *tb);
> +
> +struct inet_bind2_bucket *
> +inet_bind2_bucket_find(const struct inet_bind_hashbucket *head,
> +		       const struct net *net,
> +		       unsigned short port, int l3mdev,
> +		       const struct sock *sk);
> +
> +bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb,
> +				      const struct net *net, unsigned short port,
> +				      int l3mdev, const struct sock *sk);
> +
>  static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
>  			       const u32 bhash_size)
>  {
>  	return (lport + net_hash_mix(net)) & (bhash_size - 1);
>  }
>  
> +static inline struct inet_bind_hashbucket *
> +inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk,
> +		      const struct net *net, unsigned short port)
> +{
> +	u32 hash;
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +	if (sk->sk_family == AF_INET6)
> +		hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
> +	else
> +#endif
> +		hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
> +	return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
> +}
> +
> +struct inet_bind_hashbucket *
> +inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port);
> +
> +/* This should be called whenever a socket's sk_rcv_saddr (ipv4) or
> + * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's
> + * rcv_saddr field should already have been updated when this is called.
> + */
> +int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk);
> +
>  void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
> -		    const unsigned short snum);
> +		    struct inet_bind2_bucket *tb2, unsigned short port);
>  
>  /* Caller must disable local BH processing. */
>  int __inet_inherit_port(const struct sock *sk, struct sock *child);
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 5bed1ea7a722..3932e3e96281 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -348,6 +348,7 @@ struct sk_filter;
>    *	@sk_txtime_report_errors: set report errors mode for SO_TXTIME
>    *	@sk_txtime_unused: unused txtime flags
>    *	@ns_tracker: tracker for netns reference
> +  *	@sk_bind2_node: bind node in the bhash2 table
>    */
>  struct sock {
>  	/*
> @@ -537,6 +538,7 @@ struct sock {
>  #endif
>  	struct rcu_head		sk_rcu;
>  	netns_tracker		ns_tracker;
> +	struct hlist_node	sk_bind2_node;
>  };
>  
>  enum sk_pacing {
> @@ -811,12 +813,21 @@ static inline void __sk_del_bind_node(struct sock *sk)
>  	__hlist_del(&sk->sk_bind_node);
>  }
>  
> -static inline void sk_add_bind_node(struct sock *sk,
> -					struct hlist_head *list)
> +static inline void sk_add_bind_node(struct sock *sk, struct hlist_head *list)
>  {
>  	hlist_add_head(&sk->sk_bind_node, list);
>  }

This is an unneeded chunck, that increases the size of an already big
patch. I would have dropped it.
 
> +static inline void __sk_del_bind2_node(struct sock *sk)
> +{
> +	__hlist_del(&sk->sk_bind2_node);
> +}
> +
> +static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
> +{
> +	hlist_add_head(&sk->sk_bind2_node, list);
> +}
> +
>  #define sk_for_each(__sk, list) \
>  	hlist_for_each_entry(__sk, list, sk_node)
>  #define sk_for_each_rcu(__sk, list) \
> @@ -834,6 +845,8 @@ static inline void sk_add_bind_node(struct sock *sk,
>  	hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
>  #define sk_for_each_bound(__sk, list) \
>  	hlist_for_each_entry(__sk, list, sk_bind_node)
> +#define sk_for_each_bound_bhash2(__sk, list) \
> +	hlist_for_each_entry(__sk, list, sk_bind2_node)
>  
>  /**
>   * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
> diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
> index da6e3b20cd75..7958f5d355f3 100644
> --- a/net/dccp/ipv4.c
> +++ b/net/dccp/ipv4.c
> @@ -45,14 +45,15 @@ static unsigned int dccp_v4_pernet_id __read_mostly;
>  int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>  {
>  	const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
> +	struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
> +	__be32 daddr, nexthop, prev_sk_rcv_saddr;
>  	struct inet_sock *inet = inet_sk(sk);
>  	struct dccp_sock *dp = dccp_sk(sk);
> +	struct ip_options_rcu *inet_opt;
>  	__be16 orig_sport, orig_dport;
> -	__be32 daddr, nexthop;
>  	struct flowi4 *fl4;
>  	struct rtable *rt;
>  	int err;
> -	struct ip_options_rcu *inet_opt;
>  
>  	dp->dccps_role = DCCP_ROLE_CLIENT;
>  
> @@ -89,9 +90,26 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>  	if (inet_opt == NULL || !inet_opt->opt.srr)
>  		daddr = fl4->daddr;
>  
> -	if (inet->inet_saddr == 0)
> +	if (inet->inet_saddr == 0) {
> +		prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo,
> +							     sk, sock_net(sk),
> +							     inet->inet_num);
> +		prev_sk_rcv_saddr = sk->sk_rcv_saddr;
>  		inet->inet_saddr = fl4->saddr;
> +	}
> +
>  	sk_rcv_saddr_set(sk, inet->inet_saddr);
> +
> +	if (prev_addr_hashbucket) {
> +		err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
> +		if (err) {
> +			inet->inet_saddr = 0;
> +			sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
> +			ip_rt_put(rt);
> +			return err;
> +		}
> +	}
> +
>  	inet->inet_dport = usin->sin_port;
>  	sk_daddr_set(sk, daddr);
>  
> diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
> index fd44638ec16b..83843aea173c 100644
> --- a/net/dccp/ipv6.c
> +++ b/net/dccp/ipv6.c
> @@ -934,8 +934,20 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
>  	}
>  
>  	if (saddr == NULL) {
> +		struct in6_addr prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
> +		struct inet_bind_hashbucket *prev_addr_hashbucket;
> +
> +		prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo,
> +							     sk, sock_net(sk),
> +							     inet->inet_num);
>  		saddr = &fl6.saddr;
>  		sk->sk_v6_rcv_saddr = *saddr;
> +
> +		err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
> +		if (err) {
> +			sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
> +			goto failure;
> +		}
>  	}
>  
>  	/* set the source address */
> diff --git a/net/dccp/proto.c b/net/dccp/proto.c
> index eb8e128e43e8..f4f2ad5f9c08 100644
> --- a/net/dccp/proto.c
> +++ b/net/dccp/proto.c
> @@ -1120,6 +1120,12 @@ static int __init dccp_init(void)
>  				  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
>  	if (!dccp_hashinfo.bind_bucket_cachep)
>  		goto out_free_hashinfo2;
> +	dccp_hashinfo.bind2_bucket_cachep =
> +		kmem_cache_create("dccp_bind2_bucket",
> +				  sizeof(struct inet_bind2_bucket), 0,
> +				  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
> +	if (!dccp_hashinfo.bind2_bucket_cachep)
> +		goto out_free_bind_bucket_cachep;
>  
>  	/*
>  	 * Size and allocate the main established and bind bucket
> @@ -1150,7 +1156,7 @@ static int __init dccp_init(void)
>  
>  	if (!dccp_hashinfo.ehash) {
>  		DCCP_CRIT("Failed to allocate DCCP established hash table");
> -		goto out_free_bind_bucket_cachep;
> +		goto out_free_bind2_bucket_cachep;
>  	}
>  
>  	for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
> @@ -1176,14 +1182,24 @@ static int __init dccp_init(void)
>  		goto out_free_dccp_locks;
>  	}
>  
> +	dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *)
> +		__get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
> +
> +	if (!dccp_hashinfo.bhash2) {
> +		DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
> +		goto out_free_dccp_bhash;
> +	}
> +
>  	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
>  		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
>  		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
> +		spin_lock_init(&dccp_hashinfo.bhash2[i].lock);
> +		INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
>  	}
>  
>  	rc = dccp_mib_init();
>  	if (rc)
> -		goto out_free_dccp_bhash;
> +		goto out_free_dccp_bhash2;
>  
>  	rc = dccp_ackvec_init();
>  	if (rc)
> @@ -1207,30 +1223,38 @@ static int __init dccp_init(void)
>  	dccp_ackvec_exit();
>  out_free_dccp_mib:
>  	dccp_mib_exit();
> +out_free_dccp_bhash2:
> +	free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
>  out_free_dccp_bhash:
>  	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
>  out_free_dccp_locks:
>  	inet_ehash_locks_free(&dccp_hashinfo);
>  out_free_dccp_ehash:
>  	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
> +out_free_bind2_bucket_cachep:
> +	kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
>  out_free_bind_bucket_cachep:
>  	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
>  out_free_hashinfo2:
>  	inet_hashinfo2_free_mod(&dccp_hashinfo);
>  out_fail:
>  	dccp_hashinfo.bhash = NULL;
> +	dccp_hashinfo.bhash2 = NULL;
>  	dccp_hashinfo.ehash = NULL;
>  	dccp_hashinfo.bind_bucket_cachep = NULL;
> +	dccp_hashinfo.bind2_bucket_cachep = NULL;
>  	return rc;
>  }
>  
>  static void __exit dccp_fini(void)
>  {
> +	int bhash_order = get_order(dccp_hashinfo.bhash_size *
> +				    sizeof(struct inet_bind_hashbucket));
> +
>  	ccid_cleanup_builtins();
>  	dccp_mib_exit();
> -	free_pages((unsigned long)dccp_hashinfo.bhash,
> -		   get_order(dccp_hashinfo.bhash_size *
> -			     sizeof(struct inet_bind_hashbucket)));
> +	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
> +	free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
>  	free_pages((unsigned long)dccp_hashinfo.ehash,
>  		   get_order((dccp_hashinfo.ehash_mask + 1) *
>  			     sizeof(struct inet_ehash_bucket)));
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index da81f56fdd1c..47b5fa4f8c24 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1218,13 +1218,15 @@ EXPORT_SYMBOL(inet_unregister_protosw);
>  
>  static int inet_sk_reselect_saddr(struct sock *sk)
>  {
> +	struct inet_bind_hashbucket *prev_addr_hashbucket;
>  	struct inet_sock *inet = inet_sk(sk);
>  	__be32 old_saddr = inet->inet_saddr;
>  	__be32 daddr = inet->inet_daddr;
> +	struct ip_options_rcu *inet_opt;
>  	struct flowi4 *fl4;
>  	struct rtable *rt;
>  	__be32 new_saddr;
> -	struct ip_options_rcu *inet_opt;
> +	int err;
>  
>  	inet_opt = rcu_dereference_protected(inet->inet_opt,
>  					     lockdep_sock_is_held(sk));
> @@ -1239,20 +1241,33 @@ static int inet_sk_reselect_saddr(struct sock *sk)
>  	if (IS_ERR(rt))
>  		return PTR_ERR(rt);
>  
> -	sk_setup_caps(sk, &rt->dst);
> -

I don't see why  'sk_setup_caps()' is moved. Additionally it looks like
it's not called anymore on the error path. It looks like an unrelated
"optimization", I suggest to drop it.


Thanks!

Paolo

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ