lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240711170023.63094-1-kuniyu@amazon.com>
Date: Thu, 11 Jul 2024 10:00:23 -0700
From: Kuniyuki Iwashima <kuniyu@...zon.com>
To: <heze0908@...il.com>
CC: <davem@...emloft.net>, <dsahern@...nel.org>, <edumazet@...gle.com>,
	<kernelxing@...cent.com>, <kuba@...nel.org>, <netdev@...r.kernel.org>,
	<pabeni@...hat.com>, <kuniyu@...zon.com>
Subject: Re: [PATCH net-next] inet: reduce the execution time of getsockname()

From: heze0908 <heze0908@...il.com>
Date: Thu, 11 Jul 2024 15:10:17 +0800
> From: Ze He <zanehe@...cent.com>
> 
> Recently, we received feedback regarding an increase
> in the time consumption of getsockname() in production.
> Therefore, we conducted tests based on the
> "getsockname" test item in libmicro. The test results
> indicate that compared to the kernel 5.4, the latest
> kernel indeed has an increased time consumption
> in getsockname().
> The test results are as follows:
> 
> case_name	kernel 5.4	latest kernel	  diff
> ----------	-----------	-------------	--------
> getsockname	  0.12278 	  0.18246	+48.61%
> 
> It was discovered that the introduction of lock_sock() in
> commit 9dfc685e0262 ("inet: remove races in inet{6}_getname()")
> to solve the data race problem between __inet_hash_connect()
> and inet_getname() has led to the increased time consumption.
> This patch attempts to propose a lockless solution to replace
> the spinlock solution.
> 
> We have to solve the race issue without heavy spin lock:
> one reader is reading some members in struct inet_sock
> while the other writer is trying to modify them. Those
> members are "inet_sport" "inet_saddr" "inet_dport"
> "inet_rcv_saddr". Therefore, in the path of getname, we
> use READ_ONCE to read these data, and correspondingly,
> in the path of tcp connect, we use WRITE_ONCE to write
> these data.
> 
> Using this patch, we conducted the getsockname test again,
> and the results are as follows:
> 
> case_name       latest kernel   latest kernel(patched)
> ----------      -----------     ---------------------
> getsockname       0.18246             0.14423
> 
> Signed-off-by: Jason Xing <kernelxing@...cent.com>
> Signed-off-by: Ze He <zanehe@...cent.com>
> ---
>  include/net/ip.h            |  3 ++-
>  net/ipv4/af_inet.c          | 27 +++++++++++++--------------
>  net/ipv4/inet_hashtables.c  |  8 ++++----
>  net/ipv4/tcp_ipv4.c         |  4 ++--
>  net/ipv6/af_inet6.c         | 22 ++++++++++------------
>  net/ipv6/inet6_hashtables.c |  2 +-
>  net/ipv6/tcp_ipv6.c         |  6 +++---
>  7 files changed, 35 insertions(+), 37 deletions(-)
> 
> diff --git a/include/net/ip.h b/include/net/ip.h
> index c5606cadb1a5..cec1919cfdd0 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -663,7 +663,8 @@ static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast,
>  
>  static __inline__ void inet_reset_saddr(struct sock *sk)
>  {
> -	inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;
> +	WRITE_ONCE(inet_sk(sk)->inet_rcv_saddr, 0);
> +	WRITE_ONCE(inet_sk(sk)->inet_saddr, 0);
>  #if IS_ENABLED(CONFIG_IPV6)
>  	if (sk->sk_family == PF_INET6) {
>  		struct ipv6_pinfo *np = inet6_sk(sk);
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index b24d74616637..e8c035f23078 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -803,28 +803,27 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
>  	int sin_addr_len = sizeof(*sin);
>  
>  	sin->sin_family = AF_INET;
> -	lock_sock(sk);
>  	if (peer) {
> -		if (!inet->inet_dport ||
> +		__be16 dport = READ_ONCE(inet->inet_dport);
> +
> +		if (!dport ||
>  		    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&

sk->sk_state will need annotation.


> -		     peer == 1)) {
> -			release_sock(sk);
> +		     peer == 1))
>  			return -ENOTCONN;
> -		}
> -		sin->sin_port = inet->inet_dport;
> +		sin->sin_port = dport;
>  		sin->sin_addr.s_addr = inet->inet_daddr;

READ_ONCE() is needed here, and WRITE_ONCE() in sk_daddr_set().


> -		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
> -				       CGROUP_INET4_GETPEERNAME);
> +		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len,
> +					    CGROUP_INET4_GETPEERNAME, NULL);
>  	} else {
> -		__be32 addr = inet->inet_rcv_saddr;
> +		__be32 addr = READ_ONCE(inet->inet_rcv_saddr);
> +
>  		if (!addr)
> -			addr = inet->inet_saddr;
> -		sin->sin_port = inet->inet_sport;
> +			addr = READ_ONCE(inet->inet_saddr);
> +		sin->sin_port = READ_ONCE(inet->inet_sport);
>  		sin->sin_addr.s_addr = addr;
> -		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
> -				       CGROUP_INET4_GETSOCKNAME);
> +		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len,
> +					    CGROUP_INET4_GETSOCKNAME, NULL);
>  	}
> -	release_sock(sk);
>  	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
>  	return sin_addr_len;
>  }
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 48d0d494185b..9398dbf625b4 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -577,7 +577,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
>  	 * in hash table socket with a funny identity.
>  	 */
>  	inet->inet_num = lport;
> -	inet->inet_sport = htons(lport);
> +	WRITE_ONCE(inet->inet_sport, htons(lport));
>  	sk->sk_hash = hash;
>  	WARN_ON(!sk_unhashed(sk));
>  	__sk_nulls_add_node_rcu(sk, &head->chain);
> @@ -877,7 +877,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
>  static void inet_update_saddr(struct sock *sk, void *saddr, int family)
>  {
>  	if (family == AF_INET) {
> -		inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
> +		WRITE_ONCE(inet_sk(sk)->inet_saddr, *(__be32 *)saddr);
>  		sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
>  	}
>  #if IS_ENABLED(CONFIG_IPV6)
> @@ -1115,7 +1115,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>  	inet_bind_hash(sk, tb, tb2, port);
>  
>  	if (sk_unhashed(sk)) {
> -		inet_sk(sk)->inet_sport = htons(port);
> +		WRITE_ONCE(inet_sk(sk)->inet_sport, htons(port));
>  		inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
>  	}
>  	if (tw)
> @@ -1140,7 +1140,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>  		spin_unlock(lock);
>  
>  		sk->sk_hash = 0;
> -		inet_sk(sk)->inet_sport = 0;
> +		WRITE_ONCE(inet_sk(sk)->inet_sport, 0);
>  		inet_sk(sk)->inet_num = 0;
>  
>  		if (tw)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index fd17f25ff288..041a29d8a0fb 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -279,7 +279,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>  			WRITE_ONCE(tp->write_seq, 0);
>  	}
>  
> -	inet->inet_dport = usin->sin_port;
> +	WRITE_ONCE(inet->inet_dport, usin->sin_port);
>  	sk_daddr_set(sk, daddr);
>  
>  	inet_csk(sk)->icsk_ext_hdr_len = 0;
> @@ -348,7 +348,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>  	inet_bhash2_reset_saddr(sk);
>  	ip_rt_put(rt);
>  	sk->sk_route_caps = 0;
> -	inet->inet_dport = 0;
> +	WRITE_ONCE(inet->inet_dport, 0);
>  	return err;
>  }
>  EXPORT_SYMBOL(tcp_v4_connect);
> diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> index e03fb9a1dbeb..241bc6d2d0a2 100644
> --- a/net/ipv6/af_inet6.c
> +++ b/net/ipv6/af_inet6.c
> @@ -532,32 +532,30 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
>  	sin->sin6_family = AF_INET6;
>  	sin->sin6_flowinfo = 0;
>  	sin->sin6_scope_id = 0;
> -	lock_sock(sk);
>  	if (peer) {
> -		if (!inet->inet_dport ||
> +		__be16 dport = READ_ONCE(inet->inet_dport);
> +
> +		if (!dport ||
>  		    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
> -		    peer == 1)) {
> -			release_sock(sk);
> +		    peer == 1))
>  			return -ENOTCONN;
> -		}
> -		sin->sin6_port = inet->inet_dport;
> +		sin->sin6_port = dport;
>  		sin->sin6_addr = sk->sk_v6_daddr;

This access is also racy,


>  		if (inet6_test_bit(SNDFLOW, sk))
>  			sin->sin6_flowinfo = np->flow_label;
> -		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
> -				       CGROUP_INET6_GETPEERNAME);
> +		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len,
> +					    CGROUP_INET6_GETPEERNAME, NULL);
>  	} else {
>  		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
>  			sin->sin6_addr = np->saddr;
>  		else
>  			sin->sin6_addr = sk->sk_v6_rcv_saddr;

and same here.


> -		sin->sin6_port = inet->inet_sport;
> -		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
> -				       CGROUP_INET6_GETSOCKNAME);
> +		sin->sin6_port = READ_ONCE(inet->inet_sport);
> +		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len,
> +					    CGROUP_INET6_GETSOCKNAME, NULL);
>  	}
>  	sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
>  						 sk->sk_bound_dev_if);
> -	release_sock(sk);
>  	return sin_addr_len;
>  }
>  EXPORT_SYMBOL(inet6_getname);
> diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
> index 6db71bb1cd30..d5b191db9dfe 100644
> --- a/net/ipv6/inet6_hashtables.c
> +++ b/net/ipv6/inet6_hashtables.c
> @@ -302,7 +302,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
>  	 * in hash table socket with a funny identity.
>  	 */
>  	inet->inet_num = lport;
> -	inet->inet_sport = htons(lport);
> +	WRITE_ONCE(inet->inet_sport, htons(lport));
>  	sk->sk_hash = hash;
>  	WARN_ON(!sk_unhashed(sk));
>  	__sk_nulls_add_node_rcu(sk, &head->chain);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 200fea92f12f..f78ab704378a 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -293,7 +293,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
>  
>  	/* set the source address */
>  	np->saddr = *saddr;
> -	inet->inet_rcv_saddr = LOOPBACK4_IPV6;
> +	WRITE_ONCE(inet->inet_rcv_saddr, LOOPBACK4_IPV6);
>  
>  	sk->sk_gso_type = SKB_GSO_TCPV6;
>  	ip6_dst_store(sk, dst, NULL, NULL);
> @@ -305,7 +305,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
>  
>  	tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
>  
> -	inet->inet_dport = usin->sin6_port;
> +	WRITE_ONCE(inet->inet_dport, usin->sin6_port);
>  
>  	tcp_set_state(sk, TCP_SYN_SENT);
>  	err = inet6_hash_connect(tcp_death_row, sk);
> @@ -340,7 +340,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
>  	tcp_set_state(sk, TCP_CLOSE);
>  	inet_bhash2_reset_saddr(sk);
>  failure:
> -	inet->inet_dport = 0;
> +	WRITE_ONCE(inet->inet_dport, 0);
>  	sk->sk_route_caps = 0;
>  	return err;
>  }
> -- 
> 2.43.5

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ