[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240711170023.63094-1-kuniyu@amazon.com>
Date: Thu, 11 Jul 2024 10:00:23 -0700
From: Kuniyuki Iwashima <kuniyu@...zon.com>
To: <heze0908@...il.com>
CC: <davem@...emloft.net>, <dsahern@...nel.org>, <edumazet@...gle.com>,
<kernelxing@...cent.com>, <kuba@...nel.org>, <netdev@...r.kernel.org>,
<pabeni@...hat.com>, <kuniyu@...zon.com>
Subject: Re: [PATCH net-next] inet: reduce the execution time of getsockname()
From: heze0908 <heze0908@...il.com>
Date: Thu, 11 Jul 2024 15:10:17 +0800
> From: Ze He <zanehe@...cent.com>
>
> Recently, we received feedback regarding an increase
> in the time consumption of getsockname() in production.
> Therefore, we conducted tests based on the
> "getsockname" test item in libmicro. The test results
> indicate that compared to the kernel 5.4, the latest
> kernel indeed has an increased time consumption
> in getsockname().
> The test results are as follows:
>
> case_name kernel 5.4 latest kernel diff
> ---------- ----------- ------------- --------
> getsockname 0.12278 0.18246 +48.61%
>
> It was discovered that the introduction of lock_sock() in
> commit 9dfc685e0262 ("inet: remove races in inet{6}_getname()")
> to solve the data race problem between __inet_hash_connect()
> and inet_getname() has led to the increased time consumption.
> This patch attempts to propose a lockless solution to replace
> the spinlock solution.
>
> We have to solve the race issue without heavy spin lock:
> one reader is reading some members in struct inet_sock
> while the other writer is trying to modify them. Those
> members are "inet_sport" "inet_saddr" "inet_dport"
> "inet_rcv_saddr". Therefore, in the path of getname, we
> use READ_ONCE to read these data, and correspondingly,
> in the path of tcp connect, we use WRITE_ONCE to write
> these data.
>
> Using this patch, we conducted the getsockname test again,
> and the results are as follows:
>
> case_name latest kernel latest kernel(patched)
> ---------- ----------- ---------------------
> getsockname 0.18246 0.14423
>
> Signed-off-by: Jason Xing <kernelxing@...cent.com>
> Signed-off-by: Ze He <zanehe@...cent.com>
> ---
> include/net/ip.h | 3 ++-
> net/ipv4/af_inet.c | 27 +++++++++++++--------------
> net/ipv4/inet_hashtables.c | 8 ++++----
> net/ipv4/tcp_ipv4.c | 4 ++--
> net/ipv6/af_inet6.c | 22 ++++++++++------------
> net/ipv6/inet6_hashtables.c | 2 +-
> net/ipv6/tcp_ipv6.c | 6 +++---
> 7 files changed, 35 insertions(+), 37 deletions(-)
>
> diff --git a/include/net/ip.h b/include/net/ip.h
> index c5606cadb1a5..cec1919cfdd0 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -663,7 +663,8 @@ static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast,
>
> static __inline__ void inet_reset_saddr(struct sock *sk)
> {
> - inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;
> + WRITE_ONCE(inet_sk(sk)->inet_rcv_saddr, 0);
> + WRITE_ONCE(inet_sk(sk)->inet_saddr, 0);
> #if IS_ENABLED(CONFIG_IPV6)
> if (sk->sk_family == PF_INET6) {
> struct ipv6_pinfo *np = inet6_sk(sk);
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index b24d74616637..e8c035f23078 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -803,28 +803,27 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
> int sin_addr_len = sizeof(*sin);
>
> sin->sin_family = AF_INET;
> - lock_sock(sk);
> if (peer) {
> - if (!inet->inet_dport ||
> + __be16 dport = READ_ONCE(inet->inet_dport);
> +
> + if (!dport ||
> (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
sk->sk_state will need annotation.
> - peer == 1)) {
> - release_sock(sk);
> + peer == 1))
> return -ENOTCONN;
> - }
> - sin->sin_port = inet->inet_dport;
> + sin->sin_port = dport;
> sin->sin_addr.s_addr = inet->inet_daddr;
READ_ONCE() is needed here, and WRITE_ONCE() in sk_daddr_set().
> - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
> - CGROUP_INET4_GETPEERNAME);
> + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len,
> + CGROUP_INET4_GETPEERNAME, NULL);
> } else {
> - __be32 addr = inet->inet_rcv_saddr;
> + __be32 addr = READ_ONCE(inet->inet_rcv_saddr);
> +
> if (!addr)
> - addr = inet->inet_saddr;
> - sin->sin_port = inet->inet_sport;
> + addr = READ_ONCE(inet->inet_saddr);
> + sin->sin_port = READ_ONCE(inet->inet_sport);
> sin->sin_addr.s_addr = addr;
> - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
> - CGROUP_INET4_GETSOCKNAME);
> + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len,
> + CGROUP_INET4_GETSOCKNAME, NULL);
> }
> - release_sock(sk);
> memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
> return sin_addr_len;
> }
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 48d0d494185b..9398dbf625b4 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -577,7 +577,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
> * in hash table socket with a funny identity.
> */
> inet->inet_num = lport;
> - inet->inet_sport = htons(lport);
> + WRITE_ONCE(inet->inet_sport, htons(lport));
> sk->sk_hash = hash;
> WARN_ON(!sk_unhashed(sk));
> __sk_nulls_add_node_rcu(sk, &head->chain);
> @@ -877,7 +877,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
> static void inet_update_saddr(struct sock *sk, void *saddr, int family)
> {
> if (family == AF_INET) {
> - inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
> + WRITE_ONCE(inet_sk(sk)->inet_saddr, *(__be32 *)saddr);
> sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
> }
> #if IS_ENABLED(CONFIG_IPV6)
> @@ -1115,7 +1115,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> inet_bind_hash(sk, tb, tb2, port);
>
> if (sk_unhashed(sk)) {
> - inet_sk(sk)->inet_sport = htons(port);
> + WRITE_ONCE(inet_sk(sk)->inet_sport, htons(port));
> inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
> }
> if (tw)
> @@ -1140,7 +1140,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> spin_unlock(lock);
>
> sk->sk_hash = 0;
> - inet_sk(sk)->inet_sport = 0;
> + WRITE_ONCE(inet_sk(sk)->inet_sport, 0);
> inet_sk(sk)->inet_num = 0;
>
> if (tw)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index fd17f25ff288..041a29d8a0fb 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -279,7 +279,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
> WRITE_ONCE(tp->write_seq, 0);
> }
>
> - inet->inet_dport = usin->sin_port;
> + WRITE_ONCE(inet->inet_dport, usin->sin_port);
> sk_daddr_set(sk, daddr);
>
> inet_csk(sk)->icsk_ext_hdr_len = 0;
> @@ -348,7 +348,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
> inet_bhash2_reset_saddr(sk);
> ip_rt_put(rt);
> sk->sk_route_caps = 0;
> - inet->inet_dport = 0;
> + WRITE_ONCE(inet->inet_dport, 0);
> return err;
> }
> EXPORT_SYMBOL(tcp_v4_connect);
> diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> index e03fb9a1dbeb..241bc6d2d0a2 100644
> --- a/net/ipv6/af_inet6.c
> +++ b/net/ipv6/af_inet6.c
> @@ -532,32 +532,30 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
> sin->sin6_family = AF_INET6;
> sin->sin6_flowinfo = 0;
> sin->sin6_scope_id = 0;
> - lock_sock(sk);
> if (peer) {
> - if (!inet->inet_dport ||
> + __be16 dport = READ_ONCE(inet->inet_dport);
> +
> + if (!dport ||
> (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
> - peer == 1)) {
> - release_sock(sk);
> + peer == 1))
> return -ENOTCONN;
> - }
> - sin->sin6_port = inet->inet_dport;
> + sin->sin6_port = dport;
> sin->sin6_addr = sk->sk_v6_daddr;
This access is also racy,
> if (inet6_test_bit(SNDFLOW, sk))
> sin->sin6_flowinfo = np->flow_label;
> - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
> - CGROUP_INET6_GETPEERNAME);
> + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len,
> + CGROUP_INET6_GETPEERNAME, NULL);
> } else {
> if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
> sin->sin6_addr = np->saddr;
> else
> sin->sin6_addr = sk->sk_v6_rcv_saddr;
and same here.
> - sin->sin6_port = inet->inet_sport;
> - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
> - CGROUP_INET6_GETSOCKNAME);
> + sin->sin6_port = READ_ONCE(inet->inet_sport);
> + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len,
> + CGROUP_INET6_GETSOCKNAME, NULL);
> }
> sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
> sk->sk_bound_dev_if);
> - release_sock(sk);
> return sin_addr_len;
> }
> EXPORT_SYMBOL(inet6_getname);
> diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
> index 6db71bb1cd30..d5b191db9dfe 100644
> --- a/net/ipv6/inet6_hashtables.c
> +++ b/net/ipv6/inet6_hashtables.c
> @@ -302,7 +302,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
> * in hash table socket with a funny identity.
> */
> inet->inet_num = lport;
> - inet->inet_sport = htons(lport);
> + WRITE_ONCE(inet->inet_sport, htons(lport));
> sk->sk_hash = hash;
> WARN_ON(!sk_unhashed(sk));
> __sk_nulls_add_node_rcu(sk, &head->chain);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 200fea92f12f..f78ab704378a 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -293,7 +293,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
>
> /* set the source address */
> np->saddr = *saddr;
> - inet->inet_rcv_saddr = LOOPBACK4_IPV6;
> + WRITE_ONCE(inet->inet_rcv_saddr, LOOPBACK4_IPV6);
>
> sk->sk_gso_type = SKB_GSO_TCPV6;
> ip6_dst_store(sk, dst, NULL, NULL);
> @@ -305,7 +305,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
>
> tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
>
> - inet->inet_dport = usin->sin6_port;
> + WRITE_ONCE(inet->inet_dport, usin->sin6_port);
>
> tcp_set_state(sk, TCP_SYN_SENT);
> err = inet6_hash_connect(tcp_death_row, sk);
> @@ -340,7 +340,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
> tcp_set_state(sk, TCP_CLOSE);
> inet_bhash2_reset_saddr(sk);
> failure:
> - inet->inet_dport = 0;
> + WRITE_ONCE(inet->inet_dport, 0);
> sk->sk_route_caps = 0;
> return err;
> }
> --
> 2.43.5
Powered by blists - more mailing lists