lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250302124237.3913746-5-edumazet@google.com>
Date: Sun,  2 Mar 2025 12:42:37 +0000
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, 
	Paolo Abeni <pabeni@...hat.com>, Neal Cardwell <ncardwell@...gle.com>
Cc: Kuniyuki Iwashima <kuniyu@...zon.com>, Jason Xing <kerneljasonxing@...il.com>, 
	Simon Horman <horms@...nel.org>, netdev@...r.kernel.org, eric.dumazet@...il.com, 
	Eric Dumazet <edumazet@...gle.com>
Subject: [PATCH net-next 4/4] tcp: use RCU lookup in __inet_hash_connect()

When __inet_hash_connect() has to try many 4-tuples before
finding an available one, we see a high spinlock cost from
the many spin_lock_bh(&head->lock) performed in its loop.

This patch adds an RCU lookup to avoid the spinlock cost.

check_established() gets a new @rcu_lookup argument.
First reason is to not make any changes while head->lock
is not held.
Second reason is to not make this RCU lookup a second time
after the spinlock has been acquired.

Tested:

Server:

ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog

Client:

ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server

Before series:

  utime_start=0.288582
  utime_end=1.548707
  stime_start=20.637138
  stime_end=2002.489845
  num_transactions=484453
  latency_min=0.156279245
  latency_max=20.922042756
  latency_mean=1.546521274
  latency_stddev=3.936005194
  num_samples=312537
  throughput=47426.00

perf top on the client:

 49.54%  [kernel]       [k] _raw_spin_lock
 25.87%  [kernel]       [k] _raw_spin_lock_bh
  5.97%  [kernel]       [k] queued_spin_lock_slowpath
  5.67%  [kernel]       [k] __inet_hash_connect
  3.53%  [kernel]       [k] __inet6_check_established
  3.48%  [kernel]       [k] inet6_ehashfn
  0.64%  [kernel]       [k] rcu_all_qs

After this series:

  utime_start=0.271607
  utime_end=3.847111
  stime_start=18.407684
  stime_end=1997.485557
  num_transactions=1350742
  latency_min=0.014131929
  latency_max=17.895073144
  latency_mean=0.505675853  # Nice reduction of latency metrics
  latency_stddev=2.125164772
  num_samples=307884
  throughput=139866.80      # 190 % increase

perf top on client:

 56.86%  [kernel]       [k] __inet6_check_established
 17.96%  [kernel]       [k] __inet_hash_connect
 13.88%  [kernel]       [k] inet6_ehashfn
  2.52%  [kernel]       [k] rcu_all_qs
  2.01%  [kernel]       [k] __cond_resched
  0.41%  [kernel]       [k] _raw_spin_lock

Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
 include/net/inet_hashtables.h |  3 +-
 net/ipv4/inet_hashtables.c    | 52 +++++++++++++++++++++++------------
 net/ipv6/inet6_hashtables.c   | 24 ++++++++--------
 3 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 73c0e4087fd1a6d0d2a40ab0394165e07b08ed6d..b12797f13c9a3d66fab99c877d059f9c29c30d11 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -529,7 +529,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			struct sock *sk, u64 port_offset,
 			int (*check_established)(struct inet_timewait_death_row *,
 						 struct sock *, __u16,
-						 struct inet_timewait_sock **));
+						 struct inet_timewait_sock **,
+						 bool rcu_lookup));
 
 int inet_hash_connect(struct inet_timewait_death_row *death_row,
 		      struct sock *sk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index b737e13f8459c53428980221355344327c4bc8dd..d1b5f45ee718410fdf3e78c113c7ebd4a1ddba40 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -537,7 +537,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established);
 /* called with local bh disabled */
 static int __inet_check_established(struct inet_timewait_death_row *death_row,
 				    struct sock *sk, __u16 lport,
-				    struct inet_timewait_sock **twp)
+				    struct inet_timewait_sock **twp,
+				    bool rcu_lookup)
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	struct inet_sock *inet = inet_sk(sk);
@@ -556,17 +557,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	struct sock *sk2;
 	spinlock_t *lock;
 
-	rcu_read_lock();
-	sk_nulls_for_each(sk2, node, &head->chain) {
-		if (sk2->sk_hash != hash ||
-		    !inet_match(net, sk2, acookie, ports, dif, sdif))
-			continue;
-		if (sk2->sk_state == TCP_TIME_WAIT)
-			break;
-		rcu_read_unlock();
-		return -EADDRNOTAVAIL;
+	if (rcu_lookup) {
+		sk_nulls_for_each(sk2, node, &head->chain) {
+			if (sk2->sk_hash != hash ||
+			    !inet_match(net, sk2, acookie, ports, dif, sdif))
+				continue;
+			if (sk2->sk_state == TCP_TIME_WAIT)
+				break;
+			return -EADDRNOTAVAIL;
+		}
+		return 0;
 	}
-	rcu_read_unlock();
 
 	lock = inet_ehash_lockp(hinfo, hash);
 	spin_lock(lock);
@@ -1007,7 +1008,8 @@ static u32 *table_perturb;
 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		struct sock *sk, u64 port_offset,
 		int (*check_established)(struct inet_timewait_death_row *,
-			struct sock *, __u16, struct inet_timewait_sock **))
+			struct sock *, __u16, struct inet_timewait_sock **,
+			bool rcu_lookup))
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	struct inet_bind_hashbucket *head, *head2;
@@ -1025,7 +1027,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 
 	if (port) {
 		local_bh_disable();
-		ret = check_established(death_row, sk, port, NULL);
+		ret = check_established(death_row, sk, port, NULL, false);
 		local_bh_enable();
 		return ret;
 	}
@@ -1061,6 +1063,21 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			continue;
 		head = &hinfo->bhash[inet_bhashfn(net, port,
 						  hinfo->bhash_size)];
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(tb, &head->chain, node) {
+			if (!inet_bind_bucket_match(tb, net, port, l3mdev))
+				continue;
+			if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
+				rcu_read_unlock();
+				goto next_port;
+			}
+			if (!check_established(death_row, sk, port, &tw, true))
+				break;
+			rcu_read_unlock();
+			goto next_port;
+		}
+		rcu_read_unlock();
+
 		spin_lock_bh(&head->lock);
 
 		/* Does not bother with rcv_saddr checks, because
@@ -1070,12 +1087,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
 				if (tb->fastreuse >= 0 ||
 				    tb->fastreuseport >= 0)
-					goto next_port;
+					goto next_port_unlock;
 				WARN_ON(hlist_empty(&tb->bhash2));
 				if (!check_established(death_row, sk,
-						       port, &tw))
+						       port, &tw, false))
 					goto ok;
-				goto next_port;
+				goto next_port_unlock;
 			}
 		}
 
@@ -1089,8 +1106,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		tb->fastreuse = -1;
 		tb->fastreuseport = -1;
 		goto ok;
-next_port:
+next_port_unlock:
 		spin_unlock_bh(&head->lock);
+next_port:
 		cond_resched();
 	}
 
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3604a5cae5d29a25d24f9513308334ff8e64b083..9be315496459fcb391123a07ac887e2f59d27360 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -263,7 +263,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup);
 
 static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 				     struct sock *sk, const __u16 lport,
-				     struct inet_timewait_sock **twp)
+				     struct inet_timewait_sock **twp,
+				     bool rcu_lookup)
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	struct inet_sock *inet = inet_sk(sk);
@@ -281,17 +282,18 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	struct sock *sk2;
 	spinlock_t *lock;
 
-	rcu_read_lock();
-	sk_nulls_for_each(sk2, node, &head->chain) {
-		if (sk2->sk_hash != hash ||
-		    !inet6_match(net, sk2, saddr, daddr, ports, dif, sdif))
-			continue;
-		if (sk2->sk_state == TCP_TIME_WAIT)
-			break;
-		rcu_read_unlock();
-		return -EADDRNOTAVAIL;
+	if (rcu_lookup) {
+		sk_nulls_for_each(sk2, node, &head->chain) {
+			if (sk2->sk_hash != hash ||
+			    !inet6_match(net, sk2, saddr, daddr,
+					 ports, dif, sdif))
+				continue;
+			if (sk2->sk_state == TCP_TIME_WAIT)
+				break;
+			return -EADDRNOTAVAIL;
+		}
+		return 0;
 	}
-	rcu_read_unlock();
 
 	lock = inet_ehash_lockp(hinfo, hash);
 	spin_lock(lock);
-- 
2.48.1.711.g2feabab25a-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ