lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240729095554.28296-1-xiaolinkui@126.com>
Date: Mon, 29 Jul 2024 17:55:54 +0800
From: xiaolinkui@....com
To: edumazet@...gle.com,
	davem@...emloft.net,
	dsahern@...nel.org,
	kuba@...nel.org,
	pabeni@...hat.com
Cc: netdev@...r.kernel.org,
	Linkui Xiao <xiaolinkui@...inos.cn>
Subject: [PATCH] tcp/dccp: Add another way to allocate local ports in connect()

From: Linkui Xiao <xiaolinkui@...inos.cn>

Commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()") allocates even ports for connect() first while leaving
odd ports for bind() and this works well in busy servers.

But this strategy causes severe performance degradation in busy clients.
when a client has used more than half of the local ports setted in
proc/sys/net/ipv4/ip_local_port_range, if this client try to connect
to a server again, the connect time increases rapidly since it will
traverse all the even ports though they are exhausted.

So this path provides another strategy by introducing a system option:
local_port_allocation. If it is a busy client, users should set it to 1
to use sequential allocation while it should be set to 0 in other
situations. Its default value is 0.

In commit 207184853dbd ("tcp/dccp: change source port selection at
connect() time"), tell users that they can access all odd and even ports
by using IP_LOCAL_PORT_RANGE. But this requires users to modify the
socket application. When even numbered ports are not sufficient, use the
sysctl parameter to achieve the same effect:
	sysctl -w net.ipv4.local_port_allocation=1

Signed-off-by: Linkui Xiao <xiaolinkui@...inos.cn>
---
 include/net/tcp.h          |  1 +
 net/ipv4/inet_hashtables.c | 12 ++++++++----
 net/ipv4/sysctl_net_ipv4.c |  8 ++++++++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2aac11e7e1cc..99969b8e5183 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -269,6 +269,7 @@ DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
 
 extern struct percpu_counter tcp_sockets_allocated;
 extern unsigned long tcp_memory_pressure;
+extern bool sysctl_local_port_allocation;
 
 /* optimized version of sk_under_memory_pressure() for TCP sockets */
 static inline bool tcp_under_memory_pressure(const struct sock *sk)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 48d0d494185b..e572f8b21b95 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -1020,11 +1020,15 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	l3mdev = inet_sk_bound_l3mdev(sk);
 
 	local_ports = inet_sk_get_local_port_range(sk, &low, &high);
-	step = local_ports ? 1 : 2;
+	/* local_port_allocation 0 means even and odd port allocation strategy
+	 * will be applied, so step is 2; otherwise sequential allocation will
+	 * be used and step is 1. Default value is 0.
+	 */
+	step = sysctl_local_port_allocation ? 1 : 2;
 
 	high++; /* [32768, 60999] -> [32768, 61000[ */
 	remaining = high - low;
-	if (!local_ports && remaining > 1)
+	if (!sysctl_local_port_allocation && remaining > 1)
 		remaining &= ~1U;
 
 	get_random_sleepable_once(table_perturb,
@@ -1037,7 +1041,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	/* In first pass we try ports of @low parity.
 	 * inet_csk_get_port() does the opposite choice.
 	 */
-	if (!local_ports)
+	if (!sysctl_local_port_allocation)
 		offset &= ~1U;
 other_parity_scan:
 	port = low + offset;
@@ -1081,7 +1085,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		cond_resched();
 	}
 
-	if (!local_ports) {
+	if (!sysctl_local_port_allocation) {
 		offset++;
 		if ((offset & 1) && remaining > 1)
 			goto other_parity_scan;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9140d20eb2d4..1f6bf3a73516 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
 static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
 static int tcp_plb_max_rounds = 31;
 static int tcp_plb_max_cong_thresh = 256;
+bool sysctl_local_port_allocation;
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -632,6 +633,13 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &sysctl_fib_sync_mem_min,
 		.extra2		= &sysctl_fib_sync_mem_max,
 	},
+	{
+		.procname	= "local_port_allocation",
+		.data		= &sysctl_local_port_allocation,
+		.maxlen		= sizeof(sysctl_local_port_allocation),
+		.mode		= 0644,
+		.proc_handler	= proc_dobool,
+	},
 };
 
 static struct ctl_table ipv4_net_table[] = {
-- 
2.17.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ