lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Sat, 06 Jun 2015 08:53:05 -0700
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	David Miller <davem@...emloft.net>
Cc:	netdev <netdev@...r.kernel.org>,
	Michael Kerrisk <mtk.manpages@...il.com>,
	Maciej Żenczykowski <maze@...gle.com>
Subject: [PATCH net-next] inet: add IP_BIND_ADDRESS_NO_PORT to overcome
 bind(0) limitations

From: Eric Dumazet <edumazet@...gle.com>

When an application needs to force a source IP on an active TCP socket
it has to use bind(IP, port=x).

As most applications do not want to deal with already used ports, x is
often set to 0, meaning the kernel is in charge to find an available
port.
But kernel does not know yet if this socket is going to be a listener or
be connected.
It has very limited choices (no full knowledge of final 4-tuple for a
connect())

With limited ephemeral port range (about 32K ports), it is very easy to
fill the space.

This patch adds a new SOL_IP socket option, asking kernel to ignore
the 0 port provided by application in bind(IP, port=0) and only
remember the given IP address.

The port will be automatically chosen at connect() time, in a way
that allows sharing a source port as long as the 4-tuples are unique.

This new feature is available for both IPv4 and IPv6.

Tested:

Wrote a test program and checked its behavior.

strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by
connect().
Also getsockname() show that the port is still 0 right after bind()
but properly allocated after connect().

socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0
connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0

socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 6
setsockopt(6, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(6, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0
getsockname(6, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0
connect(6, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.4")}, 16) = 0
getsockname(6, {sa_family=AF_INET, sin_port=htons(35032), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0

I was able to bind()/connect() a million concurrent sockets, instead of
~32000 before patch.

lpaa23:~# ulimit -n 1000010
lpaa23:~# ./bind --connect --num-flows=1000000 &
1000000 sockets

lpaa23:~# grep TCP /proc/net/sockstat
TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66

Check that a given source port is indeed used by many different
connections :

lpaa23:~# ss -t src :40000 | head -10
State      Recv-Q Send-Q   Local Address:Port          Peer Address:Port
ESTAB      0      0           127.0.0.2:40000         127.0.202.33:44983
ESTAB      0      0           127.0.0.2:40000         127.2.27.240:44983
ESTAB      0      0           127.0.0.2:40000           127.2.98.5:44983
ESTAB      0      0           127.0.0.2:40000        127.0.124.196:44983
ESTAB      0      0           127.0.0.2:40000         127.2.139.38:44983
ESTAB      0      0           127.0.0.2:40000          127.1.59.80:44983
ESTAB      0      0           127.0.0.2:40000          127.3.6.228:44983
ESTAB      0      0           127.0.0.2:40000          127.0.38.53:44983
ESTAB      0      0           127.0.0.2:40000         127.1.197.10:44983

Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
 include/net/inet_sock.h |    1 +
 include/uapi/linux/in.h |    1 +
 net/ipv4/af_inet.c      |    3 ++-
 net/ipv4/ip_sockglue.c  |    7 +++++++
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index b6c3737da4e9..47eb67b08abd 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -187,6 +187,7 @@ struct inet_sock {
 				transparent:1,
 				mc_all:1,
 				nodefrag:1;
+	__u8			bind_address_no_port:1;
 	__u8			rcv_tos;
 	__u8			convert_csum;
 	int			uc_index;
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 641338bef651..83d6236a2f08 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -112,6 +112,7 @@ struct in_addr {
 #define IP_MINTTL       21
 #define IP_NODEFRAG     22
 #define IP_CHECKSUM	23
+#define IP_BIND_ADDRESS_NO_PORT	24
 
 /* IP_MTU_DISCOVER values */
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6ad0f7a711c9..cc858ef44451 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -488,7 +488,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		inet->inet_saddr = 0;  /* Use device */
 
 	/* Make sure we are allowed to bind here. */
-	if (sk->sk_prot->get_port(sk, snum)) {
+	if ((snum || !inet->bind_address_no_port) &&
+	    sk->sk_prot->get_port(sk, snum)) {
 		inet->inet_saddr = inet->inet_rcv_saddr = 0;
 		err = -EADDRINUSE;
 		goto out_release_sock;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 7cfb0893f263..04ae2992a5cd 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -582,6 +582,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	case IP_TRANSPARENT:
 	case IP_MINTTL:
 	case IP_NODEFRAG:
+	case IP_BIND_ADDRESS_NO_PORT:
 	case IP_UNICAST_IF:
 	case IP_MULTICAST_TTL:
 	case IP_MULTICAST_ALL:
@@ -732,6 +733,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		}
 		inet->nodefrag = val ? 1 : 0;
 		break;
+	case IP_BIND_ADDRESS_NO_PORT:
+		inet->bind_address_no_port = val ? 1 : 0;
+		break;
 	case IP_MTU_DISCOVER:
 		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
 			goto e_inval;
@@ -1324,6 +1328,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_NODEFRAG:
 		val = inet->nodefrag;
 		break;
+	case IP_BIND_ADDRESS_NO_PORT:
+		val = inet->bind_address_no_port;
+		break;
 	case IP_MTU_DISCOVER:
 		val = inet->pmtudisc;
 		break;


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ