[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20220913095855.99323-1-jakub@cloudflare.com>
Date: Tue, 13 Sep 2022 11:58:55 +0200
From: Jakub Sitnicki <jakub@...udflare.com>
To: netdev@...r.kernel.org
Cc: kernel-team@...udflare.com
Subject: [RFC net-next] udp: Auto-bind connected sockets to unique 4-tuple with port sharing
This is an RFC patch accompanying an LPC 2022 talk [1].
Users of connected UDP sockets who want to delegate the free source port
search to the kernel, by leaving the port unspecified at bind() time, face
a limitation today.
If the delayed auto-bind flag, IP_BIND_ADDRESS_NO_PORT, is set on the
socket, the source (IP, port) actually will not be shared between two
connected UDP sockets:
# if there is just one ephemeral port
system("sysctl -w net.ipv4.ip_local_port_range='60000 60000'")
s1 = socket(AF_INET, SOCK_DGRAM)
s1.setsockopt(SOL_IP, IP_BIND_ADDRESS_NO_PORT, 1)
s1.bind(("192.0.2.1", 0))
s1.connect(("1.1.1.1", 53))
s2 = socket(AF_INET, SOCK_DGRAM)
s2.setsockopt(SOL_IP, IP_BIND_ADDRESS_NO_PORT, 1)
s2.bind(("192.0.2.1", 0))
s2.connect(("1.0.0.1", 53)) # -> EAGAIN
This leaves users in a situation where the number of connected UDP sockets
on given IP is limited to the number of ephemeral ports.
If the user would like to share the source port when the 4-tuple is unique,
they have to resort to user-space free port search implementation with
4-tuple conflict detection, which is non-trivial [2].
To address this limitation, implement a new protocol operation for finding
a free port but avoiding the 4-tuple conflicts. The new operation is
similar to ->get_port but applies stricter criteria for determining if a
port is busy. Destination IP and port of existing sockets is checked
against the address the user passed to connect(), in addition to what
->get_port checks today (netns, src addr, device).
There already happens to exist a proto operation that has a signature
matching our needs here, that is takes a socket reference and a destination
address as arguments - named ->bind_add(). It is currently used only by
SCTP code, so we can re-purpose it.
To remain backward compatible, we call into ->bind_add at connect() time to
find a free port only if the user:
1. has specified the local source IP but left port unspecified, and
2. enabled IP_BIND_ADDRESS_NO_PORT, and
3. enabled port sharing with SO_REUSEADDR.
If the above condition is met, we will try to find a local port that can be
shared with other existing sockets as long as the 4-tuple is unique. Or
fail with EAGAIN if we are out of local ports.
Rationale here is that today when source address sharing with REUSEADDR is
enabled for a UDP socket, setting BIND_ADDRESS_NO_PORT has no effect on
port selection and conflict detection. It merely delays the auto-bind from
bind() to connect()/sendmsg() time.
At the same time, users are unlikely to run into EAGAIN errors from
connect() calling into ->bind_add(), if for some reason they are setting
both REUSEADDR and BIND_ADDRESS_NO_PORT on their connected UDP sockets
already. For that to happen, we would have to encounter a 4-tuple conflict
with another existing connected UDP socket and completely run out of
ephemeral ports.
As this is an RFC submission and there are still a few things left to do:
- get rid of duplicated code between ->get_port and ->bind_add
- UDP-Lite and UDPv6 support is missing
- split code into patches
- add support for IPv6 sockets
- add selftests/net
- add man page docs
[1] https://lpc.events/event/16/contributions/1349/
[2] https://github.com/cloudflare/cloudflare-blog/blob/232b432c1d57/2022-02-connectx/connectx.py#L116
Signed-off-by: Jakub Sitnicki <jakub@...udflare.com>
---
net/ipv4/af_inet.c | 39 +++++++++++++++++++-
net/ipv4/datagram.c | 11 +++++-
net/ipv4/udp.c | 89 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 136 insertions(+), 3 deletions(-)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d3ab1ae32ef5..de2918c9e9e2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -187,6 +187,42 @@ static int inet_autobind(struct sock *sk)
return 0;
}
+static int inet_autobind_reuse(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ const struct proto *prot = READ_ONCE(sk->sk_prot);
+ struct inet_sock *inet = inet_sk(sk);
+ int err = -EAGAIN;
+
+ if (addr_len < sizeof(*usin))
+ return -EINVAL;
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ lock_sock(sk);
+ if (inet->inet_num)
+ goto ok;
+
+ if (sk->sk_reuse && !sk->sk_reuseport && prot->bind_add &&
+ inet->inet_rcv_saddr && inet->inet_saddr) {
+ if (prot->bind_add(sk, uaddr, addr_len))
+ goto fail;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_daddr = usin->sin_addr.s_addr;
+ inet->inet_dport = usin->sin_port;
+ sk->sk_state = TCP_ESTABLISHED;
+ } else {
+ if (prot->get_port(sk, 0))
+ goto fail;
+ inet->inet_sport = htons(inet->inet_num);
+ }
+ok:
+ err = 0;
+fail:
+ release_sock(sk);
+ return err;
+}
+
/*
* Move a socket into listening state.
*/
@@ -571,8 +607,9 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
return err;
}
- if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
+ if (data_race(!inet_sk(sk)->inet_num) && inet_autobind_reuse(sk, uaddr, addr_len))
return -EAGAIN;
+
return sk->sk_prot->connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_dgram_connect);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 405a8c2aea64..18ba2403fc0e 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -68,8 +68,10 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
- inet->inet_daddr = fl4->daddr;
- inet->inet_dport = usin->sin_port;
+ if (!inet->inet_daddr)
+ inet->inet_daddr = fl4->daddr;
+ if (!inet->inet_dport)
+ inet->inet_dport = usin->sin_port;
reuseport_has_conns(sk, true);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
@@ -78,6 +80,11 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
sk_dst_set(sk, &rt->dst);
err = 0;
out:
+ if (err) {
+ /* Dissolve any destination association auto-bind might have created */
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
+ }
return err;
}
EXPORT_SYMBOL(__ip4_datagram_connect);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cd72158e953a..38b73b7df30f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -163,6 +163,28 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
return 0;
}
+static void udp_v4_lport_reuse_inuse(const struct net *net,
+ const struct udp_hslot *hslot,
+ unsigned long *bitmap,
+ struct sock *sk, unsigned int log,
+ __be32 daddr, __be16 dport)
+{
+ struct sock *sk2;
+
+ sk_for_each(sk2, &hslot->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ ((!sk2->sk_reuse &&
+ inet_rcv_saddr_equal(sk, sk2, true)) ||
+ (sk2->sk_reuse &&
+ inet_rcv_saddr_equal(sk, sk2, false) &&
+ inet_sk(sk2)->inet_daddr == daddr &&
+ inet_sk(sk2)->inet_dport == dport)))
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
+ }
+}
+
/*
* Note: we still hold spinlock of primary hash chain, so no other writer
* can insert/delete a socket with local_port == num
@@ -356,6 +378,72 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
+static int udp_v4_bind_add(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ const struct proto *prot = READ_ONCE(sk->sk_prot);
+ struct udp_table *udptable = prot->h.udp_table;
+ DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+ struct udp_hslot *hslot, *hslot2;
+ struct net *net = sock_net(sk);
+ int low, high, remaining;
+ u16 first, last, snum;
+ u32 rand;
+
+ inet_sk_get_local_port_range(sk, &low, &high);
+ remaining = (high - low) + 1;
+
+ rand = prandom_u32();
+ first = reciprocal_scale(rand, remaining) + low;
+ last = first + udptable->mask + 1;
+ /* force rand to be an odd multiple of UDP_HTABLE_SIZE */
+ rand = (rand | 1) * (udptable->mask + 1);
+
+ do {
+ bitmap_zero(bitmap, PORTS_PER_CHAIN);
+
+ hslot = udp_hashslot(udptable, net, first);
+ spin_lock(&hslot->lock);
+
+ udp_v4_lport_reuse_inuse(net, hslot, bitmap, sk, udptable->log,
+ usin->sin_addr.s_addr, usin->sin_port);
+
+ snum = first;
+ do {
+ if (low <= snum && snum <= high &&
+ !test_bit(snum >> udptable->log, bitmap) &&
+ !inet_is_local_reserved_port(net, snum))
+ goto found;
+ snum += rand;
+ } while (snum != first);
+
+ spin_unlock(&hslot->lock);
+ cond_resched();
+ } while (++first != last);
+
+ return 1;
+found:
+ inet_sk(sk)->inet_num = snum;
+ udp_sk(sk)->udp_port_hash = snum;
+ udp_sk(sk)->udp_portaddr_hash =
+ ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, snum);
+
+ sk_add_node_rcu(sk, &hslot->head);
+ hslot->count++;
+ sock_prot_inuse_add(net, prot, 1);
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ spin_lock(&hslot2->lock);
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head);
+ hslot2->count++;
+ spin_unlock(&hslot2->lock);
+
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ spin_unlock(&hslot->lock);
+
+ return 0;
+}
+
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned short hnum,
@@ -2939,6 +3027,7 @@ struct proto udp_prot = {
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
+ .bind_add = udp_v4_bind_add,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
--
2.37.2
Powered by blists - more mailing lists