netdev - Re: Is bug 200755 in anyone's queue??

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CA+FuTSfRP09aJNYRt04SS6qj22ViiOEWaWmLAwX0psk8-PGNxw@mail.gmail.com>
Date:   Tue, 10 Sep 2019 11:52:35 -0400
From:   Willem de Bruijn <willemdebruijn.kernel@...il.com>
To:     Willem de Bruijn <willemdebruijn.kernel@...il.com>
Cc:     Steve Zabele <zabele@...cast.net>,
        Eric Dumazet <eric.dumazet@...il.com>,
        Mark KEATON <mark.keaton@...theon.com>,
        Network Development <netdev@...r.kernel.org>,
        "shum@...ndrew.org" <shum@...ndrew.org>,
        "vladimir116@...il.com" <vladimir116@...il.com>,
        "saifi.khan@...ikr.in" <saifi.khan@...ikr.in>,
        Daniel Borkmann <daniel@...earbox.net>,
        "on2k16nm@...il.com" <on2k16nm@...il.com>,
        Stephen Hemminger <stephen@...workplumber.org>,
        Craig Gallek <kraig@...gle.com>
Subject: Re: Is bug 200755 in anyone's queue??

On Wed, Sep 4, 2019 at 11:46 AM Willem de Bruijn
<willemdebruijn.kernel@...il.com> wrote:
>
> On Wed, Sep 4, 2019 at 10:51 AM Steve Zabele <zabele@...cast.net> wrote:
> >
> > I think a dual table approach makes a lot of sense here, especially if we look at the different use cases. For the DNS server example, almost certainly there will not be any connected sockets using the server port, so a test of whether the connected table is empty (maybe a boolean stored with the unconnected table?) should get to the existing code very quickly and not require accessing the memory holding the connected table. For our use case, the connected sockets persist for long periods (at network timescales at least) and so any rehashing should be infrequent and so have limited impact on performance overall.
> >
> > So does a dual table approach seem workable to other folks that know the internals?
>
> Let me take a stab and compare. A dual table does bring it more in
> line with how the TCP code is structured.

On closer look, I think two tables is too much code churn and risk for
a stable fix. It requires lookup changes across ipv4 and ipv6 unicast,
multicast, early demux, .. Though I'm happy to be proven wrong, of
course.

One variant that is easy to see only modifies behavior for reuseport
groups with connections is to mark those as such:

"
@@ -21,7 +21,8 @@ struct sock_reuseport {
        unsigned int            synq_overflow_ts;
        /* ID stays the same even after the size of socks[] grows. */
        unsigned int            reuseport_id;
-       bool                    bind_inany;
+       unsigned int            bind_inany:1;
+       unsigned int            has_conns:1;
        struct bpf_prog __rcu   *prog;          /* optional BPF sock selector */
        struct sock             *socks[0];      /* array of sock pointers */
 };
@@ -37,6 +38,23 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
 extern int reuseport_detach_prog(struct sock *sk);

+static inline bool reuseport_has_conns(struct sock *sk, bool set)
+{
+       struct sock_reuseport *reuse;
+       bool ret = false;
+
+       rcu_read_lock();
+       reuse = rcu_dereference(sk->sk_reuseport_cb);
+       if (reuse) {
+               if (set)
+                       reuse->has_conns = 1;
+               ret = reuse->has_conns;
+       }
+       rcu_read_unlock();
+
+       return ret;
+}

@@ -67,6 +68,7 @@ int __ip4_datagram_connect(struct sock *sk, struct
sockaddr *uaddr, int addr_len
                if (sk->sk_prot->rehash)
                        sk->sk_prot->rehash(sk);
        }
+       reuseport_has_conns(sk, true);
        inet->inet_daddr = fl4->daddr;
        inet->inet_dport = usin->sin_port;
        sk->sk_state = TCP_ESTABLISHED;
"

Then at lookup treat connected reuseport sockets are regular sockets
and do not return early on a reuseport match if there may be higher
scoring connections:

"
@@ -423,13 +423,15 @@ static struct sock *udp4_lib_lookup2(struct net *net,
                score = compute_score(sk, net, saddr, sport,
                                      daddr, hnum, dif, sdif);
                if (score > badness) {
-                       if (sk->sk_reuseport) {
+                       if (sk->sk_reuseport &&
+                           sk->sk_state != TCP_ESTABLISHED) {
                                hash = udp_ehashfn(net, daddr, hnum,
                                                   saddr, sport);
                                result = reuseport_select_sock(sk, hash, skb,
                                                        sizeof(struct udphdr));
-                               if (result)
+                               if (result && !reuseport_has_conns(sk, false))
                                        return result;
+                               sk = result ? : sk;
                        }
                        badness = score;
                        result = sk;
"

and finally for reuseport matches only return unconnected sockets in the group:

"
@@ -295,8 +295,19 @@ struct sock *reuseport_select_sock(struct sock *sk,

 select_by_hash:
                /* no bpf or invalid bpf result: fall back to hash usage */
-               if (!sk2)
-                       sk2 = reuse->socks[reciprocal_scale(hash, socks)];
+               if (!sk2) {
+                       int i, j;
+
+                       i = j = reciprocal_scale(hash, socks);
+                       while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+                               i++;
+                               if (i == reuse->num_socks)
+                                       i = 0;
+                               if (i == j)
+                                       goto out;
+                       }
+                       sk2 = reuse->socks[i];
+               }
        }
"

This is hardly a short patch, but the behavioral change is contained.

I also coded up the alternative to rely on order in the list: entries
are listed at the head and the list is traversed at the head. To keep
all connections within a group ahead of all the unconnected sockets in
a group (1) rehash on connect and (2) do a more complex
insert-after-connected-reuseport for new reuseport sockets:

"
@@ -67,12 +68,16 @@ int __ip4_datagram_connect(struct sock *sk, struct
sockaddr *uaddr, int addr_len
                if (sk->sk_prot->rehash)
                        sk->sk_prot->rehash(sk);
        }
+
        inet->inet_daddr = fl4->daddr;
        inet->inet_dport = usin->sin_port;
        sk->sk_state = TCP_ESTABLISHED;
        sk_set_txhash(sk);
        inet->inet_id = jiffies;

+       if (rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_prot->rehash)
+               sk->sk_prot->rehash(sk);
+
        sk_dst_set(sk, &rt->dst);
        err = 0;

@@ -323,7 +323,21 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
                    sk->sk_family == AF_INET6)
                        hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
                                           &hslot2->head);
-               else
+               else if (sk->sk_reuseport) {
+                       struct sock *cur, *last_conn = NULL;
+
+                       udp_portaddr_for_each_entry_rcu(cur, &hslot2->head) {
+                               if (cur->sk_state == TCP_ESTABLISHED &&
+                                   rcu_access_pointer(cur->sk_reuseport_cb))
+                                       last_conn = cur;
+                       }
+                       if (last_conn)
+
hlist_add_behind_rcu(&udp_sk(sk)->udp_portaddr_node,
+
&udp_sk(last_conn)->udp_portaddr_node);
+                       else
+
hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+                                                        &hslot2->head);
+               } else
                        hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
                                           &hslot2->head);

@@ -423,7 +437,8 @@ static struct sock *udp4_lib_lookup2(struct net *net,
                score = compute_score(sk, net, saddr, sport,
                                      daddr, hnum, dif, sdif);
                if (score > badness) {
-                       if (sk->sk_reuseport) {
+                       if (sk->sk_reuseport &&
+                           sk->sk_state != TCP_ESTABLISHED) {
                                hash = udp_ehashfn(net, daddr, hnum,
                                                   saddr, sport);
                                result = reuseport_select_sock(sk, hash, skb,
@@ -1891,10 +1906,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
                                             udp_sk(sk)->udp_port_hash);
                        /* we must lock primary chain too */
                        spin_lock_bh(&hslot->lock);
+#if 0
                        /* TODO: differentiate inet_rcv_saddr change
from regular connect */
                        if (rcu_access_pointer(sk->sk_reuseport_cb))
                                reuseport_detach_sock(sk);
+#endif

-                       if (hslot2 != nhslot2) {
+                       if (1) {
                                spin_lock(&hslot2->lock);

hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
                                hslot2->count--;
"

This clearly has some loose ends and is no shorter or simpler. So
unless anyone has comments or a different solution, I'll finish
up the first variant.