netdev - Re: [PATCH net-next] tcp: refine tcp_prune_ofo

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACSApvZY9FrC481k76HydWbhYYVpky-Ys51zR0pbt6BXtvZk0Q@mail.gmail.com>
Date:   Tue, 1 Nov 2022 00:18:17 -0400
From:   Soheil Hassas Yeganeh <soheil@...gle.com>
To:     Eric Dumazet <edumazet@...gle.com>
Cc:     "David S . Miller" <davem@...emloft.net>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>,
        Neal Cardwell <ncardwell@...gle.com>, netdev@...r.kernel.org,
        eric.dumazet@...il.com
Subject: Re: [PATCH net-next] tcp: refine tcp_prune_ofo_queue() logic

On Mon, Oct 31, 2022 at 11:52 PM Eric Dumazet <edumazet@...gle.com> wrote:
>
> After commits 36a6503fedda ("tcp: refine tcp_prune_ofo_queue()
> to not drop all packets") and 72cd43ba64fc1
> ("tcp: free batches of packets in tcp_prune_ofo_queue()")
> tcp_prune_ofo_queue() drops a fraction of ooo queue,
> to make room for incoming packet.
>
> However it makes no sense to drop packets that are
> before the incoming packet, in sequence space.
>
> In order to recover from packet losses faster,
> it makes more sense to only drop ooo packets
> which are after the incoming packet.
>
> Tested:
> packetdrill test:
>    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
>    +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
>    +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [3800], 4) = 0
>    +0 bind(3, ..., ...) = 0
>    +0 listen(3, 1) = 0
>
>    +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
>    +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 0>
>   +.1 < . 1:1(0) ack 1 win 1024
>    +0 accept(3, ..., ...) = 4
>
>  +.01 < . 200:300(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 200:300>
>
>  +.01 < . 400:500(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 400:500 200:300>
>
>  +.01 < . 600:700(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 600:700 400:500 200:300>
>
>  +.01 < . 800:900(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 800:900 600:700 400:500 200:300>
>
>  +.01 < . 1000:1100(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 1000:1100 800:900 600:700 400:500>
>
>  +.01 < . 1200:1300(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
>
> // this packet is dropped because we have no room left.
>  +.01 < . 1400:1500(100) ack 1 win 1024
>
>  +.01 < . 1:200(199) ack 1 win 1024
> // Make sure kernel did not drop 200:300 sequence
>    +0 > . 1:1(0) ack 300 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
> // Make room, since our RCVBUF is very small
>    +0 read(4, ..., 299) = 299
>
>  +.01 < . 300:400(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 500 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
>
>  +.01 < . 500:600(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 700 <nop,nop, sack 1200:1300 1000:1100 800:900>
>
>    +0 read(4, ..., 400) = 400
>
>  +.01 < . 700:800(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 900 <nop,nop, sack 1200:1300 1000:1100>
>
>  +.01 < . 900:1000(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1100 <nop,nop, sack 1200:1300>
>
>  +.01 < . 1100:1200(100) ack 1 win 1024
> // This checks that 1200:1300 has not been removed from ooo queue
>    +0 > . 1:1(0) ack 1300
>
> Suggested-by: Jakub Kicinski <kuba@...nel.org>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>

Acked-by: Soheil Hassas Yeganeh <soheil@...gle.com>

Very nice idea! It should lower tail latency on highly congested
links. Thank you Eric!

> ---
>  net/ipv4/tcp_input.c | 51 +++++++++++++++++++++++++++-----------------
>  1 file changed, 31 insertions(+), 20 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 0640453fce54b6daae0861d948f3db075830daf6..d764b5854dfcc865207b5eb749c29013ef18bdbc 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4764,8 +4764,8 @@ static void tcp_ofo_queue(struct sock *sk)
>         }
>  }
>
> -static bool tcp_prune_ofo_queue(struct sock *sk);
> -static int tcp_prune_queue(struct sock *sk);
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
>
>  static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
>                                  unsigned int size)
> @@ -4773,11 +4773,11 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
>         if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
>             !sk_rmem_schedule(sk, skb, size)) {
>
> -               if (tcp_prune_queue(sk) < 0)
> +               if (tcp_prune_queue(sk, skb) < 0)
>                         return -1;
>
>                 while (!sk_rmem_schedule(sk, skb, size)) {
> -                       if (!tcp_prune_ofo_queue(sk))
> +                       if (!tcp_prune_ofo_queue(sk, skb))
>                                 return -1;
>                 }
>         }
> @@ -5329,6 +5329,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
>   * Clean the out-of-order queue to make room.
>   * We drop high sequences packets to :
>   * 1) Let a chance for holes to be filled.
> + *    This means we do not drop packets from ooo queue if their sequence
> + *    is before incoming packet sequence.
>   * 2) not add too big latencies if thousands of packets sit there.
>   *    (But if application shrinks SO_RCVBUF, we could still end up
>   *     freeing whole queue here)
> @@ -5336,24 +5338,31 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
>   *
>   * Return true if queue has shrunk.
>   */
> -static bool tcp_prune_ofo_queue(struct sock *sk)
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
>         struct rb_node *node, *prev;
> +       bool pruned = false;
>         int goal;
>
>         if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
>                 return false;
>
> -       NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
>         goal = sk->sk_rcvbuf >> 3;
>         node = &tp->ooo_last_skb->rbnode;
> +
>         do {
> +               struct sk_buff *skb = rb_to_skb(node);
> +
> +               /* If incoming skb would land last in ofo queue, stop pruning. */
> +               if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
> +                       break;
> +               pruned = true;
>                 prev = rb_prev(node);
>                 rb_erase(node, &tp->out_of_order_queue);
> -               goal -= rb_to_skb(node)->truesize;
> -               tcp_drop_reason(sk, rb_to_skb(node),
> -                               SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> +               goal -= skb->truesize;
> +               tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> +               tp->ooo_last_skb = rb_to_skb(prev);
>                 if (!prev || goal <= 0) {
>                         if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
>                             !tcp_under_memory_pressure(sk))
> @@ -5362,16 +5371,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
>                 }
>                 node = prev;
>         } while (node);
> -       tp->ooo_last_skb = rb_to_skb(prev);
>
> -       /* Reset SACK state.  A conforming SACK implementation will
> -        * do the same at a timeout based retransmit.  When a connection
> -        * is in a sad state like this, we care only about integrity
> -        * of the connection not performance.
> -        */
> -       if (tp->rx_opt.sack_ok)
> -               tcp_sack_reset(&tp->rx_opt);
> -       return true;
> +       if (pruned) {
> +               NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> +               /* Reset SACK state.  A conforming SACK implementation will
> +                * do the same at a timeout based retransmit.  When a connection
> +                * is in a sad state like this, we care only about integrity
> +                * of the connection not performance.
> +                */
> +               if (tp->rx_opt.sack_ok)
> +                       tcp_sack_reset(&tp->rx_opt);
> +       }
> +       return pruned;
>  }
>
>  /* Reduce allocated memory if we can, trying to get
> @@ -5381,7 +5392,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
>   * until the socket owning process reads some of the data
>   * to stabilize the situation.
>   */
> -static int tcp_prune_queue(struct sock *sk)
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
>
> @@ -5408,7 +5419,7 @@ static int tcp_prune_queue(struct sock *sk)
>         /* Collapsing did not help, destructive actions follow.
>          * This must not ever occur. */
>
> -       tcp_prune_ofo_queue(sk);
> +       tcp_prune_ofo_queue(sk, in_skb);
>
>         if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
>                 return 0;
> --
> 2.38.1.273.g43a17bfeac-goog
>