[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221101165412.25234-1-kuniyu@amazon.com>
Date: Tue, 1 Nov 2022 09:54:12 -0700
From: Kuniyuki Iwashima <kuniyu@...zon.com>
To: <edumazet@...gle.com>
CC: <davem@...emloft.net>, <eric.dumazet@...il.com>, <kuba@...nel.org>,
<ncardwell@...gle.com>, <netdev@...r.kernel.org>,
<pabeni@...hat.com>, <soheil@...gle.com>, <kuniyu@...zon.com>
Subject: Re: [PATCH net-next] tcp: refine tcp_prune_ofo_queue() logic
From: Eric Dumazet <edumazet@...gle.com>
Date: Tue, 1 Nov 2022 03:52:34 +0000
> After commits 36a6503fedda ("tcp: refine tcp_prune_ofo_queue()
> to not drop all packets") and 72cd43ba64fc1
> ("tcp: free batches of packets in tcp_prune_ofo_queue()")
> tcp_prune_ofo_queue() drops a fraction of ooo queue,
> to make room for incoming packet.
>
> However it makes no sense to drop packets that are
> before the incoming packet, in sequence space.
>
> In order to recover from packet losses faster,
> it makes more sense to only drop ooo packets
> which are after the incoming packet.
>
> Tested:
> packetdrill test:
> 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
> +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
> +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [3800], 4) = 0
> +0 bind(3, ..., ...) = 0
> +0 listen(3, 1) = 0
>
> +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
> +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 0>
> +.1 < . 1:1(0) ack 1 win 1024
> +0 accept(3, ..., ...) = 4
>
> +.01 < . 200:300(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 200:300>
>
> +.01 < . 400:500(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 400:500 200:300>
>
> +.01 < . 600:700(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 600:700 400:500 200:300>
>
> +.01 < . 800:900(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 800:900 600:700 400:500 200:300>
>
> +.01 < . 1000:1100(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 1000:1100 800:900 600:700 400:500>
>
> +.01 < . 1200:1300(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
>
> // this packet is dropped because we have no room left.
> +.01 < . 1400:1500(100) ack 1 win 1024
>
> +.01 < . 1:200(199) ack 1 win 1024
> // Make sure kernel did not drop 200:300 sequence
> +0 > . 1:1(0) ack 300 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
> // Make room, since our RCVBUF is very small
> +0 read(4, ..., 299) = 299
>
> +.01 < . 300:400(100) ack 1 win 1024
> +0 > . 1:1(0) ack 500 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
>
> +.01 < . 500:600(100) ack 1 win 1024
> +0 > . 1:1(0) ack 700 <nop,nop, sack 1200:1300 1000:1100 800:900>
>
> +0 read(4, ..., 400) = 400
>
> +.01 < . 700:800(100) ack 1 win 1024
> +0 > . 1:1(0) ack 900 <nop,nop, sack 1200:1300 1000:1100>
>
> +.01 < . 900:1000(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1100 <nop,nop, sack 1200:1300>
>
> +.01 < . 1100:1200(100) ack 1 win 1024
> // This checks that 1200:1300 has not been removed from ooo queue
> +0 > . 1:1(0) ack 1300
>
> Suggested-by: Jakub Kicinski <kuba@...nel.org>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
Acked-by: Kuniyuki Iwashima <kuniyu@...zon.com>
Sounds good!
Thank you.
> ---
> net/ipv4/tcp_input.c | 51 +++++++++++++++++++++++++++-----------------
> 1 file changed, 31 insertions(+), 20 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 0640453fce54b6daae0861d948f3db075830daf6..d764b5854dfcc865207b5eb749c29013ef18bdbc 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4764,8 +4764,8 @@ static void tcp_ofo_queue(struct sock *sk)
> }
> }
>
> -static bool tcp_prune_ofo_queue(struct sock *sk);
> -static int tcp_prune_queue(struct sock *sk);
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
>
> static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
> unsigned int size)
> @@ -4773,11 +4773,11 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
> if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
> !sk_rmem_schedule(sk, skb, size)) {
>
> - if (tcp_prune_queue(sk) < 0)
> + if (tcp_prune_queue(sk, skb) < 0)
> return -1;
>
> while (!sk_rmem_schedule(sk, skb, size)) {
> - if (!tcp_prune_ofo_queue(sk))
> + if (!tcp_prune_ofo_queue(sk, skb))
> return -1;
> }
> }
> @@ -5329,6 +5329,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
> * Clean the out-of-order queue to make room.
> * We drop high sequences packets to :
> * 1) Let a chance for holes to be filled.
> + * This means we do not drop packets from ooo queue if their sequence
> + * is before incoming packet sequence.
> * 2) not add too big latencies if thousands of packets sit there.
> * (But if application shrinks SO_RCVBUF, we could still end up
> * freeing whole queue here)
> @@ -5336,24 +5338,31 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
> *
> * Return true if queue has shrunk.
> */
> -static bool tcp_prune_ofo_queue(struct sock *sk)
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
> {
> struct tcp_sock *tp = tcp_sk(sk);
> struct rb_node *node, *prev;
> + bool pruned = false;
> int goal;
>
> if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
> return false;
>
> - NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> goal = sk->sk_rcvbuf >> 3;
> node = &tp->ooo_last_skb->rbnode;
> +
> do {
> + struct sk_buff *skb = rb_to_skb(node);
> +
> + /* If incoming skb would land last in ofo queue, stop pruning. */
> + if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
> + break;
> + pruned = true;
> prev = rb_prev(node);
> rb_erase(node, &tp->out_of_order_queue);
> - goal -= rb_to_skb(node)->truesize;
> - tcp_drop_reason(sk, rb_to_skb(node),
> - SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> + goal -= skb->truesize;
> + tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> + tp->ooo_last_skb = rb_to_skb(prev);
> if (!prev || goal <= 0) {
> if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
> !tcp_under_memory_pressure(sk))
> @@ -5362,16 +5371,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
> }
> node = prev;
> } while (node);
> - tp->ooo_last_skb = rb_to_skb(prev);
>
> - /* Reset SACK state. A conforming SACK implementation will
> - * do the same at a timeout based retransmit. When a connection
> - * is in a sad state like this, we care only about integrity
> - * of the connection not performance.
> - */
> - if (tp->rx_opt.sack_ok)
> - tcp_sack_reset(&tp->rx_opt);
> - return true;
> + if (pruned) {
> + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> + /* Reset SACK state. A conforming SACK implementation will
> + * do the same at a timeout based retransmit. When a connection
> + * is in a sad state like this, we care only about integrity
> + * of the connection not performance.
> + */
> + if (tp->rx_opt.sack_ok)
> + tcp_sack_reset(&tp->rx_opt);
> + }
> + return pruned;
> }
>
> /* Reduce allocated memory if we can, trying to get
> @@ -5381,7 +5392,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
> * until the socket owning process reads some of the data
> * to stabilize the situation.
> */
> -static int tcp_prune_queue(struct sock *sk)
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
> {
> struct tcp_sock *tp = tcp_sk(sk);
>
> @@ -5408,7 +5419,7 @@ static int tcp_prune_queue(struct sock *sk)
> /* Collapsing did not help, destructive actions follow.
> * This must not ever occur. */
>
> - tcp_prune_ofo_queue(sk);
> + tcp_prune_ofo_queue(sk, in_skb);
>
> if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
> return 0;
> --
> 2.38.1.273.g43a17bfeac-goog
Powered by blists - more mailing lists