netdev - Re: [PATCH net-next] tcp: refine tcp_prune_ofo

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221101165412.25234-1-kuniyu@amazon.com>
Date:   Tue, 1 Nov 2022 09:54:12 -0700
From:   Kuniyuki Iwashima <kuniyu@...zon.com>
To:     <edumazet@...gle.com>
CC:     <davem@...emloft.net>, <eric.dumazet@...il.com>, <kuba@...nel.org>,
        <ncardwell@...gle.com>, <netdev@...r.kernel.org>,
        <pabeni@...hat.com>, <soheil@...gle.com>, <kuniyu@...zon.com>
Subject: Re: [PATCH net-next] tcp: refine tcp_prune_ofo_queue() logic

From:   Eric Dumazet <edumazet@...gle.com>
Date:   Tue,  1 Nov 2022 03:52:34 +0000
> After commits 36a6503fedda ("tcp: refine tcp_prune_ofo_queue()
> to not drop all packets") and 72cd43ba64fc1
> ("tcp: free batches of packets in tcp_prune_ofo_queue()")
> tcp_prune_ofo_queue() drops a fraction of ooo queue,
> to make room for incoming packet.
> 
> However it makes no sense to drop packets that are
> before the incoming packet, in sequence space.
> 
> In order to recover from packet losses faster,
> it makes more sense to only drop ooo packets
> which are after the incoming packet.
> 
> Tested:
> packetdrill test:
>    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
>    +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
>    +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [3800], 4) = 0
>    +0 bind(3, ..., ...) = 0
>    +0 listen(3, 1) = 0
> 
>    +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
>    +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 0>
>   +.1 < . 1:1(0) ack 1 win 1024
>    +0 accept(3, ..., ...) = 4
> 
>  +.01 < . 200:300(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 200:300>
> 
>  +.01 < . 400:500(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 400:500 200:300>
> 
>  +.01 < . 600:700(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 600:700 400:500 200:300>
> 
>  +.01 < . 800:900(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 800:900 600:700 400:500 200:300>
> 
>  +.01 < . 1000:1100(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 1000:1100 800:900 600:700 400:500>
> 
>  +.01 < . 1200:1300(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
> 
> // this packet is dropped because we have no room left.
>  +.01 < . 1400:1500(100) ack 1 win 1024
> 
>  +.01 < . 1:200(199) ack 1 win 1024
> // Make sure kernel did not drop 200:300 sequence
>    +0 > . 1:1(0) ack 300 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
> // Make room, since our RCVBUF is very small
>    +0 read(4, ..., 299) = 299
> 
>  +.01 < . 300:400(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 500 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
> 
>  +.01 < . 500:600(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 700 <nop,nop, sack 1200:1300 1000:1100 800:900>
> 
>    +0 read(4, ..., 400) = 400
> 
>  +.01 < . 700:800(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 900 <nop,nop, sack 1200:1300 1000:1100>
> 
>  +.01 < . 900:1000(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1100 <nop,nop, sack 1200:1300>
> 
>  +.01 < . 1100:1200(100) ack 1 win 1024
> // This checks that 1200:1300 has not been removed from ooo queue
>    +0 > . 1:1(0) ack 1300
> 
> Suggested-by: Jakub Kicinski <kuba@...nel.org>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>

Acked-by: Kuniyuki Iwashima <kuniyu@...zon.com>

Sounds good!

Thank you.


> ---
>  net/ipv4/tcp_input.c | 51 +++++++++++++++++++++++++++-----------------
>  1 file changed, 31 insertions(+), 20 deletions(-)
> 
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 0640453fce54b6daae0861d948f3db075830daf6..d764b5854dfcc865207b5eb749c29013ef18bdbc 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4764,8 +4764,8 @@ static void tcp_ofo_queue(struct sock *sk)
>  	}
>  }
>  
> -static bool tcp_prune_ofo_queue(struct sock *sk);
> -static int tcp_prune_queue(struct sock *sk);
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
>  
>  static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
>  				 unsigned int size)
> @@ -4773,11 +4773,11 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
>  	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
>  	    !sk_rmem_schedule(sk, skb, size)) {
>  
> -		if (tcp_prune_queue(sk) < 0)
> +		if (tcp_prune_queue(sk, skb) < 0)
>  			return -1;
>  
>  		while (!sk_rmem_schedule(sk, skb, size)) {
> -			if (!tcp_prune_ofo_queue(sk))
> +			if (!tcp_prune_ofo_queue(sk, skb))
>  				return -1;
>  		}
>  	}
> @@ -5329,6 +5329,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
>   * Clean the out-of-order queue to make room.
>   * We drop high sequences packets to :
>   * 1) Let a chance for holes to be filled.
> + *    This means we do not drop packets from ooo queue if their sequence
> + *    is before incoming packet sequence.
>   * 2) not add too big latencies if thousands of packets sit there.
>   *    (But if application shrinks SO_RCVBUF, we could still end up
>   *     freeing whole queue here)
> @@ -5336,24 +5338,31 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
>   *
>   * Return true if queue has shrunk.
>   */
> -static bool tcp_prune_ofo_queue(struct sock *sk)
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
>  {
>  	struct tcp_sock *tp = tcp_sk(sk);
>  	struct rb_node *node, *prev;
> +	bool pruned = false;
>  	int goal;
>  
>  	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
>  		return false;
>  
> -	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
>  	goal = sk->sk_rcvbuf >> 3;
>  	node = &tp->ooo_last_skb->rbnode;
> +
>  	do {
> +		struct sk_buff *skb = rb_to_skb(node);
> +
> +		/* If incoming skb would land last in ofo queue, stop pruning. */
> +		if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
> +			break;
> +		pruned = true;
>  		prev = rb_prev(node);
>  		rb_erase(node, &tp->out_of_order_queue);
> -		goal -= rb_to_skb(node)->truesize;
> -		tcp_drop_reason(sk, rb_to_skb(node),
> -				SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> +		goal -= skb->truesize;
> +		tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> +		tp->ooo_last_skb = rb_to_skb(prev);
>  		if (!prev || goal <= 0) {
>  			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
>  			    !tcp_under_memory_pressure(sk))
> @@ -5362,16 +5371,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
>  		}
>  		node = prev;
>  	} while (node);
> -	tp->ooo_last_skb = rb_to_skb(prev);
>  
> -	/* Reset SACK state.  A conforming SACK implementation will
> -	 * do the same at a timeout based retransmit.  When a connection
> -	 * is in a sad state like this, we care only about integrity
> -	 * of the connection not performance.
> -	 */
> -	if (tp->rx_opt.sack_ok)
> -		tcp_sack_reset(&tp->rx_opt);
> -	return true;
> +	if (pruned) {
> +		NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> +		/* Reset SACK state.  A conforming SACK implementation will
> +		 * do the same at a timeout based retransmit.  When a connection
> +		 * is in a sad state like this, we care only about integrity
> +		 * of the connection not performance.
> +		 */
> +		if (tp->rx_opt.sack_ok)
> +			tcp_sack_reset(&tp->rx_opt);
> +	}
> +	return pruned;
>  }
>  
>  /* Reduce allocated memory if we can, trying to get
> @@ -5381,7 +5392,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
>   * until the socket owning process reads some of the data
>   * to stabilize the situation.
>   */
> -static int tcp_prune_queue(struct sock *sk)
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
>  {
>  	struct tcp_sock *tp = tcp_sk(sk);
>  
> @@ -5408,7 +5419,7 @@ static int tcp_prune_queue(struct sock *sk)
>  	/* Collapsing did not help, destructive actions follow.
>  	 * This must not ever occur. */
>  
> -	tcp_prune_ofo_queue(sk);
> +	tcp_prune_ofo_queue(sk, in_skb);
>  
>  	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
>  		return 0;
> -- 
> 2.38.1.273.g43a17bfeac-goog