[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAK6E8=fzjZc33bnLU1_f3MdKzKj_ZzmhoFQotnbMaY37KfmeCQ@mail.gmail.com>
Date: Wed, 23 Nov 2016 12:05:27 -0800
From: Yuchung Cheng <ycheng@...gle.com>
To: Eric Dumazet <eric.dumazet@...il.com>
Cc: David Miller <davem@...emloft.net>,
netdev <netdev@...r.kernel.org>,
Neal Cardwell <ncardwell@...gle.com>
Subject: Re: [PATCH net-next] tcp: enhance tcp_collapse_retrans() with skb_shift()
On Tue, Nov 15, 2016 at 12:51 PM, Eric Dumazet <eric.dumazet@...il.com> wrote:
>
> From: Eric Dumazet <edumazet@...gle.com>
>
> In commit 2331ccc5b323 ("tcp: enhance tcp collapsing"),
> we made a first step allowing copying right skb to left skb head.
>
> Since all skbs in socket write queue are headless (but possibly the very
> first one), this strategy often does not work.
>
> This patch extends tcp_collapse_retrans() to perform frag shifting,
> thanks to skb_shift() helper.
>
> This helper needs to not BUG on non headless skbs, as callers are ok
> with that.
>
> Tested:
>
> Following packetdrill test now passes :
>
> 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
> +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
> +0 bind(3, ..., ...) = 0
> +0 listen(3, 1) = 0
>
> +0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 8>
> +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
> +.100 < . 1:1(0) ack 1 win 257
> +0 accept(3, ..., ...) = 4
>
> +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
> +0 write(4, ..., 200) = 200
> +0 > P. 1:201(200) ack 1
> +.001 write(4, ..., 200) = 200
> +0 > P. 201:401(200) ack 1
> +.001 write(4, ..., 200) = 200
> +0 > P. 401:601(200) ack 1
> +.001 write(4, ..., 200) = 200
> +0 > P. 601:801(200) ack 1
> +.001 write(4, ..., 200) = 200
> +0 > P. 801:1001(200) ack 1
> +.001 write(4, ..., 100) = 100
> +0 > P. 1001:1101(100) ack 1
> +.001 write(4, ..., 100) = 100
> +0 > P. 1101:1201(100) ack 1
> +.001 write(4, ..., 100) = 100
> +0 > P. 1201:1301(100) ack 1
> +.001 write(4, ..., 100) = 100
> +0 > P. 1301:1401(100) ack 1
>
> +.099 < . 1:1(0) ack 201 win 257
> +.001 < . 1:1(0) ack 201 win 257 <nop,nop,sack 1001:1401>
> +0 > P. 201:1001(800) ack 1
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> Cc: Neal Cardwell <ncardwell@...gle.com>
> Cc: Yuchung Cheng <ycheng@...gle.com>
Acked-by: Yuchung Cheng <ycheng@...gle.com>
Nice follow-up patch. This also works well with RACK loss detection
since RACK only cares about time (skb_mstamp) not sequence so
collapsing sequences is not a problem.
> ---
> net/core/skbuff.c | 4 +++-
> net/ipv4/tcp_output.c | 22 +++++++++++-----------
> 2 files changed, 14 insertions(+), 12 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 0b2a6e94af2de73ed638634c47a0fb71e2cbc1cb..a9cb81a10c4ba895587727aa4cf098e9a38424ea 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -2656,7 +2656,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
> struct skb_frag_struct *fragfrom, *fragto;
>
> BUG_ON(shiftlen > skb->len);
> - BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
> +
> + if (skb_headlen(skb))
> + return 0;
>
> todo = shiftlen;
> from = 0;
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index f57b5aa51b59cf0a58975fe34a7dcdb886ea8c50..19105b46a30436ebb85fe97ee43089e77aa028bb 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -2514,7 +2514,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
> }
>
> /* Collapses two adjacent SKB's during retransmission. */
> -static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
> +static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
> {
> struct tcp_sock *tp = tcp_sk(sk);
> struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
> @@ -2525,14 +2525,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
>
> BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
>
> + if (next_skb_size) {
> + if (next_skb_size <= skb_availroom(skb))
> + skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
> + next_skb_size);
> + else if (!skb_shift(skb, next_skb, next_skb_size))
> + return false;
> + }
> tcp_highest_sack_combine(sk, next_skb, skb);
>
> tcp_unlink_write_queue(next_skb, sk);
>
> - if (next_skb_size)
> - skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
> - next_skb_size);
> -
> if (next_skb->ip_summed == CHECKSUM_PARTIAL)
> skb->ip_summed = CHECKSUM_PARTIAL;
>
> @@ -2561,6 +2564,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
> tcp_skb_collapse_tstamp(skb, next_skb);
>
> sk_wmem_free_skb(sk, next_skb);
> + return true;
> }
>
> /* Check if coalescing SKBs is legal. */
> @@ -2610,16 +2614,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
>
> if (space < 0)
> break;
> - /* Punt if not enough space exists in the first SKB for
> - * the data in the second
> - */
> - if (skb->len > skb_availroom(to))
> - break;
>
> if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
> break;
>
> - tcp_collapse_retrans(sk, to);
> + if (!tcp_collapse_retrans(sk, to))
> + break;
> }
> }
>
>
>
Powered by blists - more mailing lists