netdev - Re: [PATCH net-next 2/3] tcp: implement coalescing on backlog queue

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAK6E8=en_sPbKnb60YnPYrVBYKY1doG_V-FFgzrbAv4gnQwy5w@mail.gmail.com>
Date:   Wed, 21 Nov 2018 14:31:57 -0800
From:   Yuchung Cheng <ycheng@...gle.com>
To:     Eric Dumazet <edumazet@...gle.com>
Cc:     "David S . Miller" <davem@...emloft.net>,
        netdev <netdev@...r.kernel.org>,
        Jean-Louis Dupond <jean-louis@...ond.be>,
        Neal Cardwell <ncardwell@...gle.com>,
        Eric Dumazet <eric.dumazet@...il.com>
Subject: Re: [PATCH net-next 2/3] tcp: implement coalescing on backlog queue

On Wed, Nov 21, 2018 at 9:52 AM, Eric Dumazet <edumazet@...gle.com> wrote:
>
> In case GRO is not as efficient as it should be or disabled,
> we might have a user thread trapped in __release_sock() while
> softirq handler flood packets up to the point we have to drop.
>
> This patch balances work done from user thread and softirq,
> to give more chances to __release_sock() to complete its work.
>
> This also helps if we receive many ACK packets, since GRO
> does not aggregate them.
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> Tested-by: Jean-Louis Dupond <jean-louis@...ond.be>
> Cc: Neal Cardwell <ncardwell@...gle.com>
> Cc: Yuchung Cheng <ycheng@...gle.com>
> ---
>  include/uapi/linux/snmp.h |  1 +
>  net/ipv4/proc.c           |  1 +
>  net/ipv4/tcp_ipv4.c       | 75 +++++++++++++++++++++++++++++++++++----
>  3 files changed, 71 insertions(+), 6 deletions(-)
>
> diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
> index f80135e5feaa886000009db6dff75b2bc2d637b2..86dc24a96c90ab047d5173d625450facd6c6dd79 100644
> --- a/include/uapi/linux/snmp.h
> +++ b/include/uapi/linux/snmp.h
> @@ -243,6 +243,7 @@ enum
>         LINUX_MIB_TCPREQQFULLDROP,              /* TCPReqQFullDrop */
>         LINUX_MIB_TCPRETRANSFAIL,               /* TCPRetransFail */
>         LINUX_MIB_TCPRCVCOALESCE,               /* TCPRcvCoalesce */
> +       LINUX_MIB_TCPBACKLOGCOALESCE,           /* TCPBacklogCoalesce */
>         LINUX_MIB_TCPOFOQUEUE,                  /* TCPOFOQueue */
>         LINUX_MIB_TCPOFODROP,                   /* TCPOFODrop */
>         LINUX_MIB_TCPOFOMERGE,                  /* TCPOFOMerge */
> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
> index 70289682a6701438aed99a00a9705c39fa4394d3..c3610b37bb4ce665b1976d8cc907b6dd0de42ab9 100644
> --- a/net/ipv4/proc.c
> +++ b/net/ipv4/proc.c
> @@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
>         SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
>         SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
>         SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
> +       SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
>         SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
>         SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
>         SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 795605a2327504b8a025405826e7e0ca8dc8501d..401e1d1cb904a4c7963d8baa419cfbf178593344 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1619,12 +1619,10 @@ int tcp_v4_early_demux(struct sk_buff *skb)
>  bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
>  {
>         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
> -
> -       /* Only socket owner can try to collapse/prune rx queues
> -        * to reduce memory overhead, so add a little headroom here.
> -        * Few sockets backlog are possibly concurrently non empty.
> -        */
> -       limit += 64*1024;
> +       struct skb_shared_info *shinfo;
> +       const struct tcphdr *th;
> +       struct sk_buff *tail;
> +       unsigned int hdrlen;
>
>         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
>          * we can fix skb->truesize to its real value to avoid future drops.
> @@ -1636,6 +1634,71 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
>
>         skb_dst_drop(skb);
>
> +       if (unlikely(tcp_checksum_complete(skb))) {
> +               bh_unlock_sock(sk);
> +               __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
> +               __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
> +               return true;
> +       }
> +
> +       /* Attempt coalescing to last skb in backlog, even if we are
> +        * above the limits.
> +        * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
> +        */
> +       th = (const struct tcphdr *)skb->data;
> +       hdrlen = th->doff * 4;
> +       shinfo = skb_shinfo(skb);
> +
> +       if (!shinfo->gso_size)
> +               shinfo->gso_size = skb->len - hdrlen;
> +
> +       if (!shinfo->gso_segs)
> +               shinfo->gso_segs = 1;
> +
> +       tail = sk->sk_backlog.tail;
> +       if (tail &&
> +           TCP_SKB_CB(tail)->end_seq == TCP_SKB_CB(skb)->seq &&
> +#ifdef CONFIG_TLS_DEVICE
> +           tail->decrypted == skb->decrypted &&
> +#endif
> +           !memcmp(tail->data + sizeof(*th), skb->data + sizeof(*th),
> +                   hdrlen - sizeof(*th))) {
> +               bool fragstolen;
> +               int delta;
> +
> +               __skb_pull(skb, hdrlen);
> +               if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
> +                       TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
> +                       TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
> +                       TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
> +
> +                       if (TCP_SKB_CB(skb)->has_rxtstamp) {
> +                               TCP_SKB_CB(tail)->has_rxtstamp = true;
> +                               tail->tstamp = skb->tstamp;
> +                               skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
> +                       }
> +
Really nice! would it make sense to re-use (some of) the similar
tcp_try_coalesce()?

> +                       /* Not as strict as GRO. We only need to carry mss max value */
> +                       skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
> +                                                        skb_shinfo(tail)->gso_size);
> +
> +                       skb_shinfo(tail)->gso_segs += shinfo->gso_segs;
> +
> +                       sk->sk_backlog.len += delta;
> +                       __NET_INC_STATS(sock_net(sk),
> +                                       LINUX_MIB_TCPBACKLOGCOALESCE);
> +                       kfree_skb_partial(skb, fragstolen);
> +                       return false;
> +               }
> +               __skb_push(skb, hdrlen);
> +       }
> +
> +       /* Only socket owner can try to collapse/prune rx queues
> +        * to reduce memory overhead, so add a little headroom here.
> +        * Few sockets backlog are possibly concurrently non empty.
> +        */
> +       limit += 64*1024;
> +
>         if (unlikely(sk_add_backlog(sk, skb, limit))) {
>                 bh_unlock_sock(sk);
>                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
> --
> 2.19.1.1215.g8438c0b245-goog
>