lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 26 May 2021 16:35:00 +0200
From:   Eric Dumazet <edumazet@...gle.com>
To:     Leonard Crestez <cdleonard@...il.com>
Cc:     Neal Cardwell <ncardwell@...gle.com>,
        Matt Mathis <mattmathis@...gle.com>,
        "David S. Miller" <davem@...emloft.net>,
        Willem de Bruijn <willemb@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>,
        Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
        David Ahern <dsahern@...nel.org>,
        John Heffner <johnwheffner@...il.com>,
        Leonard Crestez <lcrestez@...venets.com>,
        Soheil Hassas Yeganeh <soheil@...gle.com>,
        Roopa Prabhu <roopa@...ulusnetworks.com>,
        netdev <netdev@...r.kernel.org>,
        LKML <linux-kernel@...r.kernel.org>
Subject: Re: [RFCv2 3/3] tcp: Wait for sufficient data in tcp_mtu_probe

On Wed, May 26, 2021 at 12:38 PM Leonard Crestez <cdleonard@...il.com> wrote:
>
> According to RFC4821 Section 7.4 "Protocols MAY delay sending non-probes
> in order to accumulate enough data" but linux almost never does that.
>
> Implement this by returning 0 from tcp_mtu_probe if not enough data is
> queued locally but some packets are still in flight. This makes mtu
> probing more likely to happen for applications that do small writes.
>
> Only doing this if packets are in flight should ensure that writing will
> be attempted again later. This is similar to how tcp_mtu_probe already
> returns zero if the probe doesn't fit inside the receiver window or the
> congestion window.
>
> Control this with a sysctl because this implies a latency tradeoff but
> only up to one RTT.
>
> Signed-off-by: Leonard Crestez <cdleonard@...il.com>
> ---
>  Documentation/networking/ip-sysctl.rst |  5 +++++
>  include/net/netns/ipv4.h               |  1 +
>  net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
>  net/ipv4/tcp_ipv4.c                    |  1 +
>  net/ipv4/tcp_output.c                  | 18 ++++++++++++++----
>  5 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 7ab52a105a5d..967b7fac35b1 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -349,10 +349,15 @@ tcp_mtu_probe_floor - INTEGER
>         If MTU probing is enabled this caps the minimum MSS used for search_low
>         for the connection.
>
>         Default : 48
>
> +tcp_mtu_probe_waitdata - BOOLEAN
> +       Wait for enough data for an mtu probe to accumulate on the sender.
> +
> +       Default: 1
> +
>  tcp_mtu_probe_rack - BOOLEAN
>         Try to use shorter probes if RACK is also enabled
>
>         Default: 1
>
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index b4ff12f25a7f..366e7b325778 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -112,10 +112,11 @@ struct netns_ipv4 {
>  #ifdef CONFIG_NET_L3_MASTER_DEV
>         u8 sysctl_tcp_l3mdev_accept;
>  #endif
>         u8 sysctl_tcp_mtu_probing;
>         int sysctl_tcp_mtu_probe_floor;
> +       int sysctl_tcp_mtu_probe_waitdata;

If this is a boolean, you should use u8, and place this field to avoid
adding a hole.

>         int sysctl_tcp_mtu_probe_rack;
>         int sysctl_tcp_base_mss;
>         int sysctl_tcp_min_snd_mss;
>         int sysctl_tcp_probe_threshold;
>         u32 sysctl_tcp_probe_interval;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 275c91fb9cf8..53868b812958 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -847,10 +847,17 @@ static struct ctl_table ipv4_net_table[] = {
>                 .mode           = 0644,
>                 .proc_handler   = proc_dointvec_minmax,
>                 .extra1         = &tcp_min_snd_mss_min,
>                 .extra2         = &tcp_min_snd_mss_max,
>         },
> +       {
> +               .procname       = "tcp_mtu_probe_waitdata",
> +               .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_waitdata,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec,

If this is a boolean, please use proc_dou8vec_minmax, and SYSCTL_ZERO/SYSCTL_ONE

> +       },
>         {
>                 .procname       = "tcp_mtu_probe_rack",
>                 .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_rack,
>                 .maxlen         = sizeof(int),
>                 .mode           = 0644,
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index ed8af4a7325b..940df2ae4636 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2892,10 +2892,11 @@ static int __net_init tcp_sk_init(struct net *net)
>         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
>         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
>         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
>         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
>         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
> +       net->ipv4.sysctl_tcp_mtu_probe_waitdata = 1;
>         net->ipv4.sysctl_tcp_mtu_probe_rack = 1;
>
>         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
>         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
>         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 362f97cfb09e..268e1bac001f 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -2394,14 +2394,10 @@ static int tcp_mtu_probe(struct sock *sk)
>                  */
>                 tcp_mtu_check_reprobe(sk);
>                 return -1;
>         }
>
> -       /* Have enough data in the send queue to probe? */
> -       if (tp->write_seq - tp->snd_nxt < size_needed)
> -               return -1;
> -
>         /* Can probe fit inside congestion window? */
>         if (packets_needed > tp->snd_cwnd)
>                 return -1;
>
>         /* Can probe fit inside receiver window? If not then skip probing.
> @@ -2411,10 +2407,24 @@ static int tcp_mtu_probe(struct sock *sk)
>          * clear below.
>          */
>         if (tp->snd_wnd < size_needed)
>                 return -1;
>
> +       /* Have enough data in the send queue to probe? */
> +       if (tp->write_seq - tp->snd_nxt < size_needed) {
> +               /* If packets are already in flight it's safe to wait for more data to
> +                * accumulate on the sender because writing will be triggered as ACKs
> +                * arrive.
> +                * If no packets are in flight returning zero can stall.
> +                */
> +               if (net->ipv4.sysctl_tcp_mtu_probe_waitdata &&

I have serious doubts about RPC traffic.
Adding one RTT latency is going to make this unlikely to be used.

> +                   tcp_packets_in_flight(tp))
> +                       return 0;
> +               else
> +                       return -1;
> +       }
> +
>         /* Do we need for more acks to clear the receive window? */
>         if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
>                 return 0;
>
>         /* Do we need the congestion window to clear? */
> --
> 2.25.1
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ