lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89i+4x+YLVmPNSSiOEB4isQYussWSLqFb5x+0hQ5hyS4j_A@mail.gmail.com>
Date:   Tue, 11 May 2021 15:04:28 +0200
From:   Eric Dumazet <edumazet@...gle.com>
To:     Leonard Crestez <cdleonard@...il.com>
Cc:     Neal Cardwell <ncardwell@...gle.com>,
        Matt Mathis <mattmathis@...gle.com>,
        "David S. Miller" <davem@...emloft.net>,
        Willem de Bruijn <willemb@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>,
        Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
        David Ahern <dsahern@...nel.org>,
        John Heffner <johnwheffner@...il.com>,
        Leonard Crestez <lcrestez@...venets.com>,
        Soheil Hassas Yeganeh <soheil@...gle.com>,
        Roopa Prabhu <roopa@...ulusnetworks.com>,
        netdev <netdev@...r.kernel.org>,
        LKML <linux-kernel@...r.kernel.org>
Subject: Re: [RFC 1/3] tcp: Consider mtu probing for tcp_xmit_size_goal

On Tue, May 11, 2021 at 2:04 PM Leonard Crestez <cdleonard@...il.com> wrote:
>
> According to RFC4821 Section 7.4 "Protocols MAY delay sending non-probes
> in order to accumulate enough data" but linux almost never does that.
>
> Linux checks for (probe_size + (1 + reorder) * mss_cache) bytes to be
> available in the send buffer and if that condition is not met it will
> send anyway using the current MSS. The feature can be made to work by
> sending very large chunks of data from userspace (for example 128k) but
> for small writes on fast links tcp mtu probes almost never happen.

Why should they happen ?

I am not sure the kernel should perform extra checks just because
applications are not properly written.

>
> This patch tries to take mtu probe into account in tcp_xmit_size_goal, a
> function which otherwise attempts to accumulate a packet suitable for
> TSO. No delays are introduced beyond existing autocork heuristics.


MTU probing should not be attempted for every write().
This belongs to some kind of slow path, once in a while.

>
> Suggested-by: Matt Mathis <mattmathis@...gle.com>
> Signed-off-by: Leonard Crestez <cdleonard@...il.com>
> ---
>  Documentation/networking/ip-sysctl.rst |  5 ++++
>  include/net/inet_connection_sock.h     |  4 ++-
>  include/net/netns/ipv4.h               |  1 +
>  include/net/tcp.h                      |  1 +
>  net/ipv4/sysctl_net_ipv4.c             |  7 ++++++
>  net/ipv4/tcp.c                         | 11 +++++++-
>  net/ipv4/tcp_ipv4.c                    |  1 +
>  net/ipv4/tcp_output.c                  | 35 +++++++++++++++++++++++---
>  8 files changed, 59 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index c2ecc9894fd0..108a5ee227d3 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -320,10 +320,15 @@ tcp_mtu_probe_floor - INTEGER
>         If MTU probing is enabled this caps the minimum MSS used for search_low
>         for the connection.
>
>         Default : 48
>
> +tcp_mtu_probe_autocork - BOOLEAN
> +       Take into account mtu probe size when accumulating data via autocorking.
> +
> +       Default: 1
> +
>  tcp_min_snd_mss - INTEGER
>         TCP SYN and SYNACK messages usually advertise an ADVMSS option,
>         as described in RFC 1122 and RFC 6691.
>
>         If this ADVMSS option is smaller than tcp_min_snd_mss,
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index 3c8c59471bc1..9a53d698c2e6 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -123,11 +123,13 @@ struct inet_connection_sock {
>                 /* Range of MTUs to search */
>                 int               search_high;
>                 int               search_low;
>
>                 /* Information on the current probe. */
> -               u32               probe_size:31,
> +               u32               probe_size:30,
> +               /* Are we actively accumulating data for an mtu probe? */
> +                                 wait_data:1,
>                 /* Is the MTUP feature enabled for this connection? */
>                                   enabled:1;
>
>                 u32               probe_timestamp;
>         } icsk_mtup;
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index f6af8d96d3c6..3a2d8bf2b20a 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -112,10 +112,11 @@ struct netns_ipv4 {
>  #ifdef CONFIG_NET_L3_MASTER_DEV
>         u8 sysctl_tcp_l3mdev_accept;
>  #endif
>         u8 sysctl_tcp_mtu_probing;
>         int sysctl_tcp_mtu_probe_floor;
> +       int sysctl_tcp_mtu_probe_autocork;
>         int sysctl_tcp_base_mss;
>         int sysctl_tcp_min_snd_mss;
>         int sysctl_tcp_probe_threshold;
>         u32 sysctl_tcp_probe_interval;
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index d05193cb0d99..fb656490c901 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -666,10 +666,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
>  void tcp_initialize_rcv_mss(struct sock *sk);
>
>  int tcp_mtu_to_mss(struct sock *sk, int pmtu);
>  int tcp_mss_to_mtu(struct sock *sk, int mss);
>  void tcp_mtup_init(struct sock *sk);
> +int tcp_mtu_probe_size_needed(struct sock *sk, int *probe_size);
>
>  static inline void tcp_bound_rto(const struct sock *sk)
>  {
>         if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
>                 inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index a62934b9f15a..e19176c17973 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -827,10 +827,17 @@ static struct ctl_table ipv4_net_table[] = {
>                 .mode           = 0644,
>                 .proc_handler   = proc_dointvec_minmax,
>                 .extra1         = &tcp_min_snd_mss_min,
>                 .extra2         = &tcp_min_snd_mss_max,
>         },
> +       {
> +               .procname       = "tcp_mtu_probe_autocork",
> +               .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_autocork,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec,
> +       },
>         {
>                 .procname       = "tcp_probe_threshold",
>                 .data           = &init_net.ipv4.sysctl_tcp_probe_threshold,
>                 .maxlen         = sizeof(int),
>                 .mode           = 0644,
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index f1c1f9e3de72..23cfb2db28b4 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -913,10 +913,11 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
>  }
>
>  static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
>                                        int large_allowed)
>  {
> +       struct inet_connection_sock *icsk = inet_csk(sk);
>         struct tcp_sock *tp = tcp_sk(sk);
>         u32 new_size_goal, size_goal;
>
>         if (!large_allowed)
>                 return mss_now;
> @@ -932,11 +933,19 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
>                 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
>                                      sk->sk_gso_max_segs);
>                 size_goal = tp->gso_segs * mss_now;
>         }
>
> -       return max(size_goal, mss_now);
> +       size_goal = max(size_goal, mss_now);
> +
> +       if (unlikely(icsk->icsk_mtup.wait_data)) {
> +               int mtu_probe_size_needed = tcp_mtu_probe_size_needed(sk, NULL);
> +               if (mtu_probe_size_needed > 0)
> +                       size_goal = max(size_goal, (u32)mtu_probe_size_needed);
> +       }


I think you are mistaken.

This function usually returns 64KB depending on MSS.
 Have you really tested this part ?

Also adding in TCP fast path some tweaks for MTU probing while
applications are performing small writes
is not acceptable.


> +
> +       return size_goal;
>  }
>
>  int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
>  {
>         int mss_now;
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 312184cead57..7e75423c08c9 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2889,10 +2889,11 @@ static int __net_init tcp_sk_init(struct net *net)
>         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
>         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
>         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
>         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
>         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
> +       net->ipv4.sysctl_tcp_mtu_probe_autocork = 1;
>
>         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
>         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
>         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
>
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index bde781f46b41..5a320d792ec4 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -2311,10 +2311,31 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
>         }
>
>         return true;
>  }
>
> +/* Calculate the size of an MTU probe
> + * Probing the MTU requires one packets which is larger that current MSS as well
> + * as enough following mtu-sized packets to ensure that a probe loss can be
> + * detected without a full Retransmit Time out.
> + */
> +int tcp_mtu_probe_size_needed(struct sock *sk, int *probe_size)
> +{
> +       struct inet_connection_sock *icsk = inet_csk(sk);
> +       struct tcp_sock *tp = tcp_sk(sk);
> +       int probe_size_val;
> +       int size_needed;
> +
> +       /* This might be a little slow: */
> +       probe_size_val = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + icsk->icsk_mtup.search_low) >> 1);
> +       if (probe_size)
> +               *probe_size = probe_size_val;
> +       size_needed = probe_size_val + (tp->reordering + 1) * tp->mss_cache;
> +
> +       return size_needed;
> +}
> +
>  /* Create a new MTU probe if we are ready.
>   * MTU probe is regularly attempting to increase the path MTU by
>   * deliberately sending larger packets.  This discovers routing
>   * changes resulting in larger path MTUs.
>   *
> @@ -2349,13 +2370,12 @@ static int tcp_mtu_probe(struct sock *sk)
>         /* Use binary search for probe_size between tcp_mss_base,
>          * and current mss_clamp. if (search_high - search_low)
>          * smaller than a threshold, backoff from probing.
>          */
>         mss_now = tcp_current_mss(sk);
> -       probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
> -                                   icsk->icsk_mtup.search_low) >> 1);
> -       size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
> +       size_needed = tcp_mtu_probe_size_needed(sk, &probe_size);
> +
>         interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
>         /* When misfortune happens, we are reprobing actively,
>          * and then reprobe timer has expired. We stick with current
>          * probing process by not resetting search range to its orignal.
>          */
> @@ -2368,11 +2388,11 @@ static int tcp_mtu_probe(struct sock *sk)
>                 return -1;
>         }
>
>         /* Have enough data in the send queue to probe? */
>         if (tp->write_seq - tp->snd_nxt < size_needed)
> -               return -1;
> +               return net->ipv4.sysctl_tcp_mtu_probe_autocork ? 0 : -1;
>
>         if (tp->snd_wnd < size_needed)
>                 return -1;
>         if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
>                 return 0;
> @@ -2596,11 +2616,13 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
>   * but cannot send anything now because of SWS or another problem.
>   */
>  static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
>                            int push_one, gfp_t gfp)
>  {
> +       struct inet_connection_sock *icsk = inet_csk(sk);
>         struct tcp_sock *tp = tcp_sk(sk);
> +       struct net *net = sock_net(sk);
>         struct sk_buff *skb;
>         unsigned int tso_segs, sent_pkts;
>         int cwnd_quota;
>         int result;
>         bool is_cwnd_limited = false, is_rwnd_limited = false;
> @@ -2611,13 +2633,18 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
>         tcp_mstamp_refresh(tp);
>         if (!push_one) {
>                 /* Do MTU probing. */
>                 result = tcp_mtu_probe(sk);
>                 if (!result) {
> +                       if (net->ipv4.sysctl_tcp_mtu_probe_autocork)
> +                               icsk->icsk_mtup.wait_data = true;
>                         return false;
>                 } else if (result > 0) {
> +                       icsk->icsk_mtup.wait_data = false;
>                         sent_pkts = 1;
> +               } else {
> +                       icsk->icsk_mtup.wait_data = false;
>                 }
>         }
>
>         max_segs = tcp_tso_segs(sk, mss_now);
>         while ((skb = tcp_send_head(sk))) {
> --
> 2.25.1
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ