lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CANn89iL0hCwDKGquYyGvriPEc+GXSzf+UuZG4vb0Ah-NaVL7cA@mail.gmail.com>
Date:   Mon, 29 Aug 2022 10:15:55 -0700
From:   Eric Dumazet <edumazet@...gle.com>
To:     Richard Gobert <richardbgobert@...il.com>
Cc:     David Miller <davem@...emloft.net>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>,
        Jonathan Corbet <corbet@....net>,
        Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
        David Ahern <dsahern@...nel.org>,
        Alexander Aring <alex.aring@...il.com>,
        Stefan Schmidt <stefan@...enfreihafen.org>,
        Pablo Neira Ayuso <pablo@...filter.org>,
        Jozsef Kadlecsik <kadlec@...filter.org>,
        Florian Westphal <fw@...len.de>,
        Martin KaFai Lau <kafai@...com>,
        netdev <netdev@...r.kernel.org>,
        "open list:DOCUMENTATION" <linux-doc@...r.kernel.org>,
        LKML <linux-kernel@...r.kernel.org>, linux-wpan@...r.kernel.org,
        netfilter-devel@...r.kernel.org, coreteam@...filter.org
Subject: Re: [PATCH 4/4] net-next: frags: dynamic timeout under load

On Mon, Aug 29, 2022 at 4:49 AM Richard Gobert <richardbgobert@...il.com> wrote:
>
> Calculate a dynamic fragment reassembly timeout, taking into
> consideration the current fqdir load and the load introduced by
> the peer. Reintroduce low_thresh, which now acts as a knob for
> adjusting per-peer memory limits.
>
> Signed-off-by: Richard Gobert <richardbgobert@...il.com>
> ---
>  Documentation/networking/ip-sysctl.rst |  3 +++
>  include/net/inet_frag.h                |  1 +
>  net/ipv4/inet_fragment.c               | 30 +++++++++++++++++++++++++-
>  net/ipv4/ip_fragment.c                 |  2 +-
>  4 files changed, 34 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 56cd4ea059b2..fb25aa6e22a2 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -247,6 +247,9 @@ ipfrag_low_thresh - LONG INTEGER
>         begins to remove incomplete fragment queues to free up resources.
>         The kernel still accepts new fragments for defragmentation.
>
> +       (Since linux-6.1)
> +       Maximum memory used to reassemble IP fragments sent by a single peer.
> +
>  ipfrag_time - INTEGER
>         Time in seconds to keep an IP fragment in memory.
>
> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> index 077a0ec78a58..595a6db57a0e 100644
> --- a/include/net/inet_frag.h
> +++ b/include/net/inet_frag.h
> @@ -99,6 +99,7 @@ struct inet_frag_queue {
>         u16                     max_size;
>         struct fqdir            *fqdir;
>         struct inet_peer        *peer;
> +       u64                     timeout;

Why u64 ?

This is not what the timer interface uses (look at mod_timer(), it
uses "unsigned long")

>         struct rcu_head         rcu;
>  };
>
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 8b8d77d548d4..34c5ebba4951 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -314,6 +314,30 @@ void inet_frag_free(struct inet_frag_queue *q)
>         call_rcu(&q->rcu, inet_frag_destroy_rcu);
>  }
>
> +static int inet_frag_update_timeout(struct inet_frag_queue *q)
> +{
> +       u64 peer_timeout, inet_timeout;
> +       long peer_mem, inet_mem;
> +       long high_thresh = READ_ONCE(q->fqdir->high_thresh);
> +       long low_thresh  = READ_ONCE(q->fqdir->low_thresh);
> +       u64 base_timeout = READ_ONCE(q->fqdir->timeout);
> +
> +       peer_mem = low_thresh - peer_mem_limit(q);
> +       inet_mem = high_thresh - frag_mem_limit(q->fqdir);
> +
> +       if (peer_mem <= 0 || inet_mem <= 0)
> +               return -ENOMEM;
> +
> +       /* Timeout changes linearly with respect to the amount of free memory.
> +        * Choose the more permissive of the two timeouts, to avoid limiting
> +        * the system while there is still enough memory.
> +        */
> +       peer_timeout = div64_long(base_timeout * peer_mem, low_thresh);
> +       inet_timeout = div64_long(base_timeout * inet_mem, high_thresh);
> +       q->timeout = max_t(u64, peer_timeout, inet_timeout);

If/when under load, timeout is close to zero,
we would fire many timers (increased system load) and make impossible
for datagrams to complete.

In contrast, a reasonable timer and probabilistic drops of new datagrams
when the queue is full lets some datagrams to complete.

Make sure to test your change under a real DDOS, not only non malicious netperf

> +       return 0;
> +}
> +
>  void inet_frag_destroy(struct inet_frag_queue *q)
>  {
>         struct fqdir *fqdir;
> @@ -346,6 +370,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
>
>         q->fqdir = fqdir;
>         f->constructor(q, arg);
> +       if (inet_frag_update_timeout(q)) {
> +               inet_frag_free(q);
> +               return NULL;
> +       }
>         add_frag_mem_limit(q, f->qsize);
>
>         timer_setup(&q->timer, f->frag_expire, 0);
> @@ -367,7 +395,7 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
>                 *prev = ERR_PTR(-ENOMEM);
>                 return NULL;
>         }
> -       mod_timer(&q->timer, jiffies + fqdir->timeout);
> +       mod_timer(&q->timer, jiffies + q->timeout);
>
>         *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
>                                                  &q->node, f->rhash_params);
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index e35061f6aadb..88a99242d721 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -236,7 +236,7 @@ static int ip_frag_reinit(struct ipq *qp)
>  {
>         unsigned int sum_truesize = 0;
>
> -       if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
> +       if (!mod_timer(&qp->q.timer, jiffies + qp->q.timeout)) {
>                 refcount_inc(&qp->q.refcnt);
>                 return -ETIMEDOUT;
>         }
> --
> 2.36.1
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ