lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Tue, 6 Dec 2022 20:13:50 +0100
From:   Eric Dumazet <edumazet@...gle.com>
To:     Johannes Weiner <hannes@...xchg.org>
Cc:     Ivan Babrou <ivan@...udflare.com>, Linux MM <linux-mm@...ck.org>,
        Linux Kernel Network Developers <netdev@...r.kernel.org>,
        linux-kernel <linux-kernel@...r.kernel.org>,
        Michal Hocko <mhocko@...nel.org>,
        Roman Gushchin <roman.gushchin@...ux.dev>,
        Shakeel Butt <shakeelb@...gle.com>,
        Muchun Song <songmuchun@...edance.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        "David S. Miller" <davem@...emloft.net>,
        Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
        David Ahern <dsahern@...nel.org>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>, cgroups@...r.kernel.org,
        kernel-team <kernel-team@...udflare.com>
Subject: Re: Low TCP throughput due to vmpressure with swap enabled

On Tue, Dec 6, 2022 at 8:00 PM Johannes Weiner <hannes@...xchg.org> wrote:
>
> On Mon, Dec 05, 2022 at 04:50:46PM -0800, Ivan Babrou wrote:
> > And now I can see plenty of this:
> >
> > [  108.156707][ T5175] socket pressure[2]: 4294673429
> > [  108.157050][ T5175] socket pressure[2]: 4294673429
> > [  108.157301][ T5175] socket pressure[2]: 4294673429
> > [  108.157581][ T5175] socket pressure[2]: 4294673429
> > [  108.157874][ T5175] socket pressure[2]: 4294673429
> > [  108.158254][ T5175] socket pressure[2]: 4294673429
> >
> > I think the first result below is to blame:
> >
> > $ rg '.->socket_pressure' mm
> > mm/memcontrol.c
> > 5280: memcg->socket_pressure = jiffies;
> > 7198: memcg->socket_pressure = 0;
> > 7201: memcg->socket_pressure = 1;
> > 7211: memcg->socket_pressure = 0;
> > 7215: memcg->socket_pressure = 1;
>
> Hoo boy, that's a silly mistake indeed. Thanks for tracking it down.
>
> > While we set socket_pressure to either zero or one in
> > mem_cgroup_charge_skmem, it is still initialized to jiffies on memcg
> > creation. Zero seems like a more appropriate starting point. With that
> > change I see it working as expected with no TCP speed bumps. My
> > ebpf_exporter program also looks happy and reports zero clamps in my
> > brief testing.
>
> Excellent, now this behavior makes sense.
>
> > I also think we should downgrade socket_pressure from "unsigned long"
> > to "bool", as it only holds zero and one now.
>
> Sounds good to me!
>
> Attaching the updated patch below. If nobody has any objections, I'll
> add a proper changelog, reported-bys, sign-off etc and send it out.
>
> ---
>  include/linux/memcontrol.h |  8 +++---
>  include/linux/vmpressure.h |  7 ++---
>  mm/memcontrol.c            | 20 +++++++++----
>  mm/vmpressure.c            | 58 ++++++--------------------------------
>  mm/vmscan.c                | 15 +---------
>  5 files changed, 30 insertions(+), 78 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e1644a24009c..ef1c388be5b3 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -283,11 +283,11 @@ struct mem_cgroup {
>         atomic_long_t           memory_events[MEMCG_NR_MEMORY_EVENTS];
>         atomic_long_t           memory_events_local[MEMCG_NR_MEMORY_EVENTS];
>
> -       unsigned long           socket_pressure;
> +       /* Socket memory allocations have failed */
> +       bool                    socket_pressure;
>
>         /* Legacy tcp memory accounting */
>         bool                    tcpmem_active;
> -       int                     tcpmem_pressure;
>
>  #ifdef CONFIG_MEMCG_KMEM
>         int kmemcg_id;
> @@ -1701,10 +1701,10 @@ void mem_cgroup_sk_alloc(struct sock *sk);
>  void mem_cgroup_sk_free(struct sock *sk);
>  static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
>  {
> -       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
> +       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->socket_pressure)

&& READ_ONCE(memcg->socket_pressure))

>                 return true;
>         do {
> -               if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
> +               if (memcg->socket_pressure)

if (READ_ONCE(...))

>                         return true;
>         } while ((memcg = parent_mem_cgroup(memcg)));
>         return false;
> diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
> index 6a2f51ebbfd3..20d93de37a17 100644
> --- a/include/linux/vmpressure.h
> +++ b/include/linux/vmpressure.h
> @@ -11,9 +11,6 @@
>  #include <linux/eventfd.h>
>
>  struct vmpressure {
> -       unsigned long scanned;
> -       unsigned long reclaimed;
> -
>         unsigned long tree_scanned;
>         unsigned long tree_reclaimed;
>         /* The lock is used to keep the scanned/reclaimed above in sync. */
> @@ -30,7 +27,7 @@ struct vmpressure {
>  struct mem_cgroup;
>
>  #ifdef CONFIG_MEMCG
> -extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
> +extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
>                        unsigned long scanned, unsigned long reclaimed);
>  extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
>
> @@ -44,7 +41,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
>  extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
>                                         struct eventfd_ctx *eventfd);
>  #else
> -static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
> +static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
>                               unsigned long scanned, unsigned long reclaimed) {}
>  static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
>                                    int prio) {}
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2d8549ae1b30..0d4b9dbe775a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5277,7 +5277,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>         vmpressure_init(&memcg->vmpressure);
>         INIT_LIST_HEAD(&memcg->event_list);
>         spin_lock_init(&memcg->event_list_lock);
> -       memcg->socket_pressure = jiffies;
>  #ifdef CONFIG_MEMCG_KMEM
>         memcg->kmemcg_id = -1;
>         INIT_LIST_HEAD(&memcg->objcg_list);
> @@ -7195,10 +7194,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
>                 struct page_counter *fail;
>
>                 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
> -                       memcg->tcpmem_pressure = 0;

Orthogonal to your patch, but:

Maybe avoid touching this cache line too often and use READ/WRITE_ONCE() ?

    if (READ_ONCE(memcg->socket_pressure))
      WRITE_ONCE(memcg->socket_pressure, false);


> +                       memcg->socket_pressure = false;
>                         return true;
>                 }
> -               memcg->tcpmem_pressure = 1;
> +               memcg->socket_pressure = true;

Same remark.

>                 if (gfp_mask & __GFP_NOFAIL) {
>                         page_counter_charge(&memcg->tcpmem, nr_pages);
>                         return true;
> @@ -7206,12 +7205,21 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
>                 return false;
>         }
>
> -       if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
> -               mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
> -               return true;
> +       if (try_charge(memcg, gfp_mask & ~__GFP_NOFAIL, nr_pages) == 0) {
> +               memcg->socket_pressure = false;

same remark.

> +               goto success;
> +       }
> +       memcg->socket_pressure = true;

same remark.

> +       if (gfp_mask & __GFP_NOFAIL) {
> +               try_charge(memcg, gfp_mask, nr_pages);
> +               goto success;
>         }
>
>         return false;
> +
> +success:
> +       mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
> +       return true;
>  }
>
>  /**

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ