linux-kernel - Re: [PATCH v2 3/5] mm: memcg: make stats flushing threshold per-memcg

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZTdqpcDFVHhFwWMc@xsang-OptiPlex-9020>
Date:   Tue, 24 Oct 2023 14:56:37 +0800
From:   Oliver Sang <oliver.sang@...el.com>
To:     Yosry Ahmed <yosryahmed@...gle.com>
CC:     Feng Tang <feng.tang@...el.com>,
        Shakeel Butt <shakeelb@...gle.com>,
        "oe-lkp@...ts.linux.dev" <oe-lkp@...ts.linux.dev>,
        lkp <lkp@...el.com>,
        "cgroups@...r.kernel.org" <cgroups@...r.kernel.org>,
        "linux-mm@...ck.org" <linux-mm@...ck.org>,
        "Huang, Ying" <ying.huang@...el.com>,
        "Yin, Fengwei" <fengwei.yin@...el.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        "Johannes Weiner" <hannes@...xchg.org>,
        Michal Hocko <mhocko@...nel.org>,
        Roman Gushchin <roman.gushchin@...ux.dev>,
        Muchun Song <muchun.song@...ux.dev>,
        Ivan Babrou <ivan@...udflare.com>, Tejun Heo <tj@...nel.org>,
        Michal Koutný <mkoutny@...e.com>,
        Waiman Long <longman@...hat.com>,
        "kernel-team@...udflare.com" <kernel-team@...udflare.com>,
        Wei Xu <weixugc@...gle.com>, Greg Thelen <gthelen@...gle.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        <oliver.sang@...el.com>
Subject: Re: [PATCH v2 3/5] mm: memcg: make stats flushing threshold per-memcg

hi, Yosry Ahmed,

On Mon, Oct 23, 2023 at 07:13:50PM -0700, Yosry Ahmed wrote:

...

> 
> I still could not run the benchmark, but I used a version of
> fallocate1.c that does 1 million iterations. I ran 100 in parallel.
> This showed ~13% regression with the patch, so not the same as the
> will-it-scale version, but it could be an indicator.
> 
> With that, I did not see any improvement with the fixlet above or
> ___cacheline_aligned_in_smp. So you can scratch that.
> 
> I did, however, see some improvement with reducing the indirection
> layers by moving stats_updates directly into struct mem_cgroup. The
> regression in my manual testing went down to 9%. Still not great, but
> I am wondering how this reflects on the benchmark. If you're able to
> test it that would be great, the diff is below. Meanwhile I am still
> looking for other improvements that can be made.

we applied previous patch-set as below:

c5f50d8b23c79 (linux-review/Yosry-Ahmed/mm-memcg-change-flush_next_time-to-flush_last_time/20231010-112257) mm: memcg: restore subtree stats flushing
ac8a48ba9e1ca mm: workingset: move the stats flush into workingset_test_recent()
51d74c18a9c61 mm: memcg: make stats flushing threshold per-memcg
130617edc1cd1 mm: memcg: move vmstats structs definition above flushing code
26d0ee342efc6 mm: memcg: change flush_next_time to flush_last_time
25478183883e6 Merge branch 'mm-nonmm-unstable' into mm-everything   <---- the base our tool picked for the patch set

I tried to apply below patch to either 51d74c18a9c61 or c5f50d8b23c79,
but failed. could you guide how to apply this patch?
Thanks

> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index f64ac140083e..b4dfcd8b9cc1 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -270,6 +270,9 @@ struct mem_cgroup {
> 
>         CACHELINE_PADDING(_pad1_);
> 
> +       /* Stats updates since the last flush */
> +       atomic64_t              stats_updates;
> +
>         /* memory.stat */
>         struct memcg_vmstats    *vmstats;
> 
> @@ -309,6 +312,7 @@ struct mem_cgroup {
>         atomic_t                moving_account;
>         struct task_struct      *move_lock_task;
> 
> +       unsigned int __percpu *stats_updates_percpu;
>         struct memcg_vmstats_percpu __percpu *vmstats_percpu;
> 
>  #ifdef CONFIG_CGROUP_WRITEBACK
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7cbc7d94eb65..e5d2f3d4d874 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -627,9 +627,6 @@ struct memcg_vmstats_percpu {
>         /* Cgroup1: threshold notifications & softlimit tree updates */
>         unsigned long           nr_page_events;
>         unsigned long           targets[MEM_CGROUP_NTARGETS];
> -
> -       /* Stats updates since the last flush */
> -       unsigned int            stats_updates;
>  };
> 
>  struct memcg_vmstats {
> @@ -644,9 +641,6 @@ struct memcg_vmstats {
>         /* Pending child counts during tree propagation */
>         long                    state_pending[MEMCG_NR_STAT];
>         unsigned long           events_pending[NR_MEMCG_EVENTS];
> -
> -       /* Stats updates since the last flush */
> -       atomic64_t              stats_updates;
>  };
> 
>  /*
> @@ -695,14 +689,14 @@ static void memcg_stats_unlock(void)
> 
>  static bool memcg_should_flush_stats(struct mem_cgroup *memcg)
>  {
> -       return atomic64_read(&memcg->vmstats->stats_updates) >
> +       return atomic64_read(&memcg->stats_updates) >
>                 MEMCG_CHARGE_BATCH * num_online_cpus();
>  }
> 
>  static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
>  {
>         int cpu = smp_processor_id();
> -       unsigned int x;
> +       unsigned int *stats_updates_percpu;
> 
>         if (!val)
>                 return;
> @@ -710,10 +704,10 @@ static inline void memcg_rstat_updated(struct
> mem_cgroup *memcg, int val)
>         cgroup_rstat_updated(memcg->css.cgroup, cpu);
> 
>         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
> -               x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates,
> -                                         abs(val));
> +               stats_updates_percpu =
> this_cpu_ptr(memcg->stats_updates_percpu);
> 
> -               if (x < MEMCG_CHARGE_BATCH)
> +               *stats_updates_percpu += abs(val);
> +               if (*stats_updates_percpu < MEMCG_CHARGE_BATCH)
>                         continue;
> 
>                 /*
> @@ -721,8 +715,8 @@ static inline void memcg_rstat_updated(struct
> mem_cgroup *memcg, int val)
>                  * redundant. Avoid the overhead of the atomic update.
>                  */
>                 if (!memcg_should_flush_stats(memcg))
> -                       atomic64_add(x, &memcg->vmstats->stats_updates);
> -               __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0);
> +                       atomic64_add(*stats_updates_percpu,
> &memcg->stats_updates);
> +               *stats_updates_percpu = 0;
>         }
>  }
> 
> @@ -5467,6 +5461,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
>                 free_mem_cgroup_per_node_info(memcg, node);
>         kfree(memcg->vmstats);
>         free_percpu(memcg->vmstats_percpu);
> +       free_percpu(memcg->stats_updates_percpu);
>         kfree(memcg);
>  }
> 
> @@ -5504,6 +5499,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>         if (!memcg->vmstats_percpu)
>                 goto fail;
> 
> +       memcg->stats_updates_percpu = alloc_percpu_gfp(unsigned int,
> +                                                      GFP_KERNEL_ACCOUNT);
> +       if (!memcg->stats_updates_percpu)
> +               goto fail;
> +
>         for_each_node(node)
>                 if (alloc_mem_cgroup_per_node_info(memcg, node))
>                         goto fail;
> @@ -5735,10 +5735,12 @@ static void mem_cgroup_css_rstat_flush(struct
> cgroup_subsys_state *css, int cpu)
>         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>         struct mem_cgroup *parent = parent_mem_cgroup(memcg);
>         struct memcg_vmstats_percpu *statc;
> +       int *stats_updates_percpu;
>         long delta, delta_cpu, v;
>         int i, nid;
> 
>         statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
> +       stats_updates_percpu = per_cpu_ptr(memcg->stats_updates_percpu, cpu);
> 
>         for (i = 0; i < MEMCG_NR_STAT; i++) {
>                 /*
> @@ -5826,10 +5828,10 @@ static void mem_cgroup_css_rstat_flush(struct
> cgroup_subsys_state *css, int cpu)
>                         }
>                 }
>         }
> -       statc->stats_updates = 0;
> +       *stats_updates_percpu = 0;
>         /* We are in a per-cpu loop here, only do the atomic write once */
> -       if (atomic64_read(&memcg->vmstats->stats_updates))
> -               atomic64_set(&memcg->vmstats->stats_updates, 0);
> +       if (atomic64_read(&memcg->stats_updates))
> +               atomic64_set(&memcg->stats_updates, 0);
>  }
> 
>  #ifdef CONFIG_MMU
>