lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Tue, 30 Apr 2024 10:30:51 -0700
From: "T.J. Mercier" <tjmercier@...gle.com>
To: Shakeel Butt <shakeel.butt@...ux.dev>
Cc: Andrew Morton <akpm@...ux-foundation.org>, Johannes Weiner <hannes@...xchg.org>, 
	Michal Hocko <mhocko@...nel.org>, Roman Gushchin <roman.gushchin@...ux.dev>, 
	Muchun Song <muchun.song@...ux.dev>, Yosry Ahmed <yosryahmed@...gle.com>, kernel-team@...a.com, 
	linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3 4/8] memcg: reduce memory for the lruvec and memcg stats

On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@...ux.dev> wrote:
>
> At the moment, the amount of memory allocated for stats related structs
> in the mem_cgroup corresponds to the size of enum node_stat_item.
> However not all fields in enum node_stat_item has corresponding memcg

typo: "have corresponding"

> stats. So, let's use indirection mechanism similar to the one used for
> memcg vmstats management.
>
> For a given x86_64 config, the size of stats with and without patch is:
>
> structs size in bytes         w/o     with
>
> struct lruvec_stats           1128     648
> struct lruvec_stats_percpu     752     432
> struct memcg_vmstats          1832    1352
> struct memcg_vmstats_percpu   1280     960
>
> The memory savings is further compounded by the fact that these structs
> are allocated for each cpu and for each node. To be precise, for each
> memcg the memory saved would be:
>
> Memory saved = ((21 * 3 * NR_NODES) + (21 * 2 * NR_NODS * NR_CPUS) +

typo: "NR_NODES"

>                (21 * 3) + (21 * 2 * NR_CPUS)) * sizeof(long)
>
> Where 21 is the number of fields eliminated.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@...ux.dev>
> ---
>
> Changes since v2:
> - N/A
>
>  mm/memcontrol.c | 138 ++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 115 insertions(+), 23 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 434cff91b65e..f424c5b2ba9b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -576,35 +576,105 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
>         return mz;
>  }
>
> +/* Subset of node_stat_item for memcg stats */
> +static const unsigned int memcg_node_stat_items[] = {
> +       NR_INACTIVE_ANON,
> +       NR_ACTIVE_ANON,
> +       NR_INACTIVE_FILE,
> +       NR_ACTIVE_FILE,
> +       NR_UNEVICTABLE,
> +       NR_SLAB_RECLAIMABLE_B,
> +       NR_SLAB_UNRECLAIMABLE_B,
> +       WORKINGSET_REFAULT_ANON,
> +       WORKINGSET_REFAULT_FILE,
> +       WORKINGSET_ACTIVATE_ANON,
> +       WORKINGSET_ACTIVATE_FILE,
> +       WORKINGSET_RESTORE_ANON,
> +       WORKINGSET_RESTORE_FILE,
> +       WORKINGSET_NODERECLAIM,
> +       NR_ANON_MAPPED,
> +       NR_FILE_MAPPED,
> +       NR_FILE_PAGES,
> +       NR_FILE_DIRTY,
> +       NR_WRITEBACK,
> +       NR_SHMEM,
> +       NR_SHMEM_THPS,
> +       NR_FILE_THPS,
> +       NR_ANON_THPS,
> +       NR_KERNEL_STACK_KB,
> +       NR_PAGETABLE,
> +       NR_SECONDARY_PAGETABLE,
> +#ifdef CONFIG_SWAP
> +       NR_SWAPCACHE,
> +#endif
> +};
> +
> +static const unsigned int memcg_stat_items[] = {
> +       MEMCG_SWAP,
> +       MEMCG_SOCK,
> +       MEMCG_PERCPU_B,
> +       MEMCG_VMALLOC,
> +       MEMCG_KMEM,
> +       MEMCG_ZSWAP_B,
> +       MEMCG_ZSWAPPED,
> +};
> +
> +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
> +
> +static void init_memcg_stats(void)
> +{
> +       int8_t i, j = 0;
> +
> +       /* Switch to short once this failure occurs. */
> +       BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
> +
> +       for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i)
> +               mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j;
> +
> +       for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i)
> +               mem_cgroup_stats_index[memcg_stat_items[i]] = ++j;
> +}
> +
> +static inline int memcg_stats_index(int idx)
> +{
> +       return mem_cgroup_stats_index[idx] - 1;

Could this just be: return mem_cgroup_stats_index[idx];
with a postfix increment of j in init_memcg_stats instead of prefix increment?


> +}
> +
>  struct lruvec_stats_percpu {
>         /* Local (CPU and cgroup) state */
> -       long state[NR_VM_NODE_STAT_ITEMS];
> +       long state[NR_MEMCG_NODE_STAT_ITEMS];
>
>         /* Delta calculation for lockless upward propagation */
> -       long state_prev[NR_VM_NODE_STAT_ITEMS];
> +       long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
>  };
>
>  struct lruvec_stats {
>         /* Aggregated (CPU and subtree) state */
> -       long state[NR_VM_NODE_STAT_ITEMS];
> +       long state[NR_MEMCG_NODE_STAT_ITEMS];
>
>         /* Non-hierarchical (CPU aggregated) state */
> -       long state_local[NR_VM_NODE_STAT_ITEMS];
> +       long state_local[NR_MEMCG_NODE_STAT_ITEMS];
>
>         /* Pending child counts during tree propagation */
> -       long state_pending[NR_VM_NODE_STAT_ITEMS];
> +       long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
>  };
>
>  unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
>  {
>         struct mem_cgroup_per_node *pn;
> -       long x;
> +       long x = 0;
> +       int i;
>
>         if (mem_cgroup_disabled())
>                 return node_page_state(lruvec_pgdat(lruvec), idx);
>
> -       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -       x = READ_ONCE(pn->lruvec_stats->state[idx]);
> +       i = memcg_stats_index(idx);
> +       if (i >= 0) {
> +               pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +               x = READ_ONCE(pn->lruvec_stats->state[i]);
> +       }
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -617,12 +687,16 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>  {
>         struct mem_cgroup_per_node *pn;
>         long x = 0;
> +       int i;
>
>         if (mem_cgroup_disabled())
>                 return node_page_state(lruvec_pgdat(lruvec), idx);
>
> -       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -       x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
> +       i = memcg_stats_index(idx);
> +       if (i >= 0) {
> +               pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +               x = READ_ONCE(pn->lruvec_stats->state_local[i]);
> +       }
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -689,11 +763,11 @@ struct memcg_vmstats_percpu {
>         /* The above should fit a single cacheline for memcg_rstat_updated() */
>
>         /* Local (CPU and cgroup) page state & events */
> -       long                    state[MEMCG_NR_STAT];
> +       long                    state[NR_MEMCG_STATS];
>         unsigned long           events[NR_MEMCG_EVENTS];
>
>         /* Delta calculation for lockless upward propagation */
> -       long                    state_prev[MEMCG_NR_STAT];
> +       long                    state_prev[NR_MEMCG_STATS];
>         unsigned long           events_prev[NR_MEMCG_EVENTS];
>
>         /* Cgroup1: threshold notifications & softlimit tree updates */
> @@ -703,15 +777,15 @@ struct memcg_vmstats_percpu {
>
>  struct memcg_vmstats {
>         /* Aggregated (CPU and subtree) page state & events */
> -       long                    state[MEMCG_NR_STAT];
> +       long                    state[NR_MEMCG_STATS];
>         unsigned long           events[NR_MEMCG_EVENTS];
>
>         /* Non-hierarchical (CPU aggregated) page state & events */
> -       long                    state_local[MEMCG_NR_STAT];
> +       long                    state_local[NR_MEMCG_STATS];
>         unsigned long           events_local[NR_MEMCG_EVENTS];
>
>         /* Pending child counts during tree propagation */
> -       long                    state_pending[MEMCG_NR_STAT];
> +       long                    state_pending[NR_MEMCG_STATS];
>         unsigned long           events_pending[NR_MEMCG_EVENTS];
>
>         /* Stats updates since the last flush */
> @@ -844,7 +918,13 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
>
>  unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
>  {
> -       long x = READ_ONCE(memcg->vmstats->state[idx]);
> +       long x;
> +       int i = memcg_stats_index(idx);
> +
> +       if (i < 0)
> +               return 0;
> +
> +       x = READ_ONCE(memcg->vmstats->state[i]);
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -876,18 +956,25 @@ static int memcg_state_val_in_pages(int idx, int val)
>   */
>  void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
>  {
> -       if (mem_cgroup_disabled())
> +       int i = memcg_stats_index(idx);
> +
> +       if (mem_cgroup_disabled() || i < 0)
>                 return;
>
> -       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
> +       __this_cpu_add(memcg->vmstats_percpu->state[i], val);
>         memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
>  }
>
>  /* idx can be of type enum memcg_stat_item or node_stat_item. */
>  static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
>  {
> -       long x = READ_ONCE(memcg->vmstats->state_local[idx]);
> +       long x;
> +       int i = memcg_stats_index(idx);
> +
> +       if (i < 0)
> +               return 0;
>
> +       x = READ_ONCE(memcg->vmstats->state_local[i]);
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -901,6 +988,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
>  {
>         struct mem_cgroup_per_node *pn;
>         struct mem_cgroup *memcg;
> +       int i = memcg_stats_index(idx);
> +
> +       if (i < 0)
> +               return;
>
>         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
>         memcg = pn->memcg;
> @@ -930,10 +1021,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
>         }
>
>         /* Update memcg */
> -       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
> +       __this_cpu_add(memcg->vmstats_percpu->state[i], val);
>
>         /* Update lruvec */
> -       __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
> +       __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
>
>         memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
>         memcg_stats_unlock();
> @@ -5702,6 +5793,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>                 page_counter_init(&memcg->kmem, &parent->kmem);
>                 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
>         } else {
> +               init_memcg_stats();
>                 init_memcg_events();
>                 page_counter_init(&memcg->memory, NULL);
>                 page_counter_init(&memcg->swap, NULL);
> @@ -5873,7 +5965,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>
>         statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
>
> -       for (i = 0; i < MEMCG_NR_STAT; i++) {
> +       for (i = 0; i < NR_MEMCG_STATS; i++) {
>                 /*
>                  * Collect the aggregated propagation counts of groups
>                  * below us. We're in a per-cpu loop here and this is
> @@ -5937,7 +6029,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>
>                 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
>
> -               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
> +               for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) {
>                         delta = lstats->state_pending[i];
>                         if (delta)
>                                 lstats->state_pending[i] = 0;
> --
> 2.43.0
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ