lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Zi_BsznrtoC1N_lq@P9FQF9L96D>
Date: Mon, 29 Apr 2024 08:50:11 -0700
From: Roman Gushchin <roman.gushchin@...ux.dev>
To: Shakeel Butt <shakeel.butt@...ux.dev>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
	Johannes Weiner <hannes@...xchg.org>,
	Michal Hocko <mhocko@...nel.org>,
	Muchun Song <muchun.song@...ux.dev>, linux-mm@...ck.org,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 2/7] memcg: dynamically allocate lruvec_stats

On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote:
> To decouple the dependency of lruvec_stats on NR_VM_NODE_STAT_ITEMS, we
> need to dynamically allocate lruvec_stats in the mem_cgroup_per_node
> structure. Also move the definition of lruvec_stats_percpu and
> lruvec_stats and related functions to the memcontrol.c to facilitate
> later patches. No functional changes in the patch.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@...ux.dev>
> ---
>  include/linux/memcontrol.h | 62 +++------------------------
>  mm/memcontrol.c            | 87 ++++++++++++++++++++++++++++++++------
>  2 files changed, 81 insertions(+), 68 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 9aba0d0462ca..ab8a6e884375 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -83,6 +83,8 @@ enum mem_cgroup_events_target {
>  
>  struct memcg_vmstats_percpu;
>  struct memcg_vmstats;
> +struct lruvec_stats_percpu;
> +struct lruvec_stats;
>  
>  struct mem_cgroup_reclaim_iter {
>  	struct mem_cgroup *position;
> @@ -90,25 +92,6 @@ struct mem_cgroup_reclaim_iter {
>  	unsigned int generation;
>  };
>  
> -struct lruvec_stats_percpu {
> -	/* Local (CPU and cgroup) state */
> -	long state[NR_VM_NODE_STAT_ITEMS];
> -
> -	/* Delta calculation for lockless upward propagation */
> -	long state_prev[NR_VM_NODE_STAT_ITEMS];
> -};
> -
> -struct lruvec_stats {
> -	/* Aggregated (CPU and subtree) state */
> -	long state[NR_VM_NODE_STAT_ITEMS];
> -
> -	/* Non-hierarchical (CPU aggregated) state */
> -	long state_local[NR_VM_NODE_STAT_ITEMS];
> -
> -	/* Pending child counts during tree propagation */
> -	long state_pending[NR_VM_NODE_STAT_ITEMS];
> -};
> -
>  /*
>   * per-node information in memory controller.
>   */
> @@ -116,7 +99,7 @@ struct mem_cgroup_per_node {
>  	struct lruvec		lruvec;
>  
>  	struct lruvec_stats_percpu __percpu	*lruvec_stats_percpu;
> -	struct lruvec_stats			lruvec_stats;
> +	struct lruvec_stats			*lruvec_stats;
>  
>  	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
>  
> @@ -1037,42 +1020,9 @@ static inline void mod_memcg_page_state(struct page *page,
>  }
>  
>  unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
> -
> -static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
> -					      enum node_stat_item idx)
> -{
> -	struct mem_cgroup_per_node *pn;
> -	long x;
> -
> -	if (mem_cgroup_disabled())
> -		return node_page_state(lruvec_pgdat(lruvec), idx);
> -
> -	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -	x = READ_ONCE(pn->lruvec_stats.state[idx]);
> -#ifdef CONFIG_SMP
> -	if (x < 0)
> -		x = 0;
> -#endif
> -	return x;
> -}
> -
> -static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> -						    enum node_stat_item idx)
> -{
> -	struct mem_cgroup_per_node *pn;
> -	long x = 0;
> -
> -	if (mem_cgroup_disabled())
> -		return node_page_state(lruvec_pgdat(lruvec), idx);
> -
> -	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -	x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
> -#ifdef CONFIG_SMP
> -	if (x < 0)
> -		x = 0;
> -#endif
> -	return x;
> -}
> +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
> +unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> +				      enum node_stat_item idx);
>  
>  void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>  void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 53769d06053f..5e337ed6c6bf 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -576,6 +576,60 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
>  	return mz;
>  }
>  
> +struct lruvec_stats_percpu {
> +	/* Local (CPU and cgroup) state */
> +	long state[NR_VM_NODE_STAT_ITEMS];
> +
> +	/* Delta calculation for lockless upward propagation */
> +	long state_prev[NR_VM_NODE_STAT_ITEMS];
> +};
> +
> +struct lruvec_stats {
> +	/* Aggregated (CPU and subtree) state */
> +	long state[NR_VM_NODE_STAT_ITEMS];
> +
> +	/* Non-hierarchical (CPU aggregated) state */
> +	long state_local[NR_VM_NODE_STAT_ITEMS];
> +
> +	/* Pending child counts during tree propagation */
> +	long state_pending[NR_VM_NODE_STAT_ITEMS];
> +};
> +
> +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
> +{
> +	struct mem_cgroup_per_node *pn;
> +	long x;
> +
> +	if (mem_cgroup_disabled())
> +		return node_page_state(lruvec_pgdat(lruvec), idx);
> +
> +	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +	x = READ_ONCE(pn->lruvec_stats->state[idx]);
> +#ifdef CONFIG_SMP
> +	if (x < 0)
> +		x = 0;
> +#endif
> +	return x;
> +}
> +
> +unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> +				      enum node_stat_item idx)
> +{
> +	struct mem_cgroup_per_node *pn;
> +	long x = 0;
> +
> +	if (mem_cgroup_disabled())
> +		return node_page_state(lruvec_pgdat(lruvec), idx);
> +
> +	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +	x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
> +#ifdef CONFIG_SMP
> +	if (x < 0)
> +		x = 0;
> +#endif

Not directly related to your change, but do we still need it? And if yes,
do we really care about !CONFIG_SMP case enough to justify these #ifdefs?

> +	return x;
> +}
> +
>  /* Subset of vm_event_item to report for memcg event stats */
>  static const unsigned int memcg_vm_event_stat[] = {
>  	PGPGIN,
> @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
>  	if (!pn)
>  		return 1;
>  
> +	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> +					node);

Why not GFP_KERNEL_ACCOUNT?

> +	if (!pn->lruvec_stats)
> +		goto fail;
> +
>  	pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
>  						   GFP_KERNEL_ACCOUNT);
> -	if (!pn->lruvec_stats_percpu) {
> -		kfree(pn);
> -		return 1;
> -	}
> +	if (!pn->lruvec_stats_percpu)
> +		goto fail;
>  
>  	lruvec_init(&pn->lruvec);
>  	pn->memcg = memcg;
>  
>  	memcg->nodeinfo[node] = pn;
>  	return 0;
> +fail:
> +	kfree(pn->lruvec_stats);
> +	kfree(pn);
> +	return 1;
>  }
>  
>  static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> @@ -5514,6 +5575,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
>  		return;
>  
>  	free_percpu(pn->lruvec_stats_percpu);
> +	kfree(pn->lruvec_stats);
>  	kfree(pn);
>  }
>  
> @@ -5866,18 +5928,19 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>  
>  	for_each_node_state(nid, N_MEMORY) {
>  		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
> -		struct mem_cgroup_per_node *ppn = NULL;
> +		struct lruvec_stats *lstats = pn->lruvec_stats;
> +		struct lruvec_stats *plstats = NULL;
>  		struct lruvec_stats_percpu *lstatc;
>  
>  		if (parent)
> -			ppn = parent->nodeinfo[nid];
> +			plstats = parent->nodeinfo[nid]->lruvec_stats;
>  
>  		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
>  
>  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
> -			delta = pn->lruvec_stats.state_pending[i];
> +			delta = lstats->state_pending[i];
>  			if (delta)
> -				pn->lruvec_stats.state_pending[i] = 0;
> +				lstats->state_pending[i] = 0;
>  
>  			delta_cpu = 0;
>  			v = READ_ONCE(lstatc->state[i]);
> @@ -5888,12 +5951,12 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>  			}
>  
>  			if (delta_cpu)
> -				pn->lruvec_stats.state_local[i] += delta_cpu;
> +				lstats->state_local[i] += delta_cpu;
>  
>  			if (delta) {
> -				pn->lruvec_stats.state[i] += delta;
> -				if (ppn)
> -					ppn->lruvec_stats.state_pending[i] += delta;
> +				lstats->state[i] += delta;
> +				if (plstats)
> +					plstats->state_pending[i] += delta;
>  			}
>  		}
>  	}
> -- 
> 2.43.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ