linux-kernel - Re: [PATCH RFC 1/2] memcg: use percpu

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5412481C.2020101@jp.fujitsu.com>
Date:	Fri, 12 Sep 2014 10:10:52 +0900
From:	Kamezawa Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	Vladimir Davydov <vdavydov@...allels.com>,
	<linux-kernel@...r.kernel.org>
CC:	Johannes Weiner <hannes@...xchg.org>,
	Michal Hocko <mhocko@...e.cz>,
	Greg Thelen <gthelen@...gle.com>,
	Hugh Dickins <hughd@...gle.com>,
	Motohiro Kosaki <Motohiro.Kosaki@...fujitsu.com>,
	Glauber Costa <glommer@...il.com>, Tejun Heo <tj@...nel.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Pavel Emelianov <xemul@...allels.com>,
	Konstantin Khorenko <khorenko@...allels.com>,
	<linux-mm@...ck.org>, <cgroups@...r.kernel.org>
Subject: Re: [PATCH RFC 1/2] memcg: use percpu_counter for statistics

(2014/09/12 0:41), Vladimir Davydov wrote:
> In the next patch I need a quick way to get a value of
> MEM_CGROUP_STAT_RSS. The current procedure (mem_cgroup_read_stat) is
> slow (iterates over all cpus) and may sleep (uses get/put_online_cpus),
> so it's a no-go.
> 
> This patch converts memory cgroup statistics to use percpu_counter so
> that percpu_counter_read will do the trick.
> 
> Signed-off-by: Vladimir Davydov <vdavydov@...allels.com>


I have no strong objections but you need performance comparison to go with this.

I thought percpu counter was messy to be used for "array".
I can't understand why you started from fixing future performance problem before
merging new feature.

Thanks,
-Kame


> ---
>   mm/memcontrol.c |  217 ++++++++++++++++++-------------------------------------
>   1 file changed, 69 insertions(+), 148 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 085dc6d2f876..7e8d65e0608a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -136,9 +136,7 @@ enum mem_cgroup_events_target {
>   #define SOFTLIMIT_EVENTS_TARGET 1024
>   #define NUMAINFO_EVENTS_TARGET	1024
>   
> -struct mem_cgroup_stat_cpu {
> -	long count[MEM_CGROUP_STAT_NSTATS];
> -	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
> +struct mem_cgroup_ratelimit_state {
>   	unsigned long nr_page_events;
>   	unsigned long targets[MEM_CGROUP_NTARGETS];
>   };
> @@ -341,16 +339,10 @@ struct mem_cgroup {
>   	atomic_t	moving_account;
>   	/* taken only while moving_account > 0 */
>   	spinlock_t	move_lock;
> -	/*
> -	 * percpu counter.
> -	 */
> -	struct mem_cgroup_stat_cpu __percpu *stat;
> -	/*
> -	 * used when a cpu is offlined or other synchronizations
> -	 * See mem_cgroup_read_stat().
> -	 */
> -	struct mem_cgroup_stat_cpu nocpu_base;
> -	spinlock_t pcp_counter_lock;
> +
> +	struct percpu_counter stat[MEM_CGROUP_STAT_NSTATS];
> +	struct percpu_counter events[MEM_CGROUP_EVENTS_NSTATS];
> +	struct mem_cgroup_ratelimit_state __percpu *ratelimit;
>   
>   	atomic_t	dead_count;
>   #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
> @@ -849,59 +841,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
>   	return mz;
>   }
>   
> -/*
> - * Implementation Note: reading percpu statistics for memcg.
> - *
> - * Both of vmstat[] and percpu_counter has threshold and do periodic
> - * synchronization to implement "quick" read. There are trade-off between
> - * reading cost and precision of value. Then, we may have a chance to implement
> - * a periodic synchronizion of counter in memcg's counter.
> - *
> - * But this _read() function is used for user interface now. The user accounts
> - * memory usage by memory cgroup and he _always_ requires exact value because
> - * he accounts memory. Even if we provide quick-and-fuzzy read, we always
> - * have to visit all online cpus and make sum. So, for now, unnecessary
> - * synchronization is not implemented. (just implemented for cpu hotplug)
> - *
> - * If there are kernel internal actions which can make use of some not-exact
> - * value, and reading all cpu value can be performance bottleneck in some
> - * common workload, threashold and synchonization as vmstat[] should be
> - * implemented.
> - */
>   static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
>   				 enum mem_cgroup_stat_index idx)
>   {
> -	long val = 0;
> -	int cpu;
> -
> -	get_online_cpus();
> -	for_each_online_cpu(cpu)
> -		val += per_cpu(memcg->stat->count[idx], cpu);
> -#ifdef CONFIG_HOTPLUG_CPU
> -	spin_lock(&memcg->pcp_counter_lock);
> -	val += memcg->nocpu_base.count[idx];
> -	spin_unlock(&memcg->pcp_counter_lock);
> -#endif
> -	put_online_cpus();
> -	return val;
> +	return percpu_counter_read(&memcg->stat[idx]);
>   }
>   
>   static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
>   					    enum mem_cgroup_events_index idx)
>   {
> -	unsigned long val = 0;
> -	int cpu;
> -
> -	get_online_cpus();
> -	for_each_online_cpu(cpu)
> -		val += per_cpu(memcg->stat->events[idx], cpu);
> -#ifdef CONFIG_HOTPLUG_CPU
> -	spin_lock(&memcg->pcp_counter_lock);
> -	val += memcg->nocpu_base.events[idx];
> -	spin_unlock(&memcg->pcp_counter_lock);
> -#endif
> -	put_online_cpus();
> -	return val;
> +	return percpu_counter_read(&memcg->events[idx]);
>   }
>   
>   static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
> @@ -913,25 +862,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
>   	 * counted as CACHE even if it's on ANON LRU.
>   	 */
>   	if (PageAnon(page))
> -		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
> +		percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS],
>   				nr_pages);
>   	else
> -		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
> +		percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_CACHE],
>   				nr_pages);
>   
>   	if (PageTransHuge(page))
> -		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> +		percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE],
>   				nr_pages);
>   
>   	/* pagein of a big page is an event. So, ignore page size */
>   	if (nr_pages > 0)
> -		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
> -	else {
> -		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
> -		nr_pages = -nr_pages; /* for event */
> -	}
> -
> -	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
> +		percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGIN]);
> +	else
> +		percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT]);
>   }
>   
>   unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
> @@ -981,8 +926,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
>   {
>   	unsigned long val, next;
>   
> -	val = __this_cpu_read(memcg->stat->nr_page_events);
> -	next = __this_cpu_read(memcg->stat->targets[target]);
> +	val = __this_cpu_read(memcg->ratelimit->nr_page_events);
> +	next = __this_cpu_read(memcg->ratelimit->targets[target]);
>   	/* from time_after() in jiffies.h */
>   	if ((long)next - (long)val < 0) {
>   		switch (target) {
> @@ -998,7 +943,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
>   		default:
>   			break;
>   		}
> -		__this_cpu_write(memcg->stat->targets[target], next);
> +		__this_cpu_write(memcg->ratelimit->targets[target], next);
>   		return true;
>   	}
>   	return false;
> @@ -1006,10 +951,15 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
>   
>   /*
>    * Check events in order.
> - *
>    */
> -static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
> +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page,
> +			       unsigned long nr_pages)
>   {
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	__this_cpu_add(memcg->ratelimit->nr_page_events, nr_pages);
> +
>   	/* threshold event is triggered in finer grain than soft limit */
>   	if (unlikely(mem_cgroup_event_ratelimit(memcg,
>   						MEM_CGROUP_TARGET_THRESH))) {
> @@ -1030,6 +980,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
>   			atomic_inc(&memcg->numainfo_events);
>   #endif
>   	}
> +	local_irq_restore(flags);
>   }
>   
>   struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
> @@ -1294,10 +1245,10 @@ void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
>   
>   	switch (idx) {
>   	case PGFAULT:
> -		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
> +		percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGFAULT]);
>   		break;
>   	case PGMAJFAULT:
> -		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
> +		percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
>   		break;
>   	default:
>   		BUG();
> @@ -2306,7 +2257,7 @@ void mem_cgroup_update_page_stat(struct page *page,
>   	if (unlikely(!memcg || !PageCgroupUsed(pc)))
>   		return;
>   
> -	this_cpu_add(memcg->stat->count[idx], val);
> +	percpu_counter_add(&memcg->stat[idx], val);
>   }
>   
>   /*
> @@ -2476,37 +2427,12 @@ static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
>   	mutex_unlock(&percpu_charge_mutex);
>   }
>   
> -/*
> - * This function drains percpu counter value from DEAD cpu and
> - * move it to local cpu. Note that this function can be preempted.
> - */
> -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
> -{
> -	int i;
> -
> -	spin_lock(&memcg->pcp_counter_lock);
> -	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
> -		long x = per_cpu(memcg->stat->count[i], cpu);
> -
> -		per_cpu(memcg->stat->count[i], cpu) = 0;
> -		memcg->nocpu_base.count[i] += x;
> -	}
> -	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
> -		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
> -
> -		per_cpu(memcg->stat->events[i], cpu) = 0;
> -		memcg->nocpu_base.events[i] += x;
> -	}
> -	spin_unlock(&memcg->pcp_counter_lock);
> -}
> -
>   static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
>   					unsigned long action,
>   					void *hcpu)
>   {
>   	int cpu = (unsigned long)hcpu;
>   	struct memcg_stock_pcp *stock;
> -	struct mem_cgroup *iter;
>   
>   	if (action == CPU_ONLINE)
>   		return NOTIFY_OK;
> @@ -2514,9 +2440,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
>   	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
>   		return NOTIFY_OK;
>   
> -	for_each_mem_cgroup(iter)
> -		mem_cgroup_drain_pcp_counter(iter, cpu);
> -
>   	stock = &per_cpu(memcg_stock, cpu);
>   	drain_stock(stock);
>   	return NOTIFY_OK;
> @@ -3419,8 +3342,8 @@ void mem_cgroup_split_huge_fixup(struct page *head)
>   		pc->mem_cgroup = memcg;
>   		pc->flags = head_pc->flags;
>   	}
> -	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> -		       HPAGE_PMD_NR);
> +	percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE],
> +			   HPAGE_PMD_NR);
>   }
>   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>   
> @@ -3475,17 +3398,17 @@ static int mem_cgroup_move_account(struct page *page,
>   	move_lock_mem_cgroup(from, &flags);
>   
>   	if (!PageAnon(page) && page_mapped(page)) {
> -		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
> -			       nr_pages);
> -		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
> -			       nr_pages);
> +		percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_FILE_MAPPED],
> +				   nr_pages);
> +		percpu_counter_add(&to->stat[MEM_CGROUP_STAT_FILE_MAPPED],
> +				   nr_pages);
>   	}
>   
>   	if (PageWriteback(page)) {
> -		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
> -			       nr_pages);
> -		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
> -			       nr_pages);
> +		percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_WRITEBACK],
> +				   nr_pages);
> +		percpu_counter_add(&to->stat[MEM_CGROUP_STAT_WRITEBACK],
> +				   nr_pages);
>   	}
>   
>   	/*
> @@ -3499,12 +3422,10 @@ static int mem_cgroup_move_account(struct page *page,
>   	move_unlock_mem_cgroup(from, &flags);
>   	ret = 0;
>   
> -	local_irq_disable();
>   	mem_cgroup_charge_statistics(to, page, nr_pages);
> -	memcg_check_events(to, page);
> +	memcg_check_events(to, page, nr_pages);
>   	mem_cgroup_charge_statistics(from, page, -nr_pages);
> -	memcg_check_events(from, page);
> -	local_irq_enable();
> +	memcg_check_events(from, page, nr_pages);
>   out_unlock:
>   	unlock_page(page);
>   out:
> @@ -3582,7 +3503,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
>   					 bool charge)
>   {
>   	int val = (charge) ? 1 : -1;
> -	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
> +	percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_SWAP], val);
>   }
>   
>   /**
> @@ -5413,25 +5334,11 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
>   
>   static struct mem_cgroup *mem_cgroup_alloc(void)
>   {
> -	struct mem_cgroup *memcg;
>   	size_t size;
>   
>   	size = sizeof(struct mem_cgroup);
>   	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
> -
> -	memcg = kzalloc(size, GFP_KERNEL);
> -	if (!memcg)
> -		return NULL;
> -
> -	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
> -	if (!memcg->stat)
> -		goto out_free;
> -	spin_lock_init(&memcg->pcp_counter_lock);
> -	return memcg;
> -
> -out_free:
> -	kfree(memcg);
> -	return NULL;
> +	return kzalloc(size, GFP_KERNEL);
>   }
>   
>   /*
> @@ -5448,13 +5355,20 @@ out_free:
>   static void __mem_cgroup_free(struct mem_cgroup *memcg)
>   {
>   	int node;
> +	int i;
>   
>   	mem_cgroup_remove_from_trees(memcg);
>   
>   	for_each_node(node)
>   		free_mem_cgroup_per_zone_info(memcg, node);
>   
> -	free_percpu(memcg->stat);
> +	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++)
> +		percpu_counter_destroy(&memcg->stat[i]);
> +
> +	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
> +		percpu_counter_destroy(&memcg->events[i]);
> +
> +	free_percpu(memcg->ratelimit);
>   
>   	/*
>   	 * We need to make sure that (at least for now), the jump label
> @@ -5511,11 +5425,24 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>   	struct mem_cgroup *memcg;
>   	long error = -ENOMEM;
>   	int node;
> +	int i;
>   
>   	memcg = mem_cgroup_alloc();
>   	if (!memcg)
>   		return ERR_PTR(error);
>   
> +	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++)
> +		if (percpu_counter_init(&memcg->stat[i], 0, GFP_KERNEL))
> +			goto free_out;
> +
> +	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
> +		if (percpu_counter_init(&memcg->events[i], 0, GFP_KERNEL))
> +			goto free_out;
> +
> +	memcg->ratelimit = alloc_percpu(struct mem_cgroup_ratelimit_state);
> +	if (!memcg->ratelimit)
> +		goto free_out;
> +
>   	for_each_node(node)
>   		if (alloc_mem_cgroup_per_zone_info(memcg, node))
>   			goto free_out;
> @@ -6507,10 +6434,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
>   		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
>   	}
>   
> -	local_irq_disable();
>   	mem_cgroup_charge_statistics(memcg, page, nr_pages);
> -	memcg_check_events(memcg, page);
> -	local_irq_enable();
> +	memcg_check_events(memcg, page, nr_pages);
>   
>   	if (do_swap_account && PageSwapCache(page)) {
>   		swp_entry_t entry = { .val = page_private(page) };
> @@ -6557,8 +6482,6 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>   			   unsigned long nr_anon, unsigned long nr_file,
>   			   unsigned long nr_huge, struct page *dummy_page)
>   {
> -	unsigned long flags;
> -
>   	if (!mem_cgroup_is_root(memcg)) {
>   		if (nr_mem)
>   			res_counter_uncharge(&memcg->res,
> @@ -6569,14 +6492,12 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
>   		memcg_oom_recover(memcg);
>   	}
>   
> -	local_irq_save(flags);
> -	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
> -	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
> -	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
> -	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
> -	__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
> -	memcg_check_events(memcg, dummy_page);
> -	local_irq_restore(flags);
> +	percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS], nr_anon);
> +	percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_CACHE], nr_file);
> +	percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
> +	percpu_counter_add(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
> +
> +	memcg_check_events(memcg, dummy_page, nr_anon + nr_file);
>   }
>   
>   static void uncharge_list(struct list_head *page_list)
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/