linux-kernel - Re: [PATCH 4/4] Memory controller soft limit reclaim on contention (v4)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20090306185124.51a52519.kamezawa.hiroyu@jp.fujitsu.com>
Date:	Fri, 6 Mar 2009 18:51:24 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	Balbir Singh <balbir@...ux.vnet.ibm.com>
Cc:	linux-mm@...ck.org, Sudhir Kumar <skumar@...ux.vnet.ibm.com>,
	YAMAMOTO Takashi <yamamoto@...inux.co.jp>,
	Bharata B Rao <bharata@...ibm.com>,
	Paul Menage <menage@...gle.com>, lizf@...fujitsu.com,
	linux-kernel@...r.kernel.org,
	KOSAKI Motohiro <kosaki.motohiro@...fujitsu.com>,
	David Rientjes <rientjes@...gle.com>,
	Pavel Emelianov <xemul@...nvz.org>,
	Dhaval Giani <dhaval@...ux.vnet.ibm.com>,
	Rik van Riel <riel@...hat.com>,
	Andrew Morton <akpm@...ux-foundation.org>
Subject: Re: [PATCH 4/4] Memory controller soft limit reclaim on contention
 (v4)

On Fri, 06 Mar 2009 14:53:53 +0530
Balbir Singh <balbir@...ux.vnet.ibm.com> wrote:

> ---
> 
>  include/linux/memcontrol.h |    9 ++
>  include/linux/swap.h       |    5 +
>  mm/memcontrol.c            |  223 +++++++++++++++++++++++++++++++++++++++++---
>  mm/vmscan.c                |   26 +++++
>  4 files changed, 245 insertions(+), 18 deletions(-)
> 
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 18146c9..16343d0 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -116,6 +116,9 @@ static inline bool mem_cgroup_disabled(void)
>  }
>  
>  extern bool mem_cgroup_oom_called(struct task_struct *task);
> +unsigned long
> +mem_cgroup_soft_limit_reclaim(int priority, struct zone *zone, int nid,
> +				gfp_t gfp_mask);
>  
>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>  struct mem_cgroup;
> @@ -264,6 +267,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
>  {
>  }
>  
> +static inline unsigned long
> +mem_cgroup_soft_limit_reclaim(int priority, struct zone *zone, int nid,
> +				gfp_t gfp_mask)
> +{
> +	return 0;
> +}
>  #endif /* CONFIG_CGROUP_MEM_CONT */
>  
>  #endif /* _LINUX_MEMCONTROL_H */
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 989eb53..37bc2a9 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -217,6 +217,11 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
>  						  gfp_t gfp_mask, bool noswap,
>  						  unsigned int swappiness);
> +extern unsigned long mem_cgroup_shrink_zone(struct mem_cgroup *mem,
> +						struct zone *zone,
> +						gfp_t gfp_mask,
> +						unsigned int swappiness,
> +						int priority);
>  extern int __isolate_lru_page(struct page *page, int mode, int file);
>  extern unsigned long shrink_all_memory(unsigned long nr_pages);
>  extern int vm_swappiness;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d548dd2..3be1f27 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -20,6 +20,7 @@
>  #include <linux/res_counter.h>
>  #include <linux/memcontrol.h>
>  #include <linux/cgroup.h>
> +#include <linux/completion.h>
>  #include <linux/mm.h>
>  #include <linux/pagemap.h>
>  #include <linux/smp.h>
> @@ -191,6 +192,14 @@ struct mem_cgroup {
>  	unsigned long last_tree_update;		/* Last time the tree was */
>  						/* updated in jiffies     */
>  
> +	bool on_tree;				/* Is the node on tree? */
> +	struct completion wait_on_soft_reclaim;
> +	/*
> +	 * Set to > 0, when reclaim is initiated due to 
> +	 * the soft limit being exceeded. It adds an additional atomic
> +	 * operation to page fault path.
> +	 */
> +	int soft_limit_reclaim_count;
>  	/*
>  	 * statistics. This must be placed at the end of memcg.
>  	 */
> @@ -227,18 +236,29 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
>  #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
>  #define MEMFILE_ATTR(val)	((val) & 0xffff)
>  
> +/*
> + * Bits used for hierarchical reclaim bits
> + */
> +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
> +#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> +#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
> +#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> +#define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
> +#define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
> +
>  static void mem_cgroup_get(struct mem_cgroup *mem);
>  static void mem_cgroup_put(struct mem_cgroup *mem);
>  static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
>  
> -static void mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
> +static void __mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
>  {
>  	struct rb_node **p = &mem_cgroup_soft_limit_tree.rb_node;
>  	struct rb_node *parent = NULL;
>  	struct mem_cgroup *mem_node;
> -	unsigned long flags;
>  
> -	spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
> +	if (mem->on_tree)
> +		return;
> +
>  	while (*p) {
>  		parent = *p;
>  		mem_node = rb_entry(parent, struct mem_cgroup, mem_cgroup_node);
> @@ -255,6 +275,23 @@ static void mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
>  	rb_insert_color(&mem->mem_cgroup_node,
>  			&mem_cgroup_soft_limit_tree);
>  	mem->last_tree_update = jiffies;
> +	mem->on_tree = true;
> +}
> +
> +static void __mem_cgroup_remove_exceeded(struct mem_cgroup *mem)
> +{
> +	if (!mem->on_tree)
> +		return;
> +	rb_erase(&mem->mem_cgroup_node, &mem_cgroup_soft_limit_tree);
> +	mem->on_tree = false;
> +}
> +
> +static void mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
> +	__mem_cgroup_insert_exceeded(mem);
>  	spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
>  }
>  
> @@ -262,8 +299,34 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup *mem)
>  {
>  	unsigned long flags;
>  	spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
> -	rb_erase(&mem->mem_cgroup_node, &mem_cgroup_soft_limit_tree);
> +	__mem_cgroup_remove_exceeded(mem);
> +	spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
> +}
> +
> +static struct mem_cgroup *mem_cgroup_get_largest_soft_limit_exceeding_node(void)
> +{
> +	struct rb_node *rightmost = NULL;
> +	struct mem_cgroup *mem = NULL;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
> +retry:
> +	rightmost = rb_last(&mem_cgroup_soft_limit_tree);
> +	if (!rightmost)
> +		goto done;		/* Nothing to reclaim from */
> +
> +	mem = rb_entry(rightmost, struct mem_cgroup, mem_cgroup_node);
> +	/*
> +	 * Remove the node now but someone else can add it back,
> +	 * we will to add it back at the end of reclaim to its correct
> +	 * position in the tree.
> +	 */
> +	__mem_cgroup_remove_exceeded(mem);
> +	if (!css_tryget(&mem->css) || !res_counter_soft_limit_excess(&mem->res))
> +		goto retry;
> +done:
>  	spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
> +	return mem;
>  }
>  
>  static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
> @@ -324,6 +387,27 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
>  	return total;
>  }
>  
> +static unsigned long long
> +mem_cgroup_get_node_zone_usage(struct mem_cgroup *mem, struct zone *zone,
> +				int nid)
> +{
> +	int l;
> +	unsigned long long total = 0;
> +	struct mem_cgroup_per_zone *mz;
> +	unsigned long flags;
> +
> +	/*
> +	 * Is holding the zone LRU lock being overly protective?
> +	 * This routine is not invoked from the hot path anyway.
> +	 */
> +	spin_lock_irqsave(&zone->lru_lock, flags);
> +	mz = mem_cgroup_zoneinfo(mem, nid, zone_idx(zone));
> +	for_each_evictable_lru(l)
> +		total += MEM_CGROUP_ZSTAT(mz, l);
> +	spin_unlock_irqrestore(&zone->lru_lock, flags);
> +	return total * PAGE_SIZE;
> +}
> +
>  static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
>  {
>  	return container_of(cgroup_subsys_state(cont,
> @@ -888,14 +972,30 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
>   * If shrink==true, for avoiding to free too much, this returns immedieately.
>   */
>  static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
> -				   gfp_t gfp_mask, bool noswap, bool shrink)
> +						struct zone *zone,
> +						gfp_t gfp_mask,
> +						unsigned long flags,
> +						int priority)
>  {
>  	struct mem_cgroup *victim;
>  	int ret, total = 0;
>  	int loop = 0;
> +	bool noswap = flags & MEM_CGROUP_RECLAIM_NOSWAP;
> +	bool shrink = flags & MEM_CGROUP_RECLAIM_SHRINK;
> +	bool check_soft = flags & MEM_CGROUP_RECLAIM_SOFT;
>  
>  	while (loop < 2) {
>  		victim = mem_cgroup_select_victim(root_mem);
> +		/*
> +		 * In the first loop, don't reclaim from victims below
> +		 * their soft limit
> +		 */
> +		if (!loop && res_counter_check_under_soft_limit(&victim->res)) {
> +			if (victim == root_mem)
> +				loop++;
> +			css_put(&victim->css);
> +			continue;
> +		}
>  		if (victim == root_mem)
>  			loop++;
>  		if (!mem_cgroup_local_usage(&victim->stat)) {
> @@ -904,8 +1004,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
>  			continue;
>  		}
>  		/* we use swappiness of local cgroup */
> -		ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
> -						   get_swappiness(victim));
> +		if (!check_soft)
> +			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
> +							noswap,
> +							get_swappiness(victim));
> +		else
> +			ret = mem_cgroup_shrink_zone(victim, zone, gfp_mask,
> +							get_swappiness(victim),
> +							priority);
>  		css_put(&victim->css);
>  		/*
>  		 * At shrinking usage, we can't check we should stop here or
> @@ -915,7 +1021,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
>  		if (shrink)
>  			return ret;
>  		total += ret;
> -		if (mem_cgroup_check_under_limit(root_mem))
> +		if (check_soft) {
> +			if (res_counter_check_under_soft_limit(&root_mem->res))
> +				return total;
> +		} else if (mem_cgroup_check_under_limit(root_mem))
>  			return 1 + total;
>  	}
>  	return total;
> @@ -1025,7 +1134,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
>  
>  	while (1) {
>  		int ret;
> -		bool noswap = false;
> +		unsigned long flags = 0;
>  
>  		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
>  						&soft_fail_res);
> @@ -1038,7 +1147,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
>  				break;
>  			/* mem+swap counter fails */
>  			res_counter_uncharge(&mem->res, PAGE_SIZE);
> -			noswap = true;
> +			flags = MEM_CGROUP_RECLAIM_NOSWAP;
>  			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
>  									memsw);
>  		} else
> @@ -1049,8 +1158,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
>  		if (!(gfp_mask & __GFP_WAIT))
>  			goto nomem;
>  
> -		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
> -							noswap, false);
> +		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
> +							gfp_mask, flags, 0);
>  		if (ret)
>  			continue;
>  
> @@ -1082,9 +1191,29 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
>  	 * soft limit
>  	 */
>  	if (soft_fail_res) {
> +		/*
> +		 * Throttle the task here, if it is undergoing soft limit
> +		 * reclaim and failing soft limits
> +		 */
> +		unsigned long flags;
> +		bool wait = false;
> +
> +		spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
> +		if (mem->soft_limit_reclaim_count) {
> +			INIT_COMPLETION(mem->wait_on_soft_reclaim);
> +			wait = true;
> +		}
> +		spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
>  		mem_over_soft_limit =
>  			mem_cgroup_from_res_counter(soft_fail_res, res);
>  		mem_cgroup_check_and_update_tree(mem_over_soft_limit, true);
> +		/*
> +		 * We hold the mmap_sem and throttle, I don't think there
> +		 * should be corner cases, but this part could use more
> +		 * review
> +		 */
> +		if (wait)
> +			wait_for_completion(&mem->wait_on_soft_reclaim);
>  	}
What ???? Why we have to wait here...holding mmap->sem...This is too bad.



>  	return 0;
>  nomem:
> @@ -1695,8 +1824,8 @@ int mem_cgroup_shrink_usage(struct page *page,
>  		return 0;
>  
>  	do {
> -		progress = mem_cgroup_hierarchical_reclaim(mem,
> -					gfp_mask, true, false);
> +		progress = mem_cgroup_hierarchical_reclaim(mem, NULL,
> +					gfp_mask, MEM_CGROUP_RECLAIM_NOSWAP, 0);
>  		progress += mem_cgroup_check_under_limit(mem);
>  	} while (!progress && --retry);
>  
> @@ -1750,8 +1879,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>  		if (!ret)
>  			break;
>  
> -		progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
> -						   false, true);
> +		progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
> +						GFP_KERNEL,
> +						MEM_CGROUP_RECLAIM_SHRINK, 0);
>  		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
>  		/* Usage is reduced ? */
>    		if (curusage >= oldusage)
> @@ -1799,7 +1929,9 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
>  		if (!ret)
>  			break;
>  
> -		mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
> +		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
> +						MEM_CGROUP_RECLAIM_NOSWAP |
> +						MEM_CGROUP_RECLAIM_SHRINK, 0);
>  		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
>  		/* Usage is reduced ? */
>  		if (curusage >= oldusage)
> @@ -1810,6 +1942,59 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
>  	return ret;
>  }
>  
> +unsigned long
> +mem_cgroup_soft_limit_reclaim(int priority, struct zone *zone, int nid,
> +				gfp_t gfp_mask)
> +{
> +	unsigned long nr_reclaimed = 0;
> +	struct mem_cgroup *mem;
> +	unsigned long flags;
> +	unsigned long long usage;
> +
> +	/*
> +	 * This loop can run a while, specially if mem_cgroup's continuously
> +	 * keep exceeding their soft limit and putting the system under
> +	 * pressure
> +	 */
> +	do {
> +		mem = mem_cgroup_get_largest_soft_limit_exceeding_node();
> +		if (!mem)
> +			break;
> +		usage = mem_cgroup_get_node_zone_usage(mem, zone, nid);
> +		if (!usage)
> +			goto skip_reclaim;

Why this works well ? if "mem" is the laragest, it will be inserted again
as the largest. Do I miss any ?

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/