linux-kernel - Re: [PATCH] memcg: rework inactive

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20120216103842.0c3e9258.kamezawa.hiroyu@jp.fujitsu.com>
Date:	Thu, 16 Feb 2012 10:38:42 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	Konstantin Khlebnikov <khlebnikov@...nvz.org>
Cc:	linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>,
	Johannes Weiner <hannes@...xchg.org>, cgroups@...r.kernel.org,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH] memcg: rework inactive_ratio logic

On Wed, 15 Feb 2012 20:24:42 +0400
Konstantin Khlebnikov <khlebnikov@...nvz.org> wrote:

> This patch adds mem_cgroup->inactive_ratio calculated from hierarchical memory limit.
> It updated at each limit change before shrinking cgroup to this new limit.
> Ratios for all child cgroups are updated too, because parent limit can affect them.
> Update precedure can be greatly optimized if its performance becomes the problem.
> Inactive ratio for unlimited or huge limit does not matter, because we'll never hit it.
> 
> At global reclaim always use global ratio from zone->inactive_ratio.
> At mem-cgroup reclaim use inactive_ratio from target memory cgroup,
> this is cgroup which hit its limit and cause this reclaimer invocation.
> 
> Thus, global memory reclaimer will try to keep ratio for all lru lists in zone
> above one mark, this guarantee that total ratio in this zone will be above too.
> Meanwhile mem-cgroup will do the same thing for its lru lists in all zones, and
> for all lru lists in all sub-cgroups in hierarchy.
> 
> Also this patch removes some redundant code.
> 
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@...nvz.org>

Hmm, the main purpose of this patch is to remove calculation per get_scan_ratio() ?




> ---
>  include/linux/memcontrol.h |   16 ++------
>  mm/memcontrol.c            |   85 ++++++++++++++++++++++++--------------------
>  mm/vmscan.c                |   82 +++++++++++++++++++++++-------------------
>  3 files changed, 93 insertions(+), 90 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 4d34356..453a3dd 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -113,10 +113,7 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
>  /*
>   * For memory reclaim.
>   */
> -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
> -				    struct zone *zone);
> -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg,
> -				    struct zone *zone);
> +unsigned int mem_cgroup_inactive_ratio(struct mem_cgroup *memcg);
>  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
>  unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
>  					int nid, int zid, unsigned int lrumask);
> @@ -319,16 +316,9 @@ static inline bool mem_cgroup_disabled(void)
>  	return true;
>  }
>  
> -static inline int
> -mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
> -{
> -	return 1;
> -}
> -
> -static inline int
> -mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
> +static inline unsigned int mem_cgroup_inactive_ratio(struct mem_cgroup *memcg)
>  {
> -	return 1;
> +	return 0;
>  }
>  
>  static inline unsigned long
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 6728a7a..343324a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -212,6 +212,8 @@ struct mem_cgroup_eventfd_list {
>  
>  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
>  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
> +static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
> +		unsigned long long *mem_limit, unsigned long long *memsw_limit);
>  
>  /*
>   * The memory controller data structure. The memory controller controls both
> @@ -256,6 +258,10 @@ struct mem_cgroup {
>  	atomic_t	refcnt;
>  
>  	int	swappiness;
> +
> +	/* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages */
> +	unsigned int inactive_ratio;
> +
>  	/* OOM-Killer disable */
>  	int		oom_kill_disable;
>  
> @@ -1155,44 +1161,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
>  	return ret;
>  }
>  
> -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
> -{
> -	unsigned long inactive_ratio;
> -	int nid = zone_to_nid(zone);
> -	int zid = zone_idx(zone);
> -	unsigned long inactive;
> -	unsigned long active;
> -	unsigned long gb;
> -
> -	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
> -						BIT(LRU_INACTIVE_ANON));
> -	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
> -					      BIT(LRU_ACTIVE_ANON));
> -
> -	gb = (inactive + active) >> (30 - PAGE_SHIFT);
> -	if (gb)
> -		inactive_ratio = int_sqrt(10 * gb);
> -	else
> -		inactive_ratio = 1;
> -
> -	return inactive * inactive_ratio < active;
> -}
> -
> -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
> -{
> -	unsigned long active;
> -	unsigned long inactive;
> -	int zid = zone_idx(zone);
> -	int nid = zone_to_nid(zone);
> -
> -	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
> -						BIT(LRU_INACTIVE_FILE));
> -	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
> -					      BIT(LRU_ACTIVE_FILE));
> -
> -	return (active > inactive);
> -}
> -
>  struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
>  						      struct zone *zone)
>  {
> @@ -3373,6 +3341,32 @@ void mem_cgroup_print_bad_page(struct page *page)
>  
>  static DEFINE_MUTEX(set_limit_mutex);
>  
> +/*
> + * Update inactive_ratio accoring to new memory limit
> + */
> +static void mem_cgroup_update_inactive_ratio(struct mem_cgroup *memcg,
> +					     unsigned long long target)
> +{
> +	unsigned long long mem_limit, memsw_limit, gb;
> +	struct mem_cgroup *iter;
> +
> +	for_each_mem_cgroup_tree(iter, memcg) {
> +		memcg_get_hierarchical_limit(iter, &mem_limit, &memsw_limit);
> +		mem_limit = min(mem_limit, target);
> +
> +		gb = mem_limit >> 30;
> +		if (gb && 10 * gb < INT_MAX)
> +			iter->inactive_ratio = int_sqrt(10 * gb);
> +		else
> +			iter->inactive_ratio = 1;
> +	}
> +}
> +
> +unsigned int mem_cgroup_inactive_ratio(struct mem_cgroup *memcg)
> +{
> +	return memcg->inactive_ratio;
> +}
> +
>  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>  				unsigned long long val)
>  {
> @@ -3422,6 +3416,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>  			else
>  				memcg->memsw_is_minimum = false;
>  		}
> +		mem_cgroup_update_inactive_ratio(memcg, val);
>  		mutex_unlock(&set_limit_mutex);
>  
>  		if (!ret)
> @@ -3439,6 +3434,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>  	if (!ret && enlarge)
>  		memcg_oom_recover(memcg);
>  
> +	if (ret) {
> +		mutex_lock(&set_limit_mutex);
> +		mem_cgroup_update_inactive_ratio(memcg, RESOURCE_MAX);
> +		mutex_unlock(&set_limit_mutex);
> +	}

Why RESOUECE_MAX ?

Thanks,
-Kame

> +
>  	return ret;
>  }
>  
> @@ -4155,6 +4156,8 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
>  	}
>  
>  #ifdef CONFIG_DEBUG_VM
> +	cb->fill(cb, "inactive_ratio", mem_cont->inactive_ratio);
> +
>  	{
>  		int nid, zid;
>  		struct mem_cgroup_per_zone *mz;
> @@ -4936,8 +4939,12 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
>  	memcg->last_scanned_node = MAX_NUMNODES;
>  	INIT_LIST_HEAD(&memcg->oom_notify);
>  
> -	if (parent)
> +	if (parent) {
>  		memcg->swappiness = mem_cgroup_swappiness(parent);
> +		memcg->inactive_ratio = parent->inactive_ratio;
> +	} else
> +		memcg->inactive_ratio = 1;
> +
>  	atomic_set(&memcg->refcnt, 1);
>  	memcg->move_charge_at_immigrate = 0;
>  	mutex_init(&memcg->thresholds_lock);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 4061e91..531abcc 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1788,19 +1788,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  }
>  
>  #ifdef CONFIG_SWAP
> -static int inactive_anon_is_low_global(struct zone *zone)
> -{
> -	unsigned long active, inactive;
> -
> -	active = zone_page_state(zone, NR_ACTIVE_ANON);
> -	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
> -
> -	if (inactive * zone->inactive_ratio < active)
> -		return 1;
> -
> -	return 0;
> -}
> -
>  /**
>   * inactive_anon_is_low - check if anonymous pages need to be deactivated
>   * @zone: zone to check
> @@ -1809,8 +1796,12 @@ static int inactive_anon_is_low_global(struct zone *zone)
>   * Returns true if the zone does not have enough inactive anon pages,
>   * meaning some active anon pages need to be deactivated.
>   */
> -static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
> +static int inactive_anon_is_low(struct mem_cgroup_zone *mz,
> +				struct scan_control *sc)
>  {
> +	unsigned long active, inactive;
> +	unsigned int ratio;
> +
>  	/*
>  	 * If we don't have swap space, anonymous page deactivation
>  	 * is pointless.
> @@ -1818,29 +1809,33 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
>  	if (!total_swap_pages)
>  		return 0;
>  
> -	if (!scanning_global_lru(mz))
> -		return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
> -						       mz->zone);
> +	if (global_reclaim(sc))
> +		ratio = mz->zone->inactive_ratio;
> +	else
> +		ratio = mem_cgroup_inactive_ratio(sc->target_mem_cgroup);
>  
> -	return inactive_anon_is_low_global(mz->zone);
> +	if (scanning_global_lru(mz)) {
> +		active = zone_page_state(mz->zone, NR_ACTIVE_ANON);
> +		inactive = zone_page_state(mz->zone, NR_INACTIVE_ANON);
> +	} else {
> +		active = mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
> +				zone_to_nid(mz->zone), zone_idx(mz->zone),
> +				BIT(LRU_ACTIVE_ANON));
> +		inactive = mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
> +				zone_to_nid(mz->zone), zone_idx(mz->zone),
> +				BIT(LRU_INACTIVE_ANON));
> +	}
> +
> +	return inactive * ratio < active;
>  }
>  #else
> -static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
> +static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz,
> +				       struct scan_control *sc)
>  {
>  	return 0;
>  }
>  #endif
>  
> -static int inactive_file_is_low_global(struct zone *zone)
> -{
> -	unsigned long active, inactive;
> -
> -	active = zone_page_state(zone, NR_ACTIVE_FILE);
> -	inactive = zone_page_state(zone, NR_INACTIVE_FILE);
> -
> -	return (active > inactive);
> -}
> -
>  /**
>   * inactive_file_is_low - check if file pages need to be deactivated
>   * @mz: memory cgroup and zone to check
> @@ -1857,19 +1852,30 @@ static int inactive_file_is_low_global(struct zone *zone)
>   */
>  static int inactive_file_is_low(struct mem_cgroup_zone *mz)
>  {
> -	if (!scanning_global_lru(mz))
> -		return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
> -						       mz->zone);
> +	unsigned long active, inactive;
> +
> +	if (scanning_global_lru(mz)) {
> +		active = zone_page_state(mz->zone, NR_ACTIVE_FILE);
> +		inactive = zone_page_state(mz->zone, NR_INACTIVE_FILE);
> +	} else {
> +		active = mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
> +				zone_to_nid(mz->zone), zone_idx(mz->zone),
> +				BIT(LRU_ACTIVE_FILE));
> +		inactive = mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
> +				zone_to_nid(mz->zone), zone_idx(mz->zone),
> +				BIT(LRU_INACTIVE_FILE));
> +	}
>  
> -	return inactive_file_is_low_global(mz->zone);
> +	return inactive < active;
>  }
>  
> -static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
> +static int inactive_list_is_low(struct mem_cgroup_zone *mz,
> +				struct scan_control *sc, int file)
>  {
>  	if (file)
>  		return inactive_file_is_low(mz);
>  	else
> -		return inactive_anon_is_low(mz);
> +		return inactive_anon_is_low(mz, sc);
>  }
>  
>  static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> @@ -1879,7 +1885,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
>  	int file = is_file_lru(lru);
>  
>  	if (is_active_lru(lru)) {
> -		if (inactive_list_is_low(mz, file))
> +		if (inactive_list_is_low(mz, sc, file))
>  			shrink_active_list(nr_to_scan, mz, sc, priority, file);
>  		return 0;
>  	}
> @@ -2129,7 +2135,7 @@ restart:
>  	 * Even if we did not try to evict anon pages at all, we want to
>  	 * rebalance the anon lru active/inactive ratio.
>  	 */
> -	if (inactive_anon_is_low(mz))
> +	if (inactive_anon_is_low(mz, sc))
>  		shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
>  
>  	/* reclaim/compaction might need reclaim to continue */
> @@ -2558,7 +2564,7 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc,
>  			.zone = zone,
>  		};
>  
> -		if (inactive_anon_is_low(&mz))
> +		if (inactive_anon_is_low(&mz, sc))
>  			shrink_active_list(SWAP_CLUSTER_MAX, &mz,
>  					   sc, priority, 0);
>  
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@...ck.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@...ck.org"> email@...ck.org </a>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/