lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f4131456fc4b1dd4f5b8d060e0cbef80.squirrel@webmail-b.css.fujitsu.com>
Date:	Wed, 19 Aug 2009 23:18:01 +0900 (JST)
From:	"KAMEZAWA Hiroyuki" <kamezawa.hiroyu@...fujitsu.com>
To:	"Wu Fengguang" <fengguang.wu@...el.com>
Cc:	"KOSAKI Motohiro" <kosaki.motohiro@...fujitsu.com>,
	"Balbir Singh" <balbir@...ux.vnet.ibm.com>,
	"KAMEZAWA Hiroyuki" <kamezawa.hiroyu@...fujitsu.com>,
	"Rik van Riel" <riel@...hat.com>,
	"Johannes Weiner" <hannes@...xchg.org>,
	"Avi Kivity" <avi@...hat.com>,
	"Andrea Arcangeli" <aarcange@...hat.com>,
	"Dike, Jeffrey G" <jeffrey.g.dike@...el.com>,
	"Yu, Wilfred" <wilfred.yu@...el.com>,
	"Kleen, Andi" <andi.kleen@...el.com>,
	"Hugh Dickins" <hugh.dickins@...cali.co.uk>,
	"Andrew Morton" <akpm@...ux-foundation.org>,
	"Christoph Lameter" <cl@...ux-foundation.org>,
	"Mel Gorman" <mel@....ul.ie>,
	"LKML" <linux-kernel@...r.kernel.org>,
	"linux-mm" <linux-mm@...ck.org>,
	"nishimura@....nes.nec.co.jp" <nishimura@....nes.nec.co.jp>,
	"lizf@...fujitsu.com" <lizf@...fujitsu.com>,
	"menage@...gle.com" <menage@...gle.com>
Subject: Re: [RFC] memcg: move definitions to .h and inline some functions

Wu Fengguang さんは書きました:
> On Tue, Aug 18, 2009 at 11:57:52PM +0800, KOSAKI Motohiro wrote:
>>
>> > > This one of the reasons why we unconditionally deactivate
>> > > the active anon pages, and do background scanning of the
>> > > active anon list when reclaiming page cache pages.
>> > >
>> > > We want to always move some pages to the inactive anon
>> > > list, so it does not get too small.
>> >
>> > Right, the current code tries to pull inactive list out of
>> > smallish-size state as long as there are vmscan activities.
>> >
>> > However there is a possible (and tricky) hole: mem cgroups
>> > don't do batched vmscan. shrink_zone() may call shrink_list()
>> > with nr_to_scan=1, in which case shrink_list() _still_ calls
>> > isolate_pages() with the much larger SWAP_CLUSTER_MAX.
>> >
>> > It effectively scales up the inactive list scan rate by 10 times when
>> > it is still small, and may thus prevent it from growing up for ever.
>> >
>> > In that case, LRU becomes FIFO.
>> >
>> > Jeff, can you confirm if the mem cgroup's inactive list is small?
>> > If so, this patch should help.
>>
>> This patch does right thing.
>> However, I would explain why I and memcg folks didn't do that in past
>> days.
>>
>> Strangely, some memcg struct declaration is hide in *.c. Thus we can't
>> make inline function and we hesitated to introduce many function calling
>> overhead.
>>
>> So, Can we move some memcg structure declaration to *.h and make
>> mem_cgroup_get_saved_scan() inlined function?
>
> OK here it is. I have to move big chunks to make it compile, and it
> does reduced a dozen lines of code :)
>
> Is this big copy&paste acceptable? (memcg developers CCed).
>
> Thanks,
> Fengguang

I don't like this. plz add hooks to necessary places, at this stage.
This will be too big for inlined function, anyway.
plz move this after you find overhead is too big.

Thanks,
-Kame


> ---
>
> memcg: move definitions to .h and inline some functions
>
> So as to make inline functions.
>
> Signed-off-by: Wu Fengguang <fengguang.wu@...el.com>
> ---
>  include/linux/memcontrol.h |  154 ++++++++++++++++++++++++++++++-----
>  mm/memcontrol.c            |  131 -----------------------------
>  2 files changed, 134 insertions(+), 151 deletions(-)
>
> --- linux.orig/include/linux/memcontrol.h	2009-08-19 20:18:55.000000000
> +0800
> +++ linux/include/linux/memcontrol.h	2009-08-19 20:51:06.000000000 +0800
> @@ -20,11 +20,144 @@
>  #ifndef _LINUX_MEMCONTROL_H
>  #define _LINUX_MEMCONTROL_H
>  #include <linux/cgroup.h>
> -struct mem_cgroup;
> +#include <linux/res_counter.h>
>  struct page_cgroup;
>  struct page;
>  struct mm_struct;
>
> +/*
> + * Statistics for memory cgroup.
> + */
> +enum mem_cgroup_stat_index {
> +	/*
> +	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> +	 */
> +	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
> +	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
> +	MEM_CGROUP_STAT_MAPPED_FILE,	/* # of pages charged as file rss */
> +	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
> +	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
> +
> +	MEM_CGROUP_STAT_NSTATS,
> +};
> +
> +struct mem_cgroup_stat_cpu {
> +	s64 count[MEM_CGROUP_STAT_NSTATS];
> +} ____cacheline_aligned_in_smp;
> +
> +struct mem_cgroup_stat {
> +	struct mem_cgroup_stat_cpu cpustat[0];
> +};
> +
> +/*
> + * per-zone information in memory controller.
> + */
> +struct mem_cgroup_per_zone {
> +	/*
> +	 * spin_lock to protect the per cgroup LRU
> +	 */
> +	struct list_head	lists[NR_LRU_LISTS];
> +	unsigned long		count[NR_LRU_LISTS];
> +
> +	struct zone_reclaim_stat reclaim_stat;
> +};
> +/* Macro for accessing counter */
> +#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
> +
> +struct mem_cgroup_per_node {
> +	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
> +};
> +
> +struct mem_cgroup_lru_info {
> +	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
> +};
> +
> +/*
> + * The memory controller data structure. The memory controller controls
> both
> + * page cache and RSS per cgroup. We would eventually like to provide
> + * statistics based on the statistics developed by Rik Van Riel for
> clock-pro,
> + * to help the administrator determine what knobs to tune.
> + *
> + * TODO: Add a water mark for the memory controller. Reclaim will begin
> when
> + * we hit the water mark. May be even add a low water mark, such that
> + * no reclaim occurs from a cgroup at it's low water mark, this is
> + * a feature that will be implemented much later in the future.
> + */
> +struct mem_cgroup {
> +	struct cgroup_subsys_state css;
> +	/*
> +	 * the counter to account for memory usage
> +	 */
> +	struct res_counter res;
> +	/*
> +	 * the counter to account for mem+swap usage.
> +	 */
> +	struct res_counter memsw;
> +	/*
> +	 * Per cgroup active and inactive list, similar to the
> +	 * per zone LRU lists.
> +	 */
> +	struct mem_cgroup_lru_info info;
> +
> +	/*
> +	  protect against reclaim related member.
> +	*/
> +	spinlock_t reclaim_param_lock;
> +
> +	int	prev_priority;	/* for recording reclaim priority */
> +
> +	/*
> +	 * While reclaiming in a hiearchy, we cache the last child we
> +	 * reclaimed from.
> +	 */
> +	int last_scanned_child;
> +	/*
> +	 * Should the accounting and control be hierarchical, per subtree?
> +	 */
> +	bool use_hierarchy;
> +	unsigned long	last_oom_jiffies;
> +	atomic_t	refcnt;
> +
> +	unsigned int	swappiness;
> +
> +	/* set when res.limit == memsw.limit */
> +	bool		memsw_is_minimum;
> +
> +	/*
> +	 * statistics. This must be placed at the end of memcg.
> +	 */
> +	struct mem_cgroup_stat stat;
> +};
> +
> +static inline struct mem_cgroup_per_zone *
> +mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
> +{
> +	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
> +}
> +
> +static inline unsigned long
> +mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
> +			 struct zone *zone,
> +			 enum lru_list lru)
> +{
> +	int nid = zone->zone_pgdat->node_id;
> +	int zid = zone_idx(zone);
> +	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
> +
> +	return MEM_CGROUP_ZSTAT(mz, lru);
> +}
> +
> +static inline struct zone_reclaim_stat *
> +mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone)
> +{
> +	int nid = zone->zone_pgdat->node_id;
> +	int zid = zone_idx(zone);
> +	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
> +
> +	return &mz->reclaim_stat;
> +}
> +
> +
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  /*
>   * All "charge" functions with gfp_mask should use GFP_KERNEL or
> @@ -95,11 +228,6 @@ extern void mem_cgroup_record_reclaim_pr
>  							int priority);
>  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
>  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> -unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
> -				       struct zone *zone,
> -				       enum lru_list lru);
> -struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup
> *memcg,
> -						      struct zone *zone);
>  struct zone_reclaim_stat*
>  mem_cgroup_get_reclaim_stat_from_page(struct page *page);
>  extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
> @@ -246,20 +374,6 @@ mem_cgroup_inactive_file_is_low(struct m
>  	return 1;
>  }
>
> -static inline unsigned long
> -mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone,
> -			 enum lru_list lru)
> -{
> -	return 0;
> -}
> -
> -
> -static inline struct zone_reclaim_stat*
> -mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone)
> -{
> -	return NULL;
> -}
> -
>  static inline struct zone_reclaim_stat*
>  mem_cgroup_get_reclaim_stat_from_page(struct page *page)
>  {
> --- linux.orig/mm/memcontrol.c	2009-08-19 20:14:56.000000000 +0800
> +++ linux/mm/memcontrol.c	2009-08-19 20:46:50.000000000 +0800
> @@ -55,30 +55,6 @@ static int really_do_swap_account __init
>  static DEFINE_MUTEX(memcg_tasklist);	/* can be hold under cgroup_mutex */
>
>  /*
> - * Statistics for memory cgroup.
> - */
> -enum mem_cgroup_stat_index {
> -	/*
> -	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> -	 */
> -	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
> -	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
> -	MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
> -	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
> -	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
> -
> -	MEM_CGROUP_STAT_NSTATS,
> -};
> -
> -struct mem_cgroup_stat_cpu {
> -	s64 count[MEM_CGROUP_STAT_NSTATS];
> -} ____cacheline_aligned_in_smp;
> -
> -struct mem_cgroup_stat {
> -	struct mem_cgroup_stat_cpu cpustat[0];
> -};
> -
> -/*
>   * For accounting under irq disable, no need for increment preempt count.
>   */
>  static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu
> *stat,
> @@ -106,86 +82,6 @@ static s64 mem_cgroup_local_usage(struct
>  	return ret;
>  }
>
> -/*
> - * per-zone information in memory controller.
> - */
> -struct mem_cgroup_per_zone {
> -	/*
> -	 * spin_lock to protect the per cgroup LRU
> -	 */
> -	struct list_head	lists[NR_LRU_LISTS];
> -	unsigned long		count[NR_LRU_LISTS];
> -
> -	struct zone_reclaim_stat reclaim_stat;
> -};
> -/* Macro for accessing counter */
> -#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
> -
> -struct mem_cgroup_per_node {
> -	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
> -};
> -
> -struct mem_cgroup_lru_info {
> -	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
> -};
> -
> -/*
> - * The memory controller data structure. The memory controller controls
> both
> - * page cache and RSS per cgroup. We would eventually like to provide
> - * statistics based on the statistics developed by Rik Van Riel for
> clock-pro,
> - * to help the administrator determine what knobs to tune.
> - *
> - * TODO: Add a water mark for the memory controller. Reclaim will begin
> when
> - * we hit the water mark. May be even add a low water mark, such that
> - * no reclaim occurs from a cgroup at it's low water mark, this is
> - * a feature that will be implemented much later in the future.
> - */
> -struct mem_cgroup {
> -	struct cgroup_subsys_state css;
> -	/*
> -	 * the counter to account for memory usage
> -	 */
> -	struct res_counter res;
> -	/*
> -	 * the counter to account for mem+swap usage.
> -	 */
> -	struct res_counter memsw;
> -	/*
> -	 * Per cgroup active and inactive list, similar to the
> -	 * per zone LRU lists.
> -	 */
> -	struct mem_cgroup_lru_info info;
> -
> -	/*
> -	  protect against reclaim related member.
> -	*/
> -	spinlock_t reclaim_param_lock;
> -
> -	int	prev_priority;	/* for recording reclaim priority */
> -
> -	/*
> -	 * While reclaiming in a hiearchy, we cache the last child we
> -	 * reclaimed from.
> -	 */
> -	int last_scanned_child;
> -	/*
> -	 * Should the accounting and control be hierarchical, per subtree?
> -	 */
> -	bool use_hierarchy;
> -	unsigned long	last_oom_jiffies;
> -	atomic_t	refcnt;
> -
> -	unsigned int	swappiness;
> -
> -	/* set when res.limit == memsw.limit */
> -	bool		memsw_is_minimum;
> -
> -	/*
> -	 * statistics. This must be placed at the end of memcg.
> -	 */
> -	struct mem_cgroup_stat stat;
> -};
> -
>  enum charge_type {
>  	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
>  	MEM_CGROUP_CHARGE_TYPE_MAPPED,
> @@ -244,12 +140,6 @@ static void mem_cgroup_charge_statistics
>  }
>
>  static struct mem_cgroup_per_zone *
> -mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
> -{
> -	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
> -}
> -
> -static struct mem_cgroup_per_zone *
>  page_cgroup_zoneinfo(struct page_cgroup *pc)
>  {
>  	struct mem_cgroup *mem = pc->mem_cgroup;
> @@ -586,27 +476,6 @@ int mem_cgroup_inactive_file_is_low(stru
>  	return (active > inactive);
>  }
>
> -unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
> -				       struct zone *zone,
> -				       enum lru_list lru)
> -{
> -	int nid = zone->zone_pgdat->node_id;
> -	int zid = zone_idx(zone);
> -	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
> -
> -	return MEM_CGROUP_ZSTAT(mz, lru);
> -}
> -
> -struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup
> *memcg,
> -						      struct zone *zone)
> -{
> -	int nid = zone->zone_pgdat->node_id;
> -	int zid = zone_idx(zone);
> -	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
> -
> -	return &mz->reclaim_stat;
> -}
> -
>  struct zone_reclaim_stat *
>  mem_cgroup_get_reclaim_stat_from_page(struct page *page)
>  {
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ