linux-kernel - Re: [PATCH 4/7] bio-cgroup: Split the cgroup memory subsystem into two parts

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20080818103938.8b4e8bfa.kamezawa.hiroyu@jp.fujitsu.com>
Date:	Mon, 18 Aug 2008 10:39:38 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	Ryo Tsuruta <ryov@...inux.co.jp>
Cc:	linux-kernel@...r.kernel.org, dm-devel@...hat.com,
	containers@...ts.linux-foundation.org,
	virtualization@...ts.linux-foundation.org,
	xen-devel@...ts.xensource.com, agk@...rceware.org,
	balbir@...ux.vnet.ibm.com, xemul@...nvz.org, fernando@....ntt.co.jp
Subject: Re: [PATCH 4/7] bio-cgroup: Split the cgroup memory subsystem into
 two parts

On Tue, 12 Aug 2008 21:35:33 +0900 (JST)
Ryo Tsuruta <ryov@...inux.co.jp> wrote:

> This patch splits the cgroup memory subsystem into two parts.
> One is for tracking pages to find out the owners. The other is
> for controlling how much amount of memory should be assigned to
> each cgroup.
> 
> With this patch, you can use the page tracking mechanism even if
> the memory subsystem is off.
> 

I'm now writing remove-lock-page-cgroup patches. it works well.
please wait for a while...

Thanks,
-Kame


> Based on 2.6.27-rc1-mm1
> Signed-off-by: Ryo Tsuruta <ryov@...inux.co.jp>
> Signed-off-by: Hirokazu Takahashi <taka@...inux.co.jp>
> 
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/include/linux/memcontrol.h linux-2.6.27-rc1-mm1.cg0/include/linux/memcontrol.h
> --- linux-2.6.27-rc1-mm1.ioband/include/linux/memcontrol.h	2008-08-12 14:30:19.000000000 +0900
> +++ linux-2.6.27-rc1-mm1.cg0/include/linux/memcontrol.h	2008-08-12 14:47:11.000000000 +0900
> @@ -20,12 +20,62 @@
>  #ifndef _LINUX_MEMCONTROL_H
>  #define _LINUX_MEMCONTROL_H
>  
> +#include <linux/rcupdate.h>
> +#include <linux/mm.h>
> +#include <linux/smp.h>
> +#include <linux/bit_spinlock.h>
> +
>  struct mem_cgroup;
>  struct page_cgroup;
>  struct page;
>  struct mm_struct;
>  
> +#ifdef CONFIG_CGROUP_PAGE
> +/*
> + * We use the lower bit of the page->page_cgroup pointer as a bit spin
> + * lock.  We need to ensure that page->page_cgroup is at least two
> + * byte aligned (based on comments from Nick Piggin).  But since
> + * bit_spin_lock doesn't actually set that lock bit in a non-debug
> + * uniprocessor kernel, we should avoid setting it here too.
> + */
> +#define PAGE_CGROUP_LOCK_BIT    0x0
> +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> +#define PAGE_CGROUP_LOCK        (1 << PAGE_CGROUP_LOCK_BIT)
> +#else
> +#define PAGE_CGROUP_LOCK        0x0
> +#endif
> +
> +/*
> + * A page_cgroup page is associated with every page descriptor. The
> + * page_cgroup helps us identify information about the cgroup
> + */
> +struct page_cgroup {
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +	struct list_head lru;		/* per cgroup LRU list */
> +	struct mem_cgroup *mem_cgroup;
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +	struct page *page;
> +	int flags;
> +};
> +#define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
> +#define PAGE_CGROUP_FLAG_ACTIVE	(0x2)	/* page is active in this cgroup */
> +#define PAGE_CGROUP_FLAG_FILE	(0x4)	/* page is file system backed */
> +#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
> +
> +static inline void lock_page_cgroup(struct page *page)
> +{
> +	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +}
> +
> +static inline int try_lock_page_cgroup(struct page *page)
> +{
> +	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +}
> +
> +static inline void unlock_page_cgroup(struct page *page)
> +{
> +	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +}
>  
>  #define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
>  
> @@ -34,45 +84,15 @@ extern int mem_cgroup_charge(struct page
>  				gfp_t gfp_mask);
>  extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>  					gfp_t gfp_mask);
> -extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
>  extern void mem_cgroup_uncharge_page(struct page *page);
>  extern void mem_cgroup_uncharge_cache_page(struct page *page);
> -extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
> -
> -extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> -					struct list_head *dst,
> -					unsigned long *scanned, int order,
> -					int mode, struct zone *z,
> -					struct mem_cgroup *mem_cont,
> -					int active, int file);
> -extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
> -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
> -
> -extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
> -
> -#define mm_match_cgroup(mm, cgroup)	\
> -	((cgroup) == mem_cgroup_from_task((mm)->owner))
>  
>  extern int
>  mem_cgroup_prepare_migration(struct page *page, struct page *newpage);
>  extern void mem_cgroup_end_migration(struct page *page);
> +extern void page_cgroup_init(void);
>  
> -/*
> - * For memory reclaim.
> - */
> -extern int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem);
> -extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem);
> -
> -extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
> -extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
> -							int priority);
> -extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
> -							int priority);
> -
> -extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
> -					int priority, enum lru_list lru);
> -
> -#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +#else /* CONFIG_CGROUP_PAGE */
>  static inline void page_reset_bad_cgroup(struct page *page)
>  {
>  }
> @@ -102,6 +122,53 @@ static inline void mem_cgroup_uncharge_c
>  {
>  }
>  
> +static inline int
> +mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
> +{
> +	return 0;
> +}
> +
> +static inline void mem_cgroup_end_migration(struct page *page)
> +{
> +}
> +#endif /* CONFIG_CGROUP_PAGE */
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +
> +extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
> +extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
> +
> +extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> +					struct list_head *dst,
> +					unsigned long *scanned, int order,
> +					int mode, struct zone *z,
> +					struct mem_cgroup *mem_cont,
> +					int active, int file);
> +extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
> +int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
> +
> +extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
> +
> +#define mm_match_cgroup(mm, cgroup)	\
> +	((cgroup) == mem_cgroup_from_task((mm)->owner))
> +
> +/*
> + * For memory reclaim.
> + */
> +extern int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem);
> +extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem);
> +
> +extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
> +extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
> +							int priority);
> +extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
> +							int priority);
> +
> +extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
> +					int priority, enum lru_list lru);
> +
> +#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
>  static inline int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
>  {
>  	return 0;
> @@ -122,16 +189,6 @@ static inline int task_in_mem_cgroup(str
>  	return 1;
>  }
>  
> -static inline int
> -mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
> -{
> -	return 0;
> -}
> -
> -static inline void mem_cgroup_end_migration(struct page *page)
> -{
> -}
> -
>  static inline int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
>  {
>  	return 0;
> @@ -163,7 +220,8 @@ static inline long mem_cgroup_calc_recla
>  {
>  	return 0;
>  }
> -#endif /* CONFIG_CGROUP_MEM_CONT */
> +
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
>  
>  #endif /* _LINUX_MEMCONTROL_H */
>  
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/include/linux/mm_types.h linux-2.6.27-rc1-mm1.cg0/include/linux/mm_types.h
> --- linux-2.6.27-rc1-mm1.ioband/include/linux/mm_types.h	2008-08-12 14:30:19.000000000 +0900
> +++ linux-2.6.27-rc1-mm1.cg0/include/linux/mm_types.h	2008-08-12 14:47:11.000000000 +0900
> @@ -92,7 +92,7 @@ struct page {
>  	void *virtual;			/* Kernel virtual address (NULL if
>  					   not kmapped, ie. highmem) */
>  #endif /* WANT_PAGE_VIRTUAL */
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_CGROUP_PAGE
>  	unsigned long page_cgroup;
>  #endif
>  
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/init/Kconfig linux-2.6.27-rc1-mm1.cg0/init/Kconfig
> --- linux-2.6.27-rc1-mm1.ioband/init/Kconfig	2008-08-12 14:30:19.000000000 +0900
> +++ linux-2.6.27-rc1-mm1.cg0/init/Kconfig	2008-08-12 14:47:11.000000000 +0900
> @@ -418,6 +418,10 @@ config CGROUP_MEMRLIMIT_CTLR
>  	  memory RSS and Page Cache control. Virtual address space control
>  	  is provided by this controller.
>  
> +config CGROUP_PAGE
> +	def_bool y
> +	depends on CGROUP_MEM_RES_CTLR
> +
>  config SYSFS_DEPRECATED
>  	bool
>  
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/mm/Makefile linux-2.6.27-rc1-mm1.cg0/mm/Makefile
> --- linux-2.6.27-rc1-mm1.ioband/mm/Makefile	2008-08-12 14:30:19.000000000 +0900
> +++ linux-2.6.27-rc1-mm1.cg0/mm/Makefile	2008-08-12 14:47:11.000000000 +0900
> @@ -34,5 +34,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
>  obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
> -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> +obj-$(CONFIG_CGROUP_PAGE) += memcontrol.o
>  obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/mm/memcontrol.c linux-2.6.27-rc1-mm1.cg0/mm/memcontrol.c
> --- linux-2.6.27-rc1-mm1.ioband/mm/memcontrol.c	2008-08-12 14:30:19.000000000 +0900
> +++ linux-2.6.27-rc1-mm1.cg0/mm/memcontrol.c	2008-08-12 14:47:11.000000000 +0900
> @@ -36,10 +36,25 @@
>  
>  #include <asm/uaccess.h>
>  
> -struct cgroup_subsys mem_cgroup_subsys __read_mostly;
> +enum charge_type {
> +	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
> +	MEM_CGROUP_CHARGE_TYPE_MAPPED,
> +	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
> +};
> +
> +static void __mem_cgroup_uncharge_common(struct page *, enum charge_type);
> +
>  static struct kmem_cache *page_cgroup_cache __read_mostly;
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +struct cgroup_subsys mem_cgroup_subsys __read_mostly;
>  #define MEM_CGROUP_RECLAIM_RETRIES	5
>  
> +static inline int mem_cgroup_disabled(void)
> +{
> +	return mem_cgroup_subsys.disabled;
> +}
> +
>  /*
>   * Statistics for memory cgroup.
>   */
> @@ -136,35 +151,6 @@ struct mem_cgroup {
>  };
>  static struct mem_cgroup init_mem_cgroup;
>  
> -/*
> - * We use the lower bit of the page->page_cgroup pointer as a bit spin
> - * lock.  We need to ensure that page->page_cgroup is at least two
> - * byte aligned (based on comments from Nick Piggin).  But since
> - * bit_spin_lock doesn't actually set that lock bit in a non-debug
> - * uniprocessor kernel, we should avoid setting it here too.
> - */
> -#define PAGE_CGROUP_LOCK_BIT 	0x0
> -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> -#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
> -#else
> -#define PAGE_CGROUP_LOCK	0x0
> -#endif
> -
> -/*
> - * A page_cgroup page is associated with every page descriptor. The
> - * page_cgroup helps us identify information about the cgroup
> - */
> -struct page_cgroup {
> -	struct list_head lru;		/* per cgroup LRU list */
> -	struct page *page;
> -	struct mem_cgroup *mem_cgroup;
> -	int flags;
> -};
> -#define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
> -#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
> -#define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
> -#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
> -
>  static int page_cgroup_nid(struct page_cgroup *pc)
>  {
>  	return page_to_nid(pc->page);
> @@ -175,12 +161,6 @@ static enum zone_type page_cgroup_zid(st
>  	return page_zonenum(pc->page);
>  }
>  
> -enum charge_type {
> -	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
> -	MEM_CGROUP_CHARGE_TYPE_MAPPED,
> -	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
> -};
> -
>  /*
>   * Always modified under lru lock. Then, not necessary to preempt_disable()
>   */
> @@ -248,37 +228,6 @@ struct mem_cgroup *mem_cgroup_from_task(
>  				struct mem_cgroup, css);
>  }
>  
> -static inline int page_cgroup_locked(struct page *page)
> -{
> -	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
> -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
> -{
> -	VM_BUG_ON(!page_cgroup_locked(page));
> -	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> -}
> -
> -struct page_cgroup *page_get_page_cgroup(struct page *page)
> -{
> -	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> -}
> -
> -static void lock_page_cgroup(struct page *page)
> -{
> -	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
> -static int try_lock_page_cgroup(struct page *page)
> -{
> -	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
> -static void unlock_page_cgroup(struct page *page)
> -{
> -	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
>  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
>  			struct page_cgroup *pc)
>  {
> @@ -367,7 +316,7 @@ void mem_cgroup_move_lists(struct page *
>  	struct mem_cgroup_per_zone *mz;
>  	unsigned long flags;
>  
> -	if (mem_cgroup_subsys.disabled)
> +	if (mem_cgroup_disabled())
>  		return;
>  
>  	/*
> @@ -506,273 +455,6 @@ unsigned long mem_cgroup_isolate_pages(u
>  }
>  
>  /*
> - * Charge the memory controller for page usage.
> - * Return
> - * 0 if the charge was successful
> - * < 0 if the cgroup is over its limit
> - */
> -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
> -				gfp_t gfp_mask, enum charge_type ctype,
> -				struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup *mem;
> -	struct page_cgroup *pc;
> -	unsigned long flags;
> -	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> -	struct mem_cgroup_per_zone *mz;
> -
> -	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
> -	if (unlikely(pc == NULL))
> -		goto err;
> -
> -	/*
> -	 * We always charge the cgroup the mm_struct belongs to.
> -	 * The mm_struct's mem_cgroup changes on task migration if the
> -	 * thread group leader migrates. It's possible that mm is not
> -	 * set, if so charge the init_mm (happens for pagecache usage).
> -	 */
> -	if (likely(!memcg)) {
> -		rcu_read_lock();
> -		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
> -		/*
> -		 * For every charge from the cgroup, increment reference count
> -		 */
> -		css_get(&mem->css);
> -		rcu_read_unlock();
> -	} else {
> -		mem = memcg;
> -		css_get(&memcg->css);
> -	}
> -
> -	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
> -		if (!(gfp_mask & __GFP_WAIT))
> -			goto out;
> -
> -		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
> -			continue;
> -
> -		/*
> -		 * try_to_free_mem_cgroup_pages() might not give us a full
> -		 * picture of reclaim. Some pages are reclaimed and might be
> -		 * moved to swap cache or just unmapped from the cgroup.
> -		 * Check the limit again to see if the reclaim reduced the
> -		 * current usage of the cgroup before giving up
> -		 */
> -		if (res_counter_check_under_limit(&mem->res))
> -			continue;
> -
> -		if (!nr_retries--) {
> -			mem_cgroup_out_of_memory(mem, gfp_mask);
> -			goto out;
> -		}
> -	}
> -
> -	pc->mem_cgroup = mem;
> -	pc->page = page;
> -	/*
> -	 * If a page is accounted as a page cache, insert to inactive list.
> -	 * If anon, insert to active list.
> -	 */
> -	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
> -		pc->flags = PAGE_CGROUP_FLAG_CACHE;
> -		if (page_is_file_cache(page))
> -			pc->flags |= PAGE_CGROUP_FLAG_FILE;
> -		else
> -			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
> -	} else
> -		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> -
> -	lock_page_cgroup(page);
> -	if (unlikely(page_get_page_cgroup(page))) {
> -		unlock_page_cgroup(page);
> -		res_counter_uncharge(&mem->res, PAGE_SIZE);
> -		css_put(&mem->css);
> -		kmem_cache_free(page_cgroup_cache, pc);
> -		goto done;
> -	}
> -	page_assign_page_cgroup(page, pc);
> -
> -	mz = page_cgroup_zoneinfo(pc);
> -	spin_lock_irqsave(&mz->lru_lock, flags);
> -	__mem_cgroup_add_list(mz, pc);
> -	spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -	unlock_page_cgroup(page);
> -done:
> -	return 0;
> -out:
> -	css_put(&mem->css);
> -	kmem_cache_free(page_cgroup_cache, pc);
> -err:
> -	return -ENOMEM;
> -}
> -
> -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> -{
> -	if (mem_cgroup_subsys.disabled)
> -		return 0;
> -
> -	/*
> -	 * If already mapped, we don't have to account.
> -	 * If page cache, page->mapping has address_space.
> -	 * But page->mapping may have out-of-use anon_vma pointer,
> -	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
> -	 * is NULL.
> -  	 */
> -	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
> -		return 0;
> -	if (unlikely(!mm))
> -		mm = &init_mm;
> -	return mem_cgroup_charge_common(page, mm, gfp_mask,
> -				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
> -}
> -
> -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> -				gfp_t gfp_mask)
> -{
> -	if (mem_cgroup_subsys.disabled)
> -		return 0;
> -
> -	/*
> -	 * Corner case handling. This is called from add_to_page_cache()
> -	 * in usual. But some FS (shmem) precharges this page before calling it
> -	 * and call add_to_page_cache() with GFP_NOWAIT.
> -	 *
> -	 * For GFP_NOWAIT case, the page may be pre-charged before calling
> -	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
> -	 * charge twice. (It works but has to pay a bit larger cost.)
> -	 */
> -	if (!(gfp_mask & __GFP_WAIT)) {
> -		struct page_cgroup *pc;
> -
> -		lock_page_cgroup(page);
> -		pc = page_get_page_cgroup(page);
> -		if (pc) {
> -			VM_BUG_ON(pc->page != page);
> -			VM_BUG_ON(!pc->mem_cgroup);
> -			unlock_page_cgroup(page);
> -			return 0;
> -		}
> -		unlock_page_cgroup(page);
> -	}
> -
> -	if (unlikely(!mm))
> -		mm = &init_mm;
> -
> -	return mem_cgroup_charge_common(page, mm, gfp_mask,
> -				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
> -}
> -
> -/*
> - * uncharge if !page_mapped(page)
> - */
> -static void
> -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> -{
> -	struct page_cgroup *pc;
> -	struct mem_cgroup *mem;
> -	struct mem_cgroup_per_zone *mz;
> -	unsigned long flags;
> -
> -	if (mem_cgroup_subsys.disabled)
> -		return;
> -
> -	/*
> -	 * Check if our page_cgroup is valid
> -	 */
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> -	if (unlikely(!pc))
> -		goto unlock;
> -
> -	VM_BUG_ON(pc->page != page);
> -
> -	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
> -	    && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
> -		|| page_mapped(page)))
> -		goto unlock;
> -
> -	mz = page_cgroup_zoneinfo(pc);
> -	spin_lock_irqsave(&mz->lru_lock, flags);
> -	__mem_cgroup_remove_list(mz, pc);
> -	spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -	page_assign_page_cgroup(page, NULL);
> -	unlock_page_cgroup(page);
> -
> -	mem = pc->mem_cgroup;
> -	res_counter_uncharge(&mem->res, PAGE_SIZE);
> -	css_put(&mem->css);
> -
> -	kmem_cache_free(page_cgroup_cache, pc);
> -	return;
> -unlock:
> -	unlock_page_cgroup(page);
> -}
> -
> -void mem_cgroup_uncharge_page(struct page *page)
> -{
> -	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
> -}
> -
> -void mem_cgroup_uncharge_cache_page(struct page *page)
> -{
> -	VM_BUG_ON(page_mapped(page));
> -	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
> -}
> -
> -/*
> - * Before starting migration, account against new page.
> - */
> -int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
> -{
> -	struct page_cgroup *pc;
> -	struct mem_cgroup *mem = NULL;
> -	enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
> -	int ret = 0;
> -
> -	if (mem_cgroup_subsys.disabled)
> -		return 0;
> -
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> -	if (pc) {
> -		mem = pc->mem_cgroup;
> -		css_get(&mem->css);
> -		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
> -			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
> -	}
> -	unlock_page_cgroup(page);
> -	if (mem) {
> -		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
> -			ctype, mem);
> -		css_put(&mem->css);
> -	}
> -	return ret;
> -}
> -
> -/* remove redundant charge if migration failed*/
> -void mem_cgroup_end_migration(struct page *newpage)
> -{
> -	/*
> -	 * At success, page->mapping is not NULL.
> -	 * special rollback care is necessary when
> -	 * 1. at migration failure. (newpage->mapping is cleared in this case)
> -	 * 2. the newpage was moved but not remapped again because the task
> -	 *    exits and the newpage is obsolete. In this case, the new page
> -	 *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
> -	 *    always for avoiding mess. The  page_cgroup will be removed if
> -	 *    unnecessary. File cache pages is still on radix-tree. Don't
> -	 *    care it.
> -	 */
> -	if (!newpage->mapping)
> -		__mem_cgroup_uncharge_common(newpage,
> -					 MEM_CGROUP_CHARGE_TYPE_FORCE);
> -	else if (PageAnon(newpage))
> -		mem_cgroup_uncharge_page(newpage);
> -}
> -
> -/*
>   * A call to try to shrink memory usage under specified resource controller.
>   * This is typically used for page reclaiming for shmem for reducing side
>   * effect of page allocation from shmem, which is used by some mem_cgroup.
> @@ -783,7 +465,7 @@ int mem_cgroup_shrink_usage(struct mm_st
>  	int progress = 0;
>  	int retry = MEM_CGROUP_RECLAIM_RETRIES;
>  
> -	if (mem_cgroup_subsys.disabled)
> +	if (mem_cgroup_disabled())
>  		return 0;
>  
>  	rcu_read_lock();
> @@ -1104,7 +786,7 @@ mem_cgroup_create(struct cgroup_subsys *
>  
>  	if (unlikely((cont->parent) == NULL)) {
>  		mem = &init_mem_cgroup;
> -		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
> +		page_cgroup_init();
>  	} else {
>  		mem = mem_cgroup_alloc();
>  		if (!mem)
> @@ -1188,3 +870,325 @@ struct cgroup_subsys mem_cgroup_subsys =
>  	.attach = mem_cgroup_move_task,
>  	.early_init = 0,
>  };
> +
> +#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +static inline int mem_cgroup_disabled(void)
> +{
> +	return 1;
> +}
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +static inline int page_cgroup_locked(struct page *page)
> +{
> +	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +}
> +
> +static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
> +{
> +	VM_BUG_ON(!page_cgroup_locked(page));
> +	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> +}
> +
> +struct page_cgroup *page_get_page_cgroup(struct page *page)
> +{
> +	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> +}
> +
> +/*
> + * Charge the memory controller for page usage.
> + * Return
> + * 0 if the charge was successful
> + * < 0 if the cgroup is over its limit
> + */
> +static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
> +				gfp_t gfp_mask, enum charge_type ctype,
> +				struct mem_cgroup *memcg)
> +{
> +	struct page_cgroup *pc;
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +	struct mem_cgroup *mem;
> +	unsigned long flags;
> +	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> +	struct mem_cgroup_per_zone *mz;
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
> +	if (unlikely(pc == NULL))
> +		goto err;
> +
> +	/*
> +	 * We always charge the cgroup the mm_struct belongs to.
> +	 * The mm_struct's mem_cgroup changes on task migration if the
> +	 * thread group leader migrates. It's possible that mm is not
> +	 * set, if so charge the init_mm (happens for pagecache usage).
> +	 */
> +	if (likely(!memcg)) {
> +		rcu_read_lock();
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
> +		/*
> +		 * For every charge from the cgroup, increment reference count
> +		 */
> +		css_get(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +		rcu_read_unlock();
> +	} else {
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +		mem = memcg;
> +		css_get(&memcg->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +	}
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
> +		if (!(gfp_mask & __GFP_WAIT))
> +			goto out;
> +
> +		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
> +			continue;
> +
> +		/*
> +		 * try_to_free_mem_cgroup_pages() might not give us a full
> +		 * picture of reclaim. Some pages are reclaimed and might be
> +		 * moved to swap cache or just unmapped from the cgroup.
> +		 * Check the limit again to see if the reclaim reduced the
> +		 * current usage of the cgroup before giving up
> +		 */
> +		if (res_counter_check_under_limit(&mem->res))
> +			continue;
> +
> +		if (!nr_retries--) {
> +			mem_cgroup_out_of_memory(mem, gfp_mask);
> +			goto out;
> +		}
> +	}
> +	pc->mem_cgroup = mem;
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +	pc->page = page;
> +	/*
> +	 * If a page is accounted as a page cache, insert to inactive list.
> +	 * If anon, insert to active list.
> +	 */
> +	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
> +		pc->flags = PAGE_CGROUP_FLAG_CACHE;
> +		if (page_is_file_cache(page))
> +			pc->flags |= PAGE_CGROUP_FLAG_FILE;
> +		else
> +			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
> +	} else
> +		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> +
> +	lock_page_cgroup(page);
> +	if (unlikely(page_get_page_cgroup(page))) {
> +		unlock_page_cgroup(page);
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +		res_counter_uncharge(&mem->res, PAGE_SIZE);
> +		css_put(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +		kmem_cache_free(page_cgroup_cache, pc);
> +		goto done;
> +	}
> +	page_assign_page_cgroup(page, pc);
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +	mz = page_cgroup_zoneinfo(pc);
> +	spin_lock_irqsave(&mz->lru_lock, flags);
> +	__mem_cgroup_add_list(mz, pc);
> +	spin_unlock_irqrestore(&mz->lru_lock, flags);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +	unlock_page_cgroup(page);
> +done:
> +	return 0;
> +out:
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +	css_put(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +	kmem_cache_free(page_cgroup_cache, pc);
> +err:
> +	return -ENOMEM;
> +}
> +
> +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
> +{
> +	if (mem_cgroup_disabled())
> +		return 0;
> +
> +	/*
> +	 * If already mapped, we don't have to account.
> +	 * If page cache, page->mapping has address_space.
> +	 * But page->mapping may have out-of-use anon_vma pointer,
> +	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
> +	 * is NULL.
> +	 */
> +	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
> +		return 0;
> +	if (unlikely(!mm))
> +		mm = &init_mm;
> +	return mem_cgroup_charge_common(page, mm, gfp_mask,
> +				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
> +}
> +
> +int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> +				gfp_t gfp_mask)
> +{
> +	if (mem_cgroup_disabled())
> +		return 0;
> +
> +	/*
> +	 * Corner case handling. This is called from add_to_page_cache()
> +	 * in usual. But some FS (shmem) precharges this page before calling it
> +	 * and call add_to_page_cache() with GFP_NOWAIT.
> +	 *
> +	 * For GFP_NOWAIT case, the page may be pre-charged before calling
> +	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
> +	 * charge twice. (It works but has to pay a bit larger cost.)
> +	 */
> +	if (!(gfp_mask & __GFP_WAIT)) {
> +		struct page_cgroup *pc;
> +
> +		lock_page_cgroup(page);
> +		pc = page_get_page_cgroup(page);
> +		if (pc) {
> +			VM_BUG_ON(pc->page != page);
> +			VM_BUG_ON(!pc->mem_cgroup);
> +			unlock_page_cgroup(page);
> +			return 0;
> +		}
> +		unlock_page_cgroup(page);
> +	}
> +
> +	if (unlikely(!mm))
> +		mm = &init_mm;
> +
> +	return mem_cgroup_charge_common(page, mm, gfp_mask,
> +				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
> +}
> +
> +/*
> + * uncharge if !page_mapped(page)
> + */
> +static void
> +__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> +{
> +	struct page_cgroup *pc;
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +	struct mem_cgroup *mem;
> +	struct mem_cgroup_per_zone *mz;
> +	unsigned long flags;
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +	if (mem_cgroup_disabled())
> +		return;
> +
> +	/*
> +	 * Check if our page_cgroup is valid
> +	 */
> +	lock_page_cgroup(page);
> +	pc = page_get_page_cgroup(page);
> +	if (unlikely(!pc))
> +		goto unlock;
> +
> +	VM_BUG_ON(pc->page != page);
> +
> +	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
> +	    && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
> +		|| page_mapped(page)
> +		|| PageSwapCache(page)))
> +		goto unlock;
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +	mz = page_cgroup_zoneinfo(pc);
> +	spin_lock_irqsave(&mz->lru_lock, flags);
> +	__mem_cgroup_remove_list(mz, pc);
> +	spin_unlock_irqrestore(&mz->lru_lock, flags);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +	page_assign_page_cgroup(page, NULL);
> +	unlock_page_cgroup(page);
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +	mem = pc->mem_cgroup;
> +	res_counter_uncharge(&mem->res, PAGE_SIZE);
> +	css_put(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +	kmem_cache_free(page_cgroup_cache, pc);
> +	return;
> +unlock:
> +	unlock_page_cgroup(page);
> +}
> +
> +void mem_cgroup_uncharge_page(struct page *page)
> +{
> +	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
> +}
> +
> +void mem_cgroup_uncharge_cache_page(struct page *page)
> +{
> +	VM_BUG_ON(page_mapped(page));
> +	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
> +}
> +
> +/*
> + * Before starting migration, account against new page.
> + */
> +int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
> +{
> +	struct page_cgroup *pc;
> +	struct mem_cgroup *mem = NULL;
> +	enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
> +	int ret = 0;
> +
> +	if (mem_cgroup_disabled())
> +		return 0;
> +
> +	lock_page_cgroup(page);
> +	pc = page_get_page_cgroup(page);
> +	if (pc) {
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +		mem = pc->mem_cgroup;
> +		css_get(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
> +			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
> +	}
> +	unlock_page_cgroup(page);
> +	if (mem) {
> +		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
> +			ctype, mem);
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +		css_put(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +	}
> +	return ret;
> +}
> +
> +/* remove redundant charge if migration failed*/
> +void mem_cgroup_end_migration(struct page *newpage)
> +{
> +	/*
> +	 * At success, page->mapping is not NULL.
> +	 * special rollback care is necessary when
> +	 * 1. at migration failure. (newpage->mapping is cleared in this case)
> +	 * 2. the newpage was moved but not remapped again because the task
> +	 *    exits and the newpage is obsolete. In this case, the new page
> +	 *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
> +	 *    always for avoiding mess. The  page_cgroup will be removed if
> +	 *    unnecessary. File cache pages is still on radix-tree. Don't
> +	 *    care it.
> +	 */
> +	if (!newpage->mapping)
> +		__mem_cgroup_uncharge_common(newpage,
> +					 MEM_CGROUP_CHARGE_TYPE_FORCE);
> +	else if (PageAnon(newpage))
> +		mem_cgroup_uncharge_page(newpage);
> +}
> +
> +void page_cgroup_init()
> +{
> +	if (!page_cgroup_cache)
> +		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
> +}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/