linux-kernel - Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct page

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20080910104940.a7ec9b5a.kamezawa.hiroyu@jp.fujitsu.com>
Date:	Wed, 10 Sep 2008 10:49:40 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	balbir@...ux.vnet.ibm.com
Cc:	Nick Piggin <nickpiggin@...oo.com.au>,
	Andrew Morton <akpm@...ux-foundation.org>, hugh@...itas.com,
	menage@...gle.com, xemul@...nvz.org, linux-kernel@...r.kernel.org,
	linux-mm@...ck.org
Subject: Re: [Approach #2] [RFC][PATCH] Remove cgroup member from struct
 page

On Tue, 9 Sep 2008 18:20:48 -0700
Balbir Singh <balbir@...ux.vnet.ibm.com> wrote:

> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com> [2008-09-09 21:30:12]:
> OK, here is approach #2, it works for me and gives me really good
> performance (surpassing even the current memory controller). I am
> seeing almost a 7% increase
This number is from pre-allcation, maybe.
We really do alloc-at-boot all page_cgroup ? This seems a big change.

> 
> Caveats
> 
> 1. Uses more memory (since it allocates memory for each node based on
>    spanned_pages. Ignores holes, so might not be the most efficient,
>    but it is a tradeoff of complexity versus space. I propose refining it
>    as we go along.
> 2. Does not currently handle alloc_bootmem failure
> 3. Needs lots of testing/tuning and polishing
> 
If we can do alloc-at-boot, we can make memcg much simpler.



> I've tested it on an x86_64 box with 4G of memory
> 
> Again, this is an early RFC patch, please review test. 
> 
> Comments/Reviews?
> 
> Signed-off-by: Balbir Singh <balbir@...ux.vnet.ibm.com>
> ---
> 
>  include/linux/memcontrol.h |   32 ++++++
>  include/linux/mm_types.h   |    4 
>  mm/memcontrol.c            |  212 +++++++++++++++++++++++++++------------------
>  mm/page_alloc.c            |   10 --
>  4 files changed, 162 insertions(+), 96 deletions(-)
> 
> diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
> --- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree	2008-09-04 03:15:54.000000000 -0700
> +++ linux-2.6.27-rc5-balbir/mm/memcontrol.c	2008-09-09 17:56:54.000000000 -0700
> @@ -18,6 +18,7 @@
>   */
>  
>  #include <linux/res_counter.h>
> +#include <linux/bootmem.h>
>  #include <linux/memcontrol.h>
>  #include <linux/cgroup.h>
>  #include <linux/mm.h>
> @@ -37,9 +38,10 @@
>  #include <asm/uaccess.h>
>  
>  struct cgroup_subsys mem_cgroup_subsys __read_mostly;
> -static struct kmem_cache *page_cgroup_cache __read_mostly;
>  #define MEM_CGROUP_RECLAIM_RETRIES	5
>  
> +static struct page_cgroup *pcg_map[MAX_NUMNODES];
> +
>  /*
>   * Statistics for memory cgroup.
>   */
> @@ -137,20 +139,6 @@ struct mem_cgroup {
>  static struct mem_cgroup init_mem_cgroup;
>  
>  /*
> - * We use the lower bit of the page->page_cgroup pointer as a bit spin
> - * lock.  We need to ensure that page->page_cgroup is at least two
> - * byte aligned (based on comments from Nick Piggin).  But since
> - * bit_spin_lock doesn't actually set that lock bit in a non-debug
> - * uniprocessor kernel, we should avoid setting it here too.
> - */
> -#define PAGE_CGROUP_LOCK_BIT 	0x0
> -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> -#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
> -#else
> -#define PAGE_CGROUP_LOCK	0x0
> -#endif
> -
> -/*
>   * A page_cgroup page is associated with every page descriptor. The
>   * page_cgroup helps us identify information about the cgroup
>   */
> @@ -158,12 +146,26 @@ struct page_cgroup {
>  	struct list_head lru;		/* per cgroup LRU list */
>  	struct page *page;
>  	struct mem_cgroup *mem_cgroup;
> -	int flags;
> +	unsigned long flags;
>  };
> -#define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
> -#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
> -#define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
> -#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
> +
> +/*
> + * LOCK_BIT is 0, with value 1
> + */
> +#define PAGE_CGROUP_FLAG_LOCK_BIT  (0x0)  /* lock bit */
> +
> +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> +#define PAGE_CGROUP_FLAG_LOCK      (0x1)  /* lock value */
> +#else
> +#define PAGE_CGROUP_FLAG_LOCK      (0x0)  /* lock value */
> +#endif
> +
> +#define PAGE_CGROUP_FLAG_CACHE	   (0x2)   /* charged as cache */
> +#define PAGE_CGROUP_FLAG_ACTIVE    (0x4)   /* page is active in this cgroup */
> +#define PAGE_CGROUP_FLAG_FILE	   (0x8)   /* page is file system backed */
> +#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x10)/* page is unevictableable */
> +#define PAGE_CGROUP_FLAG_INUSE     (0x20)/* pc is allocated and in use */
> +#define PAGE_CGROUP_FLAG_VALID     (0x40)/* pc is allocated and in use */
>  
>  static int page_cgroup_nid(struct page_cgroup *pc)
>  {
> @@ -248,35 +250,99 @@ struct mem_cgroup *mem_cgroup_from_task(
>  				struct mem_cgroup, css);
>  }
>  
> -static inline int page_cgroup_locked(struct page *page)
> +static inline void lock_page_cgroup(struct page_cgroup *pc)
>  {
> -	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	bit_spin_lock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>  }
>  
> -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
> +static inline int trylock_page_cgroup(struct page_cgroup *pc)
>  {
> -	VM_BUG_ON(!page_cgroup_locked(page));
> -	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> +	return bit_spin_trylock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>  }
>  
> -struct page_cgroup *page_get_page_cgroup(struct page *page)
> +static inline void unlock_page_cgroup(struct page_cgroup *pc)
>  {
> -	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> +	bit_spin_unlock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
>  }
>  
> -static void lock_page_cgroup(struct page *page)
> +/*
> + * Called from memmap_init_zone(), has the advantage of dealing with
> + * memory_hotplug (Addition of memory)
> + */
> +int page_cgroup_alloc(int n)
>  {
> -	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	struct pglist_data *pgdat;
> +	unsigned long size, start, end;
> +
> +	if (mem_cgroup_subsys.disabled)
> +		return;
> +
> +	pgdat = NODE_DATA(n);
> +	/*
> +	 * Already allocated, leave
> +	 */
> +	if (pcg_map[n])
> +		return 0;
> +
> +	start = pgdat->node_start_pfn;
> +	end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
> +	size = (end - start) * sizeof(struct page_cgroup);
> +	printk("Allocating %lu bytes for node %d\n", size, n);
^^^^^

1. This is nonsense...do you know the memory map of IBM's (maybe ppc) machine ?
Node's memory are splitted into several pieces and not ordered by node number.
example)
   Node 0 | Node 1 | Node 2 | Node 1 | Node 2 | 

This seems special but when I helped SPARSEMEM and MEMORY_HOTPLUG,
I saw mannnny kinds of memory map. As you wrote, this should be re-designed.

2. If pre-allocating all is ok, I stop my work. Mine is of-no-use.
But you have to know that by pre-allocationg, we can't use avoid-lru-lock
by batch like page_vec technique. We can't delay uncharge because a page
can be reused soon.




> +	pcg_map[n] = alloc_bootmem_node(pgdat, size);
> +	/*
> +	 * We can do smoother recovery
> +	 */
> +	BUG_ON(!pcg_map[n]);
> +	return 0;
>  }
>  
> -static int try_lock_page_cgroup(struct page *page)
> +void page_cgroup_init(int nid, unsigned long pfn)
>  {
> -	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	unsigned long node_pfn;
> +	struct page_cgroup *pc;
> +
> +	if (mem_cgroup_subsys.disabled)
> +		return;
> +
> +	node_pfn = pfn - NODE_DATA(nid)->node_start_pfn;
> +	pc = &pcg_map[nid][node_pfn];
> +
> +	BUG_ON(!pc);
> +	pc->flags = PAGE_CGROUP_FLAG_VALID;
> +	INIT_LIST_HEAD(&pc->lru);
> +	pc->page = NULL;
This NULL is unnecessary. pc->page = pnf_to_page(pfn) always.


> +	pc->mem_cgroup = NULL;
>  }
>  
> -static void unlock_page_cgroup(struct page *page)
> +struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
> +						bool trylock)
>  {
> -	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +	struct page_cgroup *pc;
> +	int ret;
> +	int node = page_to_nid(page);
> +	unsigned long pfn;
> +
> +	pfn = page_to_pfn(page) - NODE_DATA(node)->node_start_pfn;
> +	pc = &pcg_map[node][pfn];
> +	BUG_ON(!(pc->flags & PAGE_CGROUP_FLAG_VALID));
> +	if (lock)
> +		lock_page_cgroup(pc);
> +	else if (trylock) {
> +		ret = trylock_page_cgroup(pc);
> +		if (!ret)
> +			pc = NULL;
> +	}
> +
> +	return pc;
> +}
> +
> +/*
> + * Should be called with page_cgroup lock held. Any additions to pc->flags
> + * should be reflected here. This might seem ugly, refine it later.
> + */
> +void page_clear_page_cgroup(struct page_cgroup *pc)
> +{
> +	pc->flags &= ~PAGE_CGROUP_FLAG_INUSE;
>  }
>  
>  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
> @@ -377,17 +443,15 @@ void mem_cgroup_move_lists(struct page *
>  	 * safely get to page_cgroup without it, so just try_lock it:
>  	 * mem_cgroup_isolate_pages allows for page left on wrong list.
>  	 */
> -	if (!try_lock_page_cgroup(page))
> +	pc = page_get_page_cgroup_trylock(page);
> +	if (!pc)
>  		return;
>  
> -	pc = page_get_page_cgroup(page);
> -	if (pc) {
> -		mz = page_cgroup_zoneinfo(pc);
> -		spin_lock_irqsave(&mz->lru_lock, flags);
> -		__mem_cgroup_move_lists(pc, lru);
> -		spin_unlock_irqrestore(&mz->lru_lock, flags);
> -	}
> -	unlock_page_cgroup(page);
> +	mz = page_cgroup_zoneinfo(pc);
> +	spin_lock_irqsave(&mz->lru_lock, flags);
> +	__mem_cgroup_move_lists(pc, lru);
> +	spin_unlock_irqrestore(&mz->lru_lock, flags);
> +	unlock_page_cgroup(pc);

This lock/unlock_page_cgroup is against what ?

>  }
>  
>  /*
> @@ -521,10 +585,6 @@ static int mem_cgroup_charge_common(stru
>  	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
>  	struct mem_cgroup_per_zone *mz;
>  
> -	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
> -	if (unlikely(pc == NULL))
> -		goto err;
> -
>  	/*
>  	 * We always charge the cgroup the mm_struct belongs to.
>  	 * The mm_struct's mem_cgroup changes on task migration if the
> @@ -567,43 +627,40 @@ static int mem_cgroup_charge_common(stru
>  		}
>  	}
>  
> +	pc = page_get_page_cgroup_locked(page);
> +	if (pc->flags & PAGE_CGROUP_FLAG_INUSE) {
> +		unlock_page_cgroup(pc);
> +		res_counter_uncharge(&mem->res, PAGE_SIZE);
> +		css_put(&mem->css);
> +		goto done;
> +	}
> +
Can this happen ? Our direction should be
VM_BUG_ON(pc->flags & PAGE_CGROUP_FLAG_INUSE)



>  	pc->mem_cgroup = mem;
>  	pc->page = page;
> +	pc->flags |= PAGE_CGROUP_FLAG_INUSE;
> +
>  	/*
>  	 * If a page is accounted as a page cache, insert to inactive list.
>  	 * If anon, insert to active list.
>  	 */
>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
> -		pc->flags = PAGE_CGROUP_FLAG_CACHE;
> +		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
>  		if (page_is_file_cache(page))
>  			pc->flags |= PAGE_CGROUP_FLAG_FILE;
>  		else
>  			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
>  	} else
> -		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> -
> -	lock_page_cgroup(page);
> -	if (unlikely(page_get_page_cgroup(page))) {
> -		unlock_page_cgroup(page);
> -		res_counter_uncharge(&mem->res, PAGE_SIZE);
> -		css_put(&mem->css);
> -		kmem_cache_free(page_cgroup_cache, pc);
> -		goto done;
> -	}
> -	page_assign_page_cgroup(page, pc);
> +		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
>  
>  	mz = page_cgroup_zoneinfo(pc);
>  	spin_lock_irqsave(&mz->lru_lock, flags);
>  	__mem_cgroup_add_list(mz, pc);
>  	spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -	unlock_page_cgroup(page);
> +	unlock_page_cgroup(pc);

Is this lock/unlock_page_cgroup is for what kind of race ?

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/