lists.openwall.net | lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC | |
Open Source and information security mailing list archives
| ||
|
Date: Wed, 24 Sep 2008 08:48:39 +0900 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com> To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com> Cc: "linux-mm@...ck.org" <linux-mm@...ck.org>, "balbir@...ux.vnet.ibm.com" <balbir@...ux.vnet.ibm.com>, "nishimura@....nes.nec.co.jp" <nishimura@....nes.nec.co.jp>, "xemul@...nvz.org" <xemul@...nvz.org>, LKML <linux-kernel@...r.kernel.org> Subject: Re: [PATCH 9/13] memcg: lookup page cgroup (and remove pointer from struct page) After sleeping all day, I changed my mind and decided to drop this. It seems no one like this. I'll add FLATMEM/DISCONTIGMEM/SPARSEMEM support directly. I already have wasted a month on this not-interesting work and want to fix this soon. I'm glad if people help me to test FLATMEM/DISCONTIGMEM/SPARSEMEM because there are various kinds of memory map. I have only x86-64 box. Thanks, -Kame On Mon, 22 Sep 2008 20:12:06 +0900 KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com> wrote: > Remove page_cgroup pointer from struct page. > > This patch removes page_cgroup pointer from struct page and make it be able > to get from pfn. Then, relationship of them is > > Before this: > pfn <-> struct page <-> struct page_cgroup. > After this: > struct page <-> pfn -> struct page_cgroup -> struct page. > > Benefit of this approach is we can remove 8 bytes from struct page. > > Other changes are: > - lock/unlock_page_cgroup() uses its own bit on struct page_cgroup. > - all necessary page_cgroups are allocated at boot. > > Characteristics: > - page cgroup is allocated as some amount of chunk. > This patch uses SECTION_SIZE as size of chunk if 64bit/SPARSEMEM is enabled. > If not, appropriate default number is selected. > - all page_cgroup struct is maintained by hash. > I think we have 2 ways to handle sparse index in general > ...radix-tree and hash. This uses hash because radix-tree's layout is > affected by memory map's layout. > - page_cgroup.h/page_cgroup.c is added. > > Changelog: v3 -> v4. > - changed arguments to lookup_page_cgroup() from "pfn" to "page", > > Changelog: v2 -> v3 > - changed arguments from pfn to struct page*. > - added memory hotplug callback (no undo...needs .more work.) > - adjusted to new mmotm. > > Changelog: v1 -> v2 > - Fixed memory allocation failure at boot to do panic with good message. > - rewrote charge/uncharge path (no changes in logic.) > > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com> > > include/linux/mm_types.h | 4 > include/linux/page_cgroup.h | 89 +++++++++++++++ > mm/Makefile | 2 > mm/memcontrol.c | 251 +++++++++++--------------------------------- > mm/page_alloc.c | 9 - > mm/page_cgroup.c | 235 +++++++++++++++++++++++++++++++++++++++++ > 6 files changed, 394 insertions(+), 196 deletions(-) > > Index: mmotm-2.6.27-rc6+/mm/page_cgroup.c > =================================================================== > --- /dev/null > +++ mmotm-2.6.27-rc6+/mm/page_cgroup.c > @@ -0,0 +1,235 @@ > +#include <linux/mm.h> > +#include <linux/rcupdate.h> > +#include <linux/rculist.h> > +#include <linux/bootmem.h> > +#include <linux/bit_spinlock.h> > +#include <linux/page_cgroup.h> > +#include <linux/hash.h> > +#include <linux/memory.h> > + > + > + > +struct pcg_hash_head { > + spinlock_t lock; > + struct hlist_head head; > +}; > + > +static struct pcg_hash_head *pcg_hashtable __read_mostly; > + > +struct pcg_hash { > + struct hlist_node node; > + unsigned long index; > + struct page_cgroup *map; > +}; > + > +#if BITS_PER_LONG == 32 /* we use kmalloc() */ > +#define ENTS_PER_CHUNK_SHIFT (7) > +const bool chunk_vmalloc = false; > +#else /* we'll use vmalloc */ > +#ifdef SECTION_SIZE_BITS > +#define ENTS_PER_CHUNK_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) > +#else > +#define ENTS_PER_CHUNK_SHIFT (14) /* covers 128MB on x86-64 */ > +#endif > +const bool chunk_vmalloc = true; > +#endif > + > +#define ENTS_PER_CHUNK (1 << (ENTS_PER_CHUNK_SHIFT)) > +#define ENTS_PER_CHUNK_MASK (ENTS_PER_CHUNK - 1) > + > +static int pcg_hashshift __read_mostly; > +static int pcg_hashmask __read_mostly; > + > +#define PCG_HASHSHIFT (pcg_hashshift) > +#define PCG_HASHMASK (pcg_hashmask) > +#define PCG_HASHSIZE (1 << pcg_hashshift) > + > +static int pcg_hashfun(unsigned long index) > +{ > + return hash_long(index, pcg_hashshift); > +} > + > +struct page_cgroup *lookup_page_cgroup(unsigned long pfn) > +{ > + unsigned long index = pfn >> ENTS_PER_CHUNK_SHIFT; > + struct pcg_hash *ent; > + struct pcg_hash_head *head; > + struct hlist_node *node; > + struct page_cgroup *pc = NULL; > + int hnum; > + > + hnum = pcg_hashfun(index); > + head = pcg_hashtable + hnum; > + rcu_read_lock(); > + hlist_for_each_entry(ent, node, &head->head, node) { > + if (ent->index == index) { > + pc = ent->map + pfn; > + break; > + } > + } > + rcu_read_unlock(); > + return pc; > +} > + > +static int __meminit alloc_page_cgroup(int node, unsigned long index) > +{ > + struct pcg_hash *ent; > + struct pcg_hash_head *head; > + struct page_cgroup *pc; > + unsigned long flags, base; > + int hnum, i; > + int mapsize = sizeof(struct page_cgroup) * ENTS_PER_CHUNK; > + > + if (lookup_page_cgroup(index << ENTS_PER_CHUNK_SHIFT)) > + return 0; > + > + if (!chunk_vmalloc) { > + int ent_size = sizeof(*ent) + mapsize; > + ent = kmalloc_node(ent_size, GFP_KERNEL, node); > + if (!ent) > + return 1; > + pc = (void *)(ent + 1); > + } else { > + ent = kmalloc_node(sizeof(*ent), GFP_KERNEL, node); > + if (!ent) > + return 1; > + pc = vmalloc_node(mapsize, node); > + if (!pc) { > + kfree(ent); > + return 1; > + } > + } > + ent->map = pc - (index << ENTS_PER_CHUNK_SHIFT); > + ent->index = index; > + INIT_HLIST_NODE(&ent->node); > + > + for (base = index << ENTS_PER_CHUNK_SHIFT, i = 0; > + i < ENTS_PER_CHUNK; i++) { > + unsigned long pfn = base + i; > + pc = ent->map + pfn; > + pc->page = pfn_to_page(pfn); > + pc->mem_cgroup = NULL; > + pc->flags = 0; > + } > + > + hnum = pcg_hashfun(index); > + head = &pcg_hashtable[hnum]; > + spin_lock_irqsave(&head->lock, flags); > + hlist_add_head_rcu(&ent->node, &head->head); > + spin_unlock_irqrestore(&head->lock, flags); > + return 0; > +} > + > +#ifdef CONFIG_MEMORY_HOTPLUG > + > +int online_page_cgroup(unsigned long start_pfn, > + unsigned long nr_pages, > + int nid) > +{ > + unsigned long index, end_pfn, start, end; > + int fail = 0; > + > + end_pfn = start_pfn + nr_pages; > + start = start_pfn >> ENTS_PER_CHUNK_SHIFT; > + end = (end_pfn + ENTS_PER_CHUNK - 1) >> ENTS_PER_CHUNK_SHIFT; > + > + for (index = start; (!fail) && (index < end); index++) { > + unsigned long pfn = index << ENTS_PER_CHUNK_SHIFT; > + if (lookup_page_cgroup(pfn)) > + continue; > + fail = alloc_page_cgroup(nid, index); > + } > + return fail; > +} > + > +static int pcg_memory_callback(struct notifier_block *self, > + unsigned long action, void *arg) > +{ > + struct memory_notify *mn = arg; > + int ret = 0; > + switch (action) { > + case MEM_GOING_ONLINE: > + ret = online_page_cgroup(mn->start_pfn, > + mn->nr_pages, mn->status_change_nid); > + break; > + case MEM_GOING_OFFLINE: > + break; > + case MEM_CANCEL_ONLINE: > + case MEM_OFFLINE: > + case MEM_ONLINE: > + case MEM_CANCEL_OFFLINE: > + break; > + } > + ret = notifier_from_errno(ret); > + return ret; > +} > + > +#endif > + > +/* Called From mem_cgroup's initilization */ > +void __init page_cgroup_init(void) > +{ > + struct pcg_hash_head *head; > + int node, i, fail; > + unsigned long start, pfn, end, index, offset; > + long default_pcg_hash_size; > + > + /* we don't need too large hash */ > + default_pcg_hash_size = (max_pfn/ENTS_PER_CHUNK); > + default_pcg_hash_size *= 2; > + /* if too big, use automatic calclation */ > + if (default_pcg_hash_size > 1024 * 1024) > + default_pcg_hash_size = 0; > + > + pcg_hashtable = alloc_large_system_hash("PageCgroup Hash", > + sizeof(struct pcg_hash_head), > + default_pcg_hash_size, > + 13, > + 0, > + &pcg_hashshift, > + &pcg_hashmask, > + 0); > + if (!pcg_hashtable) { > + fail = 1; > + goto nomem; > + } > + > + for (i = 0; i < PCG_HASHSIZE; i++) { > + head = &pcg_hashtable[i]; > + spin_lock_init(&head->lock); > + INIT_HLIST_HEAD(&head->head); > + } > + > + fail = 0; > + for_each_node(node) { > + start = NODE_DATA(node)->node_start_pfn; > + end = start + NODE_DATA(node)->node_spanned_pages; > + start >>= ENTS_PER_CHUNK_SHIFT; > + end = (end + ENTS_PER_CHUNK - 1) >> ENTS_PER_CHUNK_SHIFT; > + for (index = start; (!fail) && (index < end); index++) { > + pfn = index << ENTS_PER_CHUNK_SHIFT; > + /* > + * In usual, this loop breaks at offset=0. > + * Handle a case a hole in MAX_ORDER (ia64 only...) > + */ > + for (offset = 0; offset < ENTS_PER_CHUNK; offset++) { > + if (pfn_valid(pfn + offset)) { > + fail = alloc_page_cgroup(node, index); > + break; > + } > + } > + } > + if (fail) > + break; > + } > + > + hotplug_memory_notifier(pcg_memory_callback, 0); > +nomem: > + if (fail) { > + printk("Not enough memory for memory resource controller.\n"); > + panic("please try cgroup_disable=memory boot option."); > + } > + return; > +} > + > + > Index: mmotm-2.6.27-rc6+/include/linux/mm_types.h > =================================================================== > --- mmotm-2.6.27-rc6+.orig/include/linux/mm_types.h > +++ mmotm-2.6.27-rc6+/include/linux/mm_types.h > @@ -94,10 +94,6 @@ struct page { > void *virtual; /* Kernel virtual address (NULL if > not kmapped, ie. highmem) */ > #endif /* WANT_PAGE_VIRTUAL */ > -#ifdef CONFIG_CGROUP_MEM_RES_CTLR > - unsigned long page_cgroup; > -#endif > - > #ifdef CONFIG_KMEMCHECK > void *shadow; > #endif > Index: mmotm-2.6.27-rc6+/mm/Makefile > =================================================================== > --- mmotm-2.6.27-rc6+.orig/mm/Makefile > +++ mmotm-2.6.27-rc6+/mm/Makefile > @@ -34,6 +34,6 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o > obj-$(CONFIG_MIGRATION) += migrate.o > obj-$(CONFIG_SMP) += allocpercpu.o > obj-$(CONFIG_QUICKLIST) += quicklist.o > -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o > +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o > obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o > obj-$(CONFIG_KMEMTRACE) += kmemtrace.o > Index: mmotm-2.6.27-rc6+/include/linux/page_cgroup.h > =================================================================== > --- /dev/null > +++ mmotm-2.6.27-rc6+/include/linux/page_cgroup.h > @@ -0,0 +1,89 @@ > +#ifndef __LINUX_PAGE_CGROUP_H > +#define __LINUX_PAGE_CGROUP_H > + > +/* > + * Page Cgroup can be considered as an extended mem_map. > + * A page_cgroup page is associated with every page descriptor. The > + * page_cgroup helps us identify information about the cgroup > + * All page cgroups are allocated at boot or memory hotplug event, > + * then the page cgroup for pfn always exists. > + */ > +struct page_cgroup { > + unsigned long flags; > + struct mem_cgroup *mem_cgroup; > + struct page *page; > + struct list_head lru; /* per cgroup LRU list */ > +}; > + > +void __init page_cgroup_init(void); > +struct page_cgroup *lookup_page_cgroup(unsigned long pfn); > + > +enum { > + /* flags for mem_cgroup */ > + PCG_LOCK, /* page cgroup is locked */ > + PCG_CACHE, /* charged as cache */ > + PCG_USED, /* this object is in use. */ > + /* flags for LRU placement */ > + PCG_ACTIVE, /* page is active in this cgroup */ > + PCG_FILE, /* page is file system backed */ > + PCG_UNEVICTABLE, /* page is unevictableable */ > +}; > + > +#define TESTPCGFLAG(uname, lname) \ > +static inline int PageCgroup##uname(struct page_cgroup *pc) \ > + { return test_bit(PCG_##lname, &pc->flags); } > + > +#define SETPCGFLAG(uname, lname) \ > +static inline void SetPageCgroup##uname(struct page_cgroup *pc)\ > + { set_bit(PCG_##lname, &pc->flags); } > + > +#define CLEARPCGFLAG(uname, lname) \ > +static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ > + { clear_bit(PCG_##lname, &pc->flags); } > + > +/* Cache flag is set only once (at allocation) */ > +TESTPCGFLAG(Cache, CACHE) > + > +TESTPCGFLAG(Used, USED) > +CLEARPCGFLAG(Used, USED) > + > +/* LRU management flags (from global-lru definition) */ > +TESTPCGFLAG(File, FILE) > +SETPCGFLAG(File, FILE) > +CLEARPCGFLAG(File, FILE) > + > +TESTPCGFLAG(Active, ACTIVE) > +SETPCGFLAG(Active, ACTIVE) > +CLEARPCGFLAG(Active, ACTIVE) > + > +TESTPCGFLAG(Unevictable, UNEVICTABLE) > +SETPCGFLAG(Unevictable, UNEVICTABLE) > +CLEARPCGFLAG(Unevictable, UNEVICTABLE) > + > +static inline int page_cgroup_nid(struct page_cgroup *pc) > +{ > + return page_to_nid(pc->page); > +} > + > +static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) > +{ > + return page_zonenum(pc->page); > +} > + > +static inline void lock_page_cgroup(struct page_cgroup *pc) > +{ > + bit_spin_lock(PCG_LOCK, &pc->flags); > +} > + > +static inline int trylock_page_cgroup(struct page_cgroup *pc) > +{ > + return bit_spin_trylock(PCG_LOCK, &pc->flags); > +} > + > +static inline void unlock_page_cgroup(struct page_cgroup *pc) > +{ > + bit_spin_unlock(PCG_LOCK, &pc->flags); > +} > + > + > +#endif > Index: mmotm-2.6.27-rc6+/mm/memcontrol.c > =================================================================== > --- mmotm-2.6.27-rc6+.orig/mm/memcontrol.c > +++ mmotm-2.6.27-rc6+/mm/memcontrol.c > @@ -34,11 +34,11 @@ > #include <linux/seq_file.h> > #include <linux/vmalloc.h> > #include <linux/mm_inline.h> > +#include <linux/page_cgroup.h> > > #include <asm/uaccess.h> > > struct cgroup_subsys mem_cgroup_subsys __read_mostly; > -static struct kmem_cache *page_cgroup_cache __read_mostly; > #define MEM_CGROUP_RECLAIM_RETRIES 5 > > /* > @@ -138,80 +138,6 @@ static struct mem_cgroup init_mem_cgroup > > #define is_root_cgroup(cgrp) ((cgrp) == &init_mem_cgroup) > > - > -/* > - * We use the lower bit of the page->page_cgroup pointer as a bit spin > - * lock. We need to ensure that page->page_cgroup is at least two > - * byte aligned (based on comments from Nick Piggin). But since > - * bit_spin_lock doesn't actually set that lock bit in a non-debug > - * uniprocessor kernel, we should avoid setting it here too. > - */ > -#define PAGE_CGROUP_LOCK_BIT 0x0 > -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) > -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) > -#else > -#define PAGE_CGROUP_LOCK 0x0 > -#endif > - > -/* > - * A page_cgroup page is associated with every page descriptor. The > - * page_cgroup helps us identify information about the cgroup > - */ > -struct page_cgroup { > - struct list_head lru; /* per cgroup LRU list */ > - struct page *page; > - struct mem_cgroup *mem_cgroup; > - unsigned long flags; > -}; > - > -enum { > - /* flags for mem_cgroup */ > - PCG_CACHE, /* charged as cache */ > - /* flags for LRU placement */ > - PCG_ACTIVE, /* page is active in this cgroup */ > - PCG_FILE, /* page is file system backed */ > - PCG_UNEVICTABLE, /* page is unevictableable */ > -}; > - > -#define TESTPCGFLAG(uname, lname) \ > -static inline int PageCgroup##uname(struct page_cgroup *pc) \ > - { return test_bit(PCG_##lname, &pc->flags); } > - > -#define SETPCGFLAG(uname, lname) \ > -static inline void SetPageCgroup##uname(struct page_cgroup *pc)\ > - { set_bit(PCG_##lname, &pc->flags); } > - > -#define CLEARPCGFLAG(uname, lname) \ > -static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ > - { clear_bit(PCG_##lname, &pc->flags); } > - > - > -/* Cache flag is set only once (at allocation) */ > -TESTPCGFLAG(Cache, CACHE) > - > -/* LRU management flags (from global-lru definition) */ > -TESTPCGFLAG(File, FILE) > -SETPCGFLAG(File, FILE) > -CLEARPCGFLAG(File, FILE) > - > -TESTPCGFLAG(Active, ACTIVE) > -SETPCGFLAG(Active, ACTIVE) > -CLEARPCGFLAG(Active, ACTIVE) > - > -TESTPCGFLAG(Unevictable, UNEVICTABLE) > -SETPCGFLAG(Unevictable, UNEVICTABLE) > -CLEARPCGFLAG(Unevictable, UNEVICTABLE) > - > -static int page_cgroup_nid(struct page_cgroup *pc) > -{ > - return page_to_nid(pc->page); > -} > - > -static enum zone_type page_cgroup_zid(struct page_cgroup *pc) > -{ > - return page_zonenum(pc->page); > -} > - > enum charge_type { > MEM_CGROUP_CHARGE_TYPE_CACHE = 0, > MEM_CGROUP_CHARGE_TYPE_MAPPED, > @@ -222,9 +148,9 @@ enum charge_type { > > static const unsigned long > pcg_default_flags[NR_CHARGE_TYPE] = { > - ((1 << PCG_CACHE) | (1 << PCG_FILE)), > - ((1 << PCG_ACTIVE)), > - ((1 << PCG_ACTIVE) | (1 << PCG_CACHE)), > + (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED) | (1 << PCG_LOCK), > + (1 << PCG_ACTIVE) | (1 << PCG_LOCK) | (1 << PCG_USED), > + (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED)| (1 << PCG_LOCK), > 0, > }; > > @@ -307,37 +233,6 @@ struct mem_cgroup *mem_cgroup_from_task( > struct mem_cgroup, css); > } > > -static inline int page_cgroup_locked(struct page *page) > -{ > - return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); > -} > - > -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) > -{ > - VM_BUG_ON(!page_cgroup_locked(page)); > - page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); > -} > - > -struct page_cgroup *page_get_page_cgroup(struct page *page) > -{ > - return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); > -} > - > -static void lock_page_cgroup(struct page *page) > -{ > - bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); > -} > - > -static int try_lock_page_cgroup(struct page *page) > -{ > - return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); > -} > - > -static void unlock_page_cgroup(struct page *page) > -{ > - bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); > -} > - > static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, > struct page_cgroup *pc) > { > @@ -441,22 +336,19 @@ void mem_cgroup_move_lists(struct page * > * safely get to page_cgroup without it, so just try_lock it: > * mem_cgroup_isolate_pages allows for page left on wrong list. > */ > - if (!try_lock_page_cgroup(page)) > + pc = lookup_page_cgroup(page_to_pfn(page)); > + > + if (!trylock_page_cgroup(pc)) > return; > > - pc = page_get_page_cgroup(page); > - if (pc) { > + if (PageCgroupUsed(pc)) { > mem = pc->mem_cgroup; > mz = page_cgroup_zoneinfo(pc); > spin_lock_irqsave(&mz->lru_lock, flags); > - /* > - * check against the race with move_account. > - */ > - if (likely(mem == pc->mem_cgroup)) > - __mem_cgroup_move_lists(pc, lru); > + __mem_cgroup_move_lists(pc, lru); > spin_unlock_irqrestore(&mz->lru_lock, flags); > } > - unlock_page_cgroup(page); > + unlock_page_cgroup(pc); > } > > /* > @@ -543,6 +435,8 @@ unsigned long mem_cgroup_isolate_pages(u > list_for_each_entry_safe_reverse(pc, tmp, src, lru) { > if (scan >= nr_to_scan) > break; > + if (unlikely(!PageCgroupUsed(pc))) > + continue; > page = pc->page; > > if (unlikely(!PageLRU(page))) > @@ -611,12 +505,12 @@ int mem_cgroup_move_account(struct page > /* Now, we assume no_limit...no failure here. */ > return ret; > } > - if (!try_lock_page_cgroup(page)) { > + if (!trylock_page_cgroup(pc)) { > res_counter_uncharge(&to->res, PAGE_SIZE); > return ret; > } > > - if (page_get_page_cgroup(page) != pc) { > + if (!PageCgroupUsed(pc)) { > res_counter_uncharge(&to->res, PAGE_SIZE); > goto out; > } > @@ -634,7 +528,7 @@ int mem_cgroup_move_account(struct page > res_counter_uncharge(&to->res, PAGE_SIZE); > } > out: > - unlock_page_cgroup(page); > + unlock_page_cgroup(pc); > > return ret; > } > @@ -651,26 +545,27 @@ static int mem_cgroup_charge_common(stru > { > struct mem_cgroup *mem; > struct page_cgroup *pc; > - unsigned long flags; > unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; > struct mem_cgroup_per_zone *mz; > + unsigned long flags; > > - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); > - if (unlikely(pc == NULL)) > - goto err; > - > + pc = lookup_page_cgroup(page_to_pfn(page)); > + /* can happen at boot */ > + if (unlikely(!pc)) > + return 0; > + prefetchw(pc); > /* > * We always charge the cgroup the mm_struct belongs to. > * The mm_struct's mem_cgroup changes on task migration if the > * thread group leader migrates. It's possible that mm is not > * set, if so charge the init_mm (happens for pagecache usage). > */ > + > if (likely(!memcg)) { > rcu_read_lock(); > mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); > if (unlikely(!mem)) { > rcu_read_unlock(); > - kmem_cache_free(page_cgroup_cache, pc); > return 0; > } > /* > @@ -706,36 +601,34 @@ static int mem_cgroup_charge_common(stru > } > } > > + preempt_disable(); > + lock_page_cgroup(pc); > + if (unlikely(PageCgroupUsed(pc))) { > + unlock_page_cgroup(pc); > + res_counter_uncharge(&mem->res, PAGE_SIZE); > + css_put(&mem->css); > + preempt_enable(); > + goto done; > + } > pc->mem_cgroup = mem; > - pc->page = page; > /* > * If a page is accounted as a page cache, insert to inactive list. > * If anon, insert to active list. > */ > pc->flags = pcg_default_flags[ctype]; > > - lock_page_cgroup(page); > - if (unlikely(page_get_page_cgroup(page))) { > - unlock_page_cgroup(page); > - res_counter_uncharge(&mem->res, PAGE_SIZE); > - css_put(&mem->css); > - kmem_cache_free(page_cgroup_cache, pc); > - goto done; > - } > - page_assign_page_cgroup(page, pc); > - > mz = page_cgroup_zoneinfo(pc); > + > spin_lock_irqsave(&mz->lru_lock, flags); > __mem_cgroup_add_list(mz, pc); > spin_unlock_irqrestore(&mz->lru_lock, flags); > + unlock_page_cgroup(pc); > + preempt_enable(); > > - unlock_page_cgroup(page); > done: > return 0; > out: > css_put(&mem->css); > - kmem_cache_free(page_cgroup_cache, pc); > -err: > return -ENOMEM; > } > > @@ -743,7 +636,8 @@ int mem_cgroup_charge(struct page *page, > { > if (mem_cgroup_subsys.disabled) > return 0; > - > + if (PageCompound(page)) > + return 0; > /* > * If already mapped, we don't have to account. > * If page cache, page->mapping has address_space. > @@ -764,7 +658,8 @@ int mem_cgroup_cache_charge(struct page > { > if (mem_cgroup_subsys.disabled) > return 0; > - > + if (PageCompound(page)) > + return 0; > /* > * Corner case handling. This is called from add_to_page_cache() > * in usual. But some FS (shmem) precharges this page before calling it > @@ -777,15 +672,16 @@ int mem_cgroup_cache_charge(struct page > if (!(gfp_mask & __GFP_WAIT)) { > struct page_cgroup *pc; > > - lock_page_cgroup(page); > - pc = page_get_page_cgroup(page); > - if (pc) { > - VM_BUG_ON(pc->page != page); > - VM_BUG_ON(!pc->mem_cgroup); > - unlock_page_cgroup(page); > + > + pc = lookup_page_cgroup(page_to_pfn(page)); > + if (!pc) > + return 0; > + lock_page_cgroup(pc); > + if (PageCgroupUsed(pc)) { > + unlock_page_cgroup(pc); > return 0; > } > - unlock_page_cgroup(page); > + unlock_page_cgroup(pc); > } > > if (unlikely(!mm)) > @@ -808,53 +704,46 @@ __mem_cgroup_uncharge_common(struct page > struct page_cgroup *pc; > struct mem_cgroup *mem; > struct mem_cgroup_per_zone *mz; > + unsigned long pfn = page_to_pfn(page); > unsigned long flags; > > if (mem_cgroup_subsys.disabled) > return; > + /* check the condition we can know from page */ > > - /* > - * Check if our page_cgroup is valid > - */ > - lock_page_cgroup(page); > - pc = page_get_page_cgroup(page); > - if (unlikely(!pc)) > - goto unlock; > - > - VM_BUG_ON(pc->page != page); > + pc = lookup_page_cgroup(pfn); > + if (unlikely(!pc || !PageCgroupUsed(pc))) > + return; > + preempt_disable(); > + lock_page_cgroup(pc); > + if (unlikely(page_mapped(page))) { > + unlock_page_cgroup(pc); > + preempt_enable(); > + return; > + } > + ClearPageCgroupUsed(pc); > + unlock_page_cgroup(pc); > > - if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) > - && ((PageCgroupCache(pc) || page_mapped(page)))) > - goto unlock; > -retry: > mem = pc->mem_cgroup; > mz = page_cgroup_zoneinfo(pc); > + > spin_lock_irqsave(&mz->lru_lock, flags); > - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && > - unlikely(mem != pc->mem_cgroup)) { > - /* MAPPED account can be done without lock_page(). > - Check race with mem_cgroup_move_account() */ > - spin_unlock_irqrestore(&mz->lru_lock, flags); > - goto retry; > - } > __mem_cgroup_remove_list(mz, pc); > spin_unlock_irqrestore(&mz->lru_lock, flags); > - > - page_assign_page_cgroup(page, NULL); > - unlock_page_cgroup(page); > - > - > - res_counter_uncharge(&mem->res, PAGE_SIZE); > + pc->mem_cgroup = NULL; > css_put(&mem->css); > + preempt_enable(); > + res_counter_uncharge(&mem->res, PAGE_SIZE); > > - kmem_cache_free(page_cgroup_cache, pc); > return; > -unlock: > - unlock_page_cgroup(page); > } > > void mem_cgroup_uncharge_page(struct page *page) > { > + if (page_mapped(page)) > + return; > + if (page->mapping && !PageAnon(page)) > + return; > __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); > } > > @@ -878,9 +767,9 @@ int mem_cgroup_prepare_migration(struct > if (mem_cgroup_subsys.disabled) > return 0; > > - lock_page_cgroup(page); > - pc = page_get_page_cgroup(page); > - if (pc) { > + pc = lookup_page_cgroup(page_to_pfn(page)); > + lock_page_cgroup(pc); > + if (PageCgroupUsed(pc)) { > mem = pc->mem_cgroup; > css_get(&mem->css); > if (PageCgroupCache(pc)) { > @@ -890,7 +779,7 @@ int mem_cgroup_prepare_migration(struct > ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; > } > } > - unlock_page_cgroup(page); > + unlock_page_cgroup(pc); > if (mem) { > ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, > ctype, mem); > @@ -1271,8 +1160,8 @@ mem_cgroup_create(struct cgroup_subsys * > int node; > > if (unlikely((cont->parent) == NULL)) { > + page_cgroup_init(); > mem = &init_mem_cgroup; > - page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); > } else { > mem = mem_cgroup_alloc(); > if (!mem) > Index: mmotm-2.6.27-rc6+/mm/page_alloc.c > =================================================================== > --- mmotm-2.6.27-rc6+.orig/mm/page_alloc.c > +++ mmotm-2.6.27-rc6+/mm/page_alloc.c > @@ -223,17 +223,12 @@ static inline int bad_range(struct zone > > static void bad_page(struct page *page) > { > - void *pc = page_get_page_cgroup(page); > - > printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG > "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", > current->comm, page, (int)(2*sizeof(unsigned long)), > (unsigned long)page->flags, page->mapping, > page_mapcount(page), page_count(page)); > - if (pc) { > - printk(KERN_EMERG "cgroup:%p\n", pc); > - page_reset_bad_cgroup(page); > - } > + > printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" > KERN_EMERG "Backtrace:\n"); > dump_stack(); > @@ -472,7 +467,6 @@ static inline void free_pages_check(stru > free_page_mlock(page); > if (unlikely(page_mapcount(page) | > (page->mapping != NULL) | > - (page_get_page_cgroup(page) != NULL) | > (page_count(page) != 0) | > (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) > bad_page(page); > @@ -609,7 +603,6 @@ static void prep_new_page(struct page *p > { > if (unlikely(page_mapcount(page) | > (page->mapping != NULL) | > - (page_get_page_cgroup(page) != NULL) | > (page_count(page) != 0) | > (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) > bad_page(page); > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@...r.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists