The primary idea behind isolated pages is in a better isolation of a group from the global system and other groups activity. At the moment, memory cgroups are mainly used to throttle processes in a group by placing a cap on their memory usage. However, mem. cgroups don't protect their (charged) memory from being evicted by the global reclaim as all its pages are on the global LRU. This feature will provide an easy way to setup an application in the memory isolated environment without necessity of mlock to keep its pages in the memory. Due to per-cgroup reclaim, we can eliminate interference between unrelated cgroups that exhibit a spike in memory usage. A similar setup could be achieved with the current implementation as well by placing the critical application into the root group while all other processes would be placed in another group (or groups). This is, however, much harder to configure and also we have only one such an "exclusive" group on the system which is quite limiting. This goal is achieved by isolating those pages from the global LRU and keeping them on a per-cgroup LRU only so the memory cgroup is not affected by the global reclaim at all. If we isolate mem-cgroup pages from the global LRU we can still do the per-cgroup reclaim so the isolation is not the same thing as mlocking that memory. is_mem_cgroup_isolated is not called directly by the code that adds (__add_page_to_lru_list) or moves (isolate_lru_pages, move_active_pages_to_lru, check_move_unevictable_page, pagevec_move_tail, lru_deactivate) pages into an LRU because we would need to find a page_cgroup for the page and this would add an overhead. We changed the semantic for memcg LRU functions (which add or move pages to mem cgroup LRU) instead to return a flag whether the page is global (return true) or mem cgroup isolated. page->lru is initialized to an empty list whenever the page is not on the global LRU to make the LRU removal path without modifications. The page is still mark PageLRU so nobody else will misuse page->lru for other purposes. Signed-off-by: Michal Hocko --- include/linux/memcontrol.h | 22 ++++++++++++---------- include/linux/mm_inline.h | 10 ++++++++-- mm/memcontrol.c | 36 +++++++++++++++++++++--------------- mm/swap.c | 12 ++++++++---- mm/vmscan.c | 25 +++++++++++++++++-------- 5 files changed, 66 insertions(+), 39 deletions(-) Index: linux-2.6.38-rc8/include/linux/memcontrol.h =================================================================== --- linux-2.6.38-rc8.orig/include/linux/memcontrol.h 2011-03-28 11:23:58.000000000 +0200 +++ linux-2.6.38-rc8/include/linux/memcontrol.h 2011-03-28 11:24:20.000000000 +0200 @@ -60,12 +60,12 @@ extern void mem_cgroup_cancel_charge_swa extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); -extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru); +extern bool mem_cgroup_add_lru_list(struct page *page, enum lru_list lru); extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_rotate_reclaimable_page(struct page *page); -extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); +extern bool mem_cgroup_rotate_reclaimable_page(struct page *page); +extern bool mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); extern void mem_cgroup_del_lru(struct page *page); -extern void mem_cgroup_move_lists(struct page *page, +extern bool mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to); /* For coalescing uncharge for reducing memcg' overhead*/ @@ -209,13 +209,14 @@ static inline int mem_cgroup_shmem_charg return 0; } -static inline void mem_cgroup_add_lru_list(struct page *page, int lru) +static inline bool mem_cgroup_add_lru_list(struct page *page, int lru) { + return true; } -static inline void mem_cgroup_del_lru_list(struct page *page, int lru) +static inline bool mem_cgroup_del_lru_list(struct page *page, int lru) { - return ; + return true; } static inline inline void mem_cgroup_rotate_reclaimable_page(struct page *page) @@ -223,9 +224,9 @@ static inline inline void mem_cgroup_rot return ; } -static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru) +static inline bool mem_cgroup_rotate_lru_list(struct page *page, int lru) { - return ; + return true; } static inline void mem_cgroup_del_lru(struct page *page) @@ -233,9 +234,10 @@ static inline void mem_cgroup_del_lru(st return ; } -static inline void +static inline bool mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) { + return true; } static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) Index: linux-2.6.38-rc8/include/linux/mm_inline.h =================================================================== --- linux-2.6.38-rc8.orig/include/linux/mm_inline.h 2011-03-28 11:23:58.000000000 +0200 +++ linux-2.6.38-rc8/include/linux/mm_inline.h 2011-03-28 11:24:20.000000000 +0200 @@ -25,9 +25,15 @@ static inline void __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, struct list_head *head) { - list_add(&page->lru, head); __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); - mem_cgroup_add_lru_list(page, l); + + /* Add to the global LRU only if cgroup doesn't want the page + * exclusively + */ + if (mem_cgroup_add_lru_list(page, l)) + list_add(&page->lru, head); + else + INIT_LIST_HEAD(&page->lru); } static inline void Index: linux-2.6.38-rc8/mm/memcontrol.c =================================================================== --- linux-2.6.38-rc8.orig/mm/memcontrol.c 2011-03-28 11:23:58.000000000 +0200 +++ linux-2.6.38-rc8/mm/memcontrol.c 2011-03-28 11:24:20.000000000 +0200 @@ -866,58 +866,62 @@ void mem_cgroup_del_lru(struct page *pag * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void mem_cgroup_rotate_reclaimable_page(struct page *page) +bool mem_cgroup_rotate_reclaimable_page(struct page *page) { struct mem_cgroup_per_zone *mz; struct page_cgroup *pc; enum lru_list lru = page_lru(page); if (mem_cgroup_disabled()) - return; + return true; pc = lookup_page_cgroup(page); /* unused or root page is not rotated. */ if (!PageCgroupUsed(pc)) - return; + return true; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); if (mem_cgroup_is_root(pc->mem_cgroup)) - return; + return true; mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move_tail(&pc->lru, &mz->lists[lru]); + + return !is_mem_cgroup_isolated(pc->mem_cgroup); } -void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +bool mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) { struct mem_cgroup_per_zone *mz; struct page_cgroup *pc; if (mem_cgroup_disabled()) - return; + return true; pc = lookup_page_cgroup(page); /* unused or root page is not rotated. */ if (!PageCgroupUsed(pc)) - return; + return true; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); if (mem_cgroup_is_root(pc->mem_cgroup)) - return; + return true; mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move(&pc->lru, &mz->lists[lru]); + + return !is_mem_cgroup_isolated(pc->mem_cgroup); } -void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) +bool mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) - return; + return true; pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); if (!PageCgroupUsed(pc)) - return; + return true; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); @@ -925,8 +929,10 @@ void mem_cgroup_add_lru_list(struct page MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); if (mem_cgroup_is_root(pc->mem_cgroup)) - return; + return true; list_add(&pc->lru, &mz->lists[lru]); + + return !is_mem_cgroup_isolated(pc->mem_cgroup); } /* @@ -979,13 +985,13 @@ static void mem_cgroup_lru_add_after_com } -void mem_cgroup_move_lists(struct page *page, +bool mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) { if (mem_cgroup_disabled()) - return; + return true; mem_cgroup_del_lru_list(page, from); - mem_cgroup_add_lru_list(page, to); + return mem_cgroup_add_lru_list(page, to); } int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) Index: linux-2.6.38-rc8/mm/vmscan.c =================================================================== --- linux-2.6.38-rc8.orig/mm/vmscan.c 2011-03-28 11:23:58.000000000 +0200 +++ linux-2.6.38-rc8/mm/vmscan.c 2011-03-28 11:24:57.000000000 +0200 @@ -1049,8 +1049,10 @@ static unsigned long isolate_lru_pages(u case -EBUSY: /* else it is being freed elsewhere */ - list_move(&page->lru, src); - mem_cgroup_rotate_lru_list(page, page_lru(page)); + if (mem_cgroup_rotate_lru_list(page, page_lru(page))) + list_move(&page->lru, src); + else + list_del_init(&page->lru); continue; default: @@ -1482,8 +1484,11 @@ static void move_active_pages_to_lru(str VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); + if (mem_cgroup_add_lru_list(page, lru)) + list_move(&page->lru, &zone->lru[lru].list); + else + list_del_init(&page->lru); + pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { @@ -3133,8 +3138,10 @@ retry: enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); - list_move(&page->lru, &zone->lru[l].list); - mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); + if (mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l)) + list_move(&page->lru, &zone->lru[l].list); + else + list_del_init(&page->lru); __inc_zone_state(zone, NR_INACTIVE_ANON + l); __count_vm_event(UNEVICTABLE_PGRESCUED); } else { @@ -3142,8 +3149,10 @@ retry: * rotate unevictable list */ SetPageUnevictable(page); - list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); - mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); + if (mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE)) + list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); + else + list_del_init(&page->lru); if (page_evictable(page, NULL)) goto retry; } Index: linux-2.6.38-rc8/mm/swap.c =================================================================== --- linux-2.6.38-rc8.orig/mm/swap.c 2011-03-28 11:23:58.000000000 +0200 +++ linux-2.6.38-rc8/mm/swap.c 2011-03-28 11:24:20.000000000 +0200 @@ -201,8 +201,10 @@ static void pagevec_move_tail(struct pag } if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); + if (mem_cgroup_rotate_reclaimable_page(page)) + list_move_tail(&page->lru, &zone->lru[lru].list); + else + list_del_init(&page->lru); pgmoved++; } } @@ -402,8 +404,10 @@ static void lru_deactivate(struct page * * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); + if (mem_cgroup_rotate_reclaimable_page(page)) + list_move_tail(&page->lru, &zone->lru[lru].list); + else + list_del_init(&page->lru); __count_vm_event(PGROTATED); } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/