Subject: Keep freed pages in direct reclaim Date: Thu, 9 Dec 2010 14:01:32 +0900 From: Minchan Kim direct reclaimed process often sleep and race with other processes. Although direct reclaim proceess requires high order pags(order > 0) and reclaims page successfully, other processes which require order-0 page could steal the high order page for direct reclaimed process. After all, direct reclaimed process try it again and it still has a possibility of above scenario. It can make bad effects following as 1. direct reclaimed process latency is big 2. eviction working set page due to lumpy reclaim 3. continue to wake up kswapd This patch solves it. Fengguang: fix [ 1514.892933] BUG: unable to handle kernel [ 1514.892958] ---[ end trace be7cb17861e1d25b ]--- [ 1514.893589] NULL pointer dereference at (null) [ 1514.893968] IP: [] shrink_page_list+0x3dc/0x501 Signed-off-by: Minchan Kim Signed-off-by: Wu Fengguang --- fs/buffer.c | 2 +- include/linux/swap.h | 4 +++- mm/page_alloc.c | 27 +++++++++++++++++++++++---- mm/vmscan.c | 23 +++++++++++++++++++---- 4 files changed, 46 insertions(+), 10 deletions(-) --- linux-next.orig/fs/buffer.c 2011-05-02 10:34:06.000000000 +0800 +++ linux-next/fs/buffer.c 2011-05-02 10:45:24.000000000 +0800 @@ -289,7 +289,7 @@ static void free_more_memory(void) &zone); if (zone) try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, - GFP_NOFS, NULL); + GFP_NOFS, NULL, NULL); } } --- linux-next.orig/include/linux/swap.h 2011-05-02 10:34:06.000000000 +0800 +++ linux-next/include/linux/swap.h 2011-05-02 10:45:24.000000000 +0800 @@ -249,8 +249,10 @@ static inline void lru_cache_add_file(st #define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ /* linux/mm/vmscan.c */ +extern noinline_for_stack void free_page_list(struct list_head *free_pages); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask, nodemask_t *mask); + gfp_t gfp_mask, nodemask_t *mask, + struct list_head *freed_pages); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, unsigned int swappiness); --- linux-next.orig/mm/page_alloc.c 2011-05-02 10:34:06.000000000 +0800 +++ linux-next/mm/page_alloc.c 2011-05-02 10:45:26.000000000 +0800 @@ -1890,8 +1890,11 @@ __alloc_pages_direct_reclaim(gfp_t gfp_m { struct page *page = NULL; struct reclaim_state reclaim_state; + bool high_order; bool drained = false; + LIST_HEAD(freed_pages); + high_order = order ? true : false; cond_resched(); /* We now go into synchronous reclaim */ @@ -1901,16 +1904,31 @@ __alloc_pages_direct_reclaim(gfp_t gfp_m reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; - *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); - + /* + * If request is high order, keep the pages which are reclaimed + * in own list for preventing the lose by other processes. + */ + *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, + nodemask, high_order ? &freed_pages : NULL); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); current->flags &= ~PF_MEMALLOC; + if (high_order && !list_empty(&freed_pages)) { + free_page_list(&freed_pages); + drain_all_pages(); + drained = true; + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, + alloc_flags, preferred_zone, + migratetype); + if (page) + goto out; + } cond_resched(); if (unlikely(!(*did_some_progress))) - return NULL; + goto out; retry: page = get_page_from_freelist(gfp_mask, nodemask, order, @@ -1927,7 +1945,8 @@ retry: drained = true; goto retry; } - +out: + VM_BUG_ON(!list_empty(&freed_pages)); return page; } --- linux-next.orig/mm/vmscan.c 2011-05-02 10:34:06.000000000 +0800 +++ linux-next/mm/vmscan.c 2011-05-02 10:46:31.000000000 +0800 @@ -112,6 +112,9 @@ struct scan_control { * are scanned. */ nodemask_t *nodemask; + + /* keep freed pages */ + struct list_head *freed_pages; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -681,7 +684,7 @@ static enum page_references page_check_r return PAGEREF_RECLAIM; } -static noinline_for_stack void free_page_list(struct list_head *free_pages) +noinline_for_stack void free_page_list(struct list_head *free_pages) { struct pagevec freed_pvec; struct page *page, *tmp; @@ -712,6 +715,10 @@ static unsigned long shrink_page_list(st unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; + struct list_head *free_list = &free_pages; + + if (sc->freed_pages) + free_list = sc->freed_pages; cond_resched(); @@ -904,7 +911,7 @@ free_it: * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - list_add(&page->lru, &free_pages); + list_add(&page->lru, free_list); continue; cull_mlocked: @@ -940,7 +947,13 @@ keep_lumpy: if (nr_dirty == nr_congested && nr_dirty != 0) zone_set_flag(zone, ZONE_CONGESTED); - free_page_list(&free_pages); + /* + * If reclaim is direct path and high order, caller should + * free reclaimed pages. It is for preventing reclaimed pages + * lose by other processes. + */ + if (!sc->freed_pages) + free_page_list(&free_pages); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); @@ -2118,7 +2131,8 @@ out: } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask, nodemask_t *nodemask) + gfp_t gfp_mask, nodemask_t *nodemask, + struct list_head *freed_pages) { unsigned long nr_reclaimed; struct scan_control sc = { @@ -2131,6 +2145,7 @@ unsigned long try_to_free_pages(struct z .order = order, .mem_cgroup = NULL, .nodemask = nodemask, + .freed_pages = freed_pages, }; trace_mm_vmscan_direct_reclaim_begin(order,