add throttle to shrink_zone() for performance improvement and prevent incorrect oom. Signed-off-by: KOSAKI Motohiro --- include/linux/mmzone.h | 2 + include/linux/sched.h | 1 mm/Kconfig | 10 +++++++ mm/page_alloc.c | 3 ++ mm/vmscan.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 77 insertions(+), 1 deletion(-) Index: b/include/linux/mmzone.h =================================================================== --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -328,6 +328,8 @@ struct zone { unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ + atomic_t nr_reclaimers; + wait_queue_head_t reclaim_throttle_waitq; /* * rarely used fields: */ Index: b/mm/page_alloc.c =================================================================== --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3500,6 +3500,9 @@ static void __paginginit free_area_init_ zone->nr_scan_inactive = 0; zap_zone_vm_stats(zone); zone->flags = 0; + atomic_set(&zone->nr_reclaimers, 0); + init_waitqueue_head(&zone->reclaim_throttle_waitq); + if (!size) continue; Index: b/mm/vmscan.c =================================================================== --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -74,6 +74,11 @@ struct scan_control { int order; + /* Can shrink be cutted off if other task freeded enough page. */ + int may_cut_off; + + unsigned long was_freed; + /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -120,6 +125,7 @@ struct scan_control { int vm_swappiness = 60; long vm_total_pages; /* The total number of pages which the VM controls */ +#define MAX_RECLAIM_TASKS CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -1187,7 +1193,52 @@ static int shrink_zone(int priority, str unsigned long nr_inactive; unsigned long nr_to_scan; unsigned long nr_reclaimed = 0; + int ret = 0; + int throttle_on = 0; + unsigned long freed; + unsigned long threshold; + + /* !__GFP_IO and/or !__GFP_FS task may grab some locks. + thus, if these tasks wait on other, it may cause deadlock. */ + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS)) + goto shrinking; + + /* avoid recursing wait_evnet for avoid deadlock. */ + if (current->flags & PF_RECLAIMING) + goto shrinking; + + throttle_on = 1; + current->flags |= PF_RECLAIMING; + wait_event(zone->reclaim_throttle_waitq, + atomic_add_unless(&zone->nr_reclaimers, 1, MAX_RECLAIM_TASKS)); + + + /* in some situation (e.g. hibernation), shrink processing shouldn't be + cut off even though large memory freeded. */ + if (!sc->may_cut_off) + goto shrinking; + + /* kswapd is no related for user latency experience. */ + if (current->flags & PF_KSWAPD) + goto shrinking; + + /* x4 ratio mean we want rarely check. + because frequently check decrease performance. */ + threshold = ((1 << sc->order) + zone->pages_high) * 4; + freed = get_vm_event(PGFREE); + + /* reclaim still necessary? */ + if (scan_global_lru(sc) && + freed - sc->was_freed >= threshold) { + if (zone_watermark_ok(zone, sc->order, zone->pages_high, + gfp_zone(sc->gfp_mask), 0)) { + ret = -EAGAIN; + goto out; + } + sc->was_freed = freed; + } +shrinking: if (scan_global_lru(sc)) { /* * Add one to nr_to_scan just to make sure that the kernel @@ -1239,9 +1290,16 @@ static int shrink_zone(int priority, str } } +out: + if (throttle_on) { + current->flags &= ~PF_RECLAIMING; + atomic_dec(&zone->nr_reclaimers); + wake_up(&zone->reclaim_throttle_waitq); + } + sc->nr_reclaimed += nr_reclaimed; throttle_vm_writeout(sc->gfp_mask); - return 0; + return ret; } /* @@ -1439,6 +1497,8 @@ unsigned long try_to_free_pages(struct z .order = order, .mem_cgroup = NULL, .isolate_pages = isolate_pages_global, + .may_cut_off = 1, + .was_freed = get_vm_event(PGFREE), }; return do_try_to_free_pages(zonelist, &sc); Index: b/mm/Kconfig =================================================================== --- a/mm/Kconfig +++ b/mm/Kconfig @@ -205,3 +205,13 @@ config NR_QUICK config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config NR_MAX_RECLAIM_TASKS_PER_ZONE + int "maximum number of reclaiming tasks at the same time" + default 3 + help + This value determines the number of threads which can do page reclaim + in a zone simultaneously. If this is too big, performance under heavy memory + pressure will decrease. + If unsure, use default. + Index: b/include/linux/sched.h =================================================================== --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1510,6 +1510,7 @@ static inline void put_task_struct(struc #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ +#define PF_RECLAIMING 0x80000000 /* The task have page reclaim throttling ticket */ /* * Only the _current_ task can read/write to tsk->flags, but other -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/