[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1302777698-28237-13-git-send-email-mgorman@suse.de>
Date: Thu, 14 Apr 2011 11:41:38 +0100
From: Mel Gorman <mgorman@...e.de>
To: Linux-MM <linux-mm@...ck.org>,
Linux-Netdev <netdev@...r.kernel.org>
Cc: LKML <linux-kernel@...r.kernel.org>,
Peter Zijlstra <a.p.zijlstra@...llo.nl>,
Mel Gorman <mgorman@...e.de>
Subject: [PATCH 12/12] mm: Throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
If swap is backed by network storage such as NBD, there is a risk that a
large number of reclaimers can hang the system by consuming all
PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune
min_free_kbytes in advance. This patch will throttle direct reclaimers
if half the PF_MEMALLOC reserves are in use as the system is at risk of
hanging. A message will be displayed so the administrator knows that
min_free_kbytes should be tuned to a higher value to avoid the
throttling in the future.
Signed-off-by: Mel Gorman <mgorman@...e.de>
---
include/linux/mmzone.h | 1 +
mm/page_alloc.c | 1 +
mm/vmscan.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 68 insertions(+), 0 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 02ecb01..e86dcaf 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -638,6 +638,7 @@ typedef struct pglist_data {
range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
+ wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd;
int kswapd_max_order;
enum zone_type classzone_idx;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2b87dfd..4b1170f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4160,6 +4160,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6771ea7..2dad23d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,6 +42,8 @@
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <net/sock.h>
+
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -2115,6 +2117,61 @@ out:
return 0;
}
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat, int high_zoneidx)
+{
+ struct zone *zone;
+ unsigned long pfmemalloc_reserve = 0;
+ unsigned long free_pages = 0;
+ int i;
+
+ for (i = 0; i <= high_zoneidx; i++) {
+ zone = &pgdat->node_zones[i];
+ pfmemalloc_reserve += min_wmark_pages(zone);
+ free_pages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+
+ return (free_pages > pfmemalloc_reserve / 2) ? true : false;
+}
+
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+ nodemask_t *nodemask)
+{
+ struct zone *zone;
+ int high_zoneidx = gfp_zone(gfp_mask);
+ DEFINE_WAIT(wait);
+
+ /*
+ * Only worry about the PFMEMALLOC reserves when network-backed
+ * storage is configured.
+ */
+ if (!sk_memalloc_socks())
+ return;
+
+ /* Check if the pfmemalloc reserves are ok */
+ first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+ if (pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx))
+ return;
+
+ /* Throttle */
+ if (printk_ratelimit())
+ printk(KERN_INFO "Throttling %s due to reclaim pressure on "
+ "network storage\n",
+ current->comm);
+ do {
+ prepare_to_wait(&zone->zone_pgdat->pfmemalloc_wait, &wait,
+ TASK_INTERRUPTIBLE);
+ schedule();
+ finish_wait(&zone->zone_pgdat->pfmemalloc_wait, &wait);
+ } while (!pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx) &&
+ !fatal_signal_pending(current));
+}
+
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
@@ -2131,6 +2188,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.nodemask = nodemask,
};
+ throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
@@ -2482,6 +2541,13 @@ loop_again:
}
}
+
+ /* Wake throttled direct reclaimers if low watermark is met */
+ if (sk_memalloc_socks() &&
+ waitqueue_active(&pgdat->pfmemalloc_wait) &&
+ pfmemalloc_watermark_ok(pgdat, MAX_NR_ZONES - 1))
+ wake_up_interruptible(&pgdat->pfmemalloc_wait);
+
if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
--
1.7.3.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists