linux-kernel - Re: mm, vmscan: commit makes PAE kernel crash nightly (bisected)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170120155553.gjv2x5eycvdudnil@techsingularity.net>
Date:   Fri, 20 Jan 2017 15:55:53 +0000
From:   Mel Gorman <mgorman@...hsingularity.net>
To:     Trevor Cordes <trevor@...nopolis.ca>
Cc:     Michal Hocko <mhocko@...nel.org>, linux-kernel@...r.kernel.org,
        Joonsoo Kim <iamjoonsoo.kim@....com>,
        Minchan Kim <minchan@...nel.org>,
        Rik van Riel <riel@...riel.com>,
        Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
Subject: Re: mm, vmscan: commit makes PAE kernel crash nightly (bisected)

On Fri, Jan 20, 2017 at 11:02:32AM +0000, Mel Gorman wrote:
> On Fri, Jan 20, 2017 at 12:35:44AM -0600, Trevor Cordes wrote:
> > > > Hi!  The git tree above version oom'd after < 24 hours (3:02am) so
> > > > it doesn't solve the bug.  If you need a oom messages dump let me
> > > > know.  
> > > 
> > > Yes please.
> > 
> > The first oom from that night attached.  Note, the oom wasn't as dire
> > with your mhocko/4.9.0+ as it usually is with stock 4.8.x: my oom
> > detector and reboot script was able to do its thing cleanly before the
> > system became unusable.
> > 
> > I'll await further instructions and test right away.  Maybe I'll try a
> > few tuning ideas until then.  Thanks!
> > 
> 
> Thanks for the OOM report. I was expecting it to be a particular shape and
> my expectations were not matched so it took time to consider it further. Can
> you try the cumulative patch below? It combines three patches that
> 
> 1. Allow slab shrinking even if the LRU patches are unreclaimable in
>    direct reclaim
> 2. Shrinks slab based once based on the contents of all memcgs instead
>    of shrinking one at a time
> 3. Tries to shrink slabs if the lowmem usage is too high
> 
> Unfortunately it's only boot tested on x86-64 as I didn't get the chance
> to setup an i386 test bed.
> 

There was one major flaw in that patch. This version fixes it and
addresses other minor issues. It may still be too agressive shrinking
slab but worth trying out. Thanks.

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2281ad310d06..2c735ea24a85 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2318,6 +2318,59 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	}
 }
 
+#ifdef CONFIG_HIGHMEM
+static void balance_slab_lowmem(struct pglist_data *pgdat,
+				struct scan_control *sc)
+{
+	unsigned long lru_pages = 0;
+	unsigned long slab_pages = 0;
+	unsigned long managed_pages = 0;
+	int zid;
+
+	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+		struct zone *zone = &pgdat->node_zones[zid];
+
+		if (!populated_zone(zone) || is_highmem_idx(zid))
+			continue;
+
+		lru_pages += zone_page_state(zone, NR_ZONE_INACTIVE_FILE);
+		lru_pages += zone_page_state(zone, NR_ZONE_ACTIVE_FILE);
+		lru_pages += zone_page_state(zone, NR_ZONE_INACTIVE_ANON);
+		lru_pages += zone_page_state(zone, NR_ZONE_ACTIVE_ANON);
+		slab_pages += zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+		slab_pages += zone_page_state(zone, NR_SLAB_UNRECLAIMABLE);
+	}
+
+	/* Do not balance until LRU and slab exceeds 50% of lowmem */
+	if (lru_pages + slab_pages < (managed_pages >> 1))
+		return;
+
+	/*
+	 * Shrink reclaimable slabs if the number of lowmem slab pages is
+	 * over twice the size of LRU pages. Apply pressure relative to
+	 * the imbalance between LRU and slab pages.
+	 */
+	if (slab_pages > lru_pages << 1) {
+		struct reclaim_state *reclaim_state = current->reclaim_state;
+		unsigned long exceed = slab_pages - (lru_pages << 1);
+		int nid = pgdat->node_id;
+
+		exceed = min(exceed, slab_pages);
+		shrink_slab(sc->gfp_mask, nid, NULL, exceed >> 3, slab_pages);
+		if (reclaim_state) {
+			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+			reclaim_state->reclaimed_slab = 0;
+		}
+	}
+}
+#else
+static void balance_slab_lowmem(struct pglist_data *pgdat,
+				struct scan_control *sc)
+{
+	return;
+}
+#endif
+
 /*
  * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
  */
@@ -2336,6 +2389,27 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
 
 	get_scan_count(lruvec, memcg, sc, nr, lru_pages);
 
+	/*
+	 * If direct reclaiming at elevated priority and the node is
+	 * unreclaimable then skip LRU reclaim and let kswapd poll it.
+	 */
+	if (!current_is_kswapd() &&
+	    sc->priority != DEF_PRIORITY &&
+	    !pgdat_reclaimable(pgdat)) {
+		unsigned long nr_scanned;
+
+		/*
+		 * Fake scanning so that slab shrinking will continue. For
+		 * lowmem restricted allocations, shrink aggressively.
+		 */
+		nr_scanned = SWAP_CLUSTER_MAX << (DEF_PRIORITY - sc->priority);
+		if (!(sc->gfp_mask & __GFP_HIGHMEM))
+			nr_scanned = max(nr_scanned, *lru_pages);
+		sc->nr_scanned += nr_scanned;
+
+		return;
+	}
+
 	/* Record the original scan target for proportional adjustments later */
 	memcpy(targets, nr, sizeof(nr));
 
@@ -2435,6 +2509,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
 	if (inactive_list_is_low(lruvec, false, sc, true))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
+
+	balance_slab_lowmem(pgdat, sc);
 }
 
 /* Use reclaim/compaction for costly allocs or under memory pressure */
@@ -2533,7 +2609,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			.pgdat = pgdat,
 			.priority = sc->priority,
 		};
-		unsigned long node_lru_pages = 0;
+		unsigned long slab_pressure = 0;
+		unsigned long slab_eligible = 0;
 		struct mem_cgroup *memcg;
 
 		nr_reclaimed = sc->nr_reclaimed;
@@ -2555,12 +2632,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			scanned = sc->nr_scanned;
 
 			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
-			node_lru_pages += lru_pages;
-
-			if (memcg)
-				shrink_slab(sc->gfp_mask, pgdat->node_id,
-					    memcg, sc->nr_scanned - scanned,
-					    lru_pages);
+			slab_eligible += lru_pages;
+			slab_pressure += sc->nr_reclaimed - reclaimed;
 
 			/* Record the group's reclaim efficiency */
 			vmpressure(sc->gfp_mask, memcg, false,
@@ -2586,12 +2659,12 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
 		/*
 		 * Shrink the slab caches in the same proportion that
-		 * the eligible LRU pages were scanned.
+		 * the eligible LRU pages were scanned. For memcg, this
+		 * will apply the cumulative scanning pressure over all
+		 * memcgs.
 		 */
-		if (global_reclaim(sc))
-			shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
-				    sc->nr_scanned - nr_scanned,
-				    node_lru_pages);
+		shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, slab_pressure,
+							slab_eligible);
 
 		if (reclaim_state) {
 			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2683,10 +2756,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 						 GFP_KERNEL | __GFP_HARDWALL))
 				continue;
 
-			if (sc->priority != DEF_PRIORITY &&
-			    !pgdat_reclaimable(zone->zone_pgdat))
-				continue;	/* Let kswapd poll it */
-
 			/*
 			 * If we already have plenty of memory free for
 			 * compaction in this zone, don't free any more.
-- 
Mel Gorman
SUSE Labs