linux-kernel - Re: [PATCH 5/8] mm: vmscan: replace shrink

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20191022195629.GA24142@tower.DHCP.thefacebook.com>
Date:   Tue, 22 Oct 2019 19:56:33 +0000
From:   Roman Gushchin <guro@...com>
To:     Johannes Weiner <hannes@...xchg.org>
CC:     Andrew Morton <akpm@...ux-foundation.org>,
        Michal Hocko <mhocko@...e.com>,
        "linux-mm@...ck.org" <linux-mm@...ck.org>,
        "cgroups@...r.kernel.org" <cgroups@...r.kernel.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Kernel Team <Kernel-team@...com>
Subject: Re: [PATCH 5/8] mm: vmscan: replace shrink_node() loop with a retry
 jump

On Tue, Oct 22, 2019 at 10:48:00AM -0400, Johannes Weiner wrote:
> Most of the function body is inside a loop, which imposes an
> additional indentation and scoping level that makes the code a bit
> hard to follow and modify.
> 
> The looping only happens in case of reclaim-compaction, which isn't
> the common case. So rather than adding yet another function level to
> the reclaim path and have every reclaim invocation go through a level
> that only exists for one specific cornercase, use a retry goto.
> 
> Signed-off-by: Johannes Weiner <hannes@...xchg.org>
> ---
>  mm/vmscan.c | 231 ++++++++++++++++++++++++++--------------------------
>  1 file changed, 115 insertions(+), 116 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 302dad112f75..235d1fc72311 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2729,144 +2729,143 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
>  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  {
>  	struct reclaim_state *reclaim_state = current->reclaim_state;
> +	struct mem_cgroup *root = sc->target_mem_cgroup;
>  	unsigned long nr_reclaimed, nr_scanned;
>  	bool reclaimable = false;
> +	struct mem_cgroup *memcg;
> +again:
> +	memset(&sc->nr, 0, sizeof(sc->nr));
>  
> -	do {
> -		struct mem_cgroup *root = sc->target_mem_cgroup;
> -		struct mem_cgroup *memcg;
> -
> -		memset(&sc->nr, 0, sizeof(sc->nr));
> -
> -		nr_reclaimed = sc->nr_reclaimed;
> -		nr_scanned = sc->nr_scanned;
> +	nr_reclaimed = sc->nr_reclaimed;
> +	nr_scanned = sc->nr_scanned;
>  
> -		memcg = mem_cgroup_iter(root, NULL, NULL);
> -		do {
> -			unsigned long reclaimed;
> -			unsigned long scanned;
> +	memcg = mem_cgroup_iter(root, NULL, NULL);
> +	do {
> +		unsigned long reclaimed;
> +		unsigned long scanned;
>  
> -			switch (mem_cgroup_protected(root, memcg)) {
> -			case MEMCG_PROT_MIN:
> -				/*
> -				 * Hard protection.
> -				 * If there is no reclaimable memory, OOM.
> -				 */
> +		switch (mem_cgroup_protected(root, memcg)) {
> +		case MEMCG_PROT_MIN:
> +			/*
> +			 * Hard protection.
> +			 * If there is no reclaimable memory, OOM.
> +			 */
> +			continue;
> +		case MEMCG_PROT_LOW:
> +			/*
> +			 * Soft protection.
> +			 * Respect the protection only as long as
> +			 * there is an unprotected supply
> +			 * of reclaimable memory from other cgroups.
> +			 */
> +			if (!sc->memcg_low_reclaim) {
> +				sc->memcg_low_skipped = 1;
>  				continue;
> -			case MEMCG_PROT_LOW:
> -				/*
> -				 * Soft protection.
> -				 * Respect the protection only as long as
> -				 * there is an unprotected supply
> -				 * of reclaimable memory from other cgroups.
> -				 */
> -				if (!sc->memcg_low_reclaim) {
> -					sc->memcg_low_skipped = 1;
> -					continue;
> -				}
> -				memcg_memory_event(memcg, MEMCG_LOW);
> -				break;
> -			case MEMCG_PROT_NONE:
> -				/*
> -				 * All protection thresholds breached. We may
> -				 * still choose to vary the scan pressure
> -				 * applied based on by how much the cgroup in
> -				 * question has exceeded its protection
> -				 * thresholds (see get_scan_count).
> -				 */
> -				break;
>  			}
> +			memcg_memory_event(memcg, MEMCG_LOW);
> +			break;
> +		case MEMCG_PROT_NONE:
> +			/*
> +			 * All protection thresholds breached. We may
> +			 * still choose to vary the scan pressure
> +			 * applied based on by how much the cgroup in
> +			 * question has exceeded its protection
> +			 * thresholds (see get_scan_count).
> +			 */
> +			break;
> +		}
>  
> -			reclaimed = sc->nr_reclaimed;
> -			scanned = sc->nr_scanned;
> -			shrink_node_memcg(pgdat, memcg, sc);
> -
> -			shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> -					sc->priority);
> -
> -			/* Record the group's reclaim efficiency */
> -			vmpressure(sc->gfp_mask, memcg, false,
> -				   sc->nr_scanned - scanned,
> -				   sc->nr_reclaimed - reclaimed);
> -
> -		} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> +		reclaimed = sc->nr_reclaimed;
> +		scanned = sc->nr_scanned;
> +		shrink_node_memcg(pgdat, memcg, sc);
>  
> -		if (reclaim_state) {
> -			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> -			reclaim_state->reclaimed_slab = 0;
> -		}
> +		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> +			    sc->priority);
>  
> -		/* Record the subtree's reclaim efficiency */
> -		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> -			   sc->nr_scanned - nr_scanned,
> -			   sc->nr_reclaimed - nr_reclaimed);
> +		/* Record the group's reclaim efficiency */
> +		vmpressure(sc->gfp_mask, memcg, false,
> +			   sc->nr_scanned - scanned,
> +			   sc->nr_reclaimed - reclaimed);

It doesn't look as a trivial change. I'd add some comments to the commit message
why it's safe to do.

Thanks!

>  
> -		if (sc->nr_reclaimed - nr_reclaimed)
> -			reclaimable = true;
> +	} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
>  
> -		if (current_is_kswapd()) {
> -			/*
> -			 * If reclaim is isolating dirty pages under writeback,
> -			 * it implies that the long-lived page allocation rate
> -			 * is exceeding the page laundering rate. Either the
> -			 * global limits are not being effective at throttling
> -			 * processes due to the page distribution throughout
> -			 * zones or there is heavy usage of a slow backing
> -			 * device. The only option is to throttle from reclaim
> -			 * context which is not ideal as there is no guarantee
> -			 * the dirtying process is throttled in the same way
> -			 * balance_dirty_pages() manages.
> -			 *
> -			 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> -			 * count the number of pages under pages flagged for
> -			 * immediate reclaim and stall if any are encountered
> -			 * in the nr_immediate check below.
> -			 */
> -			if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
> -				set_bit(PGDAT_WRITEBACK, &pgdat->flags);
> +	if (reclaim_state) {
> +		sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> +		reclaim_state->reclaimed_slab = 0;
> +	}
>  
> -			/*
> -			 * Tag a node as congested if all the dirty pages
> -			 * scanned were backed by a congested BDI and
> -			 * wait_iff_congested will stall.
> -			 */
> -			if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -				set_bit(PGDAT_CONGESTED, &pgdat->flags);
> +	/* Record the subtree's reclaim efficiency */
> +	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> +		   sc->nr_scanned - nr_scanned,
> +		   sc->nr_reclaimed - nr_reclaimed);
>  
> -			/* Allow kswapd to start writing pages during reclaim.*/
> -			if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> -				set_bit(PGDAT_DIRTY, &pgdat->flags);
> +	if (sc->nr_reclaimed - nr_reclaimed)
> +		reclaimable = true;
>  
> -			/*
> -			 * If kswapd scans pages marked marked for immediate
> -			 * reclaim and under writeback (nr_immediate), it
> -			 * implies that pages are cycling through the LRU
> -			 * faster than they are written so also forcibly stall.
> -			 */
> -			if (sc->nr.immediate)
> -				congestion_wait(BLK_RW_ASYNC, HZ/10);
> -		}

Don't you want to separate the block below into a separate function?
It can probably make the big loop shorter and easier to follow.

Thanks!

> +	if (current_is_kswapd()) {
> +		/*
> +		 * If reclaim is isolating dirty pages under writeback,
> +		 * it implies that the long-lived page allocation rate
> +		 * is exceeding the page laundering rate. Either the
> +		 * global limits are not being effective at throttling
> +		 * processes due to the page distribution throughout
> +		 * zones or there is heavy usage of a slow backing
> +		 * device. The only option is to throttle from reclaim
> +		 * context which is not ideal as there is no guarantee
> +		 * the dirtying process is throttled in the same way
> +		 * balance_dirty_pages() manages.
> +		 *
> +		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> +		 * count the number of pages under pages flagged for
> +		 * immediate reclaim and stall if any are encountered
> +		 * in the nr_immediate check below.
> +		 */
> +		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
> +			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
>  
>  		/*
> -		 * Legacy memcg will stall in page writeback so avoid forcibly
> -		 * stalling in wait_iff_congested().
> +		 * Tag a node as congested if all the dirty pages
> +		 * scanned were backed by a congested BDI and
> +		 * wait_iff_congested will stall.
>  		 */
> -		if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> -		    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -			set_memcg_congestion(pgdat, root, true);
> +		if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> +			set_bit(PGDAT_CONGESTED, &pgdat->flags);
> +
> +		/* Allow kswapd to start writing pages during reclaim.*/
> +		if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> +			set_bit(PGDAT_DIRTY, &pgdat->flags);
>  
>  		/*
> -		 * Stall direct reclaim for IO completions if underlying BDIs
> -		 * and node is congested. Allow kswapd to continue until it
> -		 * starts encountering unqueued dirty pages or cycling through
> -		 * the LRU too quickly.
> +		 * If kswapd scans pages marked marked for immediate
> +		 * reclaim and under writeback (nr_immediate), it
> +		 * implies that pages are cycling through the LRU
> +		 * faster than they are written so also forcibly stall.
>  		 */
> -		if (!sc->hibernation_mode && !current_is_kswapd() &&
> -		   current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> -			wait_iff_congested(BLK_RW_ASYNC, HZ/10);
> +		if (sc->nr.immediate)
> +			congestion_wait(BLK_RW_ASYNC, HZ/10);
> +	}
> +
> +	/*
> +	 * Legacy memcg will stall in page writeback so avoid forcibly
> +	 * stalling in wait_iff_congested().
> +	 */
> +	if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> +	    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> +		set_memcg_congestion(pgdat, root, true);
> +
> +	/*
> +	 * Stall direct reclaim for IO completions if underlying BDIs
> +	 * and node is congested. Allow kswapd to continue until it
> +	 * starts encountering unqueued dirty pages or cycling through
> +	 * the LRU too quickly.
> +	 */
> +	if (!sc->hibernation_mode && !current_is_kswapd() &&
> +	    current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> +		wait_iff_congested(BLK_RW_ASYNC, HZ/10);
>  
> -	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> -					 sc));
> +	if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> +				    sc))
> +		goto again;
>  
>  	/*
>  	 * Kswapd gives up on balancing particular nodes after too
> -- 
> 2.23.0
> 
>