[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <224545ab-9859-6f37-f58a-d5e04371258c@suse.cz>
Date:   Tue, 20 Mar 2018 10:56:17 +0100
From:   Vlastimil Babka <vbabka@...e.cz>
To:     David Rientjes <rientjes@...gle.com>,
        Andrew Morton <akpm@...ux-foundation.org>
Cc:     Mel Gorman <mgorman@...hsingularity.net>,
        linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [patch] mm, page_alloc: wakeup kcompactd even if kswapd cannot
 free more memory
On 03/12/2018 01:00 AM, David Rientjes wrote:
> Kswapd will not wakeup if per-zone watermarks are not failing or if too
> many previous attempts at background reclaim have failed.
> 
> This can be true if there is a lot of free memory available.  For high-
> order allocations, kswapd is responsible for waking up kcompactd for
> background compaction.  If the zone is now below its watermarks or
                                         not ?
> reclaim has recently failed (lots of free memory, nothing left to
> reclaim), kcompactd does not get woken up.
> 
> When __GFP_DIRECT_RECLAIM is not allowed, allow kcompactd to still be
> woken up even if kswapd will not reclaim.  This allows high-order
> allocations, such as thp, to still trigger background compaction even
> when the zone has an abundance of free memory.
> 
> Signed-off-by: David Rientjes <rientjes@...gle.com>
Acked-by: Vlastimil Babka <vbabka@...e.cz>
> ---
>  .../postprocess/trace-vmscan-postprocess.pl   |  4 +--
>  include/linux/mmzone.h                        |  3 +-
>  include/trace/events/vmscan.h                 | 17 ++++++----
>  mm/page_alloc.c                               | 14 ++++----
>  mm/vmscan.c                                   | 32 +++++++++++++------
>  5 files changed, 45 insertions(+), 25 deletions(-)
> 
> diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
> --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
> +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
> @@ -111,7 +111,7 @@ my $regex_direct_begin_default = 'order=([0-9]*) may_writepage=([0-9]*) gfp_flag
>  my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
>  my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
>  my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
> -my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
> +my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
>  my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
>  my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
>  my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
> @@ -201,7 +201,7 @@ $regex_kswapd_sleep = generate_traceevent_regex(
>  $regex_wakeup_kswapd = generate_traceevent_regex(
>  			"vmscan/mm_vmscan_wakeup_kswapd",
>  			$regex_wakeup_kswapd_default,
> -			"nid", "zid", "order");
> +			"nid", "zid", "order", "gfp_flags");
>  $regex_lru_isolate = generate_traceevent_regex(
>  			"vmscan/mm_vmscan_lru_isolate",
>  			$regex_lru_isolate_default,
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -775,7 +775,8 @@ static inline bool is_dev_zone(const struct zone *zone)
>  #include <linux/memory_hotplug.h>
>  
>  void build_all_zonelists(pg_data_t *pgdat);
> -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
> +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
> +		   enum zone_type classzone_idx);
>  bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
>  			 int classzone_idx, unsigned int alloc_flags,
>  			 long free_pages);
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -78,26 +78,29 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
>  
>  TRACE_EVENT(mm_vmscan_wakeup_kswapd,
>  
> -	TP_PROTO(int nid, int zid, int order),
> +	TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
>  
> -	TP_ARGS(nid, zid, order),
> +	TP_ARGS(nid, zid, order, gfp_flags),
>  
>  	TP_STRUCT__entry(
> -		__field(	int,		nid	)
> -		__field(	int,		zid	)
> -		__field(	int,		order	)
> +		__field(	int,	nid		)
> +		__field(	int,	zid		)
> +		__field(	int,	order		)
> +		__field(	gfp_t,	gfp_flags	)
>  	),
>  
>  	TP_fast_assign(
>  		__entry->nid		= nid;
>  		__entry->zid		= zid;
>  		__entry->order		= order;
> +		__entry->gfp_flags	= gfp_flags;
>  	),
>  
> -	TP_printk("nid=%d zid=%d order=%d",
> +	TP_printk("nid=%d zid=%d order=%d gfp_flags=%s",
>  		__entry->nid,
>  		__entry->zid,
> -		__entry->order)
> +		__entry->order,
> +		show_gfp_flags(__entry->gfp_flags))
>  );
>  
>  DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3683,16 +3683,18 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
>  	return page;
>  }
>  
> -static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
> +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
> +			     const struct alloc_context *ac)
>  {
>  	struct zoneref *z;
>  	struct zone *zone;
>  	pg_data_t *last_pgdat = NULL;
> +	enum zone_type high_zoneidx = ac->high_zoneidx;
>  
> -	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
> -					ac->high_zoneidx, ac->nodemask) {
> +	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
> +					ac->nodemask) {
>  		if (last_pgdat != zone->zone_pgdat)
> -			wakeup_kswapd(zone, order, ac->high_zoneidx);
> +			wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
>  		last_pgdat = zone->zone_pgdat;
>  	}
>  }
> @@ -3971,7 +3973,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  		goto nopage;
>  
>  	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
> -		wake_all_kswapds(order, ac);
> +		wake_all_kswapds(order, gfp_mask, ac);
>  
>  	/*
>  	 * The adjusted alloc_flags might result in immediate success, so try
> @@ -4029,7 +4031,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  retry:
>  	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
>  	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
> -		wake_all_kswapds(order, ac);
> +		wake_all_kswapds(order, gfp_mask, ac);
>  
>  	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
>  	if (reserve_flags)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3546,16 +3546,21 @@ static int kswapd(void *p)
>  }
>  
>  /*
> - * A zone is low on free memory, so wake its kswapd task to service it.
> + * A zone is low on free memory or too fragmented for high-order memory.  If
> + * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
> + * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
> + * has failed or is not needed, still wake up kcompactd if only compaction is
> + * needed.
>   */
> -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
> +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
> +		   enum zone_type classzone_idx)
>  {
>  	pg_data_t *pgdat;
>  
>  	if (!managed_zone(zone))
>  		return;
>  
> -	if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
> +	if (!cpuset_zone_allowed(zone, gfp_flags))
>  		return;
>  	pgdat = zone->zone_pgdat;
>  	pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
> @@ -3564,14 +3569,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
>  	if (!waitqueue_active(&pgdat->kswapd_wait))
>  		return;
>  
> -	/* Hopeless node, leave it to direct reclaim */
> -	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
> -		return;
> -
> -	if (pgdat_balanced(pgdat, order, classzone_idx))
> +	/* Hopeless node, leave it to direct reclaim if possible */
> +	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
> +	    pgdat_balanced(pgdat, order, classzone_idx)) {
> +		/*
> +		 * There may be plenty of free memory available, but it's too
> +		 * fragmented for high-order allocations.  Wake up kcompactd
> +		 * and rely on compaction_suitable() to determine if it's
> +		 * needed.  If it fails, it will defer subsequent attempts to
> +		 * ratelimit its work.
> +		 */
> +		if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
> +			wakeup_kcompactd(pgdat, order, classzone_idx);
>  		return;
> +	}
>  
> -	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
> +	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
> +				      gfp_flags);
>  	wake_up_interruptible(&pgdat->kswapd_wait);
>  }
>  
> 
Powered by blists - more mailing lists
 
