linux-kernel - Re: [3.6 regression?] THP + migration/compaction livelock (I think)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20121115011449.GA20858@offline.be>
Date:	Thu, 15 Nov 2012 02:14:49 +0100
From:	Marc Duponcheel <marc@...line.be>
To:	David Rientjes <rientjes@...gle.com>
Cc:	Mel Gorman <mgorman@...e.de>,
	Andy Lutomirski <luto@...capital.net>,
	linux-kernel@...r.kernel.org, linux-mm@...ck.org,
	Marc Duponcheel <marc@...line.be>
Subject: Re: [3.6 regression?] THP + migration/compaction livelock (I think)

 Hi David

Thanks for the changeset

I will test 3.6.6 without&with this weekend.

 Have a nice day

On 2012 Nov 14, #David Rientjes wrote:
> On Wed, 14 Nov 2012, Marc Duponcheel wrote:
> 
> >  Hi all
> > 
> >  If someone can provide the patches (or learn me how to get them with
> > git (I apologise to not be git savy)) then, this weekend, I can apply
> > them to 3.6.6 and compare before/after to check if they fix #49361.
> > 
> 
> I've backported all the commits that Mel quoted to 3.6.6 and appended them 
> to this email as one big patch.  It should apply cleanly to your kernel.
> 
> Now we are only missing these commits that weren't quoted:
> 
>  - 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page 
>                   immediately when it is made available"), and
> 
>  - 83fde0f22872 ("mm: vmscan: scale number of pages reclaimed by 
>                   reclaim/compaction based on failures").
> 
> Since your regression is easily reproducible, would it be possible to try 
> to reproduce the issue FIRST with 3.6.6 and, if still present as it was in 
> 3.6.2, then try reproducing it with the appended patch?
> 
> You earlier reported that khugepaged was taking the second-most cpu time 
> when this was happening, which initially pointed you to thp, so presumably 
> this isn't a kswapd issue running at 100%.  If both 3.6.6 kernels fail 
> (the one with and without the following patch), would it be possible to 
> try Mel's suggestion of patching with
> 
>  - https://lkml.org/lkml/2012/11/5/308 +
>    https://lkml.org/lkml/2012/11/12/113
> 
> to see if it helps and, if not, reverting the latter and trying
> 
>  - https://lkml.org/lkml/2012/11/5/308 +
>    https://lkml.org/lkml/2012/11/12/151
> 
> as the final test?  This will certainly help us to find out what needs to 
> be backported to 3.6 stable to prevent this issue for other users.
> 
> Thanks!
> ---
>  include/linux/compaction.h      |   15 ++
>  include/linux/mmzone.h          |    6 +-
>  include/linux/pageblock-flags.h |   19 +-
>  mm/compaction.c                 |  450 +++++++++++++++++++++++++--------------
>  mm/internal.h                   |   16 +-
>  mm/page_alloc.c                 |   42 ++--
>  mm/vmscan.c                     |    8 +
>  7 files changed, 366 insertions(+), 190 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -24,6 +24,7 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
>  			int order, gfp_t gfp_mask, nodemask_t *mask,
>  			bool sync, bool *contended);
>  extern int compact_pgdat(pg_data_t *pgdat, int order);
> +extern void reset_isolation_suitable(pg_data_t *pgdat);
>  extern unsigned long compaction_suitable(struct zone *zone, int order);
>  
>  /* Do not skip compaction more than 64 times */
> @@ -61,6 +62,16 @@ static inline bool compaction_deferred(struct zone *zone, int order)
>  	return zone->compact_considered < defer_limit;
>  }
>  
> +/* Returns true if restarting compaction after many failures */
> +static inline bool compaction_restarting(struct zone *zone, int order)
> +{
> +	if (order < zone->compact_order_failed)
> +		return false;
> +
> +	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
> +		zone->compact_considered >= 1UL << zone->compact_defer_shift;
> +}
> +
>  #else
>  static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
>  			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> @@ -74,6 +85,10 @@ static inline int compact_pgdat(pg_data_t *pgdat, int order)
>  	return COMPACT_CONTINUE;
>  }
>  
> +static inline void reset_isolation_suitable(pg_data_t *pgdat)
> +{
> +}
> +
>  static inline unsigned long compaction_suitable(struct zone *zone, int order)
>  {
>  	return COMPACT_SKIPPED;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -369,8 +369,12 @@ struct zone {
>  	spinlock_t		lock;
>  	int                     all_unreclaimable; /* All pages pinned */
>  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
> -	/* pfn where the last incremental compaction isolated free pages */
> +	/* Set to true when the PG_migrate_skip bits should be cleared */
> +	bool			compact_blockskip_flush;
> +
> +	/* pfns where compaction scanners should start */
>  	unsigned long		compact_cached_free_pfn;
> +	unsigned long		compact_cached_migrate_pfn;
>  #endif
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  	/* see spanned/present_pages for more description */
> diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
> --- a/include/linux/pageblock-flags.h
> +++ b/include/linux/pageblock-flags.h
> @@ -30,6 +30,9 @@ enum pageblock_bits {
>  	PB_migrate,
>  	PB_migrate_end = PB_migrate + 3 - 1,
>  			/* 3 bits required for migrate types */
> +#ifdef CONFIG_COMPACTION
> +	PB_migrate_skip,/* If set the block is skipped by compaction */
> +#endif /* CONFIG_COMPACTION */
>  	NR_PAGEBLOCK_BITS
>  };
>  
> @@ -65,10 +68,22 @@ unsigned long get_pageblock_flags_group(struct page *page,
>  void set_pageblock_flags_group(struct page *page, unsigned long flags,
>  					int start_bitidx, int end_bitidx);
>  
> +#ifdef CONFIG_COMPACTION
> +#define get_pageblock_skip(page) \
> +			get_pageblock_flags_group(page, PB_migrate_skip,     \
> +							PB_migrate_skip + 1)
> +#define clear_pageblock_skip(page) \
> +			set_pageblock_flags_group(page, 0, PB_migrate_skip,  \
> +							PB_migrate_skip + 1)
> +#define set_pageblock_skip(page) \
> +			set_pageblock_flags_group(page, 1, PB_migrate_skip,  \
> +							PB_migrate_skip + 1)
> +#endif /* CONFIG_COMPACTION */
> +
>  #define get_pageblock_flags(page) \
> -			get_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1)
> +			get_pageblock_flags_group(page, 0, PB_migrate_end)
>  #define set_pageblock_flags(page, flags) \
>  			set_pageblock_flags_group(page, flags,	\
> -						  0, NR_PAGEBLOCK_BITS-1)
> +						  0, PB_migrate_end)
>  
>  #endif	/* PAGEBLOCK_FLAGS_H */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype)
>  	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
>  }
>  
> +#ifdef CONFIG_COMPACTION
> +/* Returns true if the pageblock should be scanned for pages to isolate. */
> +static inline bool isolation_suitable(struct compact_control *cc,
> +					struct page *page)
> +{
> +	if (cc->ignore_skip_hint)
> +		return true;
> +
> +	return !get_pageblock_skip(page);
> +}
> +
> +/*
> + * This function is called to clear all cached information on pageblocks that
> + * should be skipped for page isolation when the migrate and free page scanner
> + * meet.
> + */
> +static void __reset_isolation_suitable(struct zone *zone)
> +{
> +	unsigned long start_pfn = zone->zone_start_pfn;
> +	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
> +	unsigned long pfn;
> +
> +	zone->compact_cached_migrate_pfn = start_pfn;
> +	zone->compact_cached_free_pfn = end_pfn;
> +	zone->compact_blockskip_flush = false;
> +
> +	/* Walk the zone and mark every pageblock as suitable for isolation */
> +	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
> +		struct page *page;
> +
> +		cond_resched();
> +
> +		if (!pfn_valid(pfn))
> +			continue;
> +
> +		page = pfn_to_page(pfn);
> +		if (zone != page_zone(page))
> +			continue;
> +
> +		clear_pageblock_skip(page);
> +	}
> +}
> +
> +void reset_isolation_suitable(pg_data_t *pgdat)
> +{
> +	int zoneid;
> +
> +	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
> +		struct zone *zone = &pgdat->node_zones[zoneid];
> +		if (!populated_zone(zone))
> +			continue;
> +
> +		/* Only flush if a full compaction finished recently */
> +		if (zone->compact_blockskip_flush)
> +			__reset_isolation_suitable(zone);
> +	}
> +}
> +
> +/*
> + * If no pages were isolated then mark this pageblock to be skipped in the
> + * future. The information is later cleared by __reset_isolation_suitable().
> + */
> +static void update_pageblock_skip(struct compact_control *cc,
> +			struct page *page, unsigned long nr_isolated,
> +			bool migrate_scanner)
> +{
> +	struct zone *zone = cc->zone;
> +	if (!page)
> +		return;
> +
> +	if (!nr_isolated) {
> +		unsigned long pfn = page_to_pfn(page);
> +		set_pageblock_skip(page);
> +
> +		/* Update where compaction should restart */
> +		if (migrate_scanner) {
> +			if (!cc->finished_update_migrate &&
> +			    pfn > zone->compact_cached_migrate_pfn)
> +				zone->compact_cached_migrate_pfn = pfn;
> +		} else {
> +			if (!cc->finished_update_free &&
> +			    pfn < zone->compact_cached_free_pfn)
> +				zone->compact_cached_free_pfn = pfn;
> +		}
> +	}
> +}
> +#else
> +static inline bool isolation_suitable(struct compact_control *cc,
> +					struct page *page)
> +{
> +	return true;
> +}
> +
> +static void update_pageblock_skip(struct compact_control *cc,
> +			struct page *page, unsigned long nr_isolated,
> +			bool migrate_scanner)
> +{
> +}
> +#endif /* CONFIG_COMPACTION */
> +
> +static inline bool should_release_lock(spinlock_t *lock)
> +{
> +	return need_resched() || spin_is_contended(lock);
> +}
> +
>  /*
>   * Compaction requires the taking of some coarse locks that are potentially
>   * very heavily contended. Check if the process needs to be scheduled or
> @@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype)
>  static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  				      bool locked, struct compact_control *cc)
>  {
> -	if (need_resched() || spin_is_contended(lock)) {
> +	if (should_release_lock(lock)) {
>  		if (locked) {
>  			spin_unlock_irqrestore(lock, *flags);
>  			locked = false;
> @@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>  
>  		/* async aborts if taking too long or contended */
>  		if (!cc->sync) {
> -			if (cc->contended)
> -				*cc->contended = true;
> +			cc->contended = true;
>  			return false;
>  		}
>  
>  		cond_resched();
> -		if (fatal_signal_pending(current))
> -			return false;
>  	}
>  
>  	if (!locked)
> @@ -91,44 +193,85 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
>  	return compact_checklock_irqsave(lock, flags, false, cc);
>  }
>  
> +/* Returns true if the page is within a block suitable for migration to */
> +static bool suitable_migration_target(struct page *page)
> +{
> +	int migratetype = get_pageblock_migratetype(page);
> +
> +	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
> +	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
> +		return false;
> +
> +	/* If the page is a large free page, then allow migration */
> +	if (PageBuddy(page) && page_order(page) >= pageblock_order)
> +		return true;
> +
> +	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
> +	if (migrate_async_suitable(migratetype))
> +		return true;
> +
> +	/* Otherwise skip the block */
> +	return false;
> +}
> +
>  /*
>   * Isolate free pages onto a private freelist. Caller must hold zone->lock.
>   * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
>   * pages inside of the pageblock (even though it may still end up isolating
>   * some pages).
>   */
> -static unsigned long isolate_freepages_block(unsigned long blockpfn,
> +static unsigned long isolate_freepages_block(struct compact_control *cc,
> +				unsigned long blockpfn,
>  				unsigned long end_pfn,
>  				struct list_head *freelist,
>  				bool strict)
>  {
>  	int nr_scanned = 0, total_isolated = 0;
> -	struct page *cursor;
> +	struct page *cursor, *valid_page = NULL;
> +	unsigned long nr_strict_required = end_pfn - blockpfn;
> +	unsigned long flags;
> +	bool locked = false;
>  
>  	cursor = pfn_to_page(blockpfn);
>  
> -	/* Isolate free pages. This assumes the block is valid */
> +	/* Isolate free pages. */
>  	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
>  		int isolated, i;
>  		struct page *page = cursor;
>  
> -		if (!pfn_valid_within(blockpfn)) {
> -			if (strict)
> -				return 0;
> -			continue;
> -		}
>  		nr_scanned++;
> +		if (!pfn_valid_within(blockpfn))
> +			continue;
> +		if (!valid_page)
> +			valid_page = page;
> +		if (!PageBuddy(page))
> +			continue;
> +
> +		/*
> +		 * The zone lock must be held to isolate freepages.
> +		 * Unfortunately this is a very coarse lock and can be
> +		 * heavily contended if there are parallel allocations
> +		 * or parallel compactions. For async compaction do not
> +		 * spin on the lock and we acquire the lock as late as
> +		 * possible.
> +		 */
> +		locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
> +								locked, cc);
> +		if (!locked)
> +			break;
> +
> +		/* Recheck this is a suitable migration target under lock */
> +		if (!strict && !suitable_migration_target(page))
> +			break;
>  
> -		if (!PageBuddy(page)) {
> -			if (strict)
> -				return 0;
> +		/* Recheck this is a buddy page under lock */
> +		if (!PageBuddy(page))
>  			continue;
> -		}
>  
>  		/* Found a free page, break it into order-0 pages */
>  		isolated = split_free_page(page);
>  		if (!isolated && strict)
> -			return 0;
> +			break;
>  		total_isolated += isolated;
>  		for (i = 0; i < isolated; i++) {
>  			list_add(&page->lru, freelist);
> @@ -143,6 +286,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
>  	}
>  
>  	trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
> +
> +	/*
> +	 * If strict isolation is requested by CMA then check that all the
> +	 * pages requested were isolated. If there were any failures, 0 is
> +	 * returned and CMA will fail.
> +	 */
> +	if (strict && nr_strict_required > total_isolated)
> +		total_isolated = 0;
> +
> +	if (locked)
> +		spin_unlock_irqrestore(&cc->zone->lock, flags);
> +
> +	/* Update the pageblock-skip if the whole pageblock was scanned */
> +	if (blockpfn == end_pfn)
> +		update_pageblock_skip(cc, valid_page, total_isolated, false);
> +
>  	return total_isolated;
>  }
>  
> @@ -160,17 +319,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
>   * a free page).
>   */
>  unsigned long
> -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
> +isolate_freepages_range(struct compact_control *cc,
> +			unsigned long start_pfn, unsigned long end_pfn)
>  {
> -	unsigned long isolated, pfn, block_end_pfn, flags;
> -	struct zone *zone = NULL;
> +	unsigned long isolated, pfn, block_end_pfn;
>  	LIST_HEAD(freelist);
>  
> -	if (pfn_valid(start_pfn))
> -		zone = page_zone(pfn_to_page(start_pfn));
> -
>  	for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
> -		if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
> +		if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
>  			break;
>  
>  		/*
> @@ -180,10 +336,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
>  		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
>  		block_end_pfn = min(block_end_pfn, end_pfn);
>  
> -		spin_lock_irqsave(&zone->lock, flags);
> -		isolated = isolate_freepages_block(pfn, block_end_pfn,
> +		isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
>  						   &freelist, true);
> -		spin_unlock_irqrestore(&zone->lock, flags);
>  
>  		/*
>  		 * In strict mode, isolate_freepages_block() returns 0 if
> @@ -276,7 +430,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  	isolate_mode_t mode = 0;
>  	struct lruvec *lruvec;
>  	unsigned long flags;
> -	bool locked;
> +	bool locked = false;
> +	struct page *page = NULL, *valid_page = NULL;
>  
>  	/*
>  	 * Ensure that there are not too many pages isolated from the LRU
> @@ -296,23 +451,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  
>  	/* Time to isolate some pages for migration */
>  	cond_resched();
> -	spin_lock_irqsave(&zone->lru_lock, flags);
> -	locked = true;
>  	for (; low_pfn < end_pfn; low_pfn++) {
> -		struct page *page;
> -
>  		/* give a chance to irqs before checking need_resched() */
> -		if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
> -			spin_unlock_irqrestore(&zone->lru_lock, flags);
> -			locked = false;
> +		if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
> +			if (should_release_lock(&zone->lru_lock)) {
> +				spin_unlock_irqrestore(&zone->lru_lock, flags);
> +				locked = false;
> +			}
>  		}
>  
> -		/* Check if it is ok to still hold the lock */
> -		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
> -								locked, cc);
> -		if (!locked)
> -			break;
> -
>  		/*
>  		 * migrate_pfn does not necessarily start aligned to a
>  		 * pageblock. Ensure that pfn_valid is called when moving
> @@ -340,6 +487,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  		if (page_zone(page) != zone)
>  			continue;
>  
> +		if (!valid_page)
> +			valid_page = page;
> +
> +		/* If isolation recently failed, do not retry */
> +		pageblock_nr = low_pfn >> pageblock_order;
> +		if (!isolation_suitable(cc, page))
> +			goto next_pageblock;
> +
>  		/* Skip if free */
>  		if (PageBuddy(page))
>  			continue;
> @@ -349,24 +504,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  		 * migration is optimistic to see if the minimum amount of work
>  		 * satisfies the allocation
>  		 */
> -		pageblock_nr = low_pfn >> pageblock_order;
>  		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
>  		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
> -			low_pfn += pageblock_nr_pages;
> -			low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
> -			last_pageblock_nr = pageblock_nr;
> -			continue;
> +			cc->finished_update_migrate = true;
> +			goto next_pageblock;
>  		}
>  
> +		/* Check may be lockless but that's ok as we recheck later */
>  		if (!PageLRU(page))
>  			continue;
>  
>  		/*
> -		 * PageLRU is set, and lru_lock excludes isolation,
> -		 * splitting and collapsing (collapsing has already
> -		 * happened if PageLRU is set).
> +		 * PageLRU is set. lru_lock normally excludes isolation
> +		 * splitting and collapsing (collapsing has already happened
> +		 * if PageLRU is set) but the lock is not necessarily taken
> +		 * here and it is wasteful to take it just to check transhuge.
> +		 * Check TransHuge without lock and skip the whole pageblock if
> +		 * it's either a transhuge or hugetlbfs page, as calling
> +		 * compound_order() without preventing THP from splitting the
> +		 * page underneath us may return surprising results.
>  		 */
>  		if (PageTransHuge(page)) {
> +			if (!locked)
> +				goto next_pageblock;
> +			low_pfn += (1 << compound_order(page)) - 1;
> +			continue;
> +		}
> +
> +		/* Check if it is ok to still hold the lock */
> +		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
> +								locked, cc);
> +		if (!locked || fatal_signal_pending(current))
> +			break;
> +
> +		/* Recheck PageLRU and PageTransHuge under lock */
> +		if (!PageLRU(page))
> +			continue;
> +		if (PageTransHuge(page)) {
>  			low_pfn += (1 << compound_order(page)) - 1;
>  			continue;
>  		}
> @@ -383,6 +557,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  		VM_BUG_ON(PageTransCompound(page));
>  
>  		/* Successfully isolated */
> +		cc->finished_update_migrate = true;
>  		del_page_from_lru_list(page, lruvec, page_lru(page));
>  		list_add(&page->lru, migratelist);
>  		cc->nr_migratepages++;
> @@ -393,6 +568,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			++low_pfn;
>  			break;
>  		}
> +
> +		continue;
> +
> +next_pageblock:
> +		low_pfn += pageblock_nr_pages;
> +		low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
> +		last_pageblock_nr = pageblock_nr;
>  	}
>  
>  	acct_isolated(zone, locked, cc);
> @@ -400,6 +582,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  	if (locked)
>  		spin_unlock_irqrestore(&zone->lru_lock, flags);
>  
> +	/* Update the pageblock-skip if the whole pageblock was scanned */
> +	if (low_pfn == end_pfn)
> +		update_pageblock_skip(cc, valid_page, nr_isolated, true);
> +
>  	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
>  
>  	return low_pfn;
> @@ -407,43 +593,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  
>  #endif /* CONFIG_COMPACTION || CONFIG_CMA */
>  #ifdef CONFIG_COMPACTION
> -
> -/* Returns true if the page is within a block suitable for migration to */
> -static bool suitable_migration_target(struct page *page)
> -{
> -
> -	int migratetype = get_pageblock_migratetype(page);
> -
> -	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
> -	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
> -		return false;
> -
> -	/* If the page is a large free page, then allow migration */
> -	if (PageBuddy(page) && page_order(page) >= pageblock_order)
> -		return true;
> -
> -	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
> -	if (migrate_async_suitable(migratetype))
> -		return true;
> -
> -	/* Otherwise skip the block */
> -	return false;
> -}
> -
> -/*
> - * Returns the start pfn of the last page block in a zone.  This is the starting
> - * point for full compaction of a zone.  Compaction searches for free pages from
> - * the end of each zone, while isolate_freepages_block scans forward inside each
> - * page block.
> - */
> -static unsigned long start_free_pfn(struct zone *zone)
> -{
> -	unsigned long free_pfn;
> -	free_pfn = zone->zone_start_pfn + zone->spanned_pages;
> -	free_pfn &= ~(pageblock_nr_pages-1);
> -	return free_pfn;
> -}
> -
>  /*
>   * Based on information in the current compact_control, find blocks
>   * suitable for isolating free pages from and then isolate them.
> @@ -453,7 +602,6 @@ static void isolate_freepages(struct zone *zone,
>  {
>  	struct page *page;
>  	unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
> -	unsigned long flags;
>  	int nr_freepages = cc->nr_freepages;
>  	struct list_head *freelist = &cc->freepages;
>  
> @@ -501,30 +649,16 @@ static void isolate_freepages(struct zone *zone,
>  		if (!suitable_migration_target(page))
>  			continue;
>  
> -		/*
> -		 * Found a block suitable for isolating free pages from. Now
> -		 * we disabled interrupts, double check things are ok and
> -		 * isolate the pages. This is to minimise the time IRQs
> -		 * are disabled
> -		 */
> -		isolated = 0;
> +		/* If isolation recently failed, do not retry */
> +		if (!isolation_suitable(cc, page))
> +			continue;
>  
> -		/*
> -		 * The zone lock must be held to isolate freepages. This
> -		 * unfortunately this is a very coarse lock and can be
> -		 * heavily contended if there are parallel allocations
> -		 * or parallel compactions. For async compaction do not
> -		 * spin on the lock
> -		 */
> -		if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
> -			break;
> -		if (suitable_migration_target(page)) {
> -			end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
> -			isolated = isolate_freepages_block(pfn, end_pfn,
> -							   freelist, false);
> -			nr_freepages += isolated;
> -		}
> -		spin_unlock_irqrestore(&zone->lock, flags);
> +		/* Found a block suitable for isolating free pages from */
> +		isolated = 0;
> +		end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
> +		isolated = isolate_freepages_block(cc, pfn, end_pfn,
> +						   freelist, false);
> +		nr_freepages += isolated;
>  
>  		/*
>  		 * Record the highest PFN we isolated pages from. When next
> @@ -532,17 +666,8 @@ static void isolate_freepages(struct zone *zone,
>  		 * page migration may have returned some pages to the allocator
>  		 */
>  		if (isolated) {
> +			cc->finished_update_free = true;
>  			high_pfn = max(high_pfn, pfn);
> -
> -			/*
> -			 * If the free scanner has wrapped, update
> -			 * compact_cached_free_pfn to point to the highest
> -			 * pageblock with free pages. This reduces excessive
> -			 * scanning of full pageblocks near the end of the
> -			 * zone
> -			 */
> -			if (cc->order > 0 && cc->wrapped)
> -				zone->compact_cached_free_pfn = high_pfn;
>  		}
>  	}
>  
> @@ -551,11 +676,6 @@ static void isolate_freepages(struct zone *zone,
>  
>  	cc->free_pfn = high_pfn;
>  	cc->nr_freepages = nr_freepages;
> -
> -	/* If compact_cached_free_pfn is reset then set it now */
> -	if (cc->order > 0 && !cc->wrapped &&
> -			zone->compact_cached_free_pfn == start_free_pfn(zone))
> -		zone->compact_cached_free_pfn = high_pfn;
>  }
>  
>  /*
> @@ -634,7 +754,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
>  
>  	/* Perform the isolation */
>  	low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
> -	if (!low_pfn)
> +	if (!low_pfn || cc->contended)
>  		return ISOLATE_ABORT;
>  
>  	cc->migrate_pfn = low_pfn;
> @@ -651,27 +771,19 @@ static int compact_finished(struct zone *zone,
>  	if (fatal_signal_pending(current))
>  		return COMPACT_PARTIAL;
>  
> -	/*
> -	 * A full (order == -1) compaction run starts at the beginning and
> -	 * end of a zone; it completes when the migrate and free scanner meet.
> -	 * A partial (order > 0) compaction can start with the free scanner
> -	 * at a random point in the zone, and may have to restart.
> -	 */
> +	/* Compaction run completes if the migrate and free scanner meet */
>  	if (cc->free_pfn <= cc->migrate_pfn) {
> -		if (cc->order > 0 && !cc->wrapped) {
> -			/* We started partway through; restart at the end. */
> -			unsigned long free_pfn = start_free_pfn(zone);
> -			zone->compact_cached_free_pfn = free_pfn;
> -			cc->free_pfn = free_pfn;
> -			cc->wrapped = 1;
> -			return COMPACT_CONTINUE;
> -		}
> -		return COMPACT_COMPLETE;
> -	}
> +		/*
> +		 * Mark that the PG_migrate_skip information should be cleared
> +		 * by kswapd when it goes to sleep. kswapd does not set the
> +		 * flag itself as the decision to be clear should be directly
> +		 * based on an allocation request.
> +		 */
> +		if (!current_is_kswapd())
> +			zone->compact_blockskip_flush = true;
>  
> -	/* We wrapped around and ended up where we started. */
> -	if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
>  		return COMPACT_COMPLETE;
> +	}
>  
>  	/*
>  	 * order == -1 is expected when compacting via
> @@ -754,6 +866,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
>  static int compact_zone(struct zone *zone, struct compact_control *cc)
>  {
>  	int ret;
> +	unsigned long start_pfn = zone->zone_start_pfn;
> +	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
>  
>  	ret = compaction_suitable(zone, cc->order);
>  	switch (ret) {
> @@ -766,17 +880,29 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>  		;
>  	}
>  
> -	/* Setup to move all movable pages to the end of the zone */
> -	cc->migrate_pfn = zone->zone_start_pfn;
> -
> -	if (cc->order > 0) {
> -		/* Incremental compaction. Start where the last one stopped. */
> -		cc->free_pfn = zone->compact_cached_free_pfn;
> -		cc->start_free_pfn = cc->free_pfn;
> -	} else {
> -		/* Order == -1 starts at the end of the zone. */
> -		cc->free_pfn = start_free_pfn(zone);
> +	/*
> +	 * Setup to move all movable pages to the end of the zone. Used cached
> +	 * information on where the scanners should start but check that it
> +	 * is initialised by ensuring the values are within zone boundaries.
> +	 */
> +	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> +	cc->free_pfn = zone->compact_cached_free_pfn;
> +	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
> +		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> +		zone->compact_cached_free_pfn = cc->free_pfn;
>  	}
> +	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
> +		cc->migrate_pfn = start_pfn;
> +		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> +	}
> +
> +	/*
> +	 * Clear pageblock skip if there were failures recently and compaction
> +	 * is about to be retried after being deferred. kswapd does not do
> +	 * this reset as it'll reset the cached information when going to sleep.
> +	 */
> +	if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
> +		__reset_isolation_suitable(zone);
>  
>  	migrate_prep_local();
>  
> @@ -787,6 +913,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>  		switch (isolate_migratepages(zone, cc)) {
>  		case ISOLATE_ABORT:
>  			ret = COMPACT_PARTIAL;
> +			putback_lru_pages(&cc->migratepages);
> +			cc->nr_migratepages = 0;
>  			goto out;
>  		case ISOLATE_NONE:
>  			continue;
> @@ -831,6 +959,7 @@ static unsigned long compact_zone_order(struct zone *zone,
>  				 int order, gfp_t gfp_mask,
>  				 bool sync, bool *contended)
>  {
> +	unsigned long ret;
>  	struct compact_control cc = {
>  		.nr_freepages = 0,
>  		.nr_migratepages = 0,
> @@ -838,12 +967,17 @@ static unsigned long compact_zone_order(struct zone *zone,
>  		.migratetype = allocflags_to_migratetype(gfp_mask),
>  		.zone = zone,
>  		.sync = sync,
> -		.contended = contended,
>  	};
>  	INIT_LIST_HEAD(&cc.freepages);
>  	INIT_LIST_HEAD(&cc.migratepages);
>  
> -	return compact_zone(zone, &cc);
> +	ret = compact_zone(zone, &cc);
> +
> +	VM_BUG_ON(!list_empty(&cc.freepages));
> +	VM_BUG_ON(!list_empty(&cc.migratepages));
> +
> +	*contended = cc.contended;
> +	return ret;
>  }
>  
>  int sysctl_extfrag_threshold = 500;
> @@ -855,6 +989,8 @@ int sysctl_extfrag_threshold = 500;
>   * @gfp_mask: The GFP mask of the current allocation
>   * @nodemask: The allowed nodes to allocate from
>   * @sync: Whether migration is synchronous or not
> + * @contended: Return value that is true if compaction was aborted due to lock contention
> + * @page: Optionally capture a free page of the requested order during compaction
>   *
>   * This is the main entry point for direct page compaction.
>   */
> diff --git a/mm/internal.h b/mm/internal.h
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -118,23 +118,23 @@ struct compact_control {
>  	unsigned long nr_freepages;	/* Number of isolated free pages */
>  	unsigned long nr_migratepages;	/* Number of pages to migrate */
>  	unsigned long free_pfn;		/* isolate_freepages search base */
> -	unsigned long start_free_pfn;	/* where we started the search */
>  	unsigned long migrate_pfn;	/* isolate_migratepages search base */
>  	bool sync;			/* Synchronous migration */
> -	bool wrapped;			/* Order > 0 compactions are
> -					   incremental, once free_pfn
> -					   and migrate_pfn meet, we restart
> -					   from the top of the zone;
> -					   remember we wrapped around. */
> +	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
> +	bool finished_update_free;	/* True when the zone cached pfns are
> +					 * no longer being updated
> +					 */
> +	bool finished_update_migrate;
>  
>  	int order;			/* order a direct compactor needs */
>  	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
>  	struct zone *zone;
> -	bool *contended;		/* True if a lock was contended */
> +	bool contended;			/* True if a lock was contended */
>  };
>  
>  unsigned long
> -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
> +isolate_freepages_range(struct compact_control *cc,
> +			unsigned long start_pfn, unsigned long end_pfn);
>  unsigned long
>  isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>  			   unsigned long low_pfn, unsigned long end_pfn);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2131,6 +2131,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>  				alloc_flags & ~ALLOC_NO_WATERMARKS,
>  				preferred_zone, migratetype);
>  		if (page) {
> +			preferred_zone->compact_blockskip_flush = false;
>  			preferred_zone->compact_considered = 0;
>  			preferred_zone->compact_defer_shift = 0;
>  			if (order >= preferred_zone->compact_order_failed)
> @@ -4438,11 +4439,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>  
>  		zone->spanned_pages = size;
>  		zone->present_pages = realsize;
> -#if defined CONFIG_COMPACTION || defined CONFIG_CMA
> -		zone->compact_cached_free_pfn = zone->zone_start_pfn +
> -						zone->spanned_pages;
> -		zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
> -#endif
>  #ifdef CONFIG_NUMA
>  		zone->node = nid;
>  		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> @@ -5632,7 +5628,8 @@ __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
>  }
>  
>  /* [start, end) must belong to a single zone. */
> -static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
> +static int __alloc_contig_migrate_range(struct compact_control *cc,
> +					unsigned long start, unsigned long end)
>  {
>  	/* This function is based on compact_zone() from compaction.c. */
>  
> @@ -5640,25 +5637,17 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
>  	unsigned int tries = 0;
>  	int ret = 0;
>  
> -	struct compact_control cc = {
> -		.nr_migratepages = 0,
> -		.order = -1,
> -		.zone = page_zone(pfn_to_page(start)),
> -		.sync = true,
> -	};
> -	INIT_LIST_HEAD(&cc.migratepages);
> -
>  	migrate_prep_local();
>  
> -	while (pfn < end || !list_empty(&cc.migratepages)) {
> +	while (pfn < end || !list_empty(&cc->migratepages)) {
>  		if (fatal_signal_pending(current)) {
>  			ret = -EINTR;
>  			break;
>  		}
>  
> -		if (list_empty(&cc.migratepages)) {
> -			cc.nr_migratepages = 0;
> -			pfn = isolate_migratepages_range(cc.zone, &cc,
> +		if (list_empty(&cc->migratepages)) {
> +			cc->nr_migratepages = 0;
> +			pfn = isolate_migratepages_range(cc->zone, cc,
>  							 pfn, end);
>  			if (!pfn) {
>  				ret = -EINTR;
> @@ -5670,12 +5659,12 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
>  			break;
>  		}
>  
> -		ret = migrate_pages(&cc.migratepages,
> +		ret = migrate_pages(&cc->migratepages,
>  				    __alloc_contig_migrate_alloc,
>  				    0, false, MIGRATE_SYNC);
>  	}
>  
> -	putback_lru_pages(&cc.migratepages);
> +	putback_lru_pages(&cc->migratepages);
>  	return ret > 0 ? 0 : ret;
>  }
>  
> @@ -5754,6 +5743,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	unsigned long outer_start, outer_end;
>  	int ret = 0, order;
>  
> +	struct compact_control cc = {
> +		.nr_migratepages = 0,
> +		.order = -1,
> +		.zone = page_zone(pfn_to_page(start)),
> +		.sync = true,
> +		.ignore_skip_hint = true,
> +	};
> +	INIT_LIST_HEAD(&cc.migratepages);
> +
>  	/*
>  	 * What we do here is we mark all pageblocks in range as
>  	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
> @@ -5783,7 +5781,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	if (ret)
>  		goto done;
>  
> -	ret = __alloc_contig_migrate_range(start, end);
> +	ret = __alloc_contig_migrate_range(&cc, start, end);
>  	if (ret)
>  		goto done;
>  
> @@ -5832,7 +5830,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>  	__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
>  
>  	/* Grab isolated pages from freelists. */
> -	outer_end = isolate_freepages_range(outer_start, end);
> +	outer_end = isolate_freepages_range(&cc, outer_start, end);
>  	if (!outer_end) {
>  		ret = -EBUSY;
>  		goto done;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2839,6 +2839,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
>  		 */
>  		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
>  
> +		/*
> +		 * Compaction records what page blocks it recently failed to
> +		 * isolate pages from and skips them in the future scanning.
> +		 * When kswapd is going to sleep, it is reasonable to assume
> +		 * that pages and compaction may succeed so reset the cache.
> +		 */
> +		reset_isolation_suitable(pgdat);
> +
>  		if (!kthread_should_stop())
>  			schedule();
>  
> 

-- 
--
 Marc Duponcheel
 Velodroomstraat 74 - 2600 Berchem - Belgium
 +32 (0)478 68.10.91 - marc@...line.be
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/