lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <AF8E6576-CD38-4655-9CDE-DADB79DA1743@nvidia.com>
Date:   Mon, 14 Feb 2022 11:03:15 -0500
From:   Zi Yan <ziy@...dia.com>
To:     Christophe Leroy <christophe.leroy@...roup.eu>
Cc:     David Hildenbrand <david@...hat.com>, linux-mm@...ck.org,
        Mel Gorman <mgorman@...hsingularity.net>,
        Oscar Salvador <osalvador@...e.de>,
        Robin Murphy <robin.murphy@....com>,
        linux-kernel@...r.kernel.org, iommu@...ts.linux-foundation.org,
        Mike Rapoport <rppt@...nel.org>,
        Eric Ren <renzhengeek@...il.com>,
        virtualization@...ts.linux-foundation.org,
        linuxppc-dev@...ts.ozlabs.org, Christoph Hellwig <hch@....de>,
        Vlastimil Babka <vbabka@...e.cz>,
        Marek Szyprowski <m.szyprowski@...sung.com>
Subject: Re: [PATCH v5 3/6] mm: make alloc_contig_range work at pageblock granularity

On 14 Feb 2022, at 2:59, Christophe Leroy wrote:

> Le 11/02/2022 à 17:41, Zi Yan a écrit :
>> From: Zi Yan <ziy@...dia.com>
>>
>> alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
>> pageblocks with different migratetypes. It might unnecessarily convert
>> extra pageblocks at the beginning and at the end of the range. Change
>> alloc_contig_range() to work at pageblock granularity.
>>
>> Special handling is needed for free pages and in-use pages across the
>> boundaries of the range specified alloc_contig_range(). Because these
>> partially isolated pages causes free page accounting issues. The free
>> pages will be split and freed into separate migratetype lists; the
>> in-use pages will be migrated then the freed pages will be handled.
>>
>> Signed-off-by: Zi Yan <ziy@...dia.com>
>> ---
>>   include/linux/page-isolation.h |   2 +-
>>   mm/internal.h                  |   3 +
>>   mm/memory_hotplug.c            |   3 +-
>>   mm/page_alloc.c                | 235 +++++++++++++++++++++++++--------
>>   mm/page_isolation.c            |  33 ++++-
>>   5 files changed, 211 insertions(+), 65 deletions(-)
>>
>> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>> index 4ef7be6def83..78ff940cc169 100644
>> --- a/include/linux/page-isolation.h
>> +++ b/include/linux/page-isolation.h
>> @@ -54,7 +54,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
>>    */
>>   int
>>   start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> -			 unsigned migratetype, int flags);
>> +			 unsigned migratetype, int flags, gfp_t gfp_flags);
>>
>>   /*
>>    * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 0d240e876831..509cbdc25992 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -319,6 +319,9 @@ isolate_freepages_range(struct compact_control *cc,
>>   int
>>   isolate_migratepages_range(struct compact_control *cc,
>>   			   unsigned long low_pfn, unsigned long end_pfn);
>> +
>> +int
>> +isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int isolate_before_boundary);
>>   #endif
>>   int find_suitable_fallback(struct free_area *area, unsigned int order,
>>   			int migratetype, bool only_stealable, bool *can_steal);
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index ce68098832aa..82406d2f3e46 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1863,7 +1863,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>   	/* set above range as isolated */
>>   	ret = start_isolate_page_range(start_pfn, end_pfn,
>>   				       MIGRATE_MOVABLE,
>> -				       MEMORY_OFFLINE | REPORT_FAILURE);
>> +				       MEMORY_OFFLINE | REPORT_FAILURE,
>> +				       GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
>>   	if (ret) {
>>   		reason = "failure to isolate range";
>>   		goto failed_removal_pcplists_disabled;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 62ef78f3d771..7a4fa21aea5c 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -8985,7 +8985,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
>>   #endif
>>
>>   /* [start, end) must belong to a single zone. */
>> -static int __alloc_contig_migrate_range(struct compact_control *cc,
>> +int __alloc_contig_migrate_range(struct compact_control *cc,
>>   					unsigned long start, unsigned long end)
>>   {
>>   	/* This function is based on compact_zone() from compaction.c. */
>> @@ -9043,6 +9043,167 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>>   	return 0;
>>   }
>>
>> +/**
>> + * split_free_page() -- split a free page at split_pfn_offset
>> + * @free_page:		the original free page
>> + * @order:		the order of the page
>> + * @split_pfn_offset:	split offset within the page
>> + *
>> + * It is used when the free page crosses two pageblocks with different migratetypes
>> + * at split_pfn_offset within the page. The split free page will be put into
>> + * separate migratetype lists afterwards. Otherwise, the function achieves
>> + * nothing.
>> + */
>> +static inline void split_free_page(struct page *free_page,
>> +				int order, unsigned long split_pfn_offset)
>> +{
>> +	struct zone *zone = page_zone(free_page);
>> +	unsigned long free_page_pfn = page_to_pfn(free_page);
>> +	unsigned long pfn;
>> +	unsigned long flags;
>> +	int free_page_order;
>> +
>> +	spin_lock_irqsave(&zone->lock, flags);
>> +	del_page_from_free_list(free_page, zone, order);
>> +	for (pfn = free_page_pfn;
>> +	     pfn < free_page_pfn + (1UL << order);) {
>> +		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
>> +
>> +		free_page_order = order_base_2(split_pfn_offset);
>> +		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
>> +				mt, FPI_NONE);
>> +		pfn += 1UL << free_page_order;
>> +		split_pfn_offset -= (1UL << free_page_order);
>> +		/* we have done the first part, now switch to second part */
>> +		if (split_pfn_offset == 0)
>> +			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
>> +	}
>> +	spin_unlock_irqrestore(&zone->lock, flags);
>> +}
>> +
>> +/**
>> + * isolate_single_pageblock() -- tries to isolate a pageblock that might be
>> + * within a free or in-use page.
>> + * @boundary_pfn:		pageblock-aligned pfn that a page might cross
>> + * @gfp_flags:			GFP flags used for migrating pages
>> + * @isolate_before_boundary:	isolate the pageblock before (1) or after (0)
>> + *				the boundary_pfn
>> + *
>> + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
>> + * pageblock. When not all pageblocks within a page are isolated at the same
>> + * time, free page accounting can go wrong. For example, in the case of
>> + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
>> + * [        MAX_ORDER-1          ]
>> + * [  pageblock0  |  pageblock1  ]
>> + * When either pageblock is isolated, if it is a free page, the page is not
>> + * split into separate migratetype lists, which is supposed to; if it is an
>> + * in-use page and freed later, __free_one_page() does not split the free page
>> + * either. The function handles this by splitting the free page or migrating
>> + * the in-use page then splitting the free page.
>> + */
>> +int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
>> +			int isolate_before_boundary)
>
> Do you need such big param name ?

I am happy to take any suggestion.

>
> See
> https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#naming
>
> isolate_before_boundary could probably be shorter.

isolate_before instead?

>
> And should be a bool by the way.

Sure.
>
>> +{
>> +	unsigned char saved_mt;
>> +	/*
>> +	 * scan at max(MAX_ORDER_NR_PAGES, pageblock_nr_pages) aligned range to
>> +	 * avoid isolate pageblocks belonging to a bigger free or in-use page
>> +	 */
>> +	unsigned long start_pfn = pfn_max_align_down(boundary_pfn);
>> +	unsigned long isolated_pageblock_pfn;
>
> Variable name too long.
>
>> +	unsigned long pfn;
>> +
>> +	VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
>> +
>> +	if (isolate_before_boundary)
>> +		isolated_pageblock_pfn = boundary_pfn - pageblock_nr_pages;
>> +	else
>> +		isolated_pageblock_pfn = boundary_pfn;
>> +
>> +	saved_mt = get_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn));
>> +	set_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn), MIGRATE_ISOLATE);
>> +
>> +	for (pfn = start_pfn; pfn < boundary_pfn;) {
>
> This loop is a bit long a deep. Isn't there a way to put what's in "if
> (PageHuge(page) || PageTransCompound(page))" into a sub-function ?
>

Let me give it a try.

> See
> https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#functions
>

Thanks for the review.


>> +		struct page *page = pfn_to_page(pfn);
>> +
>> +		/*
>> +		 * start_pfn is max(MAX_ORDER_NR_PAGES, pageblock_nr_pages)
>> +		 * aligned, if there is any free pages in [start_pfn, boundary_pfn),
>> +		 * its head page will always be in the range.
>> +		 */
>> +		if (PageBuddy(page)) {
>> +			int order = buddy_order(page);
>> +
>> +			if (pfn + (1UL << order) > boundary_pfn)
>> +				split_free_page(page, order, boundary_pfn - pfn);
>> +			pfn += (1UL << order);
>> +			continue;
>> +		}
>> +		/*
>> +		 * migrate compound pages then let the free page handling code
>> +		 * above do the rest
>> +		 */
>> +		if (PageHuge(page) || PageTransCompound(page)) {
>> +			unsigned long nr_pages = compound_nr(page);
>> +			int order = compound_order(page);
>> +			struct page *head = compound_head(page);
>> +			unsigned long head_pfn = page_to_pfn(head);
>> +
>> +			if (head_pfn + nr_pages >= boundary_pfn) {
>> +				int ret;
>> +				struct compact_control cc = {
>> +					.nr_migratepages = 0,
>> +					.order = -1,
>> +					.zone = page_zone(pfn_to_page(head_pfn)),
>> +					.mode = MIGRATE_SYNC,
>> +					.ignore_skip_hint = true,
>> +					.no_set_skip_hint = true,
>> +					.gfp_mask = current_gfp_context(gfp_flags),
>> +					.alloc_contig = true,
>> +				};
>> +
>> +				INIT_LIST_HEAD(&cc.migratepages);
>> +
>> +				ret = __alloc_contig_migrate_range(&cc, head_pfn,
>> +							head_pfn + nr_pages);
>> +
>> +				if (ret) {
>> +					/* restore the original migratetype */
>> +					set_pageblock_migratetype(
>> +						pfn_to_page(isolated_pageblock_pfn),
>> +						saved_mt);
>> +					return -EBUSY;
>> +				}
>> +				/*
>> +				 * reset pfn, let the free page handling code
>> +				 * above split the free page to the right
>> +				 * migratetype list.
>> +				 *
>> +				 * head_pfn is not used here as a hugetlb page
>> +				 * order can be bigger than MAX_ORDER-1, but
>> +				 * after it is freed, the free page order is not.
>> +				 * Use pfn within the range to find the head of
>> +				 * the free page and reset order to 0 if a hugetlb
>> +				 * page with >MAX_ORDER-1 order is encountered.
>> +				 */
>> +				if (order > MAX_ORDER-1)
>> +					order = 0;
>> +				while (!PageBuddy(pfn_to_page(pfn))) {
>> +					order++;
>> +					pfn &= ~0UL << order;
>> +				}
>> +				continue;
>> +			}
>> +			pfn += nr_pages;
>> +			continue;
>> +		}
>> +
>> +		pfn++;
>> +	}
>> +	return 0;
>> +}
>> +
>> +
>>   /**
>>    * alloc_contig_range() -- tries to allocate given range of pages
>>    * @start:	start PFN to allocate
>> @@ -9067,8 +9228,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>>   int alloc_contig_range(unsigned long start, unsigned long end,
>>   		       unsigned migratetype, gfp_t gfp_mask)
>>   {
>> -	unsigned long outer_start, outer_end;
>> -	unsigned int order;
>> +	unsigned long outer_end;
>> +	unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages);
>> +	unsigned long alloc_end = ALIGN(end, pageblock_nr_pages);
>>   	int ret = 0;
>>
>>   	struct compact_control cc = {
>> @@ -9087,14 +9249,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   	 * What we do here is we mark all pageblocks in range as
>>   	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
>>   	 * have different sizes, and due to the way page allocator
>> -	 * work, we align the range to biggest of the two pages so
>> -	 * that page allocator won't try to merge buddies from
>> -	 * different pageblocks and change MIGRATE_ISOLATE to some
>> -	 * other migration type.
>> +	 * work, start_isolate_page_range() has special handlings for this.
>>   	 *
>>   	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
>>   	 * migrate the pages from an unaligned range (ie. pages that
>> -	 * we are interested in).  This will put all the pages in
>> +	 * we are interested in). This will put all the pages in
>>   	 * range back to page allocator as MIGRATE_ISOLATE.
>>   	 *
>>   	 * When this is done, we take the pages in range from page
>> @@ -9107,9 +9266,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   	 * put back to page allocator so that buddy can use them.
>>   	 */
>>
>> -	ret = start_isolate_page_range(start, end, migratetype, 0);
>> +	ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
>>   	if (ret)
>> -		return ret;
>> +		goto done;
>>
>>   	drain_all_pages(cc.zone);
>>
>> @@ -9128,68 +9287,28 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>>   		goto done;
>>   	ret = 0;
>>
>> -	/*
>> -	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
>> -	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
>> -	 * more, all pages in [start, end) are free in page allocator.
>> -	 * What we are going to do is to allocate all pages from
>> -	 * [start, end) (that is remove them from page allocator).
>> -	 *
>> -	 * The only problem is that pages at the beginning and at the
>> -	 * end of interesting range may be not aligned with pages that
>> -	 * page allocator holds, ie. they can be part of higher order
>> -	 * pages.  Because of this, we reserve the bigger range and
>> -	 * once this is done free the pages we are not interested in.
>> -	 *
>> -	 * We don't have to hold zone->lock here because the pages are
>> -	 * isolated thus they won't get removed from buddy.
>> -	 */
>> -
>> -	order = 0;
>> -	outer_start = start;
>> -	while (!PageBuddy(pfn_to_page(outer_start))) {
>> -		if (++order >= MAX_ORDER) {
>> -			outer_start = start;
>> -			break;
>> -		}
>> -		outer_start &= ~0UL << order;
>> -	}
>> -
>> -	if (outer_start != start) {
>> -		order = buddy_order(pfn_to_page(outer_start));
>> -
>> -		/*
>> -		 * outer_start page could be small order buddy page and
>> -		 * it doesn't include start page. Adjust outer_start
>> -		 * in this case to report failed page properly
>> -		 * on tracepoint in test_pages_isolated()
>> -		 */
>> -		if (outer_start + (1UL << order) <= start)
>> -			outer_start = start;
>> -	}
>> -
>>   	/* Make sure the range is really isolated. */
>> -	if (test_pages_isolated(outer_start, end, 0)) {
>> +	if (test_pages_isolated(alloc_start, alloc_end, 0)) {
>>   		ret = -EBUSY;
>>   		goto done;
>>   	}
>>
>>   	/* Grab isolated pages from freelists. */
>> -	outer_end = isolate_freepages_range(&cc, outer_start, end);
>> +	outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end);
>>   	if (!outer_end) {
>>   		ret = -EBUSY;
>>   		goto done;
>>   	}
>>
>>   	/* Free head and tail (if any) */
>> -	if (start != outer_start)
>> -		free_contig_range(outer_start, start - outer_start);
>> -	if (end != outer_end)
>> -		free_contig_range(end, outer_end - end);
>> +	if (start != alloc_start)
>> +		free_contig_range(alloc_start, start - alloc_start);
>> +	if (end != alloc_end)
>> +		free_contig_range(end, alloc_end - end);
>>
>>   done:
>> -	undo_isolate_page_range(pfn_max_align_down(start),
>> -				pfn_max_align_up(end), migratetype);
>> +	undo_isolate_page_range(alloc_start,
>> +				alloc_end, migratetype);
>>   	return ret;
>>   }
>>   EXPORT_SYMBOL(alloc_contig_range);
>> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
>> index 64d093ab83ec..0256d5e1032c 100644
>> --- a/mm/page_isolation.c
>> +++ b/mm/page_isolation.c
>> @@ -285,6 +285,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>    *					 and PageOffline() pages.
>>    *			REPORT_FAILURE - report details about the failure to
>>    *			isolate the range
>> + * @gfp_flags:		GFP flags used for migrating pages that sit across the
>> + *			range boundaries.
>>    *
>>    * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
>>    * the range will never be allocated. Any free pages and pages freed in the
>> @@ -293,6 +295,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>    * pages in the range finally, the caller have to free all pages in the range.
>>    * test_page_isolated() can be used for test it.
>>    *
>> + * The function first tries to isolate the pageblocks at the beginning and end
>> + * of the range, since there might be pages across the range boundaries.
>> + * Afterwards, it isolates the rest of the range.
>> + *
>>    * There is no high level synchronization mechanism that prevents two threads
>>    * from trying to isolate overlapping ranges. If this happens, one thread
>>    * will notice pageblocks in the overlapping range already set to isolate.
>> @@ -313,21 +319,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>>    * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
>>    */
>>   int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> -			     unsigned migratetype, int flags)
>> +			     unsigned migratetype, int flags, gfp_t gfp_flags)
>>   {
>>   	unsigned long pfn;
>>   	struct page *page;
>> +	/* isolation is done at page block granularity */
>> +	unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
>> +	unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
>> +	int ret;
>>
>> -	unsigned long isolate_start = pfn_max_align_down(start_pfn);
>> -	unsigned long isolate_end = pfn_max_align_up(end_pfn);
>> +	/* isolate [isolate_start, isolate_start + pageblock_nr_pages] pageblock */
>> +	ret = isolate_single_pageblock(isolate_start, gfp_flags, 0);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* isolate [isolate_end - pageblock_nr_pages, isolate_end] pageblock */
>> +	ret = isolate_single_pageblock(isolate_end, gfp_flags, 1);
>> +	if (ret) {
>> +		unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
>> +		return ret;
>> +	}
>>
>> -	for (pfn = isolate_start;
>> -	     pfn < isolate_end;
>> +	/* skip isolated pageblocks at the beginning and end */
>> +	for (pfn = isolate_start + pageblock_nr_pages;
>> +	     pfn < isolate_end - pageblock_nr_pages;
>>   	     pfn += pageblock_nr_pages) {
>>   		page = __first_valid_page(pfn, pageblock_nr_pages);
>>   		if (page && set_migratetype_isolate(page, migratetype, flags,
>>   					start_pfn, end_pfn)) {
>>   			undo_isolate_page_range(isolate_start, pfn, migratetype);
>> +			unset_migratetype_isolate(
>> +				pfn_to_page(isolate_end - pageblock_nr_pages),
>> +				migratetype);
>>   			return -EBUSY;
>>   		}
>>   	}

--
Best Regards,
Yan, Zi

Download attachment "signature.asc" of type "application/pgp-signature" (855 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ