[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20220324224435.17794-4-zi.yan@sent.com>
Date: Thu, 24 Mar 2022 18:44:33 -0400
From: Zi Yan <zi.yan@...t.com>
To: David Hildenbrand <david@...hat.com>, linux-mm@...ck.org
Cc: linux-kernel@...r.kernel.org,
virtualization@...ts.linux-foundation.org,
Vlastimil Babka <vbabka@...e.cz>,
Mel Gorman <mgorman@...hsingularity.net>,
Eric Ren <renzhengeek@...il.com>,
Mike Rapoport <rppt@...nel.org>,
Oscar Salvador <osalvador@...e.de>,
Christophe Leroy <christophe.leroy@...roup.eu>,
Zi Yan <ziy@...dia.com>, kernel test robot <lkp@...el.com>
Subject: [PATCH v9 3/5] mm: make alloc_contig_range work at pageblock granularity
From: Zi Yan <ziy@...dia.com>
alloc_contig_range() worked at MAX_ORDER_NR_PAGES granularity to avoid
merging pageblocks with different migratetypes. It might unnecessarily
convert extra pageblocks at the beginning and at the end of the range.
Change alloc_contig_range() to work at pageblock granularity.
Special handling is needed for free pages and in-use pages across the
boundaries of the range specified alloc_contig_range(). Because these
partially isolated pages causes free page accounting issues. The free
pages will be split and freed into separate migratetype lists; the
in-use pages will be migrated then the freed pages will be handled.
Reported-by: kernel test robot <lkp@...el.com>
Signed-off-by: Zi Yan <ziy@...dia.com>
---
include/linux/page-isolation.h | 2 +-
mm/internal.h | 6 ++
mm/memory_hotplug.c | 3 +-
mm/page_alloc.c | 107 +++++++++----------
mm/page_isolation.c | 189 ++++++++++++++++++++++++++++++---
5 files changed, 236 insertions(+), 71 deletions(-)
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index e14eddf6741a..52060514f920 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -42,7 +42,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
*/
int
start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype, int flags);
+ unsigned migratetype, int flags, gfp_t gfp_flags);
/*
* Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
diff --git a/mm/internal.h b/mm/internal.h
index 9be0227ccc94..9d0a6a898ba8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -269,6 +269,9 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
int nid, bool exact_nid);
+void split_free_page(struct page *free_page,
+ int order, unsigned long split_pfn_offset);
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -332,6 +335,9 @@ isolate_freepages_range(struct compact_control *cc,
int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
+
+int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end);
#endif
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 416b38ca8def..1cf4d4b60772 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1836,7 +1836,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE);
+ MEMORY_OFFLINE | REPORT_FAILURE,
+ GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f24fe057389f..57ebc9e41414 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1138,6 +1138,43 @@ static inline void __free_one_page(struct page *page,
page_reporting_notify_free(order);
}
+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page: the original free page
+ * @order: the order of the page
+ * @split_pfn_offset: split offset within the page
+ *
+ * It is used when the free page crosses two pageblocks with different migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+void split_free_page(struct page *free_page,
+ int order, unsigned long split_pfn_offset)
+{
+ struct zone *zone = page_zone(free_page);
+ unsigned long free_page_pfn = page_to_pfn(free_page);
+ unsigned long pfn;
+ unsigned long flags;
+ int free_page_order;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ del_page_from_free_list(free_page, zone, order);
+ for (pfn = free_page_pfn;
+ pfn < free_page_pfn + (1UL << order);) {
+ int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+ free_page_order = ffs(split_pfn_offset) - 1;
+ __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+ mt, FPI_NONE);
+ pfn += 1UL << free_page_order;
+ split_pfn_offset -= (1UL << free_page_order);
+ /* we have done the first part, now switch to second part */
+ if (split_pfn_offset == 0)
+ split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
* try and check multiple fields with one check. The caller must do a detailed
@@ -8959,7 +8996,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
#endif
/* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
@@ -9041,8 +9078,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask)
{
- unsigned long outer_start, outer_end;
- unsigned int order;
+ unsigned long outer_end;
+ unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages);
+ unsigned long alloc_end = ALIGN(end, pageblock_nr_pages);
int ret = 0;
struct compact_control cc = {
@@ -9061,14 +9099,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
* have different sizes, and due to the way page allocator
- * work, we align the range to biggest of the two pages so
- * that page allocator won't try to merge buddies from
- * different pageblocks and change MIGRATE_ISOLATE to some
- * other migration type.
+ * work, start_isolate_page_range() has special handlings for this.
*
* Once the pageblocks are marked as MIGRATE_ISOLATE, we
* migrate the pages from an unaligned range (ie. pages that
- * we are interested in). This will put all the pages in
+ * we are interested in). This will put all the pages in
* range back to page allocator as MIGRATE_ISOLATE.
*
* When this is done, we take the pages in range from page
@@ -9081,9 +9116,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(start, end, migratetype, 0);
+ ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
if (ret)
- return ret;
+ goto done;
drain_all_pages(cc.zone);
@@ -9102,64 +9137,24 @@ int alloc_contig_range(unsigned long start, unsigned long end,
goto done;
ret = 0;
- /*
- * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
- * aligned blocks that are marked as MIGRATE_ISOLATE. What's
- * more, all pages in [start, end) are free in page allocator.
- * What we are going to do is to allocate all pages from
- * [start, end) (that is remove them from page allocator).
- *
- * The only problem is that pages at the beginning and at the
- * end of interesting range may be not aligned with pages that
- * page allocator holds, ie. they can be part of higher order
- * pages. Because of this, we reserve the bigger range and
- * once this is done free the pages we are not interested in.
- *
- * We don't have to hold zone->lock here because the pages are
- * isolated thus they won't get removed from buddy.
- */
-
- order = 0;
- outer_start = start;
- while (!PageBuddy(pfn_to_page(outer_start))) {
- if (++order >= MAX_ORDER) {
- outer_start = start;
- break;
- }
- outer_start &= ~0UL << order;
- }
-
- if (outer_start != start) {
- order = buddy_order(pfn_to_page(outer_start));
-
- /*
- * outer_start page could be small order buddy page and
- * it doesn't include start page. Adjust outer_start
- * in this case to report failed page properly
- * on tracepoint in test_pages_isolated()
- */
- if (outer_start + (1UL << order) <= start)
- outer_start = start;
- }
-
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end, 0)) {
+ if (test_pages_isolated(alloc_start, alloc_end, 0)) {
ret = -EBUSY;
goto done;
}
/* Grab isolated pages from freelists. */
- outer_end = isolate_freepages_range(&cc, outer_start, end);
+ outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end);
if (!outer_end) {
ret = -EBUSY;
goto done;
}
/* Free head and tail (if any) */
- if (start != outer_start)
- free_contig_range(outer_start, start - outer_start);
- if (end != outer_end)
- free_contig_range(end, outer_end - end);
+ if (start != alloc_start)
+ free_contig_range(alloc_start, start - alloc_start);
+ if (end != alloc_end)
+ free_contig_range(end, alloc_end - end);
done:
undo_isolate_page_range(start, end, migratetype);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 0223c9a4cff3..a24a521f62c6 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -284,16 +284,156 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
return NULL;
}
-static unsigned long pfn_max_align_down(unsigned long pfn)
+/**
+ * isolate_single_pageblock() -- tries to isolate a pageblock that might be
+ * within a free or in-use page.
+ * @boundary_pfn: pageblock-aligned pfn that a page might cross
+ * @gfp_flags: GFP flags used for migrating pages
+ * @isolate_before: isolate the pageblock before the boundary_pfn
+ *
+ * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * pageblock. When not all pageblocks within a page are isolated at the same
+ * time, free page accounting can go wrong. For example, in the case of
+ * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
+ * [ MAX_ORDER-1 ]
+ * [ pageblock0 | pageblock1 ]
+ * When either pageblock is isolated, if it is a free page, the page is not
+ * split into separate migratetype lists, which is supposed to; if it is an
+ * in-use page and freed later, __free_one_page() does not split the free page
+ * either. The function handles this by splitting the free page or migrating
+ * the in-use page then splitting the free page.
+ */
+static int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
+ bool isolate_before)
{
- return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
-}
+ unsigned char saved_mt;
+ unsigned long start_pfn;
+ unsigned long isolate_pageblock;
+ unsigned long pfn;
+ struct zone *zone;
-static unsigned long pfn_max_align_up(unsigned long pfn)
-{
- return ALIGN(pfn, MAX_ORDER_NR_PAGES);
+ VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
+
+ if (isolate_before)
+ isolate_pageblock = boundary_pfn - pageblock_nr_pages;
+ else
+ isolate_pageblock = boundary_pfn;
+
+ /*
+ * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
+ * only isolating a subset of pageblocks from a bigger than pageblock
+ * free or in-use page. Also make sure all to-be-isolated pageblocks
+ * are within the same zone.
+ */
+ zone = page_zone(pfn_to_page(isolate_pageblock));
+ start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
+ zone->zone_start_pfn);
+
+ saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+ set_pageblock_migratetype(pfn_to_page(isolate_pageblock), MIGRATE_ISOLATE);
+
+ /*
+ * Bail out early when the to-be-isolated pageblock does not form
+ * a free or in-use page across boundary_pfn:
+ *
+ * 1. isolate before boundary_pfn: the page after is not online
+ * 2. isolate after boundary_pfn: the page before is not online
+ *
+ * This also ensures correctness. Without it, when isolate_before is
+ * false, the page can be NULL in the for loop below.
+ */
+ if (isolate_before) {
+ if (!pfn_to_online_page(boundary_pfn))
+ return 0;
+ } else {
+ if (!pfn_to_online_page(boundary_pfn - 1))
+ return 0;
+ }
+
+ for (pfn = start_pfn; pfn < boundary_pfn;) {
+ struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
+
+ VM_BUG_ON(!page);
+ pfn = page_to_pfn(page);
+ /*
+ * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
+ * free pages in [start_pfn, boundary_pfn), its head page will
+ * always be in the range.
+ */
+ if (PageBuddy(page)) {
+ int order = buddy_order(page);
+
+ if (pfn + (1UL << order) > boundary_pfn)
+ split_free_page(page, order, boundary_pfn - pfn);
+ pfn += (1UL << order);
+ continue;
+ }
+ /*
+ * migrate compound pages then let the free page handling code
+ * above do the rest. If migration is not enabled, just fail.
+ */
+ if (PageHuge(page) || PageTransCompound(page)) {
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ unsigned long nr_pages = compound_nr(page);
+ int order = compound_order(page);
+ struct page *head = compound_head(page);
+ unsigned long head_pfn = page_to_pfn(head);
+ int ret;
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(head_pfn)),
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ .no_set_skip_hint = true,
+ .gfp_mask = gfp_flags,
+ .alloc_contig = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ if (head_pfn + nr_pages < boundary_pfn) {
+ pfn += nr_pages;
+ continue;
+ }
+
+ ret = __alloc_contig_migrate_range(&cc, head_pfn,
+ head_pfn + nr_pages);
+
+ if (ret)
+ goto failed;
+ /*
+ * reset pfn, let the free page handling code above
+ * split the free page to the right migratetype list.
+ *
+ * head_pfn is not used here as a hugetlb page order
+ * can be bigger than MAX_ORDER-1, but after it is
+ * freed, the free page order is not. Use pfn within
+ * the range to find the head of the free page and
+ * reset order to 0 if a hugetlb page with
+ * >MAX_ORDER-1 order is encountered.
+ */
+ if (order > MAX_ORDER-1)
+ order = 0;
+ while (!PageBuddy(pfn_to_page(pfn))) {
+ order++;
+ pfn &= ~0UL << order;
+ }
+ continue;
+#else
+ goto failed;
+#endif
+ }
+
+ pfn++;
+ }
+ return 0;
+failed:
+ /* restore the original migratetype */
+ set_pageblock_migratetype(pfn_to_page(isolate_pageblock), saved_mt);
+ return -EBUSY;
}
+
/**
* start_isolate_page_range() - make page-allocation-type of range of pages to
* be MIGRATE_ISOLATE.
@@ -307,6 +447,8 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
* and PageOffline() pages.
* REPORT_FAILURE - report details about the failure to
* isolate the range
+ * @gfp_flags: GFP flags used for migrating pages that sit across the
+ * range boundaries.
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
@@ -315,6 +457,10 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
* pages in the range finally, the caller have to free all pages in the range.
* test_page_isolated() can be used for test it.
*
+ * The function first tries to isolate the pageblocks at the beginning and end
+ * of the range, since there might be pages across the range boundaries.
+ * Afterwards, it isolates the rest of the range.
+ *
* There is no high level synchronization mechanism that prevents two threads
* from trying to isolate overlapping ranges. If this happens, one thread
* will notice pageblocks in the overlapping range already set to isolate.
@@ -335,21 +481,38 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
* Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype, int flags)
+ unsigned migratetype, int flags, gfp_t gfp_flags)
{
unsigned long pfn;
struct page *page;
+ /* isolation is done at page block granularity */
+ unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
+ unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
+ int ret;
- unsigned long isolate_start = pfn_max_align_down(start_pfn);
- unsigned long isolate_end = pfn_max_align_up(end_pfn);
+ /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
+ ret = isolate_single_pageblock(isolate_start, gfp_flags, false);
+ if (ret)
+ return ret;
- for (pfn = isolate_start;
- pfn < isolate_end;
+ /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
+ ret = isolate_single_pageblock(isolate_end, gfp_flags, true);
+ if (ret) {
+ unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
+ return ret;
+ }
+
+ /* skip isolated pageblocks at the beginning and end */
+ for (pfn = isolate_start + pageblock_nr_pages;
+ pfn < isolate_end - pageblock_nr_pages;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
if (page && set_migratetype_isolate(page, migratetype, flags,
start_pfn, end_pfn)) {
undo_isolate_page_range(isolate_start, pfn, migratetype);
+ unset_migratetype_isolate(
+ pfn_to_page(isolate_end - pageblock_nr_pages),
+ migratetype);
return -EBUSY;
}
}
@@ -364,8 +527,8 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
{
unsigned long pfn;
struct page *page;
- unsigned long isolate_start = pfn_max_align_down(start_pfn);
- unsigned long isolate_end = pfn_max_align_up(end_pfn);
+ unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
+ unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
for (pfn = isolate_start;
--
2.35.1
Powered by blists - more mailing lists