[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a05642935d6e38963d8d71de1570d9ffbfa96ab6.1755677674.git.baolin.wang@linux.alibaba.com>
Date: Wed, 20 Aug 2025 17:07:15 +0800
From: Baolin Wang <baolin.wang@...ux.alibaba.com>
To: akpm@...ux-foundation.org,
hughd@...gle.com,
david@...hat.com,
lorenzo.stoakes@...cle.com
Cc: ziy@...dia.com,
Liam.Howlett@...cle.com,
npache@...hat.com,
ryan.roberts@....com,
dev.jain@....com,
baohua@...nel.org,
baolin.wang@...ux.alibaba.com,
linux-mm@...ck.org,
linux-kernel@...r.kernel.org
Subject: [RFC PATCH 04/11] mm: khugepaged: add shmem/file mTHP collapse support
Khugepaged already supports the anonymous mTHP collapse. Similarly, let
khugepaged also support the shmem/file mTHP collapse. The strategy for
shmem/file mTHP collapse follows the anonymous mTHP collapse, which is,
quoting from Nico:
"while scanning PMD ranges for potential collapse candidates, keep
track of pages in KHUGEPAGED_MIN_MTHP_ORDER chunks via a bitmap. Each bit
represents a utilized region of order KHUGEPAGED_MIN_MTHP_ORDER PTEs.
After the scan is complete, we will perform binary recursion on the bitmap
to determine which mTHP size would be most efficient to collapse to. The
'max_ptes_none' will be scaled by the attempted collapse order to determine
how full a THP must be to be eligible.
"
Moreover, to facilitate the scanning of shmem/file folios, extend the
'cc->mthp_bitmap_temp' bitmap to record whether each index within the
PMD range corresponds to a present page, and then this temp bitmap is used
to determine whether each chunk should be marked as present for mTHP
collapse.
Currently, the collapse_pte_mapped_thp() does not build the mapping for mTHP.
Cause we still expect to establish the mTHP mapping via refault under the
control of fault_around. So collapse_pte_mapped_thp() remains responsible
only for building the mapping for PMD-sized THP, which is reasonable and
makes life easier.
Note that we do not need to remove pte page tables for shmem/file mTHP
collapse.
Signed-off-by: Baolin Wang <baolin.wang@...ux.alibaba.com>
---
mm/khugepaged.c | 133 ++++++++++++++++++++++++++++++++++++++----------
1 file changed, 107 insertions(+), 26 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 195c26699118..53ca7bb72fbc 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -113,7 +113,7 @@ struct collapse_control {
* 1bit = order KHUGEPAGED_MIN_MTHP_ORDER mTHP
*/
DECLARE_BITMAP(mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
- DECLARE_BITMAP(mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
+ DECLARE_BITMAP(mthp_bitmap_temp, HPAGE_PMD_NR);
struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_SIZE];
};
@@ -147,6 +147,10 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
+static int collapse_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc, int order);
+
#ifdef CONFIG_SYSFS
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -1366,7 +1370,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
/* Recursive function to consume the bitmap */
static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
- int referenced, int unmapped, struct collapse_control *cc,
+ struct file *file, int referenced, int unmapped,
+ pgoff_t start, struct collapse_control *cc,
bool *mmap_locked, unsigned long enabled_orders)
{
u8 order, next_order;
@@ -1401,10 +1406,14 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
/* Check if the region is "almost full" based on the threshold */
if (bits_set > threshold_bits || is_pmd_only
- || test_bit(order, &huge_anon_orders_always)) {
- ret = collapse_huge_page(mm, address, referenced, unmapped,
- cc, mmap_locked, order,
- offset * KHUGEPAGED_MIN_MTHP_NR);
+ || (!file && test_bit(order, &huge_anon_orders_always))) {
+ if (file)
+ ret = collapse_file(mm, address, file,
+ start + offset * KHUGEPAGED_MIN_MTHP_NR, cc, order);
+ else
+ ret = collapse_huge_page(mm, address, referenced, unmapped,
+ cc, mmap_locked, order,
+ offset * KHUGEPAGED_MIN_MTHP_NR);
/*
* Analyze failure reason to determine next action:
@@ -1418,6 +1427,7 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
collapsed += (1 << order);
case SCAN_PAGE_RO:
case SCAN_PTE_MAPPED_HUGEPAGE:
+ case SCAN_PAGE_COMPOUND:
continue;
/* Cases were lower orders might still succeed */
case SCAN_LACK_REFERENCED_PAGE:
@@ -1481,7 +1491,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
goto out;
bitmap_zero(cc->mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
- bitmap_zero(cc->mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
+ bitmap_zero(cc->mthp_bitmap_temp, HPAGE_PMD_NR);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
@@ -1649,8 +1659,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
- result = collapse_scan_bitmap(mm, address, referenced, unmapped, cc,
- mmap_locked, enabled_orders);
+ result = collapse_scan_bitmap(mm, address, NULL, referenced, unmapped,
+ 0, cc, mmap_locked, enabled_orders);
if (result > 0)
result = SCAN_SUCCEED;
else
@@ -2067,6 +2077,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
struct collapse_control *cc,
int order)
{
+ int max_scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
struct address_space *mapping = file->f_mapping;
struct page *dst;
struct folio *folio, *tmp, *new_folio;
@@ -2128,9 +2139,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
nr_none++;
- if (cc->is_khugepaged && nr_none > khugepaged_max_ptes_none) {
+ if (cc->is_khugepaged && nr_none > max_scaled_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
goto xa_locked;
}
@@ -2223,6 +2235,18 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto out_unlock;
}
+ /*
+ * If the folio order is greater than the collapse order, there is
+ * no need to continue attempting to collapse.
+ * And should return SCAN_PAGE_COMPOUND instead of SCAN_PTE_MAPPED_HUGEPAGE,
+ * then we can build the mapping under the control of fault_around
+ * when refaulting.
+ */
+ if (folio_order(folio) >= order) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out_unlock;
+ }
+
if (folio_mapping(folio) != mapping) {
result = SCAN_TRUNCATED;
goto out_unlock;
@@ -2443,12 +2467,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
xas_unlock_irq(&xas);
/*
- * Remove pte page tables, so we can re-fault the page as huge.
+ * Remove pte page tables for PMD-sized THP collapse, so we can re-fault
+ * the page as huge.
* If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
*/
- retract_page_tables(mapping, start);
- if (cc && !cc->is_khugepaged)
- result = SCAN_PTE_MAPPED_HUGEPAGE;
+ if (order == HPAGE_PMD_ORDER)
+ retract_page_tables(mapping, start);
folio_unlock(new_folio);
/*
@@ -2504,21 +2528,35 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
return result;
}
-static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
+static int collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, struct file *file, pgoff_t start,
struct collapse_control *cc)
{
+ int max_scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER);
+ enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
- int present, swap;
+ int present, swap, nr_pages;
+ unsigned long enabled_orders;
int node = NUMA_NO_NODE;
int result = SCAN_SUCCEED;
+ bool is_pmd_only;
present = 0;
swap = 0;
+ bitmap_zero(cc->mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
+ bitmap_zero(cc->mthp_bitmap_temp, HPAGE_PMD_NR);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
+
+ if (cc->is_khugepaged)
+ enabled_orders = thp_vma_allowable_orders(vma, vma->vm_flags,
+ type, THP_ORDERS_ALL_FILE_DEFAULT);
+ else
+ enabled_orders = BIT(HPAGE_PMD_ORDER);
+ is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
+
rcu_read_lock();
xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
if (xas_retry(&xas, folio))
@@ -2587,7 +2625,20 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* is just too costly...
*/
- present += folio_nr_pages(folio);
+ nr_pages = folio_nr_pages(folio);
+ present += nr_pages;
+
+ /*
+ * If there are folios present, keep track of it in the bitmap
+ * for file/shmem mTHP collapse.
+ */
+ if (!is_pmd_only) {
+ pgoff_t pgoff = max_t(pgoff_t, start, folio->index) - start;
+
+ nr_pages = min_t(int, HPAGE_PMD_NR - pgoff, nr_pages);
+ bitmap_set(cc->mthp_bitmap_temp, pgoff, nr_pages);
+ }
+
folio_put(folio);
if (need_resched()) {
@@ -2597,16 +2648,46 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
}
rcu_read_unlock();
- if (result == SCAN_SUCCEED) {
- if (cc->is_khugepaged &&
- present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
- result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
- } else {
- result = collapse_file(mm, addr, file, start, cc, HPAGE_PMD_ORDER);
+ if (result != SCAN_SUCCEED)
+ goto out;
+
+ if (cc->is_khugepaged && is_pmd_only &&
+ present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ goto out;
+ }
+
+ /*
+ * Check each KHUGEPAGED_MIN_MTHP_NR page chunks, and keep track of it
+ * in the bitmap if this chunk has enough present folios.
+ */
+ if (!is_pmd_only) {
+ int i;
+
+ for (i = 0; i < HPAGE_PMD_NR; i += KHUGEPAGED_MIN_MTHP_NR) {
+ if (bitmap_weight(cc->mthp_bitmap_temp, KHUGEPAGED_MIN_MTHP_NR) >
+ KHUGEPAGED_MIN_MTHP_NR - max_scaled_none)
+ bitmap_set(cc->mthp_bitmap, i / KHUGEPAGED_MIN_MTHP_NR, 1);
+
+ bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap_temp,
+ KHUGEPAGED_MIN_MTHP_NR, HPAGE_PMD_NR);
}
+
+ bitmap_zero(cc->mthp_bitmap_temp, HPAGE_PMD_NR);
+ }
+ result = collapse_scan_bitmap(mm, addr, file, 0, 0, start,
+ cc, NULL, enabled_orders);
+ if (result > 0) {
+ if (cc && !cc->is_khugepaged)
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
+ else
+ result = SCAN_SUCCEED;
+ } else {
+ result = SCAN_FAIL;
}
+out:
trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
@@ -2628,7 +2709,7 @@ static int collapse_single_pmd(unsigned long addr,
mmap_read_unlock(mm);
*mmap_locked = false;
- result = collapse_scan_file(mm, addr, file, pgoff, cc);
+ result = collapse_scan_file(mm, vma, addr, file, pgoff, cc);
fput(file);
if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
mmap_read_lock(mm);
--
2.43.5
Powered by blists - more mailing lists