lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250211111326.14295-10-dev.jain@arm.com>
Date: Tue, 11 Feb 2025 16:43:18 +0530
From: Dev Jain <dev.jain@....com>
To: akpm@...ux-foundation.org,
	david@...hat.com,
	willy@...radead.org,
	kirill.shutemov@...ux.intel.com
Cc: npache@...hat.com,
	ryan.roberts@....com,
	anshuman.khandual@....com,
	catalin.marinas@....com,
	cl@...two.org,
	vbabka@...e.cz,
	mhocko@...e.com,
	apopple@...dia.com,
	dave.hansen@...ux.intel.com,
	will@...nel.org,
	baohua@...nel.org,
	jack@...e.cz,
	srivatsa@...il.mit.edu,
	haowenchao22@...il.com,
	hughd@...gle.com,
	aneesh.kumar@...nel.org,
	yang@...amperecomputing.com,
	peterx@...hat.com,
	ioworker0@...il.com,
	wangkefeng.wang@...wei.com,
	ziy@...dia.com,
	jglisse@...gle.com,
	surenb@...gle.com,
	vishal.moola@...il.com,
	zokeefe@...gle.com,
	zhengqi.arch@...edance.com,
	jhubbard@...dia.com,
	21cnbao@...il.com,
	linux-mm@...ck.org,
	linux-kernel@...r.kernel.org,
	Dev Jain <dev.jain@....com>
Subject: [PATCH v2 09/17] khugepaged: Define collapse policy if a larger folio is already mapped

As noted in [1], khugepaged's goal must be to collapse memory to the highest aligned
order possible. Suppose khugepaged is scanning for 64K, and we have a 128K folio,
whose first 64K half is VA-PA aligned and fully mapped. In such a case, it does not make
sense to break this down into two 64K folios. On the other hand, if the first half is
not aligned, or it is partially mapped, it makes sense for khugepaged to collapse this
portion into a VA-PA aligned fully mapped 64K folio. 

[1] https://lore.kernel.org/all/aa647830-cf55-48f0-98c2-8230796e35b3@arm.com/

Signed-off-by: Dev Jain <dev.jain@....com>
---
 mm/khugepaged.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a674014b6563..0d0d8f415a2e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -34,6 +34,7 @@ enum scan_result {
 	SCAN_PMD_NULL,
 	SCAN_PMD_NONE,
 	SCAN_PMD_MAPPED,
+	SCAN_PTE_MAPPED_THP,
 	SCAN_EXCEED_NONE_PTE,
 	SCAN_EXCEED_SWAP_PTE,
 	SCAN_EXCEED_SHARED_PTE,
@@ -562,6 +563,14 @@ static bool is_refcount_suitable(struct folio *folio)
 	return folio_ref_count(folio) == expected_refcount;
 }
 
+/* Assumes an embedded PFN */
+static bool is_same_folio(pte_t *first_pte, pte_t *last_pte)
+{
+	struct folio *folio1 = page_folio(pte_page(ptep_get(first_pte)));
+	struct folio *folio2 = page_folio(pte_page(ptep_get(last_pte)));
+	return folio1 == folio2;
+}
+
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 					unsigned long address,
 					pte_t *pte,
@@ -575,13 +584,22 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	bool writable = false;
 	unsigned int max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
 	unsigned int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
+	bool all_pfns_present = true;
+	bool all_pfns_contig = true;
+	bool first_pfn_aligned = true;
+	pte_t prev_pteval;
 
 	for (_pte = pte; _pte < pte + (1UL << order);
 	     _pte++, address += PAGE_SIZE) {
 		pte_t pteval = ptep_get(_pte);
+		if (_pte == pte) {
+			if (!IS_ALIGNED(pte_pfn(pteval), (1UL << order)))
+				first_pfn_aligned = false;
+		}
 		if (pte_none(pteval) || (pte_present(pteval) &&
 				is_zero_pfn(pte_pfn(pteval)))) {
 			++none_or_zero;
+			all_pfns_present = false;
 			if (!userfaultfd_armed(vma) &&
 			    (!cc->is_khugepaged ||
 			     none_or_zero <= max_ptes_none)) {
@@ -660,6 +678,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			goto out;
 		}
 
+		if (all_pfns_contig && (pte != _pte) && !(all_pfns_present &&
+		    (pte_pfn(pteval) == pte_pfn(prev_pteval) + 1)))
+			all_pfns_contig = false;
+
+		prev_pteval = pteval;
+
 		/*
 		 * Isolate the page to avoid collapsing an hugepage
 		 * currently in use by the VM.
@@ -696,6 +720,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		result = SCAN_PAGE_RO;
 	} else if (unlikely(cc->is_khugepaged && !referenced)) {
 		result = SCAN_LACK_REFERENCED_PAGE;
+	} else if ((result == SCAN_SUCCEED) && (order != HPAGE_PMD_ORDER) && all_pfns_present &&
+		    all_pfns_contig && first_pfn_aligned &&
+		    is_same_folio(pte, pte + (1UL << order) - 1)) {
+		result = SCAN_PTE_MAPPED_THP;
 	} else {
 		result = SCAN_SUCCEED;
 		trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
@@ -1398,6 +1426,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	bool writable = false;
 	unsigned long orders, orig_orders;
 	int order, prev_order;
+	bool all_pfns_present, all_pfns_contig, first_pfn_aligned;
+	pte_t prev_pteval;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1417,6 +1447,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
 	max_ptes_swap = khugepaged_max_ptes_swap >> (HPAGE_PMD_ORDER - order);
 	referenced = 0, shared = 0, none_or_zero = 0, unmapped = 0;
+	all_pfns_present = true, all_pfns_contig = true, first_pfn_aligned = true;
 
 	/* Check pmd after taking mmap lock */
 	result = find_pmd_or_thp_or_none(mm, address, &pmd);
@@ -1435,8 +1466,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	for (_address = address, _pte = pte; _pte < pte + (1UL << order);
 	     _pte++, _address += PAGE_SIZE) {
 		pte_t pteval = ptep_get(_pte);
+		if (_pte == pte) {
+			if (!IS_ALIGNED(pte_pfn(pteval), (1UL << order)))
+				first_pfn_aligned = false;
+		}
+
 		if (is_swap_pte(pteval)) {
 			++unmapped;
+			all_pfns_present = false;
 			if (!cc->is_khugepaged ||
 			    unmapped <= max_ptes_swap) {
 				/*
@@ -1457,6 +1494,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 		}
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
 			++none_or_zero;
+			all_pfns_present = false;
 			if (!userfaultfd_armed(vma) &&
 			    (!cc->is_khugepaged ||
 			     none_or_zero <= max_ptes_none)) {
@@ -1546,6 +1584,17 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 			goto out_unmap;
 		}
 
+
+		/*
+		 * PFNs not contig, if either at least one PFN not present, or the previous
+		 * and this PFN are not contig
+		 */
+		if (all_pfns_contig && (pte != _pte) && !(all_pfns_present &&
+		    (pte_pfn(pteval) == pte_pfn(prev_pteval) + 1)))
+			all_pfns_contig = false;
+
+		prev_pteval = pteval;
+
 		/*
 		 * If collapse was initiated by khugepaged, check that there is
 		 * enough young pte to justify collapsing the page
@@ -1567,15 +1616,30 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	}
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+
+	/*
+	 * We skip if the following conditions are true:
+	 * 1) All PTEs point to consecutive PFNs
+	 * 2) All PFNs belong to the same folio
+	 * 3) The PFNs are PA-aligned to the order we are scanning for
+	 */
+	if ((result == SCAN_SUCCEED) && (order != HPAGE_PMD_ORDER) && all_pfns_present &&
+	     all_pfns_contig && first_pfn_aligned &&
+	     is_same_folio(pte, pte + (1UL << order) - 1)) {
+		result = SCAN_PTE_MAPPED_THP;
+		goto decide_order;
+	}
+
 	if (result == SCAN_SUCCEED) {
 		result = collapse_huge_page(mm, address, referenced,
 					    unmapped, order, cc);
 		/* collapse_huge_page will return with the mmap_lock released */
 		*mmap_locked = false;
 		/* Skip over this range and decide order */
-		if (result == SCAN_SUCCEED)
+		if (result == SCAN_SUCCEED || result == SCAN_PTE_MAPPED_THP)
 			goto decide_order;
 	}
+
 	if (result != SCAN_SUCCEED) {
 
 		/* Go to the next order */
-- 
2.30.2


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ