lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240417141111.77855-1-ioworker0@gmail.com>
Date: Wed, 17 Apr 2024 22:11:11 +0800
From: Lance Yang <ioworker0@...il.com>
To: akpm@...ux-foundation.org
Cc: willy@...radead.org,
	maskray@...gle.com,
	ziy@...dia.com,
	ryan.roberts@....com,
	david@...hat.com,
	21cnbao@...il.com,
	mhocko@...e.com,
	fengwei.yin@...el.com,
	zokeefe@...gle.com,
	shy828301@...il.com,
	xiehuan09@...il.com,
	wangkefeng.wang@...wei.com,
	songmuchun@...edance.com,
	peterx@...hat.com,
	minchan@...nel.org,
	linux-mm@...ck.org,
	linux-kernel@...r.kernel.org,
	Lance Yang <ioworker0@...il.com>
Subject: [PATCH 1/1] mm/vmscan: avoid split PMD-mapped THP during shrink_folio_list()

When the user no longer requires the pages, they would use madvise(madv_free)
to mark the pages as lazy free. IMO, they would not typically rewrite to the
given range.

At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list()
is unconditionally split, which may be unnecessary. If the THP is exclusively
mapped and clean, and the PMD associated with it is also clean, then we can
attempt to remove the PMD mapping from it. This change will improve the
efficiency of memory reclamation in this case.

On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using
mem_cgroup_force_empty() results in the following runtimes in seconds
(shorter is better):

--------------------------------------------
|     Old       |      New       |  Change  |
--------------------------------------------
|   0.683426    |    0.049197    |  -92.80% |
--------------------------------------------

Signed-off-by: Lance Yang <ioworker0@...il.com>
---
 include/linux/huge_mm.h |  1 +
 include/linux/rmap.h    |  1 +
 mm/huge_memory.c        |  2 +-
 mm/rmap.c               | 81 +++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c             |  7 ++++
 5 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd07b83a3d0..02a71c05f68a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -36,6 +36,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
 		    unsigned long cp_flags);
+inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd);
 
 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0f906dc6d280..8c2f45713351 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -100,6 +100,7 @@ enum ttu_flags {
 					 * do a final flush if necessary */
 	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
 					 * caller holds it */
+	TTU_LAZYFREE_THP	= 0x100, /* avoid split PMD-mapped THP */
 };
 
 #ifdef CONFIG_MMU
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 58f2c4745d80..309fba9624c2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1801,7 +1801,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	return ret;
 }
 
-static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
 {
 	pgtable_t pgtable;
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 2608c40dffad..4994f9e402d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -77,6 +77,7 @@
 #include <linux/mm_inline.h>
 
 #include <asm/tlbflush.h>
+#include <asm/tlb.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/tlb.h>
@@ -1606,6 +1607,80 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
 #endif
 }
 
+static bool __try_to_unmap_lazyfree_thp(struct vm_area_struct *vma,
+					     unsigned long address,
+					     struct folio *folio)
+{
+	spinlock_t *ptl;
+	pmd_t *pmdp, orig_pmd;
+	struct mmu_notifier_range range;
+	struct mmu_gather tlb;
+	struct mm_struct *mm = vma->vm_mm;
+	struct page *page;
+	bool ret = false;
+
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
+	VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
+	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+
+	/*
+	 * If we encounter a PMD-mapped THP that marked as lazyfree, we
+	 * will try to unmap it without splitting.
+	 *
+	 * The folio exclusively mapped should only have two refs:
+	 * one from the isolation and one from the rmap.
+	 */
+	if (folio_entire_mapcount(folio) != 1 || folio_test_dirty(folio) ||
+	    folio_ref_count(folio) != 2)
+		return false;
+
+	pmdp = mm_find_pmd(mm, address);
+	if (unlikely(!pmdp))
+		return false;
+	if (pmd_dirty(*pmdp))
+		return false;
+
+	tlb_gather_mmu(&tlb, mm);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				address & HPAGE_PMD_MASK,
+				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	ptl = pmd_lock(mm, pmdp);
+	orig_pmd = *pmdp;
+	if (unlikely(!pmd_present(orig_pmd) || !pmd_trans_huge(orig_pmd)))
+		goto out;
+
+	page = pmd_page(orig_pmd);
+	if (unlikely(page_folio(page) != folio))
+		goto out;
+
+	orig_pmd = pmdp_huge_get_and_clear(mm, address, pmdp);
+	tlb_remove_pmd_tlb_entry(&tlb, pmdp, address);
+	/*
+	 * There is a race between the first check of the dirty bit
+	 * for the PMD and the TLB entry flush. If the PMD is re-dirty
+	 * at this point, we will return to try_to_unmap_one() to call
+	 * split_huge_pmd_address() to split it.
+	 */
+	if (pmd_dirty(orig_pmd))
+		set_pmd_at(mm, address, pmdp, orig_pmd);
+	else {
+		folio_remove_rmap_pmd(folio, page, vma);
+		zap_deposited_table(mm, pmdp);
+		add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+		folio_put(folio);
+		ret = true;
+	}
+
+out:
+	spin_unlock(ptl);
+	mmu_notifier_invalidate_range_end(&range);
+
+	return ret;
+}
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1631,6 +1706,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 	if (flags & TTU_SYNC)
 		pvmw.flags = PVMW_SYNC;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (flags & TTU_LAZYFREE_THP)
+		if (__try_to_unmap_lazyfree_thp(vma, address, folio))
+			return true;
+#endif
+
 	if (flags & TTU_SPLIT_HUGE_PMD)
 		split_huge_pmd_address(vma, address, false, folio);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 49bd94423961..2358b1cff8bf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1277,6 +1277,13 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 
 			if (folio_test_pmd_mappable(folio))
 				flags |= TTU_SPLIT_HUGE_PMD;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+			if (folio_test_anon(folio) && !was_swapbacked &&
+			    flags & TTU_SPLIT_HUGE_PMD)
+				flags |= TTU_LAZYFREE_THP;
+#endif
+
 			/*
 			 * Without TTU_SYNC, try_to_unmap will only begin to
 			 * hold PTL from the first present PTE within a large
-- 
2.33.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ