linux-kernel - [RFC PATCH 15/18] mm: use try_to_free_user_pte() in MADV

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220429133552.33768-16-zhengqi.arch@bytedance.com>
Date:   Fri, 29 Apr 2022 21:35:49 +0800
From:   Qi Zheng <zhengqi.arch@...edance.com>
To:     akpm@...ux-foundation.org, tglx@...utronix.de,
        kirill.shutemov@...ux.intel.com, mika.penttila@...tfour.com,
        david@...hat.com, jgg@...dia.com, tj@...nel.org, dennis@...nel.org,
        ming.lei@...hat.com
Cc:     linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org, songmuchun@...edance.com,
        zhouchengming@...edance.com, Qi Zheng <zhengqi.arch@...edance.com>
Subject: [RFC PATCH 15/18] mm: use try_to_free_user_pte() in MADV_FREE case

Different from MADV_DONTNEED case, MADV_FREE just marks the physical
page as lazyfree instead of unmapping it immediately, and the physical
page will not be unmapped until the system memory is tight. So we
convert the percpu_ref of the related user PTE page table page to
atomic mode in madvise_free_pte_range(), and then check if it is 0
in try_to_unmap_one(). If it is 0, we can safely reclaim the PTE page
table page at this time.

Signed-off-by: Qi Zheng <zhengqi.arch@...edance.com>
---
 include/linux/rmap.h |  2 ++
 mm/madvise.c         |  7 ++++++-
 mm/page_vma_mapped.c | 46 ++++++++++++++++++++++++++++++++++++++++++--
 mm/rmap.c            |  9 +++++++++
 4 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 17230c458341..a3174d3bf118 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -204,6 +204,8 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
 #define PVMW_SYNC		(1 << 0)
 /* Look for migration entries rather than present PTEs */
 #define PVMW_MIGRATION		(1 << 1)
+/* Used for MADV_FREE page */
+#define PVMW_MADV_FREE		(1 << 2)
 
 struct page_vma_mapped_walk {
 	unsigned long pfn;
diff --git a/mm/madvise.c b/mm/madvise.c
index 8123397f14c8..bd4bcaad5a9f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -598,7 +598,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte, *pte, ptent;
 	struct page *page;
 	int nr_swap = 0;
+	bool have_lazyfree = false;
 	unsigned long next;
+	unsigned long start = addr;
 
 	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd))
@@ -709,6 +711,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			tlb_remove_tlb_entry(tlb, pte, addr);
 		}
 		mark_page_lazyfree(page);
+		have_lazyfree = true;
 	}
 out:
 	if (nr_swap) {
@@ -718,8 +721,10 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 	}
 	arch_leave_lazy_mmu_mode();
-	if (orig_pte)
+	if (orig_pte) {
 		pte_unmap_unlock(orig_pte, ptl);
+		try_to_free_user_pte(mm, pmd, start, !have_lazyfree);
+	}
 	cond_resched();
 next:
 	return 0;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 8ecf8fd7cf5e..00bc09f57f48 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -266,8 +266,30 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 next_pte:
 		do {
 			pvmw->address += PAGE_SIZE;
-			if (pvmw->address >= end)
-				return not_found(pvmw);
+			if (pvmw->address >= end) {
+				not_found(pvmw);
+
+				if (pvmw->flags & PVMW_MADV_FREE) {
+					pgtable_t pte;
+					pmd_t pmdval;
+
+					pvmw->flags &= ~PVMW_MADV_FREE;
+					rcu_read_lock();
+					pmdval = READ_ONCE(*pvmw->pmd);
+					if (pmd_none(pmdval) || pmd_leaf(pmdval)) {
+						rcu_read_unlock();
+						return false;
+					}
+					pte = pmd_pgtable(pmdval);
+					if (percpu_ref_is_zero(pte->pte_ref)) {
+						rcu_read_unlock();
+						free_user_pte(mm, pvmw->pmd, pvmw->address);
+					} else {
+						rcu_read_unlock();
+					}
+				}
+				return false;
+			}
 			/* Did we cross page table boundary? */
 			if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
 				if (pvmw->ptl) {
@@ -275,6 +297,26 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 					pvmw->ptl = NULL;
 				}
 				pte_unmap(pvmw->pte);
+				if (pvmw->flags & PVMW_MADV_FREE) {
+					pgtable_t pte;
+					pmd_t pmdval;
+
+					pvmw->flags &= ~PVMW_MADV_FREE;
+					rcu_read_lock();
+					pmdval = READ_ONCE(*pvmw->pmd);
+					if (pmd_none(pmdval) || pmd_leaf(pmdval)) {
+						rcu_read_unlock();
+						pvmw->pte = NULL;
+						goto restart;
+					}
+					pte = pmd_pgtable(pmdval);
+					if (percpu_ref_is_zero(pte->pte_ref)) {
+						rcu_read_unlock();
+						free_user_pte(mm, pvmw->pmd, pvmw->address);
+					} else {
+						rcu_read_unlock();
+					}
+				}
 				pvmw->pte = NULL;
 				goto restart;
 			}
diff --git a/mm/rmap.c b/mm/rmap.c
index fedb82371efe..f978d324d4f9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1616,6 +1616,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 					mmu_notifier_invalidate_range(mm,
 						address, address + PAGE_SIZE);
 					dec_mm_counter(mm, MM_ANONPAGES);
+					if (IS_ENABLED(CONFIG_FREE_USER_PTE))
+						pvmw.flags |= PVMW_MADV_FREE;
 					goto discard;
 				}
 
@@ -1627,6 +1629,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				folio_set_swapbacked(folio);
 				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
+				if (IS_ENABLED(CONFIG_FREE_USER_PTE) &&
+				    pte_tryget(mm, pvmw.pmd, address)) {
+					pgtable_t pte_page = pmd_pgtable(*pvmw.pmd);
+
+					percpu_ref_switch_to_percpu(pte_page->pte_ref);
+					__pte_put(pte_page);
+				}
 				break;
 			}
 
-- 
2.20.1