linux-kernel - [RFC PATCH v2 24/47] hugetlb: update page_vma

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221021163703.3218176-25-jthoughton@google.com>
Date:   Fri, 21 Oct 2022 16:36:40 +0000
From:   James Houghton <jthoughton@...gle.com>
To:     Mike Kravetz <mike.kravetz@...cle.com>,
        Muchun Song <songmuchun@...edance.com>,
        Peter Xu <peterx@...hat.com>
Cc:     David Hildenbrand <david@...hat.com>,
        David Rientjes <rientjes@...gle.com>,
        Axel Rasmussen <axelrasmussen@...gle.com>,
        Mina Almasry <almasrymina@...gle.com>,
        "Zach O'Keefe" <zokeefe@...gle.com>,
        Manish Mishra <manish.mishra@...anix.com>,
        Naoya Horiguchi <naoya.horiguchi@....com>,
        "Dr . David Alan Gilbert" <dgilbert@...hat.com>,
        "Matthew Wilcox (Oracle)" <willy@...radead.org>,
        Vlastimil Babka <vbabka@...e.cz>,
        Baolin Wang <baolin.wang@...ux.alibaba.com>,
        Miaohe Lin <linmiaohe@...wei.com>,
        Yang Shi <shy828301@...il.com>,
        Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org,
        James Houghton <jthoughton@...gle.com>
Subject: [RFC PATCH v2 24/47] hugetlb: update page_vma_mapped to do
 high-granularity walks

This updates the HugeTLB logic to look a lot more like the PTE-mapped
THP logic. When a user calls us in a loop, we will update pvmw->address
to walk to each page table entry that could possibly map the hugepage
containing pvmw->pfn.

This makes use of the new pte_order so callers know what size PTE
they're getting.

Signed-off-by: James Houghton <jthoughton@...gle.com>
---
 include/linux/rmap.h |  4 +++
 mm/page_vma_mapped.c | 59 ++++++++++++++++++++++++++++++++++++--------
 mm/rmap.c            | 48 +++++++++++++++++++++--------------
 3 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e0557ede2951..d7d2d9f65a01 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -13,6 +13,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/memremap.h>
+#include <linux/hugetlb.h>
 
 /*
  * The anon_vma heads a list of private "related" vmas, to scan if
@@ -409,6 +410,9 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
 		pte_unmap(pvmw->pte);
 	if (pvmw->ptl)
 		spin_unlock(pvmw->ptl);
+	if (pvmw->pte && is_vm_hugetlb_page(pvmw->vma) &&
+			hugetlb_hgm_enabled(pvmw->vma))
+		hugetlb_vma_unlock_read(pvmw->vma);
 }
 
 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 395ca4e21c56..1994b3f9a4c2 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -133,7 +133,8 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
  *
  * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point
  * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is
- * adjusted if needed (for PTE-mapped THPs).
+ * adjusted if needed (for PTE-mapped THPs and high-granularity--mapped HugeTLB
+ * pages).
  *
  * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page
  * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in
@@ -166,19 +167,57 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 	if (unlikely(is_vm_hugetlb_page(vma))) {
 		struct hstate *hstate = hstate_vma(vma);
 		unsigned long size = huge_page_size(hstate);
-		/* The only possible mapping was handled on last iteration */
-		if (pvmw->pte)
-			return not_found(pvmw);
+		struct hugetlb_pte hpte;
+		pte_t *pte;
+		pte_t pteval;
+
+		end = (pvmw->address & huge_page_mask(hstate)) +
+			huge_page_size(hstate);
 
 		/* when pud is not present, pte will be NULL */
-		pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
-		if (!pvmw->pte)
+		pte = huge_pte_offset(mm, pvmw->address, size);
+		if (!pte)
 			return false;
 
-		pvmw->pte_order = huge_page_order(hstate);
-		pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
-		if (!check_pte(pvmw))
-			return not_found(pvmw);
+		do {
+			hugetlb_pte_populate(&hpte, pte, huge_page_shift(hstate),
+					hpage_size_to_level(size));
+
+			/*
+			 * Do a high granularity page table walk. The vma lock
+			 * is grabbed to prevent the page table from being
+			 * collapsed mid-walk. It is dropped in
+			 * page_vma_mapped_walk_done().
+			 */
+			if (pvmw->pte) {
+				if (pvmw->ptl)
+					spin_unlock(pvmw->ptl);
+				pvmw->ptl = NULL;
+				pvmw->address += PAGE_SIZE << pvmw->pte_order;
+				if (pvmw->address >= end)
+					return not_found(pvmw);
+			} else if (hugetlb_hgm_enabled(vma))
+				/* Only grab the lock once. */
+				hugetlb_vma_lock_read(vma);
+
+retry_walk:
+			hugetlb_hgm_walk(mm, vma, &hpte, pvmw->address,
+					PAGE_SIZE, /*stop_at_none=*/true);
+
+			pvmw->pte = hpte.ptep;
+			pvmw->pte_order = hpte.shift - PAGE_SHIFT;
+			pvmw->ptl = hugetlb_pte_lock(mm, &hpte);
+			pteval = huge_ptep_get(hpte.ptep);
+			if (pte_present(pteval) && !hugetlb_pte_present_leaf(
+						&hpte, pteval)) {
+				/*
+				 * Someone split from under us, so keep
+				 * walking.
+				 */
+				spin_unlock(pvmw->ptl);
+				goto retry_walk;
+			}
+		} while (!check_pte(pvmw));
 		return true;
 	}
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 527463c1e936..a8359584467e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1552,17 +1552,23 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			flush_cache_range(vma, range.start, range.end);
 
 			/*
-			 * To call huge_pmd_unshare, i_mmap_rwsem must be
-			 * held in write mode.  Caller needs to explicitly
-			 * do this outside rmap routines.
-			 *
-			 * We also must hold hugetlb vma_lock in write mode.
-			 * Lock order dictates acquiring vma_lock BEFORE
-			 * i_mmap_rwsem.  We can only try lock here and fail
-			 * if unsuccessful.
+			 * If HGM is enabled, we have already grabbed the VMA
+			 * lock for reading, and we cannot safely release it.
+			 * Because HGM-enabled VMAs have already unshared all
+			 * PMDs, we can safely ignore PMD unsharing here.
 			 */
-			if (!anon) {
+			if (!anon && !hugetlb_hgm_enabled(vma)) {
 				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+				/*
+				 * To call huge_pmd_unshare, i_mmap_rwsem must
+				 * be held in write mode.  Caller needs to
+				 * explicitly do this outside rmap routines.
+				 *
+				 * We also must hold hugetlb vma_lock in write
+				 * mode. Lock order dictates acquiring vma_lock
+				 * BEFORE i_mmap_rwsem.  We can only try lock
+				 * here and fail if unsuccessful.
+				 */
 				if (!hugetlb_vma_trylock_write(vma)) {
 					page_vma_mapped_walk_done(&pvmw);
 					ret = false;
@@ -1946,17 +1952,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			flush_cache_range(vma, range.start, range.end);
 
 			/*
-			 * To call huge_pmd_unshare, i_mmap_rwsem must be
-			 * held in write mode.  Caller needs to explicitly
-			 * do this outside rmap routines.
-			 *
-			 * We also must hold hugetlb vma_lock in write mode.
-			 * Lock order dictates acquiring vma_lock BEFORE
-			 * i_mmap_rwsem.  We can only try lock here and
-			 * fail if unsuccessful.
+			 * If HGM is enabled, we have already grabbed the VMA
+			 * lock for reading, and we cannot safely release it.
+			 * Because HGM-enabled VMAs have already unshared all
+			 * PMDs, we can safely ignore PMD unsharing here.
 			 */
-			if (!anon) {
+			if (!anon && !hugetlb_hgm_enabled(vma)) {
 				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+				/*
+				 * To call huge_pmd_unshare, i_mmap_rwsem must
+				 * be held in write mode.  Caller needs to
+				 * explicitly do this outside rmap routines.
+				 *
+				 * We also must hold hugetlb vma_lock in write
+				 * mode. Lock order dictates acquiring vma_lock
+				 * BEFORE i_mmap_rwsem.  We can only try lock
+				 * here and fail if unsuccessful.
+				 */
 				if (!hugetlb_vma_trylock_write(vma)) {
 					page_vma_mapped_walk_done(&pvmw);
 					ret = false;
-- 
2.38.0.135.g90850a2211-goog