linux-kernel - [RFC PATCH v2 16/47] hugetlb: make unmapping compatible with high-granularity mappings

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221021163703.3218176-17-jthoughton@google.com>
Date:   Fri, 21 Oct 2022 16:36:32 +0000
From:   James Houghton <jthoughton@...gle.com>
To:     Mike Kravetz <mike.kravetz@...cle.com>,
        Muchun Song <songmuchun@...edance.com>,
        Peter Xu <peterx@...hat.com>
Cc:     David Hildenbrand <david@...hat.com>,
        David Rientjes <rientjes@...gle.com>,
        Axel Rasmussen <axelrasmussen@...gle.com>,
        Mina Almasry <almasrymina@...gle.com>,
        "Zach O'Keefe" <zokeefe@...gle.com>,
        Manish Mishra <manish.mishra@...anix.com>,
        Naoya Horiguchi <naoya.horiguchi@....com>,
        "Dr . David Alan Gilbert" <dgilbert@...hat.com>,
        "Matthew Wilcox (Oracle)" <willy@...radead.org>,
        Vlastimil Babka <vbabka@...e.cz>,
        Baolin Wang <baolin.wang@...ux.alibaba.com>,
        Miaohe Lin <linmiaohe@...wei.com>,
        Yang Shi <shy828301@...il.com>,
        Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org,
        James Houghton <jthoughton@...gle.com>
Subject: [RFC PATCH v2 16/47] hugetlb: make unmapping compatible with
 high-granularity mappings

Enlighten __unmap_hugepage_range to deal with high-granularity mappings.
This doesn't change its API; it still must be called with hugepage
alignment, but it will correctly unmap hugepages that have been mapped
at high granularity.

The rules for mapcount and refcount here are:
 1. Refcount and mapcount are tracked on the head page.
 2. Each page table mapping into some of an hpage will increase that
    hpage's mapcount and refcount by 1.

Eventually, functionality here can be expanded to allow users to call
MADV_DONTNEED on PAGE_SIZE-aligned sections of a hugepage, but that is
not done here.

Signed-off-by: James Houghton <jthoughton@...gle.com>
---
 include/asm-generic/tlb.h |  6 ++--
 mm/hugetlb.c              | 76 +++++++++++++++++++++++++--------------
 2 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 492dce43236e..c378a44915a9 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -566,9 +566,9 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 	} while (0)
 
-#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
+#define tlb_remove_huge_tlb_entry(tlb, hpte, address)	\
 	do {							\
-		unsigned long _sz = huge_page_size(h);		\
+		unsigned long _sz = hugetlb_pte_size(&hpte);	\
 		if (_sz >= P4D_SIZE)				\
 			tlb_flush_p4d_range(tlb, address, _sz);	\
 		else if (_sz >= PUD_SIZE)			\
@@ -577,7 +577,7 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
 			tlb_flush_pmd_range(tlb, address, _sz);	\
 		else						\
 			tlb_flush_pte_range(tlb, address, _sz);	\
-		__tlb_remove_tlb_entry(tlb, ptep, address);	\
+		__tlb_remove_tlb_entry(tlb, hpte.ptep, address);\
 	} while (0)
 
 /**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 74a4afda1a7e..227150c25763 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5221,10 +5221,10 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
-	pte_t *ptep;
+	struct hugetlb_pte hpte;
 	pte_t pte;
 	spinlock_t *ptl;
-	struct page *page;
+	struct page *hpage, *subpage;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
 	struct mmu_notifier_range range;
@@ -5235,11 +5235,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 	BUG_ON(start & ~huge_page_mask(h));
 	BUG_ON(end & ~huge_page_mask(h));
 
-	/*
-	 * This is a hugetlb vma, all the pte entries should point
-	 * to huge page.
-	 */
-	tlb_change_page_size(tlb, sz);
 	tlb_start_vma(tlb, vma);
 
 	/*
@@ -5251,26 +5246,35 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 	mmu_notifier_invalidate_range_start(&range);
 	last_addr_mask = hugetlb_mask_last_page(h);
 	address = start;
-	for (; address < end; address += sz) {
-		ptep = huge_pte_offset(mm, address, sz);
+
+	while (address < end) {
+		pte_t *ptep = huge_pte_offset(mm, address, sz);
+
 		if (!ptep) {
 			address |= last_addr_mask;
+			address += sz;
 			continue;
 		}
+		hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h),
+				hpage_size_to_level(huge_page_size(h)));
+		hugetlb_hgm_walk(mm, vma, &hpte, address,
+				PAGE_SIZE, /*stop_at_none=*/true);
 
-		ptl = huge_pte_lock(h, mm, ptep);
-		if (huge_pmd_unshare(mm, vma, address, ptep)) {
+		ptl = hugetlb_pte_lock(mm, &hpte);
+		if (huge_pmd_unshare(mm, vma, address, hpte.ptep)) {
 			spin_unlock(ptl);
 			tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
 			force_flush = true;
 			address |= last_addr_mask;
+			address += sz;
 			continue;
 		}
 
-		pte = huge_ptep_get(ptep);
+		pte = huge_ptep_get(hpte.ptep);
+
 		if (huge_pte_none(pte)) {
 			spin_unlock(ptl);
-			continue;
+			goto next_hpte;
 		}
 
 		/*
@@ -5287,25 +5291,36 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 			 */
 			if (pte_swp_uffd_wp_any(pte) &&
 			    !(zap_flags & ZAP_FLAG_DROP_MARKER))
-				set_huge_pte_at(mm, address, ptep,
+				set_huge_pte_at(mm, address, hpte.ptep,
 						make_pte_marker(PTE_MARKER_UFFD_WP));
 			else
 #endif
-				huge_pte_clear(mm, address, ptep, sz);
+				huge_pte_clear(mm, address, hpte.ptep,
+						hugetlb_pte_size(&hpte));
+			spin_unlock(ptl);
+			goto next_hpte;
+		}
+
+		if (unlikely(!hugetlb_pte_present_leaf(&hpte, pte))) {
+			/*
+			 * We raced with someone splitting out from under us.
+			 * Retry the walk.
+			 */
 			spin_unlock(ptl);
 			continue;
 		}
 
-		page = pte_page(pte);
+		subpage = pte_page(pte);
+		hpage = compound_head(subpage);
 		/*
 		 * If a reference page is supplied, it is because a specific
 		 * page is being unmapped, not a range. Ensure the page we
 		 * are about to unmap is the actual page of interest.
 		 */
 		if (ref_page) {
-			if (page != ref_page) {
+			if (hpage != ref_page) {
 				spin_unlock(ptl);
-				continue;
+				goto next_hpte;
 			}
 			/*
 			 * Mark the VMA as having unmapped its page so that
@@ -5315,27 +5330,34 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
 		}
 
-		pte = huge_ptep_get_and_clear(mm, address, ptep);
-		tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
+		pte = huge_ptep_get_and_clear(mm, address, hpte.ptep);
+		tlb_change_page_size(tlb, hugetlb_pte_size(&hpte));
+		tlb_remove_huge_tlb_entry(tlb, hpte, address);
 		if (huge_pte_dirty(pte))
-			set_page_dirty(page);
+			set_page_dirty(hpage);
 #ifdef CONFIG_PTE_MARKER_UFFD_WP
 		/* Leave a uffd-wp pte marker if needed */
 		if (huge_pte_uffd_wp(pte) &&
 		    !(zap_flags & ZAP_FLAG_DROP_MARKER))
-			set_huge_pte_at(mm, address, ptep,
+			set_huge_pte_at(mm, address, hpte.ptep,
 					make_pte_marker(PTE_MARKER_UFFD_WP));
 #endif
-		hugetlb_count_sub(pages_per_huge_page(h), mm);
-		page_remove_rmap(page, vma, true);
+		hugetlb_count_sub(hugetlb_pte_size(&hpte)/PAGE_SIZE, mm);
+		page_remove_rmap(hpage, vma, true);
 
 		spin_unlock(ptl);
-		tlb_remove_page_size(tlb, page, huge_page_size(h));
 		/*
-		 * Bail out after unmapping reference page if supplied
+		 * Lower the reference count on the head page.
+		 */
+		tlb_remove_page_size(tlb, hpage, sz);
+		/*
+		 * Bail out after unmapping reference page if supplied,
+		 * and there's only one PTE mapping this page.
 		 */
-		if (ref_page)
+		if (ref_page && hugetlb_pte_size(&hpte) == sz)
 			break;
+next_hpte:
+		address += hugetlb_pte_size(&hpte);
 	}
 	mmu_notifier_invalidate_range_end(&range);
 	tlb_end_vma(tlb, vma);
-- 
2.38.0.135.g90850a2211-goog