lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221021163703.3218176-30-jthoughton@google.com>
Date:   Fri, 21 Oct 2022 16:36:45 +0000
From:   James Houghton <jthoughton@...gle.com>
To:     Mike Kravetz <mike.kravetz@...cle.com>,
        Muchun Song <songmuchun@...edance.com>,
        Peter Xu <peterx@...hat.com>
Cc:     David Hildenbrand <david@...hat.com>,
        David Rientjes <rientjes@...gle.com>,
        Axel Rasmussen <axelrasmussen@...gle.com>,
        Mina Almasry <almasrymina@...gle.com>,
        "Zach O'Keefe" <zokeefe@...gle.com>,
        Manish Mishra <manish.mishra@...anix.com>,
        Naoya Horiguchi <naoya.horiguchi@....com>,
        "Dr . David Alan Gilbert" <dgilbert@...hat.com>,
        "Matthew Wilcox (Oracle)" <willy@...radead.org>,
        Vlastimil Babka <vbabka@...e.cz>,
        Baolin Wang <baolin.wang@...ux.alibaba.com>,
        Miaohe Lin <linmiaohe@...wei.com>,
        Yang Shi <shy828301@...il.com>,
        Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org,
        James Houghton <jthoughton@...gle.com>
Subject: [RFC PATCH v2 29/47] hugetlb: add high-granularity migration support

To prevent queueing a hugepage for migration multiple times, we use
last_page to keep track of the last page we saw in queue_pages_hugetlb,
and if the page we're looking at is last_page, then we skip it.

For the non-hugetlb cases, last_page, although unused, is still updated
so that it has a consistent meaning with the hugetlb case.

This commit adds a check in hugetlb_fault for high-granularity migration
PTEs.

Signed-off-by: James Houghton <jthoughton@...gle.com>
---
 include/linux/swapops.h |  8 ++++++--
 mm/hugetlb.c            | 15 ++++++++++++++-
 mm/mempolicy.c          | 24 +++++++++++++++++++-----
 mm/migrate.c            | 18 +++++++++++-------
 4 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 86b95ccb81bb..2939323d0fd2 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -66,6 +66,8 @@
 
 static inline bool is_pfn_swap_entry(swp_entry_t entry);
 
+struct hugetlb_pte;
+
 /* Clear all flags but only keep swp_entry_t related information */
 static inline pte_t pte_swp_clear_flags(pte_t pte)
 {
@@ -346,7 +348,8 @@ extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
 #ifdef CONFIG_HUGETLB_PAGE
 extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
-extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
+extern void migration_entry_wait_huge(struct vm_area_struct *vma,
+					struct hugetlb_pte *hpte);
 #endif	/* CONFIG_HUGETLB_PAGE */
 #else  /* CONFIG_MIGRATION */
 static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
@@ -375,7 +378,8 @@ static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
 #ifdef CONFIG_HUGETLB_PAGE
 static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
-static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
+static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
+						struct hugetlb_pte *hpte) { }
 #endif	/* CONFIG_HUGETLB_PAGE */
 static inline int is_writable_migration_entry(swp_entry_t entry)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2ee2c48ee79c..8dba8d59ebe5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6100,9 +6100,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * OK as we are only making decisions based on content and
 		 * not actually modifying content here.
 		 */
+		hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h),
+				hpage_size_to_level(huge_page_size(h)));
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
-			migration_entry_wait_huge(vma, ptep);
+			migration_entry_wait_huge(vma, &hpte);
 			return 0;
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON_LARGE |
@@ -6142,7 +6144,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	hugetlb_hgm_walk(mm, vma, &hpte, address, PAGE_SIZE,
 			/*stop_at_none=*/true);
 
+	/*
+	 * Now that we have done a high-granularity walk, check again if we are
+	 * looking at a migration entry.
+	 */
 	entry = huge_ptep_get(hpte.ptep);
+	if (unlikely(is_hugetlb_entry_migration(entry))) {
+		hugetlb_vma_unlock_read(vma);
+		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+		migration_entry_wait_huge(vma, &hpte);
+		return 0;
+	}
+
 	/* PTE markers should be handled the same way as none pte */
 	if (huge_pte_none_mostly(entry)) {
 		/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 275bc549590e..47bf9b16a9c0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -424,6 +424,7 @@ struct queue_pages {
 	unsigned long start;
 	unsigned long end;
 	struct vm_area_struct *first;
+	struct page *last_page;
 };
 
 /*
@@ -475,6 +476,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 	flags = qp->flags;
 	/* go to thp migration */
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+		qp->last_page = page;
 		if (!vma_migratable(walk->vma) ||
 		    migrate_page_add(page, qp->pagelist, flags)) {
 			ret = 1;
@@ -532,6 +534,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 			continue;
 		if (!queue_pages_required(page, qp))
 			continue;
+
 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 			/* MPOL_MF_STRICT must be specified if we get here */
 			if (!vma_migratable(vma)) {
@@ -539,6 +542,8 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 				break;
 			}
 
+			qp->last_page = page;
+
 			/*
 			 * Do not abort immediately since there may be
 			 * temporary off LRU pages in the range.  Still
@@ -570,15 +575,22 @@ static int queue_pages_hugetlb(struct hugetlb_pte *hpte,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	/* We don't migrate high-granularity HugeTLB mappings for now. */
-	if (hugetlb_hgm_enabled(walk->vma))
-		return -EINVAL;
-
 	ptl = hugetlb_pte_lock(walk->mm, hpte);
 	entry = huge_ptep_get(hpte->ptep);
 	if (!pte_present(entry))
 		goto unlock;
-	page = pte_page(entry);
+
+	if (!hugetlb_pte_present_leaf(hpte, entry)) {
+		ret = -EAGAIN;
+		goto unlock;
+	}
+
+	page = compound_head(pte_page(entry));
+
+	/* We already queued this page with another high-granularity PTE. */
+	if (page == qp->last_page)
+		goto unlock;
+
 	if (!queue_pages_required(page, qp))
 		goto unlock;
 
@@ -605,6 +617,7 @@ static int queue_pages_hugetlb(struct hugetlb_pte *hpte,
 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 	if (flags & (MPOL_MF_MOVE_ALL) ||
 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
+		qp->last_page = page;
 		if (isolate_hugetlb(page, qp->pagelist) &&
 			(flags & MPOL_MF_STRICT))
 			/*
@@ -740,6 +753,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		.start = start,
 		.end = end,
 		.first = NULL,
+		.last_page = NULL,
 	};
 
 	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
diff --git a/mm/migrate.c b/mm/migrate.c
index 8712b694c5a7..197662dd1dc0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -186,6 +186,9 @@ static bool remove_migration_pte(struct folio *folio,
 		/* pgoff is invalid for ksm pages, but they are never large */
 		if (folio_test_large(folio) && !folio_test_hugetlb(folio))
 			idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
+		else if (folio_test_hugetlb(folio))
+			idx = (pvmw.address & ~huge_page_mask(hstate_vma(vma)))/
+				PAGE_SIZE;
 		new = folio_page(folio, idx);
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -235,14 +238,15 @@ static bool remove_migration_pte(struct folio *folio,
 
 #ifdef CONFIG_HUGETLB_PAGE
 		if (folio_test_hugetlb(folio)) {
+			struct page *hpage = folio_page(folio, 0);
 			unsigned int shift = pvmw.pte_order + PAGE_SHIFT;
 
 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
 			if (folio_test_anon(folio))
-				hugepage_add_anon_rmap(new, vma, pvmw.address,
+				hugepage_add_anon_rmap(hpage, vma, pvmw.address,
 						       rmap_flags);
 			else
-				page_dup_file_rmap(new, true);
+				page_dup_file_rmap(hpage, true);
 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 		} else
 #endif
@@ -258,7 +262,7 @@ static bool remove_migration_pte(struct folio *folio,
 			mlock_page_drain_local();
 
 		trace_remove_migration_pte(pvmw.address, pte_val(pte),
-					   compound_order(new));
+					   pvmw.pte_order);
 
 		/* No need to invalidate - it was non-present before */
 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
@@ -332,12 +336,12 @@ void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
 		migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
 }
 
-void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
+void migration_entry_wait_huge(struct vm_area_struct *vma,
+				struct hugetlb_pte *hpte)
 {
-	spinlock_t *ptl = huge_pte_lockptr(huge_page_shift(hstate_vma(vma)),
-					   vma->vm_mm, pte);
+	spinlock_t *ptl = hugetlb_pte_lockptr(vma->vm_mm, hpte);
 
-	__migration_entry_wait_huge(pte, ptl);
+	__migration_entry_wait_huge(hpte->ptep, ptl);
 }
 #endif
 
-- 
2.38.0.135.g90850a2211-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ