lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220420223753.386645-6-mike.kravetz@oracle.com>
Date:   Wed, 20 Apr 2022 15:37:52 -0700
From:   Mike Kravetz <mike.kravetz@...cle.com>
To:     linux-mm@...ck.org, linux-kernel@...r.kernel.org
Cc:     Michal Hocko <mhocko@...e.com>, Peter Xu <peterx@...hat.com>,
        Naoya Horiguchi <naoya.horiguchi@...ux.dev>,
        David Hildenbrand <david@...hat.com>,
        "Aneesh Kumar K . V" <aneesh.kumar@...ux.vnet.ibm.com>,
        Andrea Arcangeli <aarcange@...hat.com>,
        "Kirill A . Shutemov" <kirill.shutemov@...ux.intel.com>,
        Davidlohr Bueso <dave@...olabs.net>,
        Prakash Sangappa <prakash.sangappa@...cle.com>,
        James Houghton <jthoughton@...gle.com>,
        Mina Almasry <almasrymina@...gle.com>,
        Ray Fucillo <Ray.Fucillo@...ersystems.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Mike Kravetz <mike.kravetz@...cle.com>
Subject: [RFC PATCH v2 5/6] hugetlbfs: Do not use pmd locks if hugetlb sharing possible

In hugetlbfs, split pmd page table locks are generally used if
huge_page_size is equal to PMD_SIZE.  These locks are located in the
struct page of the corresponding pmd page.  A pmd pointer is used to
locate the page.

In the case of pmd sharing, pmd pointers can become invalid unless one
holds the page table lock.  This creates a chicken/egg problem as we
need to use the pointer to locate the lock.  To address this issue, use
the page_table_lock in the mm_struct in the pmd pointer is associated
with a sharable vma.

The routines dealing with huge pte locks (huge_pte_lockptr and
huge_pte_lock) are modified to take a vma pointer instead of mm pointer.
The vma is then checked to determine if sharing is possible.  If it is,
then  the page table lock in the mm_struct is used.  Otherwise, the
lock in hte pmd page struct page is used.

Note that code uses the mm_struct if any part of the vma is sharable.
This could be optimized by passing in the virtial address associated
with the pte pointer to determine if that specific address is sharable.

Signed-off-by: Mike Kravetz <mike.kravetz@...cle.com>
---
 arch/powerpc/mm/pgtable.c |  2 +-
 include/linux/hugetlb.h   | 27 ++++--------
 mm/damon/vaddr.c          |  4 +-
 mm/hmm.c                  |  2 +-
 mm/hugetlb.c              | 92 +++++++++++++++++++++++++++++----------
 mm/mempolicy.c            |  2 +-
 mm/migrate.c              |  2 +-
 mm/page_vma_mapped.c      |  2 +-
 8 files changed, 85 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 6ec5a7dd7913..02f76e8b735a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -261,7 +261,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 
 		psize = hstate_get_psize(h);
 #ifdef CONFIG_DEBUG_VM
-		assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
+		assert_spin_locked(huge_pte_lockptr(h, vma, ptep));
 #endif
 
 #else
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 75f4ff481538..c37611eb8571 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -864,15 +864,8 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 	return modified_mask;
 }
 
-static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
-					   struct mm_struct *mm, pte_t *pte)
-{
-	if (huge_page_size(h) == PMD_SIZE)
-		return pmd_lockptr(mm, (pmd_t *) pte);
-	VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
-	return &mm->page_table_lock;
-}
-
+spinlock_t *huge_pte_lockptr(struct hstate *h, struct vm_area_struct *vma,
+					   pte_t *pte);
 #ifndef hugepages_supported
 /*
  * Some platform decide whether they support huge pages at boot
@@ -1073,8 +1066,11 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 }
 
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
-					   struct mm_struct *mm, pte_t *pte)
+					   struct vm_area_struct *vma,
+					   pte_t *pte)
 {
+	struct mm_struct *mm = vma->vm_mm;
+
 	return &mm->page_table_lock;
 }
 
@@ -1096,15 +1092,8 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
 }
 #endif	/* CONFIG_HUGETLB_PAGE */
 
-static inline spinlock_t *huge_pte_lock(struct hstate *h,
-					struct mm_struct *mm, pte_t *pte)
-{
-	spinlock_t *ptl;
-
-	ptl = huge_pte_lockptr(h, mm, pte);
-	spin_lock(ptl);
-	return ptl;
-}
+spinlock_t *huge_pte_lock(struct hstate *h, struct vm_area_struct *vma,
+					pte_t *pte);
 
 #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
 extern void __init hugetlb_cma_reserve(int order);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b2ec0aa1ff45..125439fc88b6 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -432,7 +432,7 @@ static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	ptl = huge_pte_lock(h, walk->mm, pte);
+	ptl = huge_pte_lock(h, walk->vma, pte);
 	entry = huge_ptep_get(pte);
 	if (!pte_present(entry))
 		goto out;
@@ -555,7 +555,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	ptl = huge_pte_lock(h, walk->mm, pte);
+	ptl = huge_pte_lock(h, walk->vma, pte);
 	entry = huge_ptep_get(pte);
 	if (!pte_present(entry))
 		goto out;
diff --git a/mm/hmm.c b/mm/hmm.c
index 3fd3242c5e50..95b443f2e48e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -486,7 +486,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+	ptl = huge_pte_lock(hstate_vma(vma), vma, pte);
 	entry = huge_ptep_get(pte);
 
 	i = (start - range->start) >> PAGE_SHIFT;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e02df3527a9c..c1352ab7f941 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -94,8 +94,32 @@ DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
-/* Forward declaration */
+/* Forward declarations */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
+static bool vma_range_shareable(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end);
+
+spinlock_t *huge_pte_lockptr(struct hstate *h, struct vm_area_struct *vma,
+				pte_t *pte)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (huge_page_size(h) == PMD_SIZE &&
+			!vma_range_shareable(vma, vma->vm_start, vma->vm_end))
+		return pmd_lockptr(mm, (pmd_t *) pte);
+	VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
+	return &mm->page_table_lock;
+}
+
+spinlock_t *huge_pte_lock(struct hstate *h, struct vm_area_struct *vma,
+				pte_t *pte)
+{
+	spinlock_t *ptl;
+
+	ptl = huge_pte_lockptr(h, vma, pte);
+	spin_lock(ptl);
+	return ptl;
+}
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
 {
@@ -4753,8 +4777,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
 			continue;
 
-		dst_ptl = huge_pte_lock(h, dst, dst_pte);
-		src_ptl = huge_pte_lockptr(h, src, src_pte);
+		dst_ptl = huge_pte_lock(h, dst_vma, dst_pte);
+		src_ptl = huge_pte_lockptr(h, src_vma, src_pte);
 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 		entry = huge_ptep_get(src_pte);
 		dst_entry = huge_ptep_get(dst_pte);
@@ -4830,8 +4854,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				put_page(ptepage);
 
 				/* Install the new huge page if src pte stable */
-				dst_ptl = huge_pte_lock(h, dst, dst_pte);
-				src_ptl = huge_pte_lockptr(h, src, src_pte);
+				dst_ptl = huge_pte_lock(h, dst_vma, dst_pte);
+				src_ptl = huge_pte_lockptr(h, src_vma, src_pte);
 				spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 				entry = huge_ptep_get(src_pte);
 				if (!pte_same(src_pte_old, entry)) {
@@ -4882,8 +4906,8 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
 	spinlock_t *src_ptl, *dst_ptl;
 	pte_t pte;
 
-	dst_ptl = huge_pte_lock(h, mm, dst_pte);
-	src_ptl = huge_pte_lockptr(h, mm, src_pte);
+	dst_ptl = huge_pte_lock(h, vma, dst_pte);
+	src_ptl = huge_pte_lockptr(h, vma, src_pte);
 
 	/*
 	 * We don't have to worry about the ordering of src and dst ptlocks
@@ -4988,7 +5012,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 		if (!ptep)
 			continue;
 
-		ptl = huge_pte_lock(h, mm, ptep);
+		ptl = huge_pte_lock(h, vma, ptep);
 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
 			spin_unlock(ptl);
 			tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
@@ -5485,7 +5509,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			 * here.  Before returning error, get ptl and make
 			 * sure there really is no pte entry.
 			 */
-			ptl = huge_pte_lock(h, mm, ptep);
+			ptl = huge_pte_lock(h, vma, ptep);
 			ret = 0;
 			if (huge_pte_none(huge_ptep_get(ptep)))
 				ret = vmf_error(PTR_ERR(page));
@@ -5553,7 +5577,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 		vma_end_reservation(h, vma, haddr);
 	}
 
-	ptl = huge_pte_lock(h, mm, ptep);
+	ptl = huge_pte_lock(h, vma, ptep);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	if (idx >= size) {
 		beyond_i_size = true;
@@ -5733,7 +5757,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 								vma, haddr);
 	}
 
-	ptl = huge_pte_lock(h, mm, ptep);
+	ptl = huge_pte_lock(h, vma, ptep);
 
 	/* Check for a racing update before calling hugetlb_wp() */
 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -5935,7 +5959,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		page_in_pagecache = true;
 	}
 
-	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+	ptl = huge_pte_lockptr(h, dst_vma, dst_pte);
 	spin_lock(ptl);
 
 	/*
@@ -6089,7 +6113,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
 				      huge_page_size(h));
 		if (pte)
-			ptl = huge_pte_lock(h, mm, pte);
+			ptl = huge_pte_lock(h, vma, pte);
 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
 
 		/*
@@ -6267,7 +6291,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		ptep = huge_pte_offset(mm, address, psize);
 		if (!ptep)
 			continue;
-		ptl = huge_pte_lock(h, mm, ptep);
+		ptl = huge_pte_lock(h, vma, ptep);
 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
 			/*
 			 * When uffd-wp is enabled on the vma, unshare
@@ -6583,26 +6607,44 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
 	return saddr;
 }
 
-static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+static bool __vma_aligned_range_shareable(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
 {
-	unsigned long base = addr & PUD_MASK;
-	unsigned long end = base + PUD_SIZE;
-
 	/*
 	 * check on proper vm_flags and page table alignment
 	 */
-	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
+	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end))
 		return true;
 	return false;
 }
 
+static bool vma_range_shareable(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
+		      v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+	if (v_start >= v_end)
+		return false;
+
+	return __vma_aligned_range_shareable(vma, v_start, v_end);
+}
+
+static bool vma_addr_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+	unsigned long start = addr & PUD_MASK;
+	unsigned long end = start + PUD_SIZE;
+
+	return __vma_aligned_range_shareable(vma, start, end);
+}
+
 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
 {
 #ifdef CONFIG_USERFAULTFD
 	if (uffd_disable_huge_pmd_share(vma))
 		return false;
 #endif
-	return vma_shareable(vma, addr);
+	return vma_addr_shareable(vma, addr);
 }
 
 /*
@@ -6672,7 +6714,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!spte)
 		goto out;
 
-	ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
+	ptl = huge_pte_lock(hstate_vma(vma), vma, spte);
 	if (pud_none(*pud)) {
 		pud_populate(mm, pud,
 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
@@ -6719,6 +6761,12 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+static bool vma_range_shareable(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	return false;
+}
+
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, pud_t *pud)
 {
@@ -7034,7 +7082,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 		ptep = huge_pte_offset(mm, address, sz);
 		if (!ptep)
 			continue;
-		ptl = huge_pte_lock(h, mm, ptep);
+		ptl = huge_pte_lock(h, vma, ptep);
 		/* We don't want 'address' to be changed */
 		huge_pmd_unshare(mm, vma, &tmp, ptep);
 		spin_unlock(ptl);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 58af432a39b2..4692640847eb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -577,7 +577,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->vma, pte);
 	entry = huge_ptep_get(pte);
 	if (!pte_present(entry))
 		goto unlock;
diff --git a/mm/migrate.c b/mm/migrate.c
index b2678279eb43..3d765ee101ad 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -318,7 +318,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 void migration_entry_wait_huge(struct vm_area_struct *vma,
 		struct mm_struct *mm, pte_t *pte)
 {
-	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
+	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma, pte);
 	__migration_entry_wait(mm, pte, ptl);
 }
 
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c10f839fc410..f09eaef2a828 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -174,7 +174,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 		if (!pvmw->pte)
 			return false;
 
-		pvmw->ptl = huge_pte_lockptr(hstate, mm, pvmw->pte);
+		pvmw->ptl = huge_pte_lockptr(hstate, vma, pvmw->pte);
 		spin_lock(pvmw->ptl);
 		if (!check_pte(pvmw))
 			return not_found(pvmw);
-- 
2.35.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ