lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20211110105428.32458-6-zhengqi.arch@bytedance.com>
Date:   Wed, 10 Nov 2021 18:54:18 +0800
From:   Qi Zheng <zhengqi.arch@...edance.com>
To:     akpm@...ux-foundation.org, tglx@...utronix.de,
        kirill.shutemov@...ux.intel.com, mika.penttila@...tfour.com,
        david@...hat.com, jgg@...dia.com
Cc:     linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org, songmuchun@...edance.com,
        zhouchengming@...edance.com, Qi Zheng <zhengqi.arch@...edance.com>
Subject: [PATCH v3 05/15] mm: add pmd_installed_type return for __pte_alloc() and other friends

When we call __pte_alloc() or other friends, a huge pmd might
be created from a different thread. This is why
pmd_trans_unstable() will now be called after __pte_alloc()
or other friends return.

This patch add pmd_installed_type return for __pte_alloc() and other
friends, then we can check the huge pmd through the return value
instead of calling pmd_trans_unstable() again.

This patch has no functional change, just some preparations
for the future patches.

Signed-off-by: Qi Zheng <zhengqi.arch@...edance.com>
---
 include/linux/mm.h    | 20 +++++++++++++++++---
 mm/debug_vm_pgtable.c |  2 +-
 mm/filemap.c          | 11 +++++++----
 mm/gup.c              |  2 +-
 mm/internal.h         |  3 ++-
 mm/memory.c           | 39 ++++++++++++++++++++++++++-------------
 mm/migrate.c          | 17 ++---------------
 mm/mremap.c           |  2 +-
 mm/userfaultfd.c      | 24 +++++++++++++++---------
 9 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 706da081b9f8..52f36fde2f11 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2306,13 +2306,27 @@ static inline void pgtable_pte_page_dtor(struct page *page)
 	dec_lruvec_page_state(page, NR_PAGETABLE);
 }
 
-#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))
+enum pmd_installed_type {
+	INSTALLED_PTE,
+	INSTALLED_HUGE_PMD,
+};
+
+static inline int pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+{
+	if (unlikely(pmd_none(*(pmd))))
+		return __pte_alloc(mm, pmd);
+	if (unlikely(is_huge_pmd(*pmd)))
+		return INSTALLED_HUGE_PMD;
+
+	return INSTALLED_PTE;
+}
+#define pte_alloc pte_alloc
 
 #define pte_alloc_map(mm, pmd, address)			\
-	(pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))
+	(pte_alloc(mm, pmd) < 0 ? NULL : pte_offset_map(pmd, address))
 
 #define pte_alloc_map_lock(mm, pmd, address, ptlp)	\
-	(pte_alloc(mm, pmd) ?			\
+	(pte_alloc(mm, pmd) < 0 ?			\
 		 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
 
 #define pte_alloc_kernel(pmd, address)			\
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 228e3954b90c..b8322c55e65d 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1170,7 +1170,7 @@ static int __init init_args(struct pgtable_debug_args *args)
 	args->start_pmdp = pmd_offset(args->pudp, 0UL);
 	WARN_ON(!args->start_pmdp);
 
-	if (pte_alloc(args->mm, args->pmdp)) {
+	if (pte_alloc(args->mm, args->pmdp) < 0) {
 		pr_err("Failed to allocate pte entries\n");
 		ret = -ENOMEM;
 		goto error;
diff --git a/mm/filemap.c b/mm/filemap.c
index ff8d19b7ce1d..23363f8ddbbe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3217,12 +3217,15 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
 		}
 	}
 
-	if (pmd_none(*vmf->pmd))
-		pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
+	if (pmd_none(*vmf->pmd)) {
+		int ret = pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
 
-	/* See comment in handle_pte_fault() */
-	if (pmd_devmap_trans_unstable(vmf->pmd))
+		if (unlikely(ret == INSTALLED_HUGE_PMD))
+			goto out;
+	} else if (pmd_devmap_trans_unstable(vmf->pmd)) {
+		/* See comment in handle_pte_fault() */
 		goto out;
+	}
 
 	return false;
 
diff --git a/mm/gup.c b/mm/gup.c
index 2c51e9748a6a..2def775232a3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -699,7 +699,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 		} else {
 			spin_unlock(ptl);
 			split_huge_pmd(vma, pmd, address);
-			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
+			ret = pte_alloc(mm, pmd) < 0 ? -ENOMEM : 0;
 		}
 
 		return ret ? ERR_PTR(ret) :
diff --git a/mm/internal.h b/mm/internal.h
index 3b79a5c9427a..474d6e3443f8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -67,7 +67,8 @@ bool __folio_end_writeback(struct folio *folio);
 
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
-void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+enum pmd_installed_type pmd_install(struct mm_struct *mm, pmd_t *pmd,
+				    pgtable_t *pte);
 
 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 {
diff --git a/mm/memory.c b/mm/memory.c
index bec6a5d5ee7c..8a39c0e58324 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -437,8 +437,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	}
 }
 
-void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
+enum pmd_installed_type pmd_install(struct mm_struct *mm, pmd_t *pmd,
+				    pgtable_t *pte)
 {
+	int ret = INSTALLED_PTE;
 	spinlock_t *ptl = pmd_lock(mm, pmd);
 
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
@@ -459,20 +461,26 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
 		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 		pmd_populate(mm, pmd, *pte);
 		*pte = NULL;
+	} else if (is_huge_pmd(*pmd)) {
+		/* See comment in handle_pte_fault() */
+		ret = INSTALLED_HUGE_PMD;
 	}
 	spin_unlock(ptl);
+
+	return ret;
 }
 
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
 {
+	enum pmd_installed_type ret;
 	pgtable_t new = pte_alloc_one(mm);
 	if (!new)
 		return -ENOMEM;
 
-	pmd_install(mm, pmd, &new);
+	ret = pmd_install(mm, pmd, &new);
 	if (new)
 		pte_free(mm, new);
-	return 0;
+	return ret;
 }
 
 int __pte_alloc_kernel(pmd_t *pmd)
@@ -1813,7 +1821,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
 
 	/* Allocate the PTE if necessary; takes PMD lock once only. */
 	ret = -ENOMEM;
-	if (pte_alloc(mm, pmd))
+	if (pte_alloc(mm, pmd) < 0)
 		goto out;
 
 	while (pages_to_write_in_pmd) {
@@ -3713,6 +3721,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	struct page *page;
 	vm_fault_t ret = 0;
 	pte_t entry;
+	int alloc_ret;
 
 	/* File mapping without ->vm_ops ? */
 	if (vma->vm_flags & VM_SHARED)
@@ -3728,11 +3737,11 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	 *
 	 * Here we only have mmap_read_lock(mm).
 	 */
-	if (pte_alloc(vma->vm_mm, vmf->pmd))
+	alloc_ret = pte_alloc(vma->vm_mm, vmf->pmd);
+	if (alloc_ret < 0)
 		return VM_FAULT_OOM;
-
 	/* See comment in handle_pte_fault() */
-	if (unlikely(pmd_trans_unstable(vmf->pmd)))
+	if (unlikely(alloc_ret == INSTALLED_HUGE_PMD))
 		return 0;
 
 	/* Use the zero-page for reads */
@@ -4023,6 +4032,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 	}
 
 	if (pmd_none(*vmf->pmd)) {
+		int alloc_ret;
+
 		if (PageTransCompound(page)) {
 			ret = do_set_pmd(vmf, page);
 			if (ret != VM_FAULT_FALLBACK)
@@ -4030,14 +4041,16 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 		}
 
 		if (vmf->prealloc_pte)
-			pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
-		else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
-			return VM_FAULT_OOM;
-	}
+			alloc_ret = pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
+		else
+			alloc_ret = pte_alloc(vma->vm_mm, vmf->pmd);
 
-	/* See comment in handle_pte_fault() */
-	if (pmd_devmap_trans_unstable(vmf->pmd))
+		if (unlikely(alloc_ret != INSTALLED_PTE))
+			return alloc_ret < 0 ? VM_FAULT_OOM : 0;
+	} else if (pmd_devmap_trans_unstable(vmf->pmd)) {
+		/* See comment in handle_pte_fault() */
 		return 0;
+	}
 
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
 				      vmf->address, &vmf->ptl);
diff --git a/mm/migrate.c b/mm/migrate.c
index cf25b00f03c8..bdfdfd3b50be 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2731,21 +2731,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
 		goto abort;
 
-	/*
-	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
-	 * pte_offset_map() on pmds where a huge pmd might be created
-	 * from a different thread.
-	 *
-	 * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
-	 * parallel threads are excluded by other means.
-	 *
-	 * Here we only have mmap_read_lock(mm).
-	 */
-	if (pte_alloc(mm, pmdp))
-		goto abort;
-
-	/* See the comment in pte_alloc_one_map() */
-	if (unlikely(pmd_trans_unstable(pmdp)))
+	/* See the comment in do_anonymous_page() */
+	if (unlikely(pte_alloc(mm, pmdp) != INSTALLED_PTE))
 		goto abort;
 
 	if (unlikely(anon_vma_prepare(vma)))
diff --git a/mm/mremap.c b/mm/mremap.c
index c6e9da09dd0a..fc5c56858883 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -551,7 +551,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 				continue;
 		}
 
-		if (pte_alloc(new_vma->vm_mm, new_pmd))
+		if (pte_alloc(new_vma->vm_mm, new_pmd) < 0)
 			break;
 		move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
 			  new_pmd, new_addr, need_rmap_locks);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0780c2a57ff1..2cea08e7f076 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -592,15 +592,21 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 			err = -EEXIST;
 			break;
 		}
-		if (unlikely(pmd_none(dst_pmdval)) &&
-		    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
-			err = -ENOMEM;
-			break;
-		}
-		/* If an huge pmd materialized from under us fail */
-		if (unlikely(pmd_trans_huge(*dst_pmd))) {
-			err = -EFAULT;
-			break;
+
+		if (unlikely(pmd_none(dst_pmdval))) {
+			int ret = __pte_alloc(dst_mm, dst_pmd);
+
+			/*
+			 * If there is not enough memory or an huge pmd
+			 * materialized from under us
+			 */
+			if (unlikely(ret < 0)) {
+				err = -ENOMEM;
+				break;
+			} else if (unlikely(ret == INSTALLED_HUGE_PMD)) {
+				err = -EFAULT;
+				break;
+			}
 		}
 
 		BUG_ON(pmd_none(*dst_pmd));
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ