linux-kernel - [RFC PATCH 21/37] mm: implement speculative handling in do_swap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210407014502.24091-22-michel@lespinasse.org>
Date:   Tue,  6 Apr 2021 18:44:46 -0700
From:   Michel Lespinasse <michel@...pinasse.org>
To:     Linux-MM <linux-mm@...ck.org>
Cc:     Laurent Dufour <ldufour@...ux.ibm.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Michal Hocko <mhocko@...e.com>,
        Matthew Wilcox <willy@...radead.org>,
        Rik van Riel <riel@...riel.com>,
        Paul McKenney <paulmck@...nel.org>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Suren Baghdasaryan <surenb@...gle.com>,
        Joel Fernandes <joelaf@...gle.com>,
        Rom Lemarchand <romlem@...gle.com>,
        Linux-Kernel <linux-kernel@...r.kernel.org>,
        Michel Lespinasse <michel@...pinasse.org>
Subject: [RFC PATCH 21/37] mm: implement speculative handling in do_swap_page()

If the pte is larger than long, use pte_spinlock() to lock the page table
when verifying the pte - pte_spinlock() is necessary to ensure the page
table is still valid when we are locking it.

Abort speculative faults if the pte is not a swap entry, or if the desired
page is not found in swap cache, to keep things as simple as possible.

Only use trylock when locking the swapped page - again to keep things
simple, and also the usual lock_page_or_retry would otherwise try to
release the mmap lock which is not held in the speculative case.

Use pte_map_lock() to ensure proper synchronization when finally committing
the faulted page to the mm address space.

Signed-off-by: Michel Lespinasse <michel@...pinasse.org>
---
 mm/memory.c | 74 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index fc555fae0844..ab3160719bf3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2637,30 +2637,6 @@ bool __pte_map_lock(struct vm_fault *vmf)
 
 #endif	/* CONFIG_SPECULATIVE_PAGE_FAULT */
 
-/*
- * handle_pte_fault chooses page fault handler according to an entry which was
- * read non-atomically.  Before making any commitment, on those architectures
- * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
- * parts, do_swap_page must check under lock before unmapping the pte and
- * proceeding (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-				pte_t *page_table, pte_t orig_pte)
-{
-	int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
-	if (sizeof(pte_t) > sizeof(unsigned long)) {
-		spinlock_t *ptl = pte_lockptr(mm, pmd);
-		spin_lock(ptl);
-		same = pte_same(*page_table, orig_pte);
-		spin_unlock(ptl);
-	}
-#endif
-	pte_unmap(page_table);
-	return same;
-}
-
 static inline bool cow_user_page(struct page *dst, struct page *src,
 				 struct vm_fault *vmf)
 {
@@ -3369,12 +3345,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		return VM_FAULT_RETRY;
 	}
 
-	if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
-		goto out;
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
+	if (sizeof(pte_t) > sizeof(unsigned long)) {
+		/*
+		 * vmf->orig_pte was read non-atomically. Before making
+		 * any commitment, on those architectures or configurations
+		 * (e.g. i386 with PAE) which might give a mix of
+		 * unmatched parts, we must check under lock before
+		 * unmapping the pte and proceeding.
+		 *
+		 * (but do_wp_page is only called after already making
+		 * such a check; and do_anonymous_page can safely
+		 * check later on).
+		 */
+		if (!pte_spinlock(vmf))
+			return VM_FAULT_RETRY;
+		if (!pte_same(*vmf->pte, vmf->orig_pte))
+			goto unlock;
+		spin_unlock(vmf->ptl);
+	}
+#endif
+	pte_unmap(vmf->pte);
+	vmf->pte = NULL;
 
 	entry = pte_to_swp_entry(vmf->orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
-		if (is_migration_entry(entry)) {
+		if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
+			ret = VM_FAULT_RETRY;
+		} else if (is_migration_entry(entry)) {
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
 		} else if (is_device_private_entry(entry)) {
@@ -3395,8 +3393,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	swapcache = page;
 
 	if (!page) {
-		struct swap_info_struct *si = swp_swap_info(entry);
+		struct swap_info_struct *si;
 
+		if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
+			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+			return VM_FAULT_RETRY;
+		}
+
+		si = swp_swap_info(entry);
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
 		    __swap_count(entry) == 1) {
 			/* skip swapcache */
@@ -3459,7 +3463,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		goto out_release;
 	}
 
-	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+	if (vmf->flags & FAULT_FLAG_SPECULATIVE)
+		locked = trylock_page(page);
+	else
+		locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
 
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 	if (!locked) {
@@ -3487,10 +3494,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	cgroup_throttle_swaprate(page, GFP_KERNEL);
 
 	/*
-	 * Back out if somebody else already faulted in this pte.
+	 * Back out if the VMA has changed in our back during a speculative
+	 * page fault or if somebody else already faulted in this pte.
 	 */
-	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-			&vmf->ptl);
+	if (!pte_map_lock(vmf)) {
+		ret = VM_FAULT_RETRY;
+		goto out_page;
+	}
 	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
 		goto out_nomap;
 
-- 
2.20.1