lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180920200358.31680-8-keith.busch@intel.com>
Date:   Thu, 20 Sep 2018 14:03:58 -0600
From:   Keith Busch <keith.busch@...el.com>
To:     linux-kernel@...r.kernel.org
Cc:     Kirill Shutemov <kirill.shutemov@...ux.intel.com>,
        Dave Hansen <dave.hansen@...el.com>,
        Dan Williams <dan.j.williams@...el.com>,
        Keith Busch <keith.busch@...el.com>
Subject: [PATCHv2 7/7] mm/gup: Cache dev_pagemap while pinning pages

Pinning pages from ZONE_DEVICE memory needs to check the backing device's
live-ness, which is tracked in the device's dev_pagemap metadata. This
metadata is stored in a radix tree and looking it up adds measurable
software overhead.

This patch avoids repeating this relatively costly operation when
dev_pagemap is used by caching the last dev_pagemap while getting user
pages. The gup_benchmark reports this reduces the time to get user pages
to as low as 1/3 of the previous time.

Cc: Kirill Shutemov <kirill.shutemov@...ux.intel.com>
Cc: Dave Hansen <dave.hansen@...el.com>
Cc: Dan Williams <dan.j.williams@...el.com>
Signed-off-by: Keith Busch <keith.busch@...el.com>
---
 include/linux/mm.h |  8 +++++++-
 mm/gup.c           | 41 ++++++++++++++++++++++++-----------------
 mm/huge_memory.c   | 35 +++++++++++++++--------------------
 3 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f1fd241c9071..d688e18a19c4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -380,6 +380,7 @@ struct vm_fault {
 
 struct follow_page_context {
 	struct vm_area_struct *vma;
+	struct dev_pagemap *pgmap;
 	unsigned long address;
 	unsigned int page_mask;
 	unsigned int flags;
@@ -2546,14 +2547,19 @@ struct page *follow_page_mask(struct follow_page_context *ctx);
 static inline struct page *follow_page(struct vm_area_struct *vma,
 		unsigned long address, unsigned int foll_flags)
 {
+	struct page *page;
 	struct follow_page_context ctx = {
 		.vma = vma,
+		.pgmap = NULL,
 		.address = address,
 		.page_mask = 0,
 		.flags = foll_flags,
 	};
 
-	return follow_page_mask(&ctx);
+	page = follow_page_mask(&ctx);
+	if (ctx.pgmap)
+		put_dev_pagemap(ctx.pgmap);
+	return page;
 }
 
 #define FOLL_WRITE	0x01	/* check pte is writable */
diff --git a/mm/gup.c b/mm/gup.c
index 4c4da54f8dbe..c98ea05eaa59 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -72,7 +72,6 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 static struct page *follow_page_pte(struct follow_page_context *ctx, pmd_t *pmd)
 {
 	struct mm_struct *mm = ctx->vma->vm_mm;
-	struct dev_pagemap *pgmap = NULL;
 	struct page *page;
 	spinlock_t *ptl;
 	pte_t *ptep, pte;
@@ -114,8 +113,8 @@ static struct page *follow_page_pte(struct follow_page_context *ctx, pmd_t *pmd)
 		 * Only return device mapping pages in the FOLL_GET case since
 		 * they are only valid while holding the pgmap reference.
 		 */
-		pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
-		if (pgmap)
+		ctx->pgmap = get_dev_pagemap(pte_pfn(pte), ctx->pgmap);
+		if (ctx->pgmap)
 			page = pte_page(pte);
 		else
 			goto no_page;
@@ -154,9 +153,9 @@ static struct page *follow_page_pte(struct follow_page_context *ctx, pmd_t *pmd)
 		get_page(page);
 
 		/* drop the pgmap reference now that we hold the page */
-		if (pgmap) {
-			put_dev_pagemap(pgmap);
-			pgmap = NULL;
+		if (ctx->pgmap) {
+			put_dev_pagemap(ctx->pgmap);
+			ctx->pgmap = NULL;
 		}
 	}
 	if (ctx->flags & FOLL_TOUCH) {
@@ -645,7 +644,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned int gup_flags, struct page **pages,
 		struct vm_area_struct **vmas, int *nonblocking)
 {
-	long i = 0;
+	long ret = 0, i = 0;
 	struct vm_area_struct *vma = NULL;
 	struct follow_page_context ctx = {};
 
@@ -681,8 +680,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				goto next_page;
 			}
 
-			if (!vma || check_vma_flags(vma, gup_flags))
-				return i ? : -EFAULT;
+			if (!vma || check_vma_flags(vma, gup_flags)) {
+				ret = -EFAULT;
+				goto out;
+			}
 			if (is_vm_hugetlb_page(vma)) {
 				i = follow_hugetlb_page(mm, vma, pages, vmas,
 						&start, &nr_pages, i,
@@ -697,23 +698,25 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		 * If we have a pending SIGKILL, don't keep faulting pages and
 		 * potentially allocating memory.
 		 */
-		if (unlikely(fatal_signal_pending(current)))
-			return i ? i : -ERESTARTSYS;
+		if (unlikely(fatal_signal_pending(current))) {
+			ret = -ERESTARTSYS;
+			goto out;
+		}
 		cond_resched();
 
 		page = follow_page_mask(&ctx);
 		if (!page) {
-			int ret;
 			ret = faultin_page(tsk, &ctx, nonblocking);
 			switch (ret) {
 			case 0:
 				goto retry;
+			case -EBUSY:
+				ret = 0;
+				/* FALLTHRU */
 			case -EFAULT:
 			case -ENOMEM:
 			case -EHWPOISON:
-				return i ? i : ret;
-			case -EBUSY:
-				return i;
+				goto out;
 			case -ENOENT:
 				goto next_page;
 			}
@@ -725,7 +728,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			 */
 			goto next_page;
 		} else if (IS_ERR(page)) {
-			return i ? i : PTR_ERR(page);
+			ret = PTR_ERR(page);
+			goto out;
 		}
 		if (pages) {
 			pages[i] = page;
@@ -745,7 +749,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		start += page_increm * PAGE_SIZE;
 		nr_pages -= page_increm;
 	} while (nr_pages);
-	return i;
+out:
+	if (ctx.pgmap)
+		put_dev_pagemap(ctx.pgmap);
+	return i ? i : ret;
 }
 
 static bool vma_permits_fault(struct vm_area_struct *vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index abd36e6afe2c..6787011385ce 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -851,12 +851,23 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 		update_mmu_cache_pmd(vma, addr, pmd);
 }
 
+static struct page *pagemap_page(struct follow_page_context *ctx,
+				 unsigned long pfn)
+{
+	struct page *page;
+
+	ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
+	if (!ctx->pgmap)
+		return ERR_PTR(-EFAULT);
+	page = pfn_to_page(pfn);
+	get_page(page);
+	return page;
+}
+
 struct page *follow_devmap_pmd(struct follow_page_context *ctx, pmd_t *pmd)
 {
 	unsigned long pfn = pmd_pfn(*pmd);
 	struct mm_struct *mm = ctx->vma->vm_mm;
-	struct dev_pagemap *pgmap;
-	struct page *page;
 
 	assert_spin_locked(pmd_lockptr(mm, pmd));
 
@@ -885,14 +896,7 @@ struct page *follow_devmap_pmd(struct follow_page_context *ctx, pmd_t *pmd)
 		return ERR_PTR(-EEXIST);
 
 	pfn += (ctx->address & ~PMD_MASK) >> PAGE_SHIFT;
-	pgmap = get_dev_pagemap(pfn, NULL);
-	if (!pgmap)
-		return ERR_PTR(-EFAULT);
-	page = pfn_to_page(pfn);
-	get_page(page);
-	put_dev_pagemap(pgmap);
-
-	return page;
+	return pagemap_page(ctx, pfn);
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1002,8 +1006,6 @@ struct page *follow_devmap_pud(struct follow_page_context *ctx, pud_t *pud)
 {
 	unsigned long pfn = pud_pfn(*pud);
 	struct mm_struct *mm = ctx->vma->vm_mm;
-	struct dev_pagemap *pgmap;
-	struct page *page;
 
 	assert_spin_locked(pud_lockptr(mm, pud));
 
@@ -1026,14 +1028,7 @@ struct page *follow_devmap_pud(struct follow_page_context *ctx, pud_t *pud)
 		return ERR_PTR(-EEXIST);
 
 	pfn += (ctx->address & ~PUD_MASK) >> PAGE_SHIFT;
-	pgmap = get_dev_pagemap(pfn, NULL);
-	if (!pgmap)
-		return ERR_PTR(-EFAULT);
-	page = pfn_to_page(pfn);
-	get_page(page);
-	put_dev_pagemap(pgmap);
-
-	return page;
+	return pagemap_page(ctx, pfn);
 }
 
 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-- 
2.14.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ