lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200307010353.172991-1-cannonmatthews@google.com>
Date:   Fri,  6 Mar 2020 17:03:53 -0800
From:   Cannon Matthews <cannonmatthews@...gle.com>
To:     Mike Kravetz <mike.kravetz@...cle.com>,
        Andrew Morton <akpm@...ux-foundation.org>
Cc:     Matthew Wilcox <willy@...radead.org>,
        Michal Hocko <mhocko@...nel.org>,
        David Rientjes <rientjes@...gle.com>,
        Greg Thelen <gthelen@...gle.com>,
        Salman Qazi <sqazi@...gle.com>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org,
        Cannon Matthews <cannonmatthews@...gle.com>
Subject: [PATCH] mm: clear 1G pages with streaming stores on x86

Reimplement clear_gigantic_page() to clear gigabytes pages using the
non-temporal streaming store instructions that bypass the cache
(movnti), since an entire 1GiB region will not fit in the cache anyway.

Doing an mlock() on a 512GiB 1G-hugetlb region previously would take on
average 134 seconds, about 260ms/GiB which is quite slow. Using `movnti`
and optimizing the control flow over the constituent small pages, this
can be improved roughly by a factor of 3-4x, with the 512GiB mlock()
taking only 34 seconds on average, or 67ms/GiB.

The assembly code for the __clear_page_nt routine is more or less
taken directly from the output of gcc with -O3 for this function with
some tweaks to support arbitrary sizes and moving memory barriers:

void clear_page_nt_64i (void *page)
{
  for (int i = 0; i < GiB /sizeof(long long int); ++i)
    {
      _mm_stream_si64 (((long long int*)page) + i, 0);
    }
  sfence();
}

Tested:
	Time to `mlock()` a 512GiB region on broadwell CPU
				AVG time (s)	% imp.	ms/page
	clear_page_erms		133.584		-	261
	clear_page_nt		34.154		74.43%	67

An earlier version of this code was sent as an RFC patch ~July 2018
https://patchwork.kernel.org/patch/10543193/ but never merged.

Signed-off-by: Cannon Matthews <cannonmatthews@...gle.com>
---
 MAINTAINERS                        |  1 +
 arch/x86/Kconfig                   |  4 ++++
 arch/x86/include/asm/page_64.h     |  1 +
 arch/x86/lib/Makefile              |  2 +-
 arch/x86/lib/clear_gigantic_page.c | 28 ++++++++++++++++++++++++++++
 arch/x86/lib/clear_page_64.S       | 19 +++++++++++++++++++
 include/linux/mm.h                 |  2 ++
 mm/memory.c                        |  2 ++
 8 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/lib/clear_gigantic_page.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 68eebf3650ac..efe84f085404 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7702,6 +7702,7 @@ S:	Maintained
 F:	fs/hugetlbfs/
 F:	mm/hugetlb.c
 F:	include/linux/hugetlb.h
+F:	arch/x86/lib/clear_gigantic_page.c
 F:	Documentation/admin-guide/mm/hugetlbpage.rst
 F:	Documentation/vm/hugetlbfs_reserv.rst
 F:	Documentation/ABI/testing/sysfs-kernel-mm-hugepages
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index beea77046f9b..f49e7b6f6851 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -70,6 +70,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MEM_ENCRYPT
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
+	select ARCH_HAS_CLEAR_GIGANTIC_PAGE	if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
@@ -290,6 +291,9 @@ config ARCH_MAY_HAVE_PC_FDC
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
 
+config ARCH_HAS_CLEAR_GIGANTIC_PAGE
+	bool
+
 config ARCH_HAS_CPU_RELAX
 	def_bool y
 
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 939b1cff4a7b..6ea60883b6d6 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -55,6 +55,7 @@ static inline void clear_page(void *page)
 }
 
 void copy_page(void *to, void *from);
+void clear_page_nt(void *page, u64 page_size);
 
 #endif	/* !__ASSEMBLY__ */
 
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 5246db42de45..a620c6636210 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -56,7 +56,7 @@ endif
 else
         obj-y += iomap_copy_64.o
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
-        lib-y += clear_page_64.o copy_page_64.o
+        lib-y += clear_page_64.o copy_page_64.o clear_gigantic_page.o
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o
 	lib-y += cmpxchg16b_emu.o
diff --git a/arch/x86/lib/clear_gigantic_page.c b/arch/x86/lib/clear_gigantic_page.c
new file mode 100644
index 000000000000..6fcb494ec9bc
--- /dev/null
+++ b/arch/x86/lib/clear_gigantic_page.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/page.h>
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+
+void clear_gigantic_page(struct page *page, unsigned long addr,
+			 unsigned int pages)
+{
+	int i;
+	void *dest = page_to_virt(page);
+
+	/*
+	 * cond_resched() every 2M. Hypothetical page sizes not divisible by
+	 * this are not supported.
+	 */
+	BUG_ON(pages % HPAGE_PMD_NR != 0);
+	for (i = 0; i < pages; i += HPAGE_PMD_NR) {
+		clear_page_nt(dest + (i * PAGE_SIZE), HPAGE_PMD_NR * PAGE_SIZE);
+		cond_resched();
+	}
+	/* clear_page_nt requires an `sfence` barrier. */
+	wmb();
+}
+#endif /* defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) */
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index c4c7dd115953..1224094fd863 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -50,3 +50,22 @@ SYM_FUNC_START(clear_page_erms)
 	ret
 SYM_FUNC_END(clear_page_erms)
 EXPORT_SYMBOL_GPL(clear_page_erms)
+
+/*
+ * Zero memory using non temporal stores, bypassing the cache.
+ * Requires an `sfence` (wmb()) afterwards.
+ * %rdi - destination.
+ * %rsi - page size. Must be 64 bit aligned.
+*/
+SYM_FUNC_START(clear_page_nt)
+	leaq	(%rdi,%rsi), %rdx
+	xorl	%eax, %eax
+	.p2align 4,,10
+	.p2align 3
+.L2:
+	movnti	%rax, (%rdi)
+	addq	$8, %rdi
+	cmpq	%rdx, %rdi
+	jne	.L2
+	ret
+SYM_FUNC_END(clear_page_nt)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c54fb96cb1e6..a57f9007374b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2856,6 +2856,8 @@ enum mf_action_page_type {
 };
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+extern void clear_gigantic_page(struct page *page, unsigned long addr,
+				unsigned int pages);
 extern void clear_huge_page(struct page *page,
 			    unsigned long addr_hint,
 			    unsigned int pages_per_huge_page);
diff --git a/mm/memory.c b/mm/memory.c
index e8bfdf0d9d1d..2a13bf102890 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4706,6 +4706,7 @@ static inline void process_huge_page(
 	}
 }
 
+#ifndef CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE
 static void clear_gigantic_page(struct page *page,
 				unsigned long addr,
 				unsigned int pages_per_huge_page)
@@ -4720,6 +4721,7 @@ static void clear_gigantic_page(struct page *page,
 		clear_user_highpage(p, addr + i * PAGE_SIZE);
 	}
 }
+#endif  /* CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE */
 
 static void clear_subpage(unsigned long addr, int idx, void *arg)
 {
-- 
2.25.1.481.gfbce0eb801-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ