lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210225072910.2811795-4-namit@vmware.com>
Date:   Wed, 24 Feb 2021 23:29:07 -0800
From:   Nadav Amit <nadav.amit@...il.com>
To:     linux-mm@...ck.org, linux-kernel@...r.kernel.org
Cc:     Hugh Dickins <hughd@...gle.com>, Andy Lutomirski <luto@...nel.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        Nadav Amit <namit@...are.com>,
        Sean Christopherson <seanjc@...gle.com>,
        Andrew Morton <akpm@...ux-foundation.org>, x86@...nel.org
Subject: [RFC 3/6] x86/vdso: introduce page_prefetch()

From: Nadav Amit <namit@...are.com>

Introduce a new vDSO function: page_prefetch() which is to be used when
certain memory, which might be paged out, is expected to be used soon.
The function prefetches the page if needed. The function returns zero if
the page is accessible after the call and -1 otherwise.

page_prefetch() is intended to be very lightweight both when the page is
already present and when the page is prefetched.

The implementation leverages the new vDSO exception tables mechanism.
page_prefetch() accesses the page for read and has a corresponding vDSO
exception-table entry that indicates that a #PF might occur and that in
such case the page should be brought asynchronously. If #PF indeed
occurs, the page-fault handler sets the FAULT_FLAG_RETRY_NOWAIT flag.

If the page-fault was not resolved, the page-fault handler does not
retry, and instead jumps to the new IP that is marked in the exception
table. The vDSO part returns accordingly the return value.

Cc: Andy Lutomirski <luto@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Sean Christopherson <seanjc@...gle.com>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: Borislav Petkov <bp@...en8.de>
Cc: Andrew Morton <akpm@...ux-foundation.org>
Cc: x86@...nel.org
Signed-off-by: Nadav Amit <namit@...are.com>
---
 arch/x86/Kconfig                |  1 +
 arch/x86/entry/vdso/Makefile    |  1 +
 arch/x86/entry/vdso/extable.c   | 59 +++++++++++++++++++++++++--------
 arch/x86/entry/vdso/vdso.lds.S  |  1 +
 arch/x86/entry/vdso/vprefetch.S | 39 ++++++++++++++++++++++
 arch/x86/include/asm/vdso.h     | 38 +++++++++++++++++++--
 arch/x86/mm/fault.c             | 11 ++++--
 lib/vdso/Kconfig                |  5 +++
 8 files changed, 136 insertions(+), 19 deletions(-)
 create mode 100644 arch/x86/entry/vdso/vprefetch.S

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 21f851179ff0..86a4c265e8af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -136,6 +136,7 @@ config X86
 	select GENERIC_TIME_VSYSCALL
 	select GENERIC_GETTIMEOFDAY
 	select GENERIC_VDSO_TIME_NS
+	select GENERIC_VDSO_PREFETCH
 	select GUP_GET_PTE_LOW_HIGH		if X86_PAE
 	select HARDIRQS_SW_RESEND
 	select HARDLOCKUP_CHECK_TIMESTAMP	if X86_64
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 02e3e42f380b..e32ca1375b84 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -28,6 +28,7 @@ vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
 vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
 vobjs32-y += vdso32/vclock_gettime.o
 vobjs-$(CONFIG_X86_SGX)	+= vsgx.o
+vobjs-$(CONFIG_GENERIC_VDSO_PREFETCH) += vprefetch.o
 
 # files to link into kernel
 obj-y				+= vma.o extable.o
diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c
index 93fb37bd32ad..e821887112ce 100644
--- a/arch/x86/entry/vdso/extable.c
+++ b/arch/x86/entry/vdso/extable.c
@@ -4,36 +4,67 @@
 #include <asm/current.h>
 #include <asm/traps.h>
 #include <asm/vdso.h>
+#include "extable.h"
 
 struct vdso_exception_table_entry {
 	int insn, fixup;
 	unsigned int mask, flags;
 };
 
-bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
-			  unsigned long error_code, unsigned long fault_addr)
+static unsigned long
+get_vdso_exception_table_entry(const struct pt_regs *regs, int trapnr,
+			       unsigned int *flags)
 {
 	const struct vdso_image *image = current->mm->context.vdso_image;
 	const struct vdso_exception_table_entry *extable;
 	unsigned int nr_entries, i;
 	unsigned long base;
+	unsigned long ip = regs->ip;
+	unsigned long vdso_base = (unsigned long)current->mm->context.vdso;
 
-	if (!current->mm->context.vdso)
-		return false;
-
-	base =  (unsigned long)current->mm->context.vdso + image->extable_base;
+	base = vdso_base + image->extable_base;
 	nr_entries = image->extable_len / (sizeof(*extable));
 	extable = image->extable;
 
 	for (i = 0; i < nr_entries; i++, base += sizeof(*extable)) {
-		if (regs->ip == base + extable[i].insn) {
-			regs->ip = base + extable[i].fixup;
-			regs->di = trapnr;
-			regs->si = error_code;
-			regs->dx = fault_addr;
-			return true;
-		}
+		if (ip != base + extable[i].insn)
+			continue;
+
+		if (!((1u << trapnr) & extable[i].mask))
+			continue;
+
+		/* found */
+		if (flags)
+			*flags = extable[i].flags;
+		return base + extable[i].fixup;
 	}
 
-	return false;
+	return 0;
+}
+
+bool __fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+			    unsigned long error_code, unsigned long fault_addr)
+{
+	unsigned long new_ip;
+
+	new_ip = get_vdso_exception_table_entry(regs, trapnr, NULL);
+	if (!new_ip)
+		return false;
+
+	instruction_pointer_set(regs, new_ip);
+	regs->di = trapnr;
+	regs->si = error_code;
+	regs->dx = fault_addr;
+	return true;
+}
+
+__attribute_const__ bool __is_async_vdso_exception(struct pt_regs *regs,
+						   int trapnr)
+{
+	unsigned long new_ip;
+	unsigned int flags;
+
+	new_ip = get_vdso_exception_table_entry(regs, trapnr, &flags);
+
+	return new_ip && (flags & ASM_VDSO_ASYNC_FLAGS);
 }
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index 4bf48462fca7..fd4ba24571c8 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -28,6 +28,7 @@ VERSION {
 		clock_getres;
 		__vdso_clock_getres;
 		__vdso_sgx_enter_enclave;
+		__vdso_prefetch_page;
 	local: *;
 	};
 }
diff --git a/arch/x86/entry/vdso/vprefetch.S b/arch/x86/entry/vdso/vprefetch.S
new file mode 100644
index 000000000000..a0fcafb7d546
--- /dev/null
+++ b/arch/x86/entry/vdso/vprefetch.S
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+#include <asm/errno.h>
+#include <asm/enclu.h>
+
+#include "extable.h"
+
+.code64
+.section .text, "ax"
+
+SYM_FUNC_START(__vdso_prefetch_page)
+	/* Prolog */
+	.cfi_startproc
+	push	%rbp
+	.cfi_adjust_cfa_offset	8
+	.cfi_rel_offset		%rbp, 0
+	mov	%rsp, %rbp
+	.cfi_def_cfa_register	%rbp
+
+	xor	%rax, %rax
+.Laccess_page:
+	movb	(%rdi), %dil
+.Lout:
+
+	/* Epilog */
+	pop	%rbp
+	.cfi_def_cfa		%rsp, 8
+	ret
+
+.Lhandle_exception:
+	mov	$-1ll, %rax
+	jmp	.Lout
+	.cfi_endproc
+ASM_VDSO_EXTABLE_HANDLE .Laccess_page, .Lhandle_exception,		\
+			(1<<X86_TRAP_PF), ASM_VDSO_ASYNC_FLAGS
+
+SYM_FUNC_END(__vdso_prefetch_page)
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 98aa103eb4ab..ee47660fcd0d 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -9,6 +9,7 @@
 #ifndef __ASSEMBLER__
 
 #include <linux/mm_types.h>
+#include <linux/sched.h>
 
 struct vdso_image {
 	void *data;
@@ -49,9 +50,40 @@ extern void __init init_vdso_image(const struct vdso_image *image);
 
 extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
 
-extern bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
-				 unsigned long error_code,
-				 unsigned long fault_addr);
+extern bool __fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+				   unsigned long error_code,
+				   unsigned long fault_addr);
+
+extern __attribute_const__ bool __is_async_vdso_exception(struct pt_regs *regs,
+							  int trapnr);
+
+static inline bool is_exception_in_vdso(struct pt_regs *regs)
+{
+	const struct vdso_image *image = current->mm->context.vdso_image;
+	unsigned long vdso_base = (unsigned long)current->mm->context.vdso;
+
+	return regs->ip >= vdso_base && regs->ip < vdso_base + image->size &&
+		vdso_base != 0;
+}
+
+static inline bool is_async_vdso_exception(struct pt_regs *regs, int trapnr)
+{
+	if (!is_exception_in_vdso(regs))
+		return false;
+
+	return __is_async_vdso_exception(regs, trapnr);
+}
+
+static inline bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+				   unsigned long error_code,
+				   unsigned long fault_addr)
+{
+	if (is_exception_in_vdso(regs))
+		return __fixup_vdso_exception(regs, trapnr, error_code,
+					      fault_addr);
+	return false;
+}
+
 #endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f1f1b5a0956a..87d8ae46510c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1289,6 +1289,10 @@ void do_user_addr_fault(struct pt_regs *regs,
 	if (user_mode(regs)) {
 		local_irq_enable();
 		flags |= FAULT_FLAG_USER;
+		if (IS_ENABLED(CONFIG_GENERIC_VDSO_PREFETCH) &&
+		    is_async_vdso_exception(regs, X86_TRAP_PF))
+			flags |= FAULT_FLAG_ALLOW_RETRY |
+				 FAULT_FLAG_RETRY_NOWAIT;
 	} else {
 		if (regs->flags & X86_EFLAGS_IF)
 			local_irq_enable();
@@ -1407,8 +1411,11 @@ void do_user_addr_fault(struct pt_regs *regs,
 	 */
 	if (unlikely((fault & VM_FAULT_RETRY) &&
 		     (flags & FAULT_FLAG_ALLOW_RETRY))) {
-		flags |= FAULT_FLAG_TRIED;
-		goto retry;
+		if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			flags |= FAULT_FLAG_TRIED;
+			goto retry;
+		}
+		fixup_vdso_exception(regs, X86_TRAP_PF, hw_error_code, address);
 	}
 
 	mmap_read_unlock(mm);
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index d883ac299508..a64d2b08b6f4 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -30,4 +30,9 @@ config GENERIC_VDSO_TIME_NS
 	  Selected by architectures which support time namespaces in the
 	  VDSO
 
+config GENERIC_VDSO_PREFETCH
+	bool
+	help
+	  Selected by architectures which support page prefetch VDSO
+
 endif
-- 
2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ