lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240919-selective-mitigation-v1-1-1846cf41895e@linux.intel.com>
Date: Thu, 19 Sep 2024 14:52:37 -0700
From: Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>
To: Andy Lutomirski <luto@...nel.org>, Thomas Gleixner <tglx@...utronix.de>,
	Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
	Dave Hansen <dave.hansen@...ux.intel.com>,
	David Kaplan <David.Kaplan@....com>,
	Daniel Sneddon <daniel.sneddon@...ux.intel.com>, x86@...nel.org,
	"H. Peter Anvin" <hpa@...or.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Josh Poimboeuf <jpoimboe@...nel.org>,
	Steven Rostedt <rostedt@...dmis.org>
Cc: linux-kernel@...r.kernel.org, cgroups@...r.kernel.org
Subject: [PATCH RFC 1/2] x86/entry_64: Add a separate unmitigated entry/exit
 path

CPU mitigations are deployed system-wide, but usually not all of the
userspace is malicious. Yet, they suffer from the performance impact
of the mitigations. This all or nothing approach is due to lack of a
way for kernel to know which userspace can be trusted and which cannot.

For scenarios where an admin can decide which processes to trust, an
interface to tell the kernel to possibly skip the mitigation would be
useful.

In preparation for kernel to be able to selectively apply mitigation
per-process add a separate kernel entry/exit path that skips the
mitigations.

Originally-by: Josh Poimboeuf <jpoimboe@...nel.org>
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>
---
 arch/x86/entry/entry_64.S     | 66 +++++++++++++++++++++++++++++++++++--------
 arch/x86/include/asm/proto.h  | 15 +++++++---
 arch/x86/include/asm/ptrace.h | 15 +++++++---
 arch/x86/kernel/cpu/common.c  |  2 +-
 4 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1b5be07f8669..eeaf4226d09c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -84,7 +84,7 @@
  * with them due to bugs in both AMD and Intel CPUs.
  */
 
-SYM_CODE_START(entry_SYSCALL_64)
+.macro __entry_SYSCALL_64 mitigated=0
 	UNWIND_HINT_ENTRY
 	ENDBR
 
@@ -94,7 +94,12 @@ SYM_CODE_START(entry_SYSCALL_64)
 	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
 	movq	PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
 
-SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
+.if \mitigated
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack_mitigated, SYM_L_GLOBAL)
+.else
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack_unmitigated, SYM_L_GLOBAL)
+.endif
+
 	ANNOTATE_NOENDBR
 
 	/* Construct struct pt_regs on stack */
@@ -103,7 +108,11 @@ SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
 	pushq	%r11					/* pt_regs->flags */
 	pushq	$__USER_CS				/* pt_regs->cs */
 	pushq	%rcx					/* pt_regs->ip */
+
+.if \mitigated
 SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
+.endif
+
 	pushq	%rax					/* pt_regs->orig_ax */
 
 	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
@@ -113,10 +122,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
 	/* Sign extend the lower 32bit as syscall numbers are treated as int */
 	movslq	%eax, %rsi
 
+.if \mitigated
 	/* clobbers %rax, make sure it is after saving the syscall nr */
 	IBRS_ENTER
 	UNTRAIN_RET
 	CLEAR_BRANCH_HISTORY
+.endif
 
 	call	do_syscall_64		/* returns with IRQs disabled */
 
@@ -127,15 +138,26 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
 	 * In the Xen PV case we must use iret anyway.
 	 */
 
-	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
-		"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+.if \mitigated
+	push %rax
+	IBRS_EXIT
+	CLEAR_CPU_BUFFERS
+	pop %rax
+.endif
+
+	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode_from_syscall", \
+		"jmp swapgs_restore_regs_and_return_to_usermode_from_syscall", X86_FEATURE_XENPV
 
 	/*
 	 * We win! This label is here just for ease of understanding
 	 * perf profiles. Nothing jumps here.
 	 */
-syscall_return_via_sysret:
-	IBRS_EXIT
+.if \mitigated
+syscall_return_via_sysret_mitigated:
+.else
+syscall_return_via_sysret_unmitigated:
+.endif
+
 	POP_REGS pop_rdi=0
 
 	/*
@@ -159,15 +181,36 @@ syscall_return_via_sysret:
 
 	popq	%rdi
 	popq	%rsp
-SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
+
+.if \mitigated
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack_mitigated, SYM_L_GLOBAL)
+.else
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack_unmitigated, SYM_L_GLOBAL)
+.endif
+
 	ANNOTATE_NOENDBR
 	swapgs
-	CLEAR_CPU_BUFFERS
+
+.if \mitigated
+SYM_INNER_LABEL(entry_SYSRETQ_end_mitigated, SYM_L_GLOBAL)
+.else
+SYM_INNER_LABEL(entry_SYSRETQ_end_unmitigated, SYM_L_GLOBAL)
+.endif
 	sysretq
-SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
+
+.endm /* __entry_SYSCALL_64 */
+
+SYM_CODE_START(entry_SYSCALL_64_unmitigated)
+	__entry_SYSCALL_64 mitigated=0
 	ANNOTATE_NOENDBR
 	int3
-SYM_CODE_END(entry_SYSCALL_64)
+SYM_CODE_END(entry_SYSCALL_64_unmitigated)
+
+SYM_CODE_START(entry_SYSCALL_64_mitigated)
+	__entry_SYSCALL_64 mitigated=1
+	ANNOTATE_NOENDBR
+	int3
+SYM_CODE_END(entry_SYSCALL_64_mitigated)
 
 /*
  * %rdi: prev task
@@ -559,6 +602,8 @@ __irqentry_text_end:
 SYM_CODE_START_LOCAL(common_interrupt_return)
 SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
 	IBRS_EXIT
+	CLEAR_CPU_BUFFERS
+SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode_from_syscall, SYM_L_GLOBAL)
 #ifdef CONFIG_XEN_PV
 	ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
 #endif
@@ -573,7 +618,6 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
 
 .Lswapgs_and_iret:
 	swapgs
-	CLEAR_CPU_BUFFERS
 	/* Assert that the IRET frame indicates user mode. */
 	testb	$3, 8(%rsp)
 	jnz	.Lnative_iret
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 484f4f0131a5..0936e0e70659 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -11,10 +11,17 @@ struct task_struct;
 void syscall_init(void);
 
 #ifdef CONFIG_X86_64
-void entry_SYSCALL_64(void);
-void entry_SYSCALL_64_safe_stack(void);
-void entry_SYSRETQ_unsafe_stack(void);
-void entry_SYSRETQ_end(void);
+
+void entry_SYSCALL_64_unmitigated(void);
+void entry_SYSCALL_64_safe_stack_unmitigated(void);
+void entry_SYSRETQ_unsafe_stack_unmitigated(void);
+void entry_SYSRETQ_end_unmitigated(void);
+
+void entry_SYSCALL_64_mitigated(void);
+void entry_SYSCALL_64_safe_stack_mitigated(void);
+void entry_SYSRETQ_unsafe_stack_mitigated(void);
+void entry_SYSRETQ_end_mitigated(void);
+
 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
 #endif
 
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5a83fbd9bc0b..74a13c76d241 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -261,11 +261,18 @@ static inline bool any_64bit_mode(struct pt_regs *regs)
 
 static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
 {
-	bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
-		    regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack);
+	bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64_unmitigated &&
+		    regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack_unmitigated);
+
+	ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack_unmitigated &&
+		      regs->ip <  (unsigned long)entry_SYSRETQ_end_unmitigated);
+
+	ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_64_mitigated &&
+		      regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack_mitigated);
+
+	ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack_mitigated &&
+		      regs->ip <  (unsigned long)entry_SYSRETQ_end_mitigated);
 
-	ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack &&
-		      regs->ip <  (unsigned long)entry_SYSRETQ_end);
 #ifdef CONFIG_IA32_EMULATION
 	ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
 		      regs->ip <  (unsigned long)entry_SYSCALL_compat_safe_stack);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d4e539d4e158..e72c37f3a437 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2026,7 +2026,7 @@ static void wrmsrl_cstar(unsigned long val)
 
 static inline void idt_syscall_init(void)
 {
-	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64_unmitigated);
 
 	if (ia32_enabled()) {
 		wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);

-- 
2.34.1



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ