[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240919-selective-mitigation-v1-1-1846cf41895e@linux.intel.com>
Date: Thu, 19 Sep 2024 14:52:37 -0700
From: Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>
To: Andy Lutomirski <luto@...nel.org>, Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>,
David Kaplan <David.Kaplan@....com>,
Daniel Sneddon <daniel.sneddon@...ux.intel.com>, x86@...nel.org,
"H. Peter Anvin" <hpa@...or.com>,
Peter Zijlstra <peterz@...radead.org>,
Josh Poimboeuf <jpoimboe@...nel.org>,
Steven Rostedt <rostedt@...dmis.org>
Cc: linux-kernel@...r.kernel.org, cgroups@...r.kernel.org
Subject: [PATCH RFC 1/2] x86/entry_64: Add a separate unmitigated entry/exit
path
CPU mitigations are deployed system-wide, but usually not all of the
userspace is malicious. Yet, they suffer from the performance impact
of the mitigations. This all or nothing approach is due to lack of a
way for kernel to know which userspace can be trusted and which cannot.
For scenarios where an admin can decide which processes to trust, an
interface to tell the kernel to possibly skip the mitigation would be
useful.
In preparation for kernel to be able to selectively apply mitigation
per-process add a separate kernel entry/exit path that skips the
mitigations.
Originally-by: Josh Poimboeuf <jpoimboe@...nel.org>
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>
---
arch/x86/entry/entry_64.S | 66 +++++++++++++++++++++++++++++++++++--------
arch/x86/include/asm/proto.h | 15 +++++++---
arch/x86/include/asm/ptrace.h | 15 +++++++---
arch/x86/kernel/cpu/common.c | 2 +-
4 files changed, 78 insertions(+), 20 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1b5be07f8669..eeaf4226d09c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -84,7 +84,7 @@
* with them due to bugs in both AMD and Intel CPUs.
*/
-SYM_CODE_START(entry_SYSCALL_64)
+.macro __entry_SYSCALL_64 mitigated=0
UNWIND_HINT_ENTRY
ENDBR
@@ -94,7 +94,12 @@ SYM_CODE_START(entry_SYSCALL_64)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
-SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
+.if \mitigated
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack_mitigated, SYM_L_GLOBAL)
+.else
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack_unmitigated, SYM_L_GLOBAL)
+.endif
+
ANNOTATE_NOENDBR
/* Construct struct pt_regs on stack */
@@ -103,7 +108,11 @@ SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+
+.if \mitigated
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
+.endif
+
pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
@@ -113,10 +122,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
/* Sign extend the lower 32bit as syscall numbers are treated as int */
movslq %eax, %rsi
+.if \mitigated
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
CLEAR_BRANCH_HISTORY
+.endif
call do_syscall_64 /* returns with IRQs disabled */
@@ -127,15 +138,26 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* In the Xen PV case we must use iret anyway.
*/
- ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
- "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+.if \mitigated
+ push %rax
+ IBRS_EXIT
+ CLEAR_CPU_BUFFERS
+ pop %rax
+.endif
+
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode_from_syscall", \
+ "jmp swapgs_restore_regs_and_return_to_usermode_from_syscall", X86_FEATURE_XENPV
/*
* We win! This label is here just for ease of understanding
* perf profiles. Nothing jumps here.
*/
-syscall_return_via_sysret:
- IBRS_EXIT
+.if \mitigated
+syscall_return_via_sysret_mitigated:
+.else
+syscall_return_via_sysret_unmitigated:
+.endif
+
POP_REGS pop_rdi=0
/*
@@ -159,15 +181,36 @@ syscall_return_via_sysret:
popq %rdi
popq %rsp
-SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
+
+.if \mitigated
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack_mitigated, SYM_L_GLOBAL)
+.else
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack_unmitigated, SYM_L_GLOBAL)
+.endif
+
ANNOTATE_NOENDBR
swapgs
- CLEAR_CPU_BUFFERS
+
+.if \mitigated
+SYM_INNER_LABEL(entry_SYSRETQ_end_mitigated, SYM_L_GLOBAL)
+.else
+SYM_INNER_LABEL(entry_SYSRETQ_end_unmitigated, SYM_L_GLOBAL)
+.endif
sysretq
-SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
+
+.endm /* __entry_SYSCALL_64 */
+
+SYM_CODE_START(entry_SYSCALL_64_unmitigated)
+ __entry_SYSCALL_64 mitigated=0
ANNOTATE_NOENDBR
int3
-SYM_CODE_END(entry_SYSCALL_64)
+SYM_CODE_END(entry_SYSCALL_64_unmitigated)
+
+SYM_CODE_START(entry_SYSCALL_64_mitigated)
+ __entry_SYSCALL_64 mitigated=1
+ ANNOTATE_NOENDBR
+ int3
+SYM_CODE_END(entry_SYSCALL_64_mitigated)
/*
* %rdi: prev task
@@ -559,6 +602,8 @@ __irqentry_text_end:
SYM_CODE_START_LOCAL(common_interrupt_return)
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
IBRS_EXIT
+ CLEAR_CPU_BUFFERS
+SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode_from_syscall, SYM_L_GLOBAL)
#ifdef CONFIG_XEN_PV
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif
@@ -573,7 +618,6 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
.Lswapgs_and_iret:
swapgs
- CLEAR_CPU_BUFFERS
/* Assert that the IRET frame indicates user mode. */
testb $3, 8(%rsp)
jnz .Lnative_iret
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 484f4f0131a5..0936e0e70659 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -11,10 +11,17 @@ struct task_struct;
void syscall_init(void);
#ifdef CONFIG_X86_64
-void entry_SYSCALL_64(void);
-void entry_SYSCALL_64_safe_stack(void);
-void entry_SYSRETQ_unsafe_stack(void);
-void entry_SYSRETQ_end(void);
+
+void entry_SYSCALL_64_unmitigated(void);
+void entry_SYSCALL_64_safe_stack_unmitigated(void);
+void entry_SYSRETQ_unsafe_stack_unmitigated(void);
+void entry_SYSRETQ_end_unmitigated(void);
+
+void entry_SYSCALL_64_mitigated(void);
+void entry_SYSCALL_64_safe_stack_mitigated(void);
+void entry_SYSRETQ_unsafe_stack_mitigated(void);
+void entry_SYSRETQ_end_mitigated(void);
+
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5a83fbd9bc0b..74a13c76d241 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -261,11 +261,18 @@ static inline bool any_64bit_mode(struct pt_regs *regs)
static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
{
- bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
- regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack);
+ bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64_unmitigated &&
+ regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack_unmitigated);
+
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack_unmitigated &&
+ regs->ip < (unsigned long)entry_SYSRETQ_end_unmitigated);
+
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_64_mitigated &&
+ regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack_mitigated);
+
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack_mitigated &&
+ regs->ip < (unsigned long)entry_SYSRETQ_end_mitigated);
- ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack &&
- regs->ip < (unsigned long)entry_SYSRETQ_end);
#ifdef CONFIG_IA32_EMULATION
ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d4e539d4e158..e72c37f3a437 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2026,7 +2026,7 @@ static void wrmsrl_cstar(unsigned long val)
static inline void idt_syscall_init(void)
{
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64_unmitigated);
if (ia32_enabled()) {
wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
--
2.34.1
Powered by blists - more mailing lists