[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <52d3fdde-7611-f766-eb2e-9dd1dde61760@intel.com>
Date: Fri, 3 Nov 2017 15:03:39 -0700
From: Dave Hansen <dave.hansen@...el.com>
To: unlisted-recipients:; (no To-header on input)
Cc: moritz.lipp@...k.tugraz.at, daniel.gruss@...k.tugraz.at,
michael.schwarz@...k.tugraz.at, richard.fellner@...dent.tugraz.at,
luto@...nel.org, torvalds@...ux-foundation.org,
keescook@...gle.com, hughd@...gle.com, mingo@...nel.org,
x86@...nel.org, LKML <linux-kernel@...r.kernel.org>,
Borislav Petkov <bp@...en8.de>
Subject: [RFC][PATCH] x86, kaiser: do not require mapping process kernel
stacks
With the KAISER code that I posted a few days ago, we map and unmap
each of the kernel stacks when they are created. That's slow and
it is also the single largest thing still mapped into the user
address space.
This patch is on top of Andy's new trampoline stack code[1] plus
the previous KAISER patches I posted. I figured I'd post this now
so folks can take a look before I go re-spin the whole thing.
Andy, I think the do_double_fault() handling is arguably buggy
with your trampoline stacks: it points the iret frame to the task
stack instead of the trampoline stack. It survives that normally,
but it breaks with KAISER of course.
The other oddity in this is the SWITCH_TO_KERNEL_CR3 in the
entry_SYSCALL_64 path. We do not have a stack to use or any
regs to clobber, so we have to fall back to a per-cpu area to
stash a register. I'm open to any better ideas.
1. https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/log/?h=x86/entry_consolidation
---
b/arch/x86/entry/calling.h | 15 +++++++++++++
b/arch/x86/entry/entry_64.S | 8 ++++--
b/arch/x86/include/asm/kaiser.h | 2 -
b/arch/x86/kernel/traps.c | 46 +++++++++++++++++++++++++++++++++-------
b/arch/x86/mm/kaiser.c | 12 +---------
b/include/linux/kaiser.h | 5 ----
b/kernel/fork.c | 5 ----
7 files changed, 60 insertions(+), 33 deletions(-)
diff -puN arch/x86/mm/kaiser.c~kaiser-no-process-stacks arch/x86/mm/kaiser.c
--- a/arch/x86/mm/kaiser.c~kaiser-no-process-stacks 2017-11-02 11:07:35.624523990 -0700
+++ b/arch/x86/mm/kaiser.c 2017-11-03 14:22:34.878872556 -0700
@@ -30,6 +30,8 @@
#include <asm/tlbflush.h>
#include <asm/desc.h>
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, syscall_r11_tmp);
+
/*
* At runtime, the only things we map are some things for CPU
* hotplug, and stacks for new processes. No two CPUs will ever
@@ -251,16 +253,6 @@ int kaiser_add_user_map(const void *__st
return 0;
}
-/*
- * The stack mapping is called in generic code and can't use
- * __PAGE_KERNEL
- */
-int kaiser_map_stack(struct task_struct *tsk)
-{
- return kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE,
- __PAGE_KERNEL);
-}
-
int kaiser_add_user_map_ptrs(const void *__start_addr,
const void *__end_addr,
unsigned long flags)
diff -puN kernel/fork.c~kaiser-no-process-stacks kernel/fork.c
--- a/kernel/fork.c~kaiser-no-process-stacks 2017-11-02 11:08:33.103722857 -0700
+++ b/kernel/fork.c 2017-11-03 14:19:35.720775925 -0700
@@ -248,8 +248,6 @@ static unsigned long *alloc_thread_stack
static inline void free_thread_stack(struct task_struct *tsk)
{
- kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
-
#ifdef CONFIG_VMAP_STACK
if (task_stack_vm_area(tsk)) {
int i;
@@ -539,9 +537,6 @@ static struct task_struct *dup_task_stru
* functions again.
*/
tsk->stack = stack;
- err = kaiser_map_stack(tsk);
- if (err)
- goto free_stack;
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
diff -puN arch/x86/entry/entry_64.S~kaiser-no-process-stacks arch/x86/entry/entry_64.S
--- a/arch/x86/entry/entry_64.S~kaiser-no-process-stacks 2017-11-02 11:31:30.904900271 -0700
+++ b/arch/x86/entry/entry_64.S 2017-11-03 11:20:26.825735849 -0700
@@ -145,6 +145,9 @@ ENTRY(entry_SYSCALL_64)
swapgs
movq %rsp, PER_CPU_VAR(rsp_scratch)
+
+ SWITCH_TO_KERNEL_CR3_SYSCALL
+
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
@@ -169,8 +172,6 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
/* NB: right here, all regs except r11 are live. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
-
/* Must wait until we have the kernel CR3 to call C functions: */
TRACE_IRQS_OFF
@@ -1270,6 +1271,7 @@ ENTRY(error_entry)
* gsbase and proceed. We'll fix up the exception and land in
* .Lgs_change's error handler with kernel gsbase.
*/
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
SWAPGS
jmp .Lerror_entry_done
@@ -1379,6 +1381,7 @@ ENTRY(nmi)
swapgs
cld
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1407,7 +1410,6 @@ ENTRY(nmi)
UNWIND_HINT_REGS
ENCODE_FRAME_POINTER
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
/*
* At this point we no longer need to worry about stack damage
* due to nesting -- we're on the normal thread stack and we're
diff -puN arch/x86/entry/calling.h~kaiser-no-process-stacks arch/x86/entry/calling.h
--- a/arch/x86/entry/calling.h~kaiser-no-process-stacks 2017-11-02 12:19:37.999595414 -0700
+++ b/arch/x86/entry/calling.h 2017-11-02 12:24:14.784007027 -0700
@@ -3,6 +3,7 @@
#include <asm/cpufeatures.h>
#include <asm/page_types.h>
#include <asm/pgtable_types.h>
+#include <asm/percpu.h>
/*
@@ -214,6 +215,18 @@ For 32-bit we have the following convent
mov \scratch_reg, %cr3
.endm
+/*
+ * At syscall entry, we do not have a register to clobber and
+ * do not yet have a stack. Even if we did, we no longer map
+ * the process stack, so it does us no good. Just use some
+ * percpu space to stash r11.
+ */
+.macro SWITCH_TO_KERNEL_CR3_SYSCALL
+ movq %r11, PER_CPU_VAR(syscall_r11_tmp)
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
+ movq PER_CPU_VAR(syscall_r11_tmp), %r11
+.endm
+
.macro SWITCH_TO_USER_CR3 scratch_reg:req
mov %cr3, \scratch_reg
ADJUST_USER_CR3 \scratch_reg
@@ -254,6 +267,8 @@ For 32-bit we have the following convent
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
+.macro SWITCH_TO_KERNEL_CR3_SYSCALL
+.endm
.macro SWITCH_TO_USER_CR3 scratch_reg:req
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
diff -puN arch/x86/kernel/traps.c~kaiser-no-process-stacks arch/x86/kernel/traps.c
--- a/arch/x86/kernel/traps.c~kaiser-no-process-stacks 2017-11-02 12:46:27.978440257 -0700
+++ b/arch/x86/kernel/traps.c 2017-11-03 14:34:21.658546483 -0700
@@ -329,6 +329,43 @@ __visible void __noreturn handle_stack_o
}
#endif
+/*
+ * This "fakes" a #GP from userspace upon returning (iret'ing)
+ * from this double fault.
+ */
+void setup_fake_gp_at_iret(struct pt_regs *regs)
+{
+ unsigned long *new_stack_top = (unsigned long *)
+ (this_cpu_read(cpu_tss.x86_tss.ist[0]) - 0x1500);
+
+ /*
+ * Set up a stack just like the hardware would for a #GP.
+ *
+ * This format is an "iret frame", plus the error code
+ * that the hardware puts on the stack for us for
+ * exceptions. (see struct pt_regs).
+ */
+ new_stack_top[-1] = regs->ss;
+ new_stack_top[-2] = regs->sp;
+ new_stack_top[-3] = regs->flags;
+ new_stack_top[-4] = regs->cs;
+ new_stack_top[-5] = regs->ip;
+ new_stack_top[-6] = 0; /* faked #GP error code */
+
+ /*
+ * 'regs' points to the "iret frame" for *this*
+ * exception, *not* the #GP we are faking. Here,
+ * we are telling 'iret' to jump to general_protection
+ * when returning from this double fault.
+ */
+ regs->ip = (unsigned long)general_protection;
+ /*
+ * Make iret move the stack to the "fake #GP" stack
+ * we created above.
+ */
+ regs->sp = (unsigned long)&new_stack_top[-6];
+}
+
#ifdef CONFIG_X86_64
/* Runs on IST stack */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -354,14 +391,7 @@ dotraplinkage void do_double_fault(struc
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
- struct pt_regs *normal_regs = task_pt_regs(current);
-
- /* Fake a #GP(0) from userspace. */
- memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
- normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
- regs->ip = (unsigned long)general_protection;
- regs->sp = (unsigned long)&normal_regs->orig_ax;
-
+ setup_fake_gp_at_iret(regs);
return;
}
#endif
diff -puN arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks arch/x86/include/asm/kaiser.h
--- a/arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks 2017-11-03 14:22:37.594995310 -0700
+++ b/arch/x86/include/asm/kaiser.h 2017-11-03 14:22:40.848142333 -0700
@@ -33,8 +33,6 @@
extern int kaiser_add_mapping(unsigned long addr, unsigned long size,
unsigned long flags);
-extern int kaiser_map_stack(struct task_struct *tsk);
-
/**
* kaiser_remove_mapping - remove a kernel mapping from the userpage tables
* @addr: the start address of the range
diff -puN include/linux/kaiser.h~kaiser-no-process-stacks include/linux/kaiser.h
--- a/include/linux/kaiser.h~kaiser-no-process-stacks 2017-11-03 14:22:37.654998020 -0700
+++ b/include/linux/kaiser.h 2017-11-03 14:22:48.646494777 -0700
@@ -11,11 +11,6 @@
* disabled.
*/
-static inline int kaiser_map_stack(struct task_struct *tsk)
-{
- return 0;
-}
-
static inline void kaiser_init(void)
{
}
_
Powered by blists - more mailing lists