lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 3 Nov 2017 15:03:39 -0700
From:   Dave Hansen <dave.hansen@...el.com>
To:     unlisted-recipients:; (no To-header on input)
Cc:     moritz.lipp@...k.tugraz.at, daniel.gruss@...k.tugraz.at,
        michael.schwarz@...k.tugraz.at, richard.fellner@...dent.tugraz.at,
        luto@...nel.org, torvalds@...ux-foundation.org,
        keescook@...gle.com, hughd@...gle.com, mingo@...nel.org,
        x86@...nel.org, LKML <linux-kernel@...r.kernel.org>,
        Borislav Petkov <bp@...en8.de>
Subject: [RFC][PATCH] x86, kaiser: do not require mapping process kernel
 stacks


With the KAISER code that I posted a few days ago, we map and unmap
each of the kernel stacks when they are created.  That's slow and
it is also the single largest thing still mapped into the user
address space.

This patch is on top of Andy's new trampoline stack code[1] plus
the previous KAISER patches I posted.  I figured I'd post this now
so folks can take a look before I go re-spin the whole thing.

Andy, I think the do_double_fault() handling is arguably buggy
with your trampoline stacks: it points the iret frame to the task
stack instead of the trampoline stack.  It survives that normally,
but it breaks with KAISER of course.

The other oddity in this is the SWITCH_TO_KERNEL_CR3 in the
entry_SYSCALL_64 path.  We do not have a stack to use or any
regs to clobber, so we have to fall back to a per-cpu area to
stash a register.  I'm open to any better ideas.

1. https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/log/?h=x86/entry_consolidation

---

 b/arch/x86/entry/calling.h      |   15 +++++++++++++
 b/arch/x86/entry/entry_64.S     |    8 ++++--
 b/arch/x86/include/asm/kaiser.h |    2 -
 b/arch/x86/kernel/traps.c       |   46 +++++++++++++++++++++++++++++++++-------
 b/arch/x86/mm/kaiser.c          |   12 +---------
 b/include/linux/kaiser.h        |    5 ----
 b/kernel/fork.c                 |    5 ----
 7 files changed, 60 insertions(+), 33 deletions(-)

diff -puN arch/x86/mm/kaiser.c~kaiser-no-process-stacks arch/x86/mm/kaiser.c
--- a/arch/x86/mm/kaiser.c~kaiser-no-process-stacks	2017-11-02 11:07:35.624523990 -0700
+++ b/arch/x86/mm/kaiser.c	2017-11-03 14:22:34.878872556 -0700
@@ -30,6 +30,8 @@
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
 +DEFINE_PER_CPU_USER_MAPPED(unsigned long, syscall_r11_tmp);
+
 /*
  * At runtime, the only things we map are some things for CPU
  * hotplug, and stacks for new processes.  No two CPUs will ever
@@ -251,16 +253,6 @@ int kaiser_add_user_map(const void *__st
 	return 0;
 }
 -/*
- * The stack mapping is called in generic code and can't use
- * __PAGE_KERNEL
- */
-int kaiser_map_stack(struct task_struct *tsk)
-{
-	return kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE,
-				  __PAGE_KERNEL);
-}
-
 int kaiser_add_user_map_ptrs(const void *__start_addr,
 			     const void *__end_addr,
 			     unsigned long flags)
diff -puN kernel/fork.c~kaiser-no-process-stacks kernel/fork.c
--- a/kernel/fork.c~kaiser-no-process-stacks	2017-11-02 11:08:33.103722857 -0700
+++ b/kernel/fork.c	2017-11-03 14:19:35.720775925 -0700
@@ -248,8 +248,6 @@ static unsigned long *alloc_thread_stack
  static inline void free_thread_stack(struct task_struct *tsk)
 {
-	kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
-
 #ifdef CONFIG_VMAP_STACK
 	if (task_stack_vm_area(tsk)) {
 		int i;
@@ -539,9 +537,6 @@ static struct task_struct *dup_task_stru
 	 * functions again.
 	 */
 	tsk->stack = stack;
-	err = kaiser_map_stack(tsk);
-	if (err)
-		goto free_stack;
 #ifdef CONFIG_VMAP_STACK
 	tsk->stack_vm_area = stack_vm_area;
 #endif
diff -puN arch/x86/entry/entry_64.S~kaiser-no-process-stacks arch/x86/entry/entry_64.S
--- a/arch/x86/entry/entry_64.S~kaiser-no-process-stacks	2017-11-02 11:31:30.904900271 -0700
+++ b/arch/x86/entry/entry_64.S	2017-11-03 11:20:26.825735849 -0700
@@ -145,6 +145,9 @@ ENTRY(entry_SYSCALL_64)
  	swapgs
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
+
+	SWITCH_TO_KERNEL_CR3_SYSCALL
+
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
  	/* Construct struct pt_regs on stack */
@@ -169,8 +172,6 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
  	/* NB: right here, all regs except r11 are live. */
 -	SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
-
 	/* Must wait until we have the kernel CR3 to call C functions: */
 	TRACE_IRQS_OFF
 @@ -1270,6 +1271,7 @@ ENTRY(error_entry)
 	 * gsbase and proceed.  We'll fix up the exception and land in
 	 * .Lgs_change's error handler with kernel gsbase.
 	 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 	SWAPGS
 	jmp .Lerror_entry_done
 @@ -1379,6 +1381,7 @@ ENTRY(nmi)
  	swapgs
 	cld
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1407,7 +1410,6 @@ ENTRY(nmi)
 	UNWIND_HINT_REGS
 	ENCODE_FRAME_POINTER
 -	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
 	/*
 	 * At this point we no longer need to worry about stack damage
 	 * due to nesting -- we're on the normal thread stack and we're
diff -puN arch/x86/entry/calling.h~kaiser-no-process-stacks arch/x86/entry/calling.h
--- a/arch/x86/entry/calling.h~kaiser-no-process-stacks	2017-11-02 12:19:37.999595414 -0700
+++ b/arch/x86/entry/calling.h	2017-11-02 12:24:14.784007027 -0700
@@ -3,6 +3,7 @@
 #include <asm/cpufeatures.h>
 #include <asm/page_types.h>
 #include <asm/pgtable_types.h>
+#include <asm/percpu.h>
  /*
 @@ -214,6 +215,18 @@ For 32-bit we have the following convent
 	mov	\scratch_reg, %cr3
 .endm
 +/*
+ * At syscall entry, we do not have a register to clobber and
+ * do not yet have a stack.  Even if we did, we no longer map
+ * the process stack, so it does us no good.  Just use some
+ * percpu space to stash r11.
+ */
+.macro SWITCH_TO_KERNEL_CR3_SYSCALL
+	movq	%r11, PER_CPU_VAR(syscall_r11_tmp)
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
+	movq	PER_CPU_VAR(syscall_r11_tmp), %r11
+.endm
+
 .macro SWITCH_TO_USER_CR3 scratch_reg:req
 	mov	%cr3, \scratch_reg
 	ADJUST_USER_CR3 \scratch_reg
@@ -254,6 +267,8 @@ For 32-bit we have the following convent
  .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
 .endm
+.macro SWITCH_TO_KERNEL_CR3_SYSCALL
+.endm
 .macro SWITCH_TO_USER_CR3 scratch_reg:req
 .endm
 .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
diff -puN arch/x86/kernel/traps.c~kaiser-no-process-stacks arch/x86/kernel/traps.c
--- a/arch/x86/kernel/traps.c~kaiser-no-process-stacks	2017-11-02 12:46:27.978440257 -0700
+++ b/arch/x86/kernel/traps.c	2017-11-03 14:34:21.658546483 -0700
@@ -329,6 +329,43 @@ __visible void __noreturn handle_stack_o
 }
 #endif
 +/*
+ * This "fakes" a #GP from userspace upon returning (iret'ing)
+ * from this double fault.
+ */
+void setup_fake_gp_at_iret(struct pt_regs *regs)
+{
+	unsigned long *new_stack_top = (unsigned long *)
+		(this_cpu_read(cpu_tss.x86_tss.ist[0]) - 0x1500);
+
+	/*
+ 	 * Set up a stack just like the hardware would for a #GP.
+	 *
+	 * This format is an "iret frame", plus the error code
+	 * that the hardware puts on the stack for us for
+	 * exceptions.  (see struct pt_regs).
+	 */
+	new_stack_top[-1] = regs->ss;
+	new_stack_top[-2] = regs->sp;
+	new_stack_top[-3] = regs->flags;
+	new_stack_top[-4] = regs->cs;
+	new_stack_top[-5] = regs->ip;
+	new_stack_top[-6] = 0;	/* faked #GP error code */
+
+	/*
+	 * 'regs' points to the "iret frame" for *this*
+	 * exception, *not* the #GP we are faking.  Here,
+	 * we are telling 'iret' to jump to general_protection
+	 * when returning from this double fault.
+	 */
+	regs->ip = (unsigned long)general_protection;
+	/*
+	 * Make iret move the stack to the "fake #GP" stack
+	 * we created above.
+	 */
+	regs->sp = (unsigned long)&new_stack_top[-6];
+}
+
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -354,14 +391,7 @@ dotraplinkage void do_double_fault(struc
 		regs->cs == __KERNEL_CS &&
 		regs->ip == (unsigned long)native_irq_return_iret)
 	{
-		struct pt_regs *normal_regs = task_pt_regs(current);
-
-		/* Fake a #GP(0) from userspace. */
-		memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
-		normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
-		regs->ip = (unsigned long)general_protection;
-		regs->sp = (unsigned long)&normal_regs->orig_ax;
-
+		setup_fake_gp_at_iret(regs);
 		return;
 	}
 #endif
diff -puN arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks arch/x86/include/asm/kaiser.h
--- a/arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks	2017-11-03 14:22:37.594995310 -0700
+++ b/arch/x86/include/asm/kaiser.h	2017-11-03 14:22:40.848142333 -0700
@@ -33,8 +33,6 @@
 extern int kaiser_add_mapping(unsigned long addr, unsigned long size,
 			      unsigned long flags);
 -extern int kaiser_map_stack(struct task_struct *tsk);
-
 /**
  *  kaiser_remove_mapping - remove a kernel mapping from the userpage tables
  *  @addr: the start address of the range
diff -puN include/linux/kaiser.h~kaiser-no-process-stacks include/linux/kaiser.h
--- a/include/linux/kaiser.h~kaiser-no-process-stacks	2017-11-03 14:22:37.654998020 -0700
+++ b/include/linux/kaiser.h	2017-11-03 14:22:48.646494777 -0700
@@ -11,11 +11,6 @@
  * disabled.
  */
 -static inline int kaiser_map_stack(struct task_struct *tsk)
-{
-	return 0;
-}
-
 static inline void kaiser_init(void)
 {
 }
_

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ