lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CALCETrXiM1kVJ_bkBqyX2GbB+raA6XOK-BgO5n+aJMXMxeUYUg@mail.gmail.com>
Date:   Sat, 4 Nov 2017 04:24:47 -0700
From:   Andy Lutomirski <luto@...nel.org>
To:     Dave Hansen <dave.hansen@...el.com>
Cc:     moritz.lipp@...k.tugraz.at,
        Daniel Gruss <daniel.gruss@...k.tugraz.at>,
        michael.schwarz@...k.tugraz.at, richard.fellner@...dent.tugraz.at,
        Andrew Lutomirski <luto@...nel.org>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Kees Cook <keescook@...gle.com>,
        Hugh Dickins <hughd@...gle.com>,
        Ingo Molnar <mingo@...nel.org>, X86 ML <x86@...nel.org>,
        LKML <linux-kernel@...r.kernel.org>,
        Borislav Petkov <bp@...en8.de>
Subject: Re: [RFC][PATCH] x86, kaiser: do not require mapping process kernel stacks

On Fri, Nov 3, 2017 at 3:03 PM, Dave Hansen <dave.hansen@...el.com> wrote:
>
> With the KAISER code that I posted a few days ago, we map and unmap
> each of the kernel stacks when they are created.  That's slow and
> it is also the single largest thing still mapped into the user
> address space.
>
> This patch is on top of Andy's new trampoline stack code[1] plus
> the previous KAISER patches I posted.  I figured I'd post this now
> so folks can take a look before I go re-spin the whole thing.
>
> Andy, I think the do_double_fault() handling is arguably buggy
> with your trampoline stacks: it points the iret frame to the task
> stack instead of the trampoline stack.  It survives that normally,
> but it breaks with KAISER of course.

That is, indeed, a bug.  Will fix.

>
> The other oddity in this is the SWITCH_TO_KERNEL_CR3 in the
> entry_SYSCALL_64 path.  We do not have a stack to use or any
> regs to clobber, so we have to fall back to a per-cpu area to
> stash a register.  I'm open to any better ideas.

I think we have no choice but to use *two* words of temporary storage,
one for rsp and one for whatever scratch reg we use.  Unless we're
sneaky and use rsp as our scratch reg.  *snicker*

>
> 1. https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/log/?h=x86/entry_consolidation
>
> ---
>
>  b/arch/x86/entry/calling.h      |   15 +++++++++++++
>  b/arch/x86/entry/entry_64.S     |    8 ++++--
>  b/arch/x86/include/asm/kaiser.h |    2 -
>  b/arch/x86/kernel/traps.c       |   46 +++++++++++++++++++++++++++++++++-------
>  b/arch/x86/mm/kaiser.c          |   12 +---------
>  b/include/linux/kaiser.h        |    5 ----
>  b/kernel/fork.c                 |    5 ----
>  7 files changed, 60 insertions(+), 33 deletions(-)
>
> diff -puN arch/x86/mm/kaiser.c~kaiser-no-process-stacks arch/x86/mm/kaiser.c
> --- a/arch/x86/mm/kaiser.c~kaiser-no-process-stacks     2017-11-02 11:07:35.624523990 -0700
> +++ b/arch/x86/mm/kaiser.c      2017-11-03 14:22:34.878872556 -0700
> @@ -30,6 +30,8 @@
>  #include <asm/tlbflush.h>
>  #include <asm/desc.h>
>  +DEFINE_PER_CPU_USER_MAPPED(unsigned long, syscall_r11_tmp);
> +
>  /*
>   * At runtime, the only things we map are some things for CPU
>   * hotplug, and stacks for new processes.  No two CPUs will ever
> @@ -251,16 +253,6 @@ int kaiser_add_user_map(const void *__st
>         return 0;
>  }
>  -/*
> - * The stack mapping is called in generic code and can't use
> - * __PAGE_KERNEL
> - */
> -int kaiser_map_stack(struct task_struct *tsk)
> -{
> -       return kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE,
> -                                 __PAGE_KERNEL);
> -}
> -
>  int kaiser_add_user_map_ptrs(const void *__start_addr,
>                              const void *__end_addr,
>                              unsigned long flags)
> diff -puN kernel/fork.c~kaiser-no-process-stacks kernel/fork.c
> --- a/kernel/fork.c~kaiser-no-process-stacks    2017-11-02 11:08:33.103722857 -0700
> +++ b/kernel/fork.c     2017-11-03 14:19:35.720775925 -0700
> @@ -248,8 +248,6 @@ static unsigned long *alloc_thread_stack
>   static inline void free_thread_stack(struct task_struct *tsk)
>  {
> -       kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
> -
>  #ifdef CONFIG_VMAP_STACK
>         if (task_stack_vm_area(tsk)) {
>                 int i;
> @@ -539,9 +537,6 @@ static struct task_struct *dup_task_stru
>          * functions again.
>          */
>         tsk->stack = stack;
> -       err = kaiser_map_stack(tsk);
> -       if (err)
> -               goto free_stack;
>  #ifdef CONFIG_VMAP_STACK
>         tsk->stack_vm_area = stack_vm_area;
>  #endif
> diff -puN arch/x86/entry/entry_64.S~kaiser-no-process-stacks arch/x86/entry/entry_64.S
> --- a/arch/x86/entry/entry_64.S~kaiser-no-process-stacks        2017-11-02 11:31:30.904900271 -0700
> +++ b/arch/x86/entry/entry_64.S 2017-11-03 11:20:26.825735849 -0700
> @@ -145,6 +145,9 @@ ENTRY(entry_SYSCALL_64)
>         swapgs
>         movq    %rsp, PER_CPU_VAR(rsp_scratch)
> +
> +       SWITCH_TO_KERNEL_CR3_SYSCALL
> +
>         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
>         /* Construct struct pt_regs on stack */
> @@ -169,8 +172,6 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>         /* NB: right here, all regs except r11 are live. */
>  -      SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
> -
>         /* Must wait until we have the kernel CR3 to call C functions: */
>         TRACE_IRQS_OFF
>  @@ -1270,6 +1271,7 @@ ENTRY(error_entry)
>          * gsbase and proceed.  We'll fix up the exception and land in
>          * .Lgs_change's error handler with kernel gsbase.
>          */
> +       SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
>         SWAPGS
>         jmp .Lerror_entry_done
>  @@ -1379,6 +1381,7 @@ ENTRY(nmi)
>         swapgs
>         cld
> +       SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
>         movq    %rsp, %rdx
>         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
>         UNWIND_HINT_IRET_REGS base=%rdx offset=8
> @@ -1407,7 +1410,6 @@ ENTRY(nmi)
>         UNWIND_HINT_REGS
>         ENCODE_FRAME_POINTER
>  -      SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
>         /*
>          * At this point we no longer need to worry about stack damage
>          * due to nesting -- we're on the normal thread stack and we're
> diff -puN arch/x86/entry/calling.h~kaiser-no-process-stacks arch/x86/entry/calling.h
> --- a/arch/x86/entry/calling.h~kaiser-no-process-stacks 2017-11-02 12:19:37.999595414 -0700
> +++ b/arch/x86/entry/calling.h  2017-11-02 12:24:14.784007027 -0700
> @@ -3,6 +3,7 @@
>  #include <asm/cpufeatures.h>
>  #include <asm/page_types.h>
>  #include <asm/pgtable_types.h>
> +#include <asm/percpu.h>
>   /*
>  @@ -214,6 +215,18 @@ For 32-bit we have the following convent
>         mov     \scratch_reg, %cr3
>  .endm
>  +/*
> + * At syscall entry, we do not have a register to clobber and
> + * do not yet have a stack.  Even if we did, we no longer map
> + * the process stack, so it does us no good.  Just use some
> + * percpu space to stash r11.
> + */
> +.macro SWITCH_TO_KERNEL_CR3_SYSCALL
> +       movq    %r11, PER_CPU_VAR(syscall_r11_tmp)
> +       SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
> +       movq    PER_CPU_VAR(syscall_r11_tmp), %r11
> +.endm
> +
>  .macro SWITCH_TO_USER_CR3 scratch_reg:req
>         mov     %cr3, \scratch_reg
>         ADJUST_USER_CR3 \scratch_reg
> @@ -254,6 +267,8 @@ For 32-bit we have the following convent
>   .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
>  .endm
> +.macro SWITCH_TO_KERNEL_CR3_SYSCALL
> +.endm
>  .macro SWITCH_TO_USER_CR3 scratch_reg:req
>  .endm
>  .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
> diff -puN arch/x86/kernel/traps.c~kaiser-no-process-stacks arch/x86/kernel/traps.c
> --- a/arch/x86/kernel/traps.c~kaiser-no-process-stacks  2017-11-02 12:46:27.978440257 -0700
> +++ b/arch/x86/kernel/traps.c   2017-11-03 14:34:21.658546483 -0700
> @@ -329,6 +329,43 @@ __visible void __noreturn handle_stack_o
>  }
>  #endif
>  +/*
> + * This "fakes" a #GP from userspace upon returning (iret'ing)
> + * from this double fault.
> + */
> +void setup_fake_gp_at_iret(struct pt_regs *regs)
> +{
> +       unsigned long *new_stack_top = (unsigned long *)
> +               (this_cpu_read(cpu_tss.x86_tss.ist[0]) - 0x1500);
> +
> +       /*
> +        * Set up a stack just like the hardware would for a #GP.
> +        *
> +        * This format is an "iret frame", plus the error code
> +        * that the hardware puts on the stack for us for
> +        * exceptions.  (see struct pt_regs).
> +        */
> +       new_stack_top[-1] = regs->ss;
> +       new_stack_top[-2] = regs->sp;
> +       new_stack_top[-3] = regs->flags;
> +       new_stack_top[-4] = regs->cs;
> +       new_stack_top[-5] = regs->ip;
> +       new_stack_top[-6] = 0;  /* faked #GP error code */
> +
> +       /*
> +        * 'regs' points to the "iret frame" for *this*
> +        * exception, *not* the #GP we are faking.  Here,
> +        * we are telling 'iret' to jump to general_protection
> +        * when returning from this double fault.
> +        */
> +       regs->ip = (unsigned long)general_protection;
> +       /*
> +        * Make iret move the stack to the "fake #GP" stack
> +        * we created above.
> +        */
> +       regs->sp = (unsigned long)&new_stack_top[-6];
> +}
> +
>  #ifdef CONFIG_X86_64
>  /* Runs on IST stack */
>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
> @@ -354,14 +391,7 @@ dotraplinkage void do_double_fault(struc
>                 regs->cs == __KERNEL_CS &&
>                 regs->ip == (unsigned long)native_irq_return_iret)
>         {
> -               struct pt_regs *normal_regs = task_pt_regs(current);
> -
> -               /* Fake a #GP(0) from userspace. */
> -               memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
> -               normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
> -               regs->ip = (unsigned long)general_protection;
> -               regs->sp = (unsigned long)&normal_regs->orig_ax;
> -
> +               setup_fake_gp_at_iret(regs);
>                 return;
>         }
>  #endif
> diff -puN arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks arch/x86/include/asm/kaiser.h
> --- a/arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks    2017-11-03 14:22:37.594995310 -0700
> +++ b/arch/x86/include/asm/kaiser.h     2017-11-03 14:22:40.848142333 -0700
> @@ -33,8 +33,6 @@
>  extern int kaiser_add_mapping(unsigned long addr, unsigned long size,
>                               unsigned long flags);
>  -extern int kaiser_map_stack(struct task_struct *tsk);
> -
>  /**
>   *  kaiser_remove_mapping - remove a kernel mapping from the userpage tables
>   *  @addr: the start address of the range
> diff -puN include/linux/kaiser.h~kaiser-no-process-stacks include/linux/kaiser.h
> --- a/include/linux/kaiser.h~kaiser-no-process-stacks   2017-11-03 14:22:37.654998020 -0700
> +++ b/include/linux/kaiser.h    2017-11-03 14:22:48.646494777 -0700
> @@ -11,11 +11,6 @@
>   * disabled.
>   */
>  -static inline int kaiser_map_stack(struct task_struct *tsk)
> -{
> -       return 0;
> -}
> -
>  static inline void kaiser_init(void)
>  {
>  }
> _

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ