lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <18bd920a-8b68-37a9-771e-b486b3801739@suse.com>
Date:   Wed, 2 Aug 2017 13:08:42 +0200
From:   Juergen Gross <jgross@...e.com>
To:     Andy Lutomirski <luto@...capital.net>
Cc:     "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "xen-devel@...ts.xenproject.org" <xen-devel@...ts.xenproject.org>,
        X86 ML <x86@...nel.org>,
        Boris Ostrovsky <boris.ostrovsky@...cle.com>,
        "H. Peter Anvin" <hpa@...or.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>
Subject: Re: [PATCH v2] xen: get rid of paravirt op adjust_exception_frame

On 01/08/17 21:45, Andy Lutomirski wrote:
> On Tue, Aug 1, 2017 at 3:39 AM, Juergen Gross <jgross@...e.com> wrote:
>> When running as Xen pv-guest the exception frame on the stack contains
>> %r11 and %rcx additional to the other data pushed by the processor.
>>
>> Instead of having a paravirt op being called for each exception type
>> prepend the Xen specific code to each exception entry. When running as
>> Xen pv-guest just use the exception entry with prepended instructions,
>> otherwise use the entry without the Xen specific code.
>>
>> Signed-off-by: Juergen Gross <jgross@...e.com>
>> ---
>>  arch/x86/entry/entry_64.S             | 22 ++------------
>>  arch/x86/entry/entry_64_compat.S      |  1 -
>>  arch/x86/include/asm/desc.h           | 16 ++++++++++
>>  arch/x86/include/asm/paravirt.h       |  5 ---
>>  arch/x86/include/asm/paravirt_types.h |  4 ---
>>  arch/x86/include/asm/proto.h          |  3 ++
>>  arch/x86/include/asm/traps.h          | 51 +++++++++++++++++++++++++++++--
>>  arch/x86/kernel/asm-offsets_64.c      |  1 -
>>  arch/x86/kernel/paravirt.c            |  3 --
>>  arch/x86/kernel/traps.c               | 57 ++++++++++++++++++-----------------
>>  arch/x86/xen/enlighten_pv.c           | 16 +++++-----
>>  arch/x86/xen/irq.c                    |  3 --
>>  arch/x86/xen/xen-asm_64.S             | 45 ++++++++++++++++++++++++---
>>  arch/x86/xen/xen-ops.h                |  1 -
>>  14 files changed, 147 insertions(+), 81 deletions(-)
>>
>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>> index a9a8027a6c0e..602bcf68a32c 100644
>> --- a/arch/x86/entry/entry_64.S
>> +++ b/arch/x86/entry/entry_64.S
>> @@ -745,7 +745,6 @@ ENTRY(\sym)
>>         .endif
>>
>>         ASM_CLAC
>> -       PARAVIRT_ADJUST_EXCEPTION_FRAME
>>
>>         .ifeq \has_error_code
>>         pushq   $-1                             /* ORIG_RAX: no syscall to restart */
>> @@ -901,7 +900,7 @@ ENTRY(do_softirq_own_stack)
>>  END(do_softirq_own_stack)
>>
>>  #ifdef CONFIG_XEN
>> -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
>> +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
>>
>>  /*
>>   * A note on the "critical region" in our callback handler.
>> @@ -967,8 +966,6 @@ ENTRY(xen_failsafe_callback)
>>         movq    8(%rsp), %r11
>>         addq    $0x30, %rsp
>>         pushq   $0                              /* RIP */
>> -       pushq   %r11
>> -       pushq   %rcx
>>         jmp     general_protection
>>  1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
>>         movq    (%rsp), %rcx
>> @@ -997,9 +994,8 @@ idtentry int3                       do_int3                 has_error_code=0        paranoid=1 shift_ist=DEBUG_STACK
>>  idtentry stack_segment         do_stack_segment        has_error_code=1
>>
>>  #ifdef CONFIG_XEN
>> -idtentry xen_debug             do_debug                has_error_code=0
>> -idtentry xen_int3              do_int3                 has_error_code=0
>> -idtentry xen_stack_segment     do_stack_segment        has_error_code=1
>> +idtentry xendebug              do_debug                has_error_code=0
>> +idtentry xenint3               do_int3                 has_error_code=0
>>  #endif
>>
>>  idtentry general_protection    do_general_protection   has_error_code=1
>> @@ -1161,18 +1157,6 @@ END(error_exit)
>>  /* Runs on exception stack */
>>  ENTRY(nmi)
>>         /*
>> -        * Fix up the exception frame if we're on Xen.
>> -        * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
>> -        * one value to the stack on native, so it may clobber the rdx
>> -        * scratch slot, but it won't clobber any of the important
>> -        * slots past it.
>> -        *
>> -        * Xen is a different story, because the Xen frame itself overlaps
>> -        * the "NMI executing" variable.
>> -        */
> 
> Based on Andrew Cooper's email, it sounds like this function is just
> straight-up broken on Xen PV.  Maybe change the comment to "XXX:
> broken on Xen PV" or similar.

Fine with me.

> 
>> +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
>> +#define pv_trap_entry(name) (void *)(xen_pv_domain() ? xen_ ## name : name)
>> +#else
>> +#define pv_trap_entry(name) (void *)(name)
>> +#endif
>> +
> 
> Seems reasonable to me.
> 
>>  #ifdef CONFIG_X86_64
>>
>>  static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
>> @@ -482,6 +490,14 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
>>                                 0, 0, __KERNEL_CS);                     \
>>         } while (0)
>>
>> +#define set_intr_gate_pv(n, addr)                                      \
>> +       do {                                                            \
>> +               set_intr_gate_notrace(n, pv_trap_entry(addr));          \
>> +               _trace_set_gate(n, GATE_INTERRUPT,                      \
>> +                               pv_trap_entry(trace_##addr),            \
>> +                               0, 0, __KERNEL_CS);                     \
>> +       } while (0)
> 
> Any reason this can't be set_intr_gate((n), pv_trap_entry(addr))?  Or
> does that get preprocessed wrong?

trace_##addr won't look like anything the compiler will accept with addr
being "(void *)(xen_pv_domain() ? xen_foo : foo)". :-)

>> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
>> index 01fd0a7f48cd..e2ae5f3b9c2c 100644
>> --- a/arch/x86/include/asm/traps.h
>> +++ b/arch/x86/include/asm/traps.h
>> @@ -13,9 +13,8 @@ asmlinkage void divide_error(void);
>>  asmlinkage void debug(void);
>>  asmlinkage void nmi(void);
>>  asmlinkage void int3(void);
>> -asmlinkage void xen_debug(void);
>> -asmlinkage void xen_int3(void);
>> -asmlinkage void xen_stack_segment(void);
>> +asmlinkage void xendebug(void);
>> +asmlinkage void xenint3(void);
> 
> What are xendebug and xenint3 for?  Are they because they don't use IST on Xen?

I believe so.

>>  __visible struct pv_cpu_ops pv_cpu_ops = {
>> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
>> index bf54309b85da..a79a97a46a59 100644
>> --- a/arch/x86/kernel/traps.c
>> +++ b/arch/x86/kernel/traps.c
>> @@ -946,15 +946,15 @@ void __init early_trap_init(void)
>>          * early stage. DEBUG_STACK will be equipped after cpu_init() in
>>          * trap_init().
>>          *
>> -        * We don't need to set trace_idt_table like set_intr_gate(),
>> +        * We don't need to set trace_idt_table like set_intr_gate_pv(),
>>          * since we don't have trace_debug and it will be reset to
>>          * 'debug' in trap_init() by set_intr_gate_ist().
>>          */
>>         set_intr_gate_notrace(X86_TRAP_DB, debug);
> 
> This is confusing IMO.  Maybe just drop the comment change?  But how
> does this work at all on Xen?

TBH: I don't whether it works on Xen. The changes I did here were just
mechanical ones.

>> -       set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
>> -       set_intr_gate(X86_TRAP_TS, invalid_TSS);
>> -       set_intr_gate(X86_TRAP_NP, segment_not_present);
>> -       set_intr_gate(X86_TRAP_SS, stack_segment);
>> -       set_intr_gate(X86_TRAP_GP, general_protection);
>> -       set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
>> -       set_intr_gate(X86_TRAP_MF, coprocessor_error);
>> -       set_intr_gate(X86_TRAP_AC, alignment_check);
>> +       set_intr_gate_pv(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
>> +       set_intr_gate_pv(X86_TRAP_TS, invalid_TSS);
>> +       set_intr_gate_pv(X86_TRAP_NP, segment_not_present);
>> +       set_intr_gate_pv(X86_TRAP_SS, stack_segment);
>> +       set_intr_gate_pv(X86_TRAP_GP, general_protection);
>> +       set_intr_gate_pv(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
>> +       set_intr_gate_pv(X86_TRAP_MF, coprocessor_error);
>> +       set_intr_gate_pv(X86_TRAP_AC, alignment_check);
> 
> Hmm.  I'm okay with this, but I'm wondering whether it might be nice
> to try to have a pv op that changes what IDT writes do and remaps the
> function pointer.  Like...
> 
>> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
>> index 811e4ddb3f37..7e107142bc4f 100644
>> --- a/arch/x86/xen/enlighten_pv.c
>> +++ b/arch/x86/xen/enlighten_pv.c
>> @@ -598,24 +598,22 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
>>          * so we should never see them.  Warn if
>>          * there's an unexpected IST-using fault handler.
>>          */
>> -       if (addr == (unsigned long)debug)
>> -               addr = (unsigned long)xen_debug;
>> -       else if (addr == (unsigned long)int3)
>> -               addr = (unsigned long)xen_int3;
>> -       else if (addr == (unsigned long)stack_segment)
>> -               addr = (unsigned long)xen_stack_segment;
>> -       else if (addr == (unsigned long)double_fault) {
>> +       if (addr == (unsigned long)xen_debug)
>> +               addr = (unsigned long)xen_xendebug;
>> +       else if (addr == (unsigned long)xen_int3)
>> +               addr = (unsigned long)xen_xenint3;
>> +       else if (addr == (unsigned long)xen_double_fault) {
>>                 /* Don't need to handle these */
>>                 return 0;
> 
> ...this?  Can't this list just be extended to handle all of the pv
> entries instead of using the macro magic?

I'll have a try.

> 
>> diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
>> index c3df43141e70..f72ff71cc897 100644
>> --- a/arch/x86/xen/xen-asm_64.S
>> +++ b/arch/x86/xen/xen-asm_64.S
>> @@ -22,11 +22,46 @@
>>
>>  #include "xen-asm.h"
>>
>> -ENTRY(xen_adjust_exception_frame)
>> -       mov 8+0(%rsp), %rcx
>> -       mov 8+8(%rsp), %r11
>> -       ret $16
>> -ENDPROC(xen_adjust_exception_frame)
>> +.macro xen_pv_trap name
>> +ENTRY(xen_\name)
>> +       pop %rcx
>> +       pop %r11
>> +       jmp  \name
>> +END(xen_\name)
>> +.endm
>> +
>> +xen_pv_trap divide_error
>> +xen_pv_trap debug
>> +xen_pv_trap xendebug
>> +xen_pv_trap int3
>> +xen_pv_trap xenint3
>> +xen_pv_trap nmi
>> +xen_pv_trap overflow
>> +xen_pv_trap bounds
>> +xen_pv_trap invalid_op
>> +xen_pv_trap device_not_available
>> +xen_pv_trap double_fault
>> +xen_pv_trap coprocessor_segment_overrun
>> +xen_pv_trap invalid_TSS
>> +xen_pv_trap segment_not_present
>> +xen_pv_trap stack_segment
>> +xen_pv_trap general_protection
>> +xen_pv_trap page_fault
>> +xen_pv_trap async_page_fault
>> +xen_pv_trap spurious_interrupt_bug
>> +xen_pv_trap coprocessor_error
>> +xen_pv_trap alignment_check
>> +#ifdef CONFIG_X86_MCE
>> +xen_pv_trap machine_check
>> +#endif /* CONFIG_X86_MCE */
>> +xen_pv_trap simd_coprocessor_error
>> +#ifdef CONFIG_IA32_EMULATION
>> +xen_pv_trap entry_INT80_compat
>> +#endif
>> +#ifdef CONFIG_TRACING
>> +xen_pv_trap trace_page_fault
>> +#endif
>> +xen_pv_trap hypervisor_callback
> 
> I like this.
> 
> Also, IMO it would be nice to fully finish the job.  Remaining steps are:
> 
> 1. Unsuck the SYSCALL entries on Xen PV.
> 2. Unsuck the SYENTER entry on Xen PV.
> 3. Make a xen_nmi that's actually correct (should be trivial)
> 
> #1 is here:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/entry_syscall&id=14fee348e3e86c994400d68085217d1232a637d6
> 
> Can you test it and, if you like it, either add it to your series or
> ack it?  If you have extra spare cycles, you could try to do #2 and
> #3, too :)  For #2, it might actually make sense to rig up the Xen
> sysenter entry to redirect to entry_SYSCALL_compat or even an entirely
> new entry point -- SYSENTER is really weird.

I'll look into it.


Juergen

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ