[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <48063E19.9090701@goop.org>
Date: Wed, 16 Apr 2008 10:57:45 -0700
From: Jeremy Fitzhardinge <jeremy@...p.org>
To: Mathieu Desnoyers <compudj@...stal.dyndns.org>
CC: Ingo Molnar <mingo@...e.hu>, Andi Kleen <andi@...stfloor.org>,
akpm@...l.org, "H. Peter Anvin" <hpa@...or.com>,
Steven Rostedt <rostedt@...dmis.org>,
"Frank Ch. Eigler" <fche@...hat.com>, linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v3)
Mathieu Desnoyers wrote:
> "This way lies madness. Don't go there."
>
It is a large amount of... stuff. This immediate values thing makes a
big improvement then?
> Index: linux-2.6-lttng/include/linux/hardirq.h
> ===================================================================
> --- linux-2.6-lttng.orig/include/linux/hardirq.h 2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/include/linux/hardirq.h 2008-04-16 11:29:30.000000000 -0400
> @@ -22,10 +22,13 @@
> * PREEMPT_MASK: 0x000000ff
> * SOFTIRQ_MASK: 0x0000ff00
> * HARDIRQ_MASK: 0x0fff0000
> + * HARDNMI_MASK: 0x40000000
> */
> #define PREEMPT_BITS 8
> #define SOFTIRQ_BITS 8
>
> +#define HARDNMI_BITS 1
> +
> #ifndef HARDIRQ_BITS
> #define HARDIRQ_BITS 12
>
> @@ -45,16 +48,19 @@
> #define PREEMPT_SHIFT 0
> #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
> #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
> +#define HARDNMI_SHIFT (30)
>
Why at 30, rather than HARDIRQ_SHIFT+HARDIRQ_BITS?
>
> #define __IRQ_MASK(x) ((1UL << (x))-1)
>
> #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
> #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
> #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
> +#define HARDNMI_MASK (__IRQ_MASK(HARDNMI_BITS) << HARDNMI_SHIFT)
>
> #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
> #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
> #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
> +#define HARDNMI_OFFSET (1UL << HARDNMI_SHIFT)
>
> #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
> #error PREEMPT_ACTIVE is too low!
> @@ -63,6 +69,7 @@
> #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
> #define softirq_count() (preempt_count() & SOFTIRQ_MASK)
> #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
> +#define hardnmi_count() (preempt_count() & HARDNMI_MASK)
>
> /*
> * Are we doing bottom half or hardware interrupt processing?
> @@ -71,6 +78,7 @@
> #define in_irq() (hardirq_count())
> #define in_softirq() (softirq_count())
> #define in_interrupt() (irq_count())
> +#define in_nmi() (hardnmi_count())
>
> /*
> * Are we running in atomic context? WARNING: this macro cannot
> @@ -159,7 +167,19 @@ extern void irq_enter(void);
> */
> extern void irq_exit(void);
>
> -#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0)
> -#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0)
> +#define nmi_enter() \
> + do { \
> + lockdep_off(); \
> + BUG_ON(hardnmi_count()); \
> + add_preempt_count(HARDNMI_OFFSET); \
> + __irq_enter(); \
> + } while (0)
> +
> +#define nmi_exit() \
> + do { \
> + __irq_exit(); \
> + sub_preempt_count(HARDNMI_OFFSET); \
> + lockdep_on(); \
> + } while (0)
>
> #endif /* LINUX_HARDIRQ_H */
> Index: linux-2.6-lttng/arch/x86/kernel/entry_32.S
> ===================================================================
> --- linux-2.6-lttng.orig/arch/x86/kernel/entry_32.S 2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/arch/x86/kernel/entry_32.S 2008-04-16 12:06:30.000000000 -0400
> @@ -79,7 +79,6 @@ VM_MASK = 0x00020000
> #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
> #else
> #define preempt_stop(clobbers)
> -#define resume_kernel restore_nocheck
> #endif
>
> .macro TRACE_IRQS_IRET
> @@ -265,6 +264,8 @@ END(ret_from_exception)
> #ifdef CONFIG_PREEMPT
> ENTRY(resume_kernel)
> DISABLE_INTERRUPTS(CLBR_ANY)
> + testl $0x40000000,TI_preempt_count(%ebp) # nested over NMI ?
> + jnz return_to_nmi
> cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
> jnz restore_nocheck
> need_resched:
> @@ -276,6 +277,12 @@ need_resched:
> call preempt_schedule_irq
> jmp need_resched
> END(resume_kernel)
> +#else
> +ENTRY(resume_kernel)
> + testl $0x40000000,TI_preempt_count(%ebp) # nested over NMI ?
>
HARDNMI_MASK?
> + jnz return_to_nmi
> + jmp restore_nocheck
> +END(resume_kernel)
> #endif
> CFI_ENDPROC
>
> @@ -411,6 +418,22 @@ restore_nocheck_notrace:
> CFI_ADJUST_CFA_OFFSET -4
> irq_return:
> INTERRUPT_RETURN
> +return_to_nmi:
> + testl $X86_EFLAGS_TF, PT_EFLAGS(%esp)
> + jnz restore_nocheck /*
> + * If single-stepping an NMI handler,
> + * use the normal iret path instead of
> + * the popf/lret because lret would be
> + * single-stepped. It should not
> + * happen : it will reactivate NMIs
> + * prematurely.
> + */
> + TRACE_IRQS_IRET
> + RESTORE_REGS
> + addl $4, %esp # skip orig_eax/error_code
> + CFI_ADJUST_CFA_OFFSET -4
> + INTERRUPT_RETURN_NMI_SAFE
> +
> .section .fixup,"ax"
> iret_exc:
> pushl $0 # no error code
> Index: linux-2.6-lttng/arch/x86/kernel/entry_64.S
> ===================================================================
> --- linux-2.6-lttng.orig/arch/x86/kernel/entry_64.S 2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/arch/x86/kernel/entry_64.S 2008-04-16 12:06:31.000000000 -0400
> @@ -581,12 +581,27 @@ retint_restore_args: /* return to kernel
> * The iretq could re-enable interrupts:
> */
> TRACE_IRQS_IRETQ
> + testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */
>
HARDNMI_MASK? (ditto below)
> + jnz return_to_nmi
> restore_args:
> RESTORE_ARGS 0,8,0
>
> irq_return:
> INTERRUPT_RETURN
>
> +return_to_nmi: /*
> + * If single-stepping an NMI handler,
> + * use the normal iret path instead of
> + * the popf/lret because lret would be
> + * single-stepped. It should not
> + * happen : it will reactivate NMIs
> + * prematurely.
> + */
> + bt $8,EFLAGS-ARGOFFSET(%rsp) /* trap flag? */
>
test[bwl] is a bit more usual; then you can use X86_EFLAGS_TF.
> + jc restore_args
> + RESTORE_ARGS 0,8,0
> + INTERRUPT_RETURN_NMI_SAFE
> +
> .section __ex_table, "a"
> .quad irq_return, bad_iret
> .previous
> @@ -802,6 +817,10 @@ END(spurious_interrupt)
> .macro paranoidexit trace=1
> /* ebx: no swapgs flag */
> paranoid_exit\trace:
> + GET_THREAD_INFO(%rcx)
> + testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */
> + jnz paranoid_return_to_nmi\trace
> +paranoid_exit_no_nmi\trace:
> testl %ebx,%ebx /* swapgs needed? */
> jnz paranoid_restore\trace
> testl $3,CS(%rsp)
> @@ -814,6 +833,18 @@ paranoid_swapgs\trace:
> paranoid_restore\trace:
> RESTORE_ALL 8
> jmp irq_return
> +paranoid_return_to_nmi\trace: /*
> + * If single-stepping an NMI handler,
> + * use the normal iret path instead of
> + * the popf/lret because lret would be
> + * single-stepped. It should not
> + * happen : it will reactivate NMIs
> + * prematurely.
> + */
> + bt $8,EFLAGS-0(%rsp) /* trap flag? */
> + jc paranoid_exit_no_nmi\trace
> + RESTORE_ALL 8
> + INTERRUPT_RETURN_NMI_SAFE
>
Does this need to be repeated verbatim?
> paranoid_userspace\trace:
> GET_THREAD_INFO(%rcx)
> movl threadinfo_flags(%rcx),%ebx
> Index: linux-2.6-lttng/include/asm-x86/irqflags.h
> ===================================================================
> --- linux-2.6-lttng.orig/include/asm-x86/irqflags.h 2008-04-16 11:25:18.000000000 -0400
> +++ linux-2.6-lttng/include/asm-x86/irqflags.h 2008-04-16 11:29:30.000000000 -0400
> @@ -138,12 +138,73 @@ static inline unsigned long __raw_local_
>
> #ifdef CONFIG_X86_64
> #define INTERRUPT_RETURN iretq
> +
> +/*
> + * Only returns from a trap or exception to a NMI context (intra-privilege
> + * level near return) to the same SS and CS segments. Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued. It takes care of modifying the eflags, rsp and returning to the
> + * previous function.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(rsp) RIP
> + * 8(rsp) CS
> + * 16(rsp) EFLAGS
> + * 24(rsp) RSP
> + * 32(rsp) SS
> + *
> + * Upon execution :
> + * Copy EIP to the top of the return stack
> + * Update top of return stack address
> + * Pop eflags into the eflags register
> + * Make the return stack current
> + * Near return (popping the return address from the return stack)
> + */
> +#define INTERRUPT_RETURN_NMI_SAFE pushq %rax; \
> + pushq %rbx; \
> + movq 40(%rsp), %rax; \
> + movq 16(%rsp), %rbx; \
>
Use X+16(%rsp) notation here, so that the offsets correspond to the
comment above.
> + subq $8, %rax; \
> + movq %rbx, (%rax); \
> + movq %rax, 40(%rsp); \
> + popq %rbx; \
> + popq %rax; \
> + addq $16, %rsp; \
> + popfq; \
> + movq (%rsp), %rsp; \
> + ret; \
>
How about something like
pushq %rax
mov %rsp, %rax /* old stack */
mov 24+8(%rax), %rsp /* switch stacks */
pushq 0+8(%rax) /* push return rip */
pushq 16+8(%rax) /* push return rflags */
movq (%rax), %rax /* restore %rax */
popfq /* restore flags */
ret /* restore rip */
> +
> #define ENABLE_INTERRUPTS_SYSCALL_RET \
> movq %gs:pda_oldrsp, %rsp; \
> swapgs; \
> sysretq;
> #else
> #define INTERRUPT_RETURN iret
> +
> +/*
> + * Protected mode only, no V8086. Implies that protected mode must
> + * be entered before NMIs or MCEs are enabled. Only returns from a trap or
> + * exception to a NMI context (intra-privilege level far return). Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(esp) EIP
> + * 4(esp) CS
> + * 8(esp) EFLAGS
> + *
> + * Upon execution :
> + * Copy the stack eflags to top of stack
> + * Pop eflags into the eflags register
> + * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
> + */
> +#define INTERRUPT_RETURN_NMI_SAFE pushl 8(%esp); \
> + popfl; \
> + .byte 0xCA; \
> + .word 4;
>
Why not "lret $4"?
J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists