Add a thread flag to activate system-wide syscall tracing. Make x86 support TIF_SYSCALL_TRACE async flag set in entry_32.S/entry_64.S. x86_64 : When the flag is inactive upon syscall entry and concurrently activated before exit, we seem to reach a state where the top of stack is incorrect upon return to user space. Fix this by fixing the top of stack and jumping to int_ret_from_sys_call if we detect that thread flags has been modified. We make sure that the thread flag read is coherent between our new test and the ALLWORK_MASK test by first saving it in a register used for both comparisons. Signed-off-by: Mathieu Desnoyers CC: Andi Kleen CC: Thomas Gleixner CC: Ingo Molnar CC: H. Peter Anvin --- arch/x86/include/asm/thread_info.h | 9 ++++++--- arch/x86/kernel/entry_32.S | 3 ++- arch/x86/kernel/entry_64.S | 12 ++++++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) Index: linux-2.6-lttng/arch/x86/include/asm/thread_info.h =================================================================== --- linux-2.6-lttng.orig/arch/x86/include/asm/thread_info.h 2009-03-15 15:51:26.000000000 -0400 +++ linux-2.6-lttng/arch/x86/include/asm/thread_info.h 2009-03-15 15:57:19.000000000 -0400 @@ -81,6 +81,7 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_KERNEL_TRACE 9 /* kernel trace active */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ @@ -103,6 +104,7 @@ struct thread_info { #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) @@ -117,17 +119,18 @@ struct thread_info { /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \ + (_TIF_SYSCALL_TRACE | _TIF_KERNEL_TRACE | _TIF_SYSCALL_EMU | \ _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP) + (_TIF_SYSCALL_TRACE | _TIF_KERNEL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SINGLESTEP) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ (0x0000FFFF & \ - ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \ + ~(_TIF_SYSCALL_TRACE|_TIF_KERNEL_TRACE|_TIF_SYSCALL_AUDIT| \ _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) /* work to do on any return to user space */ Index: linux-2.6-lttng/arch/x86/kernel/entry_32.S =================================================================== --- linux-2.6-lttng.orig/arch/x86/kernel/entry_32.S 2009-03-15 15:51:26.000000000 -0400 +++ linux-2.6-lttng/arch/x86/kernel/entry_32.S 2009-03-15 15:57:19.000000000 -0400 @@ -571,7 +571,8 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testb $_TIF_WORK_SYSCALL_EXIT, %cl + /* Note, _TIF_KERNEL_TRACE is bit number 9, and so it needs testw and not testb */ + testw $_TIF_WORK_SYSCALL_EXIT, %cx jz work_pending TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call Index: linux-2.6-lttng/arch/x86/kernel/entry_64.S =================================================================== --- linux-2.6-lttng.orig/arch/x86/kernel/entry_64.S 2009-03-15 15:51:19.000000000 -0400 +++ linux-2.6-lttng/arch/x86/kernel/entry_64.S 2009-03-15 15:57:19.000000000 -0400 @@ -530,6 +530,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: + testl $_TIF_KERNEL_TRACE,%edx /* Re-read : concurrently changed */ + jnz ret_from_sys_call_trace bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON @@ -541,6 +543,16 @@ sysret_careful: CFI_ADJUST_CFA_OFFSET -8 jmp sysret_check +ret_from_sys_call_trace: + TRACE_IRQS_ON + sti + SAVE_REST + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + jmp int_ret_from_sys_call + /* Handle a signal */ sysret_signal: TRACE_IRQS_ON -- Mathieu Desnoyers OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/