linux-kernel - Re: [BUG] long freezes on thinkpad t60

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <E1HrDuQ-0005m7-00@dorka.pomaz.szeredi.hu>
Date:	Thu, 24 May 2007 16:03:18 +0200
From:	Miklos Szeredi <miklos@...redi.hu>
To:	mingo@...e.hu
CC:	linux-kernel@...r.kernel.org, linux-acpi@...r.kernel.org,
	tglx@...utronix.de
Subject: Re: [BUG] long freezes on thinkpad t60

> > On some strange workload involving strace and fuse I get ocasional 
> > long periods (10-100s) of total unresponsiveness, not even SysRq-* 
> > working.  Then the machine continues as normal.  Nothing in dmesg, 
> > absolutely no indication about what is happening.
> 
> > Any ideas?  Possibly something ACPI related?
> 
> how reproducable are these lockups - could you possibly trace it? If yes 
> then please apply:
> 
>   http://www.tglx.de/private/tglx/ht-debug/tracer.diff

With this patch boot stops at segfaulting fsck.  I enabled all the new
config options, is that not a good idea?  Which one exactly do I need?

Thanks,
Miklos

PS. tracer.diff needed some hacking to make it apply/compile.

Index: linux-2.6.22-rc2/include/linux/linkage.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/linkage.h	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/linkage.h	2007-05-24 15:57:43.000000000 +0200
@@ -3,6 +3,8 @@
 
 #include <asm/linkage.h>
 
+#define notrace __attribute ((no_instrument_function))
+
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
Index: linux-2.6.22-rc2/Documentation/stable_api_nonsense.txt
===================================================================
--- linux-2.6.22-rc2.orig/Documentation/stable_api_nonsense.txt	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/Documentation/stable_api_nonsense.txt	2007-05-24 15:57:42.000000000 +0200
@@ -62,6 +62,9 @@ consider the following facts about the L
       - different structures can contain different fields
       - Some functions may not be implemented at all, (i.e. some locks
 	compile away to nothing for non-SMP builds.)
+      - Parameter passing of variables from function to function can be
+	done in different ways (the CONFIG_REGPARM option controls
+	this.)
       - Memory within the kernel can be aligned in different ways,
 	depending on the build options.
   - Linux runs on a wide range of different processor architectures.
Index: linux-2.6.22-rc2/arch/i386/Kconfig
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/Kconfig	2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/Kconfig	2007-05-24 15:57:42.000000000 +0200
@@ -764,6 +764,14 @@ config BOOT_IOREMAP
 	depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
 	default y
 
+#
+# function tracing might turn this off:
+#
+config REGPARM
+	bool
+	depends on !MCOUNT
+	default y
+
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
Index: linux-2.6.22-rc2/arch/i386/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/Makefile	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/Makefile	2007-05-24 15:57:42.000000000 +0200
@@ -31,7 +31,7 @@ LDFLAGS_vmlinux := --emit-relocs
 endif
 CHECKFLAGS	+= -D__i386__
 
-CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return
+CFLAGS += -pipe -msoft-float
 
 # prevent gcc from keeping the stack 16 byte aligned
 CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
@@ -39,6 +39,8 @@ CFLAGS += $(call cc-option,-mpreferred-s
 # CPU-specific tuning. Anything which can be shared with UML should go here.
 include $(srctree)/arch/i386/Makefile.cpu
 
+cflags-$(CONFIG_REGPARM) += -mregparm=3 -freg-struct-return
+
 # temporary until string.h is fixed
 cflags-y += -ffreestanding
 
Index: linux-2.6.22-rc2/include/asm-i386/module.h
===================================================================
--- linux-2.6.22-rc2.orig/include/asm-i386/module.h	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/asm-i386/module.h	2007-05-24 15:57:43.000000000 +0200
@@ -64,12 +64,18 @@ struct mod_arch_specific
 #error unknown processor family
 #endif
 
+#ifdef CONFIG_REGPARM
+#define MODULE_REGPARM "REGPARM "
+#else
+#define MODULE_REGPARM ""
+#endif
+
 #ifdef CONFIG_4KSTACKS
 #define MODULE_STACKSIZE "4KSTACKS "
 #else
 #define MODULE_STACKSIZE ""
 #endif
 
-#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
+#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE
 
 #endif /* _ASM_I386_MODULE_H */
Index: linux-2.6.22-rc2/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/Makefile	2007-05-22 16:25:01.000000000 +0200
+++ linux-2.6.22-rc2/Makefile	2007-05-24 15:57:42.000000000 +0200
@@ -490,10 +490,14 @@ endif
 
 include $(srctree)/arch/$(ARCH)/Makefile
 
-ifdef CONFIG_FRAME_POINTER
-CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+ifdef CONFIG_MCOUNT
+CFLAGS                += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
 else
-CFLAGS		+= -fomit-frame-pointer
+  ifdef CONFIG_FRAME_POINTER
+    CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+  else
+    CFLAGS		+= -fomit-frame-pointer
+  endif
 endif
 
 ifdef CONFIG_DEBUG_INFO
Index: linux-2.6.22-rc2/arch/i386/lib/delay.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/lib/delay.c	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/lib/delay.c	2007-05-24 15:57:43.000000000 +0200
@@ -23,7 +23,7 @@
 #endif
 
 /* simple loop based delay: */
-static void delay_loop(unsigned long loops)
+static notrace void delay_loop(unsigned long loops)
 {
 	int d0;
 
@@ -38,7 +38,7 @@ static void delay_loop(unsigned long loo
 }
 
 /* TSC based delay: */
-static void delay_tsc(unsigned long loops)
+static notrace void delay_tsc(unsigned long loops)
 {
 	unsigned long bclock, now;
 
@@ -69,7 +69,7 @@ int read_current_timer(unsigned long *ti
 	return -1;
 }
 
-void __delay(unsigned long loops)
+void notrace __delay(unsigned long loops)
 {
 	delay_fn(loops);
 }
Index: linux-2.6.22-rc2/arch/x86_64/kernel/tsc.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/tsc.c	2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/tsc.c	2007-05-24 15:57:42.000000000 +0200
@@ -176,13 +176,13 @@ __setup("notsc", notsc_setup);
 
 
 /* clock source code: */
-static cycle_t read_tsc(void)
+static notrace cycle_t read_tsc(void)
 {
 	cycle_t ret = (cycle_t)get_cycles_sync();
 	return ret;
 }
 
-static cycle_t __vsyscall_fn vread_tsc(void)
+static notrace cycle_t __vsyscall_fn vread_tsc(void)
 {
 	cycle_t ret = (cycle_t)get_cycles_sync();
 	return ret;
Index: linux-2.6.22-rc2/drivers/clocksource/acpi_pm.c
===================================================================
--- linux-2.6.22-rc2.orig/drivers/clocksource/acpi_pm.c	2007-05-22 16:25:11.000000000 +0200
+++ linux-2.6.22-rc2/drivers/clocksource/acpi_pm.c	2007-05-24 15:57:42.000000000 +0200
@@ -30,13 +30,13 @@
  */
 u32 pmtmr_ioport __read_mostly;
 
-static inline u32 read_pmtmr(void)
+static notrace inline u32 read_pmtmr(void)
 {
 	/* mask the output to 24 bits */
 	return inl(pmtmr_ioport) & ACPI_PM_MASK;
 }
 
-u32 acpi_pm_read_verified(void)
+u32 notrace acpi_pm_read_verified(void)
 {
 	u32 v1 = 0, v2 = 0, v3 = 0;
 
@@ -56,12 +56,12 @@ u32 acpi_pm_read_verified(void)
 	return v2;
 }
 
-static cycle_t acpi_pm_read_slow(void)
+static notrace cycle_t acpi_pm_read_slow(void)
 {
 	return (cycle_t)acpi_pm_read_verified();
 }
 
-static cycle_t acpi_pm_read(void)
+static notrace cycle_t acpi_pm_read(void)
 {
 	return (cycle_t)read_pmtmr();
 }
Index: linux-2.6.22-rc2/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.22-rc2.orig/fs/proc/proc_misc.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/fs/proc/proc_misc.c	2007-05-24 15:57:42.000000000 +0200
@@ -623,6 +623,20 @@ static int execdomains_read_proc(char *p
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_EVENT_TRACE
+extern struct seq_operations latency_trace_op;
+static int latency_trace_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &latency_trace_op);
+}
+static struct file_operations proc_latency_trace_operations = {
+	.open		= latency_trace_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
 #ifdef CONFIG_MAGIC_SYSRQ
 /*
  * writing 'C' to /proc/sysrq-trigger is like sysrq-C
@@ -716,6 +730,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_SCHEDSTATS
 	create_seq_entry("schedstat", 0, &proc_schedstat_operations);
 #endif
+#ifdef CONFIG_EVENT_TRACE
+	create_seq_entry("latency_trace", 0, &proc_latency_trace_operations);
+#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
 	if (proc_root_kcore) {
Index: linux-2.6.22-rc2/include/linux/clocksource.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/clocksource.h	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/clocksource.h	2007-05-24 15:57:43.000000000 +0200
@@ -21,6 +21,9 @@
 typedef u64 cycle_t;
 struct clocksource;
 
+extern unsigned long preempt_max_latency;
+extern unsigned long preempt_thresh;
+
 /**
  * struct clocksource - hardware abstraction for a free running counter
  *	Provides mostly state-free accessors to the underlying hardware.
@@ -172,8 +175,20 @@ static inline cycle_t clocksource_read(s
  */
 static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles)
 {
-	u64 ret = (u64)cycles;
-	ret = (ret * cs->mult) >> cs->shift;
+	return ((u64)cycles * cs->mult) >> cs->shift;
+}
+
+/**
+ * ns2cyc - converts nanoseconds to clocksource cycles
+ * @cs:		Pointer to clocksource
+ * @nsecs:	Nanoseconds
+ */
+static inline cycles_t ns2cyc(struct clocksource *cs, u64 nsecs)
+{
+	cycles_t ret = nsecs << cs->shift;
+
+	do_div(ret, cs->mult + 1);
+
 	return ret;
 }
 
@@ -221,4 +236,8 @@ static inline void update_vsyscall(struc
 }
 #endif
 
+extern cycle_t get_monotonic_cycles(void);
+extern unsigned long cycles_to_usecs(cycle_t);
+extern cycle_t usecs_to_cycles(unsigned long);
+
 #endif /* _LINUX_CLOCKSOURCE_H */
Index: linux-2.6.22-rc2/include/linux/kernel.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/kernel.h	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/kernel.h	2007-05-24 15:57:43.000000000 +0200
@@ -156,6 +156,8 @@ asmlinkage int vprintk(const char *fmt, 
 	__attribute__ ((format (printf, 1, 0)));
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));
+extern void early_printk(const char *fmt, ...)
+	__attribute__ ((format (printf, 1, 2)));
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
Index: linux-2.6.22-rc2/include/linux/latency_hist.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/include/linux/latency_hist.h	2007-05-24 15:57:43.000000000 +0200
@@ -0,0 +1,32 @@
+/*
+ * kernel/latency_hist.h
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@...mvista.com>
+ *
+ */
+#ifndef _LINUX_LATENCY_HIST_H_
+#define _LINUX_LATENCY_HIST_H_
+
+enum {
+        INTERRUPT_LATENCY = 0,
+        PREEMPT_LATENCY,
+        WAKEUP_LATENCY
+};
+
+#define MAX_ENTRY_NUM 10240
+#define LATENCY_TYPE_NUM 3
+
+#ifdef CONFIG_LATENCY_HIST
+extern void latency_hist(int latency_type, int cpu, unsigned long latency);
+# define latency_hist_flag 1
+#else
+# define latency_hist(a,b,c) do { (void)(cpu); } while (0)
+# define latency_hist_flag 0
+#endif /* CONFIG_LATENCY_HIST */
+
+#endif /* ifndef _LINUX_LATENCY_HIST_H_ */
Index: linux-2.6.22-rc2/include/linux/preempt.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/preempt.h	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/preempt.h	2007-05-24 15:57:43.000000000 +0200
@@ -9,12 +9,26 @@
 #include <linux/thread_info.h>
 #include <linux/linkage.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
-  extern void fastcall add_preempt_count(int val);
-  extern void fastcall sub_preempt_count(int val);
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+  extern void notrace add_preempt_count(unsigned int val);
+  extern void notrace sub_preempt_count(unsigned int val);
+  extern void notrace mask_preempt_count(unsigned int mask);
+  extern void notrace unmask_preempt_count(unsigned int mask);
 #else
 # define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
 # define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
+# define mask_preempt_count(mask) \
+		do { preempt_count() |= (mask); } while (0)
+# define unmask_preempt_count(mask) \
+		do { preempt_count() &= ~(mask); } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_TIMING
+  extern void touch_critical_timing(void);
+  extern void stop_critical_timing(void);
+#else
+# define touch_critical_timing()	do { } while (0)
+# define stop_critical_timing()	do { } while (0)
 #endif
 
 #define inc_preempt_count() add_preempt_count(1)
Index: linux-2.6.22-rc2/include/linux/sched.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/sched.h	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/sched.h	2007-05-24 15:57:43.000000000 +0200
@@ -215,6 +215,7 @@ static inline void show_state(void)
 }
 
 extern void show_regs(struct pt_regs *);
+extern void irq_show_regs_callback(int cpu, struct pt_regs *regs);
 
 /*
  * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
@@ -251,6 +252,105 @@ static inline void touch_all_softlockup_
 }
 #endif
 
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE)
+  extern void print_traces(struct task_struct *task);
+#else
+# define print_traces(task)			do { } while (0)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# ifndef CONFIG_ARM
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#  define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+#  define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+#  define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
+#  define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+# else
+   extern unsigned long arm_return_addr(int level);
+#  define CALLER_ADDR0 arm_return_addr(0)
+#  define CALLER_ADDR1 arm_return_addr(1)
+#  define CALLER_ADDR2 arm_return_addr(2)
+#  define CALLER_ADDR3 arm_return_addr(3)
+#  define CALLER_ADDR4 arm_return_addr(4)
+#  define CALLER_ADDR5 arm_return_addr(5)
+#endif
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+# define CALLER_ADDR4 0UL
+# define CALLER_ADDR5 0UL
+#endif
+
+#ifdef CONFIG_MCOUNT
+  extern void notrace mcount(void);
+#else
+# define mcount() do { } while (0)
+#endif
+
+#ifdef CONFIG_EVENT_TRACE
+  extern int mcount_enabled, trace_enabled, trace_user_triggered,
+		trace_user_trigger_irq, trace_freerunning, trace_verbose,
+		trace_print_on_crash, trace_all_cpus, print_functions,
+		syscall_tracing, stackframe_tracing, trace_use_raw_cycles,
+		trace_all_runnable;
+  extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3);
+  extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2);
+  extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2);
+  extern void notrace trace_special_sym(void);
+  extern void stop_trace(void);
+# define start_trace() do { trace_enabled = 1; } while (0)
+  extern void print_last_trace(void);
+  extern void nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags);
+  extern long user_trace_start(void);
+  extern long user_trace_stop(void);
+  extern void trace_cmdline(void);
+  extern void init_tracer(void);
+#else
+# define mcount_enabled				0
+# define trace_enabled				0
+# define syscall_tracing			0
+# define stackframe_tracing			0
+# define trace_user_triggered			0
+# define trace_freerunning			0
+# define trace_all_cpus				0
+# define trace_verbose				0
+# define trace_special(v1,v2,v3)		do { } while (0)
+# define trace_special_pid(pid,v1,v2)		do { } while (0)
+# define trace_special_u64(v1,v2)		do { } while (0)
+# define trace_special_sym()			do { } while (0)
+# define stop_trace()				do { } while (0)
+# define start_trace()				do { } while (0)
+# define print_last_trace()			do { } while (0)
+# define nmi_trace(eip, parent_eip, flags)	do { } while (0)
+# define user_trace_start()			do { } while (0)
+# define user_trace_stop()			do { } while (0)
+# define trace_cmdline()			do { } while (0)
+# define init_tracer()				do { } while (0)
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+  extern int wakeup_timing;
+  extern void __trace_start_sched_wakeup(struct task_struct *p);
+  extern void trace_stop_sched_switched(struct task_struct *p);
+  extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu);
+#else
+# define wakeup_timing 0
+# define __trace_start_sched_wakeup(p)		do { } while (0)
+# define trace_stop_sched_switched(p)		do { } while (0)
+# define trace_change_sched_cpu(p, cpu)		do { } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+  extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1)		do { } while (0)
+# define time_hardirqs_off(a0, a1)		do { } while (0)
+#endif
 
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
@@ -1014,6 +1114,13 @@ struct task_struct {
 	unsigned int lockdep_recursion;
 #endif
 
+#define MAX_PREEMPT_TRACE 16
+
+#ifdef CONFIG_PREEMPT_TRACE
+	unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE];
+	unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE];
+#endif
+
 /* journalling filesystem info */
 	void *journal_info;
 
@@ -1636,6 +1743,7 @@ static inline unsigned int task_cpu(cons
 
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
+	trace_change_sched_cpu(p, cpu);
 	task_thread_info(p)->cpu = cpu;
 }
 
Index: linux-2.6.22-rc2/init/main.c
===================================================================
--- linux-2.6.22-rc2.orig/init/main.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/init/main.c	2007-05-24 15:57:42.000000000 +0200
@@ -576,6 +576,8 @@ asmlinkage void __init start_kernel(void
 	if (panic_later)
 		panic(panic_later, panic_param);
 
+	init_tracer();
+
 	lockdep_info();
 
 	/*
Index: linux-2.6.22-rc2/kernel/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/kernel/Makefile	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/Makefile	2007-05-24 15:57:42.000000000 +0200
@@ -38,6 +38,11 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_DEBUG_PREEMPT) += latency_trace.o
+obj-$(CONFIG_WAKEUP_TIMING) += latency_trace.o
+obj-$(CONFIG_EVENT_TRACE) += latency_trace.o
+obj-$(CONFIG_CRITICAL_TIMING) += latency_trace.o
+obj-$(CONFIG_LATENCY_HIST) += latency_hist.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
Index: linux-2.6.22-rc2/kernel/fork.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/fork.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/fork.c	2007-05-24 15:57:42.000000000 +0200
@@ -990,7 +990,7 @@ static struct task_struct *copy_process(
 
 	rt_mutex_init_task(p);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
Index: linux-2.6.22-rc2/kernel/latency_hist.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/kernel/latency_hist.c	2007-05-24 15:57:42.000000000 +0200
@@ -0,0 +1,267 @@
+/*
+ * kernel/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@...mvista.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/latency_hist.h>
+#include <linux/calc64.h>
+#include <asm/atomic.h>
+
+typedef struct hist_data_struct {
+	atomic_t hist_mode; /* 0 log, 1 don't log */
+	unsigned long min_lat;
+	unsigned long avg_lat;
+	unsigned long max_lat;
+	unsigned long long beyond_hist_bound_samples;
+	unsigned long long accumulate_lat;
+	unsigned long long total_samples;
+	unsigned long long hist_array[MAX_ENTRY_NUM];
+} hist_data_t;
+
+static struct proc_dir_entry * latency_hist_root = NULL;
+static char * latency_hist_proc_dir_root = "latency_hist";
+
+static char * percpu_proc_name = "CPU";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist);
+static char * interrupt_off_hist_proc_dir = "interrupt_off_latency";
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, preempt_off_hist);
+static char * preempt_off_hist_proc_dir = "preempt_off_latency";
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist);
+static char * wakeup_latency_hist_proc_dir = "wakeup_latency";
+#endif
+
+static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS];
+
+static inline u64 u64_div(u64 x, u64 y)
+{
+        do_div(x, y);
+        return x;
+}
+
+void latency_hist(int latency_type, int cpu, unsigned long latency)
+{
+	hist_data_t * my_hist;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY)
+			|| (latency_type > WAKEUP_LATENCY) || (latency < 0))
+		return;
+
+	switch(latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case INTERRUPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu);
+		break;
+#endif
+	default:
+		return;
+	}
+
+	if (atomic_read(&my_hist->hist_mode) == 0)
+		return;
+
+	if (latency >= MAX_ENTRY_NUM)
+		my_hist->beyond_hist_bound_samples++;
+	else
+		my_hist->hist_array[latency]++;
+
+	if (latency < my_hist->min_lat)
+		my_hist->min_lat = latency;
+	else if (latency > my_hist->max_lat)
+		my_hist->max_lat = latency;
+
+	my_hist->total_samples++;
+	my_hist->accumulate_lat += latency;
+	my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat,
+						  my_hist->total_samples);
+	return;
+}
+
+static void *l_start(struct seq_file *m, loff_t * pos)
+{
+	loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+	loff_t index = *pos;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (!index_ptr)
+		return NULL;
+
+	if (index == 0) {
+		atomic_dec(&my_hist->hist_mode);
+		seq_printf(m, "#Minimum latency: %lu microseconds.\n"
+			   "#Average latency: %lu microseconds.\n"
+			   "#Maximum latency: %lu microseconds.\n"
+			   "#Total samples: %llu\n"
+			   "#There are %llu samples greater or equal than %d microseconds\n"
+			   "#usecs\t%16s\n"
+			   , my_hist->min_lat
+			   , my_hist->avg_lat
+			   , my_hist->max_lat
+			   , my_hist->total_samples
+			   , my_hist->beyond_hist_bound_samples
+			   , MAX_ENTRY_NUM, "samples");
+	}
+	if (index >= MAX_ENTRY_NUM)
+		return NULL;
+
+	*index_ptr = index;
+	return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	loff_t *index_ptr = p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (++*pos >= MAX_ENTRY_NUM) {
+		atomic_inc(&my_hist->hist_mode);
+		return NULL;
+	}
+	*index_ptr = *pos;
+	return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+	int index = *(loff_t *) p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]);
+	return 0;
+}
+
+static struct seq_operations latency_hist_seq_op = {
+	.start = l_start,
+	.next  = l_next,
+	.stop  = l_stop,
+	.show  = l_show
+};
+
+static int latency_hist_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *entry_ptr = NULL;
+	int ret, i, j, break_flags = 0;
+	struct seq_file *seq;
+
+	entry_ptr = PDE(file->f_dentry->d_inode);
+	for (i = 0; i < LATENCY_TYPE_NUM; i++) {
+		for (j = 0; j < NR_CPUS; j++) {
+			if (entry[i][j] == NULL)
+				continue;
+			if (entry_ptr->low_ino == entry[i][j]->low_ino) {
+				break_flags = 1;
+				break;
+			}
+		}
+		if (break_flags == 1)
+			break;
+	}
+	ret = seq_open(file, &latency_hist_seq_op);
+	if (break_flags == 1) {
+		seq = (struct seq_file *)file->private_data;
+		seq->private = entry[i][j]->data;
+	}
+	return ret;
+}
+
+static struct file_operations latency_hist_seq_fops = {
+	.open = latency_hist_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int latency_hist_init(void)
+{
+	struct proc_dir_entry *tmp_parent_proc_dir;
+	int i = 0, len = 0;
+	hist_data_t *my_hist;
+	char procname[64];
+
+	latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL);
+
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[INTERRUPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i);
+		entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[PREEMPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i);
+		entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[WAKEUP_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i);
+		entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+	return 0;
+
+}
+
+__initcall(latency_hist_init);
+
Index: linux-2.6.22-rc2/kernel/latency_trace.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/kernel/latency_trace.c	2007-05-24 15:57:42.000000000 +0200
@@ -0,0 +1,2763 @@
+/*
+ *  kernel/latency_trace.c
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/rtc.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/version.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/proc_fs.h>
+#include <linux/latency_hist.h>
+#include <linux/utsrelease.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/rtc.h>
+#include <asm/asm-offsets.h>
+#include <linux/stacktrace.h>
+
+#ifndef DEFINE_RAW_SPINLOCK
+# define DEFINE_RAW_SPINLOCK		DEFINE_SPINLOCK
+#endif
+
+#ifndef RAW_SPIN_LOCK_UNLOCKED
+# define RAW_SPIN_LOCK_UNLOCKED		SPIN_LOCK_UNLOCKED
+#endif
+
+int trace_use_raw_cycles = 0;
+
+#define __raw_spinlock_t raw_spinlock_t
+#define need_resched_delayed() 0
+
+#ifdef CONFIG_EVENT_TRACE
+/*
+ * Convert raw cycles to usecs.
+ * Note: this is not the 'clocksource cycles' value, it's the raw
+ * cycle counter cycles. We use GTOD to timestamp latency start/end
+ * points, but the trace entries inbetween are timestamped with
+ * get_cycles().
+ */
+static unsigned long notrace cycles_to_us(cycle_t delta)
+{
+	if (!trace_use_raw_cycles)
+		return cycles_to_usecs(delta);
+#ifdef CONFIG_X86
+	do_div(delta, cpu_khz/1000+1);
+#elif defined(CONFIG_PPC)
+	delta = mulhwu(tb_to_us, delta);
+#elif defined(CONFIG_ARM)
+	delta = mach_cycles_to_usecs(delta);
+#else
+	#error Implement cycles_to_usecs.
+#endif
+
+	return (unsigned long) delta;
+}
+#endif
+
+static notrace inline cycle_t now(void)
+{
+	if (trace_use_raw_cycles)
+		return get_cycles();
+	return get_monotonic_cycles();
+}
+
+#ifndef irqs_off
+# define irqs_off()			0
+#endif
+
+#ifndef DEBUG_WARN_ON
+static inline int DEBUG_WARN_ON(int cond)
+{
+	WARN_ON(cond);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+#  define irqs_off_preempt_count() preempt_count()
+# else
+#  define irqs_off_preempt_count() 0
+# endif
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+struct sch_struct {
+	__raw_spinlock_t trace_lock;
+	struct task_struct *task;
+	int cpu;
+	struct cpu_trace *tr;
+} ____cacheline_aligned_in_smp;
+
+static __cacheline_aligned_in_smp struct sch_struct sch =
+		{ trace_lock: __RAW_SPIN_LOCK_UNLOCKED };
+
+int wakeup_timing = 1;
+#endif
+
+/*
+ * Track maximum latencies and save the trace:
+ */
+
+/*
+ * trace_stop_sched_switched must not be called with runqueue locks held!
+ */
+static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex);
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp unsigned long max_sequence;
+
+enum trace_type
+{
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_SPECIAL,
+	TRACE_SPECIAL_PID,
+	TRACE_SPECIAL_U64,
+	TRACE_SPECIAL_SYM,
+	TRACE_CMDLINE,
+	TRACE_SYSCALL,
+	TRACE_SYSRET,
+
+	__TRACE_LAST_TYPE
+};
+
+enum trace_flag_type
+{
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_NEED_RESCHED_DELAYED	= 0x04,
+	TRACE_FLAG_HARDIRQ		= 0x08,
+	TRACE_FLAG_SOFTIRQ		= 0x10,
+	TRACE_FLAG_IRQS_HARD_OFF	= 0x20,
+};
+
+/*
+ * Maximum preemption latency measured. Initialize to maximum,
+ * we clear it after bootup.
+ */
+#ifdef CONFIG_LATENCY_HIST
+unsigned long preempt_max_latency = (cycle_t)0UL;
+#else
+unsigned long preempt_max_latency = (cycle_t)ULONG_MAX;
+#endif
+
+unsigned long preempt_thresh;
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+	if (latency_hist_flag && !trace_user_triggered)
+		return 1;
+
+	if (preempt_thresh) {
+		if (delta < preempt_thresh)
+			return 0;
+	} else {
+		if (delta <= preempt_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+#ifdef CONFIG_EVENT_TRACE
+
+/*
+ * Number of per-CPU trace entries:
+ */
+#define MAX_TRACE (65536UL*16UL)
+
+#define CMDLINE_BYTES 16
+
+/*
+ * 32 bytes on 32-bit platforms:
+ */
+struct trace_entry {
+	char type;
+	char cpu;
+	char flags;
+	char preempt_count; // assumes PREEMPT_MASK is 8 bits or less
+	int pid;
+	cycle_t timestamp;
+	union {
+		struct {
+			unsigned long eip;
+			unsigned long parent_eip;
+		} fn;
+		struct {
+			unsigned long eip;
+			unsigned long v1, v2, v3;
+		} special;
+		struct {
+			unsigned char str[CMDLINE_BYTES];
+		} cmdline;
+		struct {
+			unsigned long nr; // highest bit: compat call
+			unsigned long p1, p2, p3;
+		} syscall;
+		struct {
+			unsigned long ret;
+		} sysret;
+		struct {
+			unsigned long __pad3[4];
+		} pad;
+	} u;
+} __attribute__((packed));
+
+#endif
+
+struct cpu_trace {
+	atomic_t disabled;
+	unsigned long trace_idx;
+	cycle_t preempt_timestamp;
+	unsigned long critical_start, critical_end;
+	unsigned long critical_sequence;
+	atomic_t underrun;
+	atomic_t overrun;
+	int early_warning;
+	int latency_type;
+	int cpu;
+
+#ifdef CONFIG_EVENT_TRACE
+	struct trace_entry *trace;
+	char comm[CMDLINE_BYTES];
+	pid_t pid;
+	unsigned long uid;
+	unsigned long nice;
+	unsigned long policy;
+	unsigned long rt_priority;
+	unsigned long saved_latency;
+#endif
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	unsigned long stack_check;
+#endif
+} ____cacheline_aligned_in_smp;
+
+static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp =
+{ [0 ... NR_CPUS-1] = {
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ .stack_check = 1
+#endif
+ } };
+
+#ifdef CONFIG_EVENT_TRACE
+
+int trace_enabled = 0;
+int syscall_tracing = 1;
+int stackframe_tracing = 0;
+int mcount_enabled = 0;
+int trace_freerunning = 0;
+int trace_print_on_crash = 0;
+int trace_verbose = 0;
+int trace_all_cpus = 0;
+int print_functions = 0;
+int trace_all_runnable = 0;
+
+/*
+ * user-triggered via gettimeofday(0,1)/gettimeofday(0,0)
+ */
+int trace_user_triggered = 0;
+int trace_user_trigger_irq = -1;
+
+void trace_start_ht_debug(void)
+{
+	trace_all_cpus = 0;
+	trace_freerunning = 1;
+	trace_user_triggered = 1;
+	mcount_enabled = 1;
+	trace_enabled = 1;
+	user_trace_start();
+}
+
+struct saved_trace_struct {
+	int cpu;
+	cycle_t first_timestamp, last_timestamp;
+	struct cpu_trace traces[NR_CPUS];
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The current worst-case trace:
+ */
+static struct saved_trace_struct max_tr;
+
+/*
+ * /proc/latency_trace atomicity:
+ */
+static DECLARE_MUTEX(out_mutex);
+
+static struct saved_trace_struct out_tr;
+
+static void notrace printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		printk("%s+%#lx/%#lx", sym_name, offset, size);
+	else
+		printk("<%08lx>", eip);
+}
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+#ifndef STACK_WARN
+# define STACK_WARN (THREAD_SIZE/8)
+#endif
+
+#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN)
+#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info))
+
+#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER)
+# define PRINT_EXACT_STACKFRAME
+#endif
+
+#ifdef PRINT_EXACT_STACKFRAME
+static unsigned long *worst_stack_bp;
+#endif
+static DEFINE_RAW_SPINLOCK(worst_stack_lock);
+unsigned long worst_stack_left = THREAD_SIZE;
+static unsigned long worst_stack_printed = THREAD_SIZE;
+static char worst_stack_comm[TASK_COMM_LEN+1];
+static int worst_stack_pid;
+static unsigned long worst_stack_sp;
+static char worst_stack[THREAD_SIZE];
+
+static notrace void fill_worst_stack(unsigned long stack_left)
+{
+	unsigned long flags;
+
+	/*
+	 * On x64, we must not read the PDA during early bootup:
+	 */
+#ifdef CONFIG_X86_64
+	if (system_state == SYSTEM_BOOTING)
+		return;
+#endif
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (likely(stack_left < worst_stack_left)) {
+		worst_stack_left = stack_left;
+		memcpy(worst_stack, current_thread_info(), THREAD_SIZE);
+		worst_stack_sp = (unsigned long)&stack_left;
+		memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN);
+		worst_stack_pid = current->pid;
+#ifdef PRINT_EXACT_STACKFRAME
+# ifdef __i386__
+		asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp));
+# elif defined(__x86_64__)
+		asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp));
+# else
+#  error Poke the author of above asm code lines !
+# endif
+#endif
+	}
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+}
+
+#ifdef PRINT_EXACT_STACKFRAME
+
+/*
+ * This takes a BP offset to point the BP back into the saved stack,
+ * the original stack might be long gone (but the stackframe within
+ * the saved copy still contains references to it).
+ */
+#define CONVERT_TO_SAVED_STACK(bp) \
+	((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1)))
+
+static void show_stackframe(void)
+{
+	unsigned long addr, frame_size, *bp, *prev_bp, sum = 0;
+
+	bp = CONVERT_TO_SAVED_STACK(worst_stack_bp);
+
+	while (bp[0]) {
+		addr = bp[1];
+		if (!kernel_text_address(addr))
+			break;
+
+		prev_bp = bp;
+		bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]);
+
+		frame_size = (bp - prev_bp) * sizeof(long);
+
+		if (frame_size < THREAD_SIZE) {
+			printk("{ %4ld} ", frame_size);
+			sum += frame_size;
+		} else
+			printk("{=%4ld} ", sum);
+
+		printk("[<%08lx>] ", addr);
+		printk_name(addr);
+		printk("\n");
+	}
+}
+
+#else
+
+static inline int valid_stack_ptr(void *p)
+{
+	return  p > (void *)worst_stack &&
+                p < (void *)worst_stack + THREAD_SIZE - 3;
+}
+
+static void show_stackframe(void)
+{
+	unsigned long prev_frame, addr;
+	unsigned long *stack;
+
+	prev_frame = (unsigned long)(worst_stack +
+					(worst_stack_sp & (THREAD_SIZE-1)));
+	stack = (unsigned long *)prev_frame;
+
+	while (valid_stack_ptr(stack)) {
+		addr = *stack++;
+		if (__kernel_text_address(addr)) {
+			printk("(%4ld) ", (unsigned long)stack - prev_frame);
+			printk("[<%08lx>] ", addr);
+			print_symbol("%s\n", addr);
+			prev_frame = (unsigned long)stack;
+		}
+		if ((char *)stack >= worst_stack + THREAD_SIZE)
+			break;
+	}
+}
+
+#endif
+
+static notrace void __print_worst_stack(void)
+{
+	unsigned long fill_ratio;
+	printk("----------------------------->\n");
+	printk("| new stack fill maximum: %s/%d, %ld bytes (out of %ld bytes).\n",
+		worst_stack_comm, worst_stack_pid,
+		MAX_STACK-worst_stack_left, (long)MAX_STACK);
+	fill_ratio = (MAX_STACK-worst_stack_left)*100/(long)MAX_STACK;
+	printk("| Stack fill ratio: %02ld%%", fill_ratio);
+	if (fill_ratio >= 90)
+		printk(" - BUG: that's quite high, please report this!\n");
+	else
+		printk(" - that's still OK, no need to report this.\n");
+	printk("------------|\n");
+
+	show_stackframe();
+	printk("<---------------------------\n\n");
+}
+
+static notrace void print_worst_stack(void)
+{
+	unsigned long flags;
+
+	if (irqs_disabled() || preempt_count())
+		return;
+
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (worst_stack_printed == worst_stack_left) {
+		spin_unlock_irqrestore(&worst_stack_lock, flags);
+		return;
+	}
+	worst_stack_printed = worst_stack_left;
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+
+	__print_worst_stack();
+}
+
+static notrace void debug_stackoverflow(struct cpu_trace *tr)
+{
+	long stack_left;
+
+	if (unlikely(tr->stack_check <= 0))
+		return;
+	atomic_inc(&tr->disabled);
+
+	/* Debugging check for stack overflow: is there less than 1KB free? */
+#ifdef __i386__
+	__asm__ __volatile__("and %%esp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#elif defined(__x86_64__)
+	__asm__ __volatile__("and %%rsp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#else
+# error Poke the author of above asm code lines !
+#endif
+	if (unlikely(stack_left < MIN_STACK_NEEDED)) {
+		tr->stack_check = 0;
+		printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n",
+			stack_left - sizeof(struct thread_info),
+			(long)&stack_left,
+			(long)current_thread_info(),
+			(long)current_thread_info() + THREAD_SIZE);
+		fill_worst_stack(stack_left);
+		__print_worst_stack();
+		goto out;
+	}
+	if (unlikely(stack_left < worst_stack_left)) {
+		tr->stack_check--;
+		fill_worst_stack(stack_left);
+		print_worst_stack();
+		tr->stack_check++;
+	} else
+		if (worst_stack_printed != worst_stack_left) {
+			tr->stack_check--;
+			print_worst_stack();
+			tr->stack_check++;
+		}
+out:
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+static void notrace early_printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		early_printk("%s <%08lx>", sym_name, eip);
+	else
+		early_printk("<%08lx>", eip);
+}
+
+static __raw_spinlock_t early_print_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
+static void notrace early_print_entry(struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	__raw_spin_lock(&early_print_lock);
+	early_printk("%-5d ", entry->pid);
+
+	early_printk("%d%c%c",
+		entry->cpu,
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+		(entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' :
+ 		((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		early_printk("H");
+	else {
+		if (hardirq)
+			early_printk("h");
+		else {
+			if (softirq)
+				early_printk("s");
+			else
+				early_printk(".");
+		}
+	}
+
+	early_printk(":%d: ", entry->preempt_count);
+
+	if (entry->type == TRACE_FN) {
+		early_printk_name(entry->u.fn.eip);
+		early_printk("  <= (");
+		early_printk_name(entry->u.fn.parent_eip);
+		early_printk(")\n");
+	} else {
+		/* special entries: */
+		early_printk_name(entry->u.special.eip);
+		early_printk(": <%08lx> <%08lx> <%08lx>\n",
+			entry->u.special.v1,
+			entry->u.special.v2,
+			entry->u.special.v3);
+	}
+	__raw_spin_unlock(&early_print_lock);
+}
+#else
+#  define early_print_entry(x) do { } while(0)
+#endif
+
+static void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long idx, idx_next;
+	cycle_t timestamp;
+	u32 pc;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+//	WARN_ON(!atomic_read(&tr->disabled));
+#endif
+	if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus &&
+	    !trace_print_on_crash && !print_functions)
+		goto out;
+	/*
+	 * Allocate the next index. Make sure an NMI (or interrupt)
+	 * has not taken it away. Potentially redo the timestamp as
+	 * well to make sure the trace timestamps are in chronologic
+	 * order.
+	 */
+again:
+	idx = tr->trace_idx;
+	idx_next = idx + 1;
+	timestamp = now();
+
+	if (unlikely((trace_freerunning || print_functions || atomic_read(&tr->underrun)) &&
+		     (idx_next >= MAX_TRACE) && !atomic_read(&tr->overrun))) {
+		atomic_inc(&tr->underrun);
+		idx_next = 0;
+	}
+	if (unlikely(idx >= MAX_TRACE)) {
+		atomic_inc(&tr->overrun);
+		goto out;
+	}
+#ifdef __HAVE_ARCH_CMPXCHG
+	if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx)) {
+		if (idx_next == 0)
+			atomic_dec(&tr->underrun);
+		goto again;
+	}
+#else
+# ifdef CONFIG_SMP
+#  error CMPXCHG missing
+# else
+	/* No worry, we are protected by the atomic_incr(&tr->disabled)
+	 * in __trace further down
+	 */
+	tr->trace_idx = idx_next;
+# endif
+#endif
+	if (unlikely(idx_next != 0 && atomic_read(&tr->underrun)))
+		atomic_inc(&tr->underrun);
+
+	pc = preempt_count();
+
+	if (unlikely(!tr->trace))
+		goto out;
+	entry = tr->trace + idx;
+	entry->type = type;
+#ifdef CONFIG_SMP
+	entry->cpu = cpu;
+#endif
+	entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) |
+		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)|
+		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+		(need_resched_delayed() ? TRACE_FLAG_NEED_RESCHED_DELAYED : 0);
+	entry->preempt_count = pc & 0xff;
+	entry->pid = current->pid;
+	entry->timestamp = timestamp;
+
+	switch (type) {
+	case TRACE_FN:
+		entry->u.fn.eip = eip;
+		entry->u.fn.parent_eip = parent_eip;
+		if (unlikely(print_functions && !in_interrupt()))
+			early_print_entry(entry);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_SPECIAL_PID:
+	case TRACE_SPECIAL_U64:
+	case TRACE_SPECIAL_SYM:
+		entry->u.special.eip = eip;
+		entry->u.special.v1 = v1;
+		entry->u.special.v2 = v2;
+		entry->u.special.v3 = v3;
+		if (unlikely(print_functions && !in_interrupt()))
+			early_print_entry(entry);
+		break;
+	case TRACE_SYSCALL:
+		entry->u.syscall.nr = eip;
+		entry->u.syscall.p1 = v1;
+		entry->u.syscall.p2 = v2;
+		entry->u.syscall.p3 = v3;
+		break;
+	case TRACE_SYSRET:
+		entry->u.sysret.ret = eip;
+		break;
+	case TRACE_CMDLINE:
+		memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES);
+		break;
+	default:
+		break;
+	}
+out:
+	;
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (unlikely(trace_enabled <= 0))
+		return;
+
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86)
+	debug_stackoverflow(cpu_traces + raw_smp_processor_id());
+#endif
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	/*
+	 * Trace on the CPU where the current highest-prio task
+	 * is waiting to become runnable:
+	 */
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing && !trace_all_cpus && !trace_print_on_crash &&
+	    !print_functions) {
+		if (!sch.tr || cpu != sch.cpu)
+			goto out;
+		tr = sch.tr;
+	} else
+		tr = cpu_traces + cpu;
+#else
+	tr = cpu_traces + cpu;
+#endif
+	atomic_inc(&tr->disabled);
+	if (likely(atomic_read(&tr->disabled) == 1)) {
+//#define DEBUG_STACK_POISON
+#ifdef DEBUG_STACK_POISON
+		char stack;
+
+		memset(&stack - 128, 0x34, 128);
+#endif
+		____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags);
+	}
+	atomic_dec(&tr->disabled);
+#ifdef CONFIG_WAKEUP_TIMING
+out:
+#endif
+	raw_local_irq_restore(flags);
+}
+
+/*
+ * Special, ad-hoc tracepoints:
+ */
+void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3)
+{
+	___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3);
+}
+
+EXPORT_SYMBOL(trace_special);
+
+void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2);
+}
+
+EXPORT_SYMBOL(trace_special_pid);
+
+void notrace trace_special_u64(unsigned long long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0,
+		 (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF),
+		 v2);
+}
+
+EXPORT_SYMBOL(trace_special_u64);
+
+void notrace trace_special_sym(void)
+{
+#define STACK_ENTRIES 8
+	unsigned long entries[STACK_ENTRIES];
+	struct stack_trace trace;
+
+	if (!trace_enabled)
+		return;
+
+	if (!stackframe_tracing)
+		return 	___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, CALLER_ADDR1, 0, 0);
+
+	trace.entries = entries;
+	trace.skip = 3;
+#if 0
+	trace.all_contexts = 1;
+#endif
+	trace.max_entries = STACK_ENTRIES;
+	trace.nr_entries = 0;
+
+#if 0
+	save_stack_trace(&trace, NULL);
+#else
+	save_stack_trace(&trace);
+#endif
+	/*
+	 * clear out the rest:
+	 */
+	while (trace.nr_entries < trace.max_entries)
+		entries[trace.nr_entries++] = 0;
+
+	___trace(TRACE_SPECIAL_SYM, entries[0], 0,
+					entries[1], entries[2], entries[3]);
+	___trace(TRACE_SPECIAL_SYM, entries[4], 0,
+					entries[5], entries[6], entries[7]);
+}
+
+EXPORT_SYMBOL(trace_special_sym);
+
+/*
+ * Non-inlined function:
+ */
+void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+	___trace(TRACE_FN, eip, parent_eip, 0, 0, 0);
+}
+
+#ifdef CONFIG_MCOUNT
+
+extern void mcount(void);
+
+EXPORT_SYMBOL(mcount);
+
+void notrace __mcount(void)
+{
+	___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0);
+}
+
+#endif
+
+void notrace
+sys_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3)
+{
+	if (syscall_tracing)
+		___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3);
+}
+
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+
+void notrace
+sys_ia32_call(unsigned long nr, unsigned long p1, unsigned long p2,
+	      unsigned long p3)
+{
+	if (syscall_tracing)
+		___trace(TRACE_SYSCALL, nr | 0x80000000, 0, p1, p2, p3);
+}
+
+#endif
+
+void notrace sys_ret(unsigned long ret)
+{
+	if (syscall_tracing)
+		___trace(TRACE_SYSRET, ret, 0, 0, 0, 0);
+}
+
+static void notrace print_name(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	/*
+	 * Special trace values:
+	 */
+	if (((long)eip < 10000L) && ((long)eip > -10000L)) {
+		seq_printf(m, "(%5ld)", eip);
+		return;
+	}
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_puts(m, sym_name);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static void notrace print_name_offset(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_printf(m, "%s+%#lx/%#lx <%08lx>",
+					sym_name, offset, size, eip);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static unsigned long out_sequence = -1;
+
+static int pid_to_cmdline_array[PID_MAX_DEFAULT+1];
+
+static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+	____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags);
+}
+
+void notrace trace_cmdline(void)
+{
+	___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0);
+}
+
+static void construct_pid_to_cmdline(struct cpu_trace *tr)
+{
+	unsigned int i, j, entries, pid;
+
+	if (tr->critical_sequence == out_sequence)
+		return;
+	out_sequence = tr->critical_sequence;
+
+	memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1));
+
+	if (!tr->trace)
+		return;
+
+	entries = min(tr->trace_idx, MAX_TRACE);
+
+	for (i = 0; i < entries; i++) {
+		struct trace_entry *entry = tr->trace + i;
+
+		if (entry->type != TRACE_CMDLINE)
+			continue;
+		pid = entry->pid;
+		if (pid < PID_MAX_DEFAULT) {
+			pid_to_cmdline_array[pid] = i;
+			/*
+			 * Replace space with underline - makes it easier
+			 * to process for tools:
+			 */
+			for (j = 0; j < CMDLINE_BYTES; j++)
+				if (entry->u.cmdline.str[j] == ' ')
+					entry->u.cmdline.str[j] = '_';
+		}
+	}
+}
+
+char *pid_to_cmdline(unsigned long pid)
+{
+	struct cpu_trace *tr = out_tr.traces + 0;
+	char *cmdline = "<...>";
+	int idx;
+
+	pid = min(pid, (unsigned long)PID_MAX_DEFAULT);
+	if (!pid)
+		return "<idle>";
+
+	if (pid_to_cmdline_array[pid] != -1) {
+		idx = pid_to_cmdline_array[pid];
+		if (tr->trace[idx].type == TRACE_CMDLINE)
+			cmdline = tr->trace[idx].u.cmdline.str;
+	}
+	return cmdline;
+}
+
+static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr, int reorder)
+{
+	if (!save->trace || !tr->trace)
+		return;
+	/* free-running needs reordering */
+	if (reorder && atomic_read(&tr->underrun)) {
+		int i, idx, idx0 = tr->trace_idx;
+
+		for (i = 0; i < MAX_TRACE; i++) {
+			idx = (idx0 + i) % MAX_TRACE;
+			save->trace[i] = tr->trace[idx];
+		}
+		save->trace_idx = MAX_TRACE;
+	} else {
+		save->trace_idx = tr->trace_idx;
+
+		memcpy(save->trace, tr->trace,
+			min(save->trace_idx, MAX_TRACE) *
+					sizeof(struct trace_entry));
+	}
+	save->underrun = tr->underrun;
+	save->overrun = tr->overrun;
+}
+
+
+struct block_idx {
+	int idx[NR_CPUS];
+};
+
+/*
+ * return the trace entry (position) of the smallest-timestamp
+ * one (that is still in the valid idx range):
+ */
+static int min_idx(struct block_idx *bidx)
+{
+	cycle_t min_stamp = (cycle_t) -1;
+	struct trace_entry *entry;
+	int cpu, min_cpu = -1, idx;
+
+	for_each_online_cpu(cpu) {
+		idx = bidx->idx[cpu];
+		if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE))
+			continue;
+		if (idx > MAX_TRACE*NR_CPUS) {
+			printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE,
+				NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+		entry = max_tr.traces[cpu].trace + bidx->idx[cpu];
+		if (entry->timestamp < min_stamp) {
+			min_cpu = cpu;
+			min_stamp = entry->timestamp;
+		}
+	}
+
+	return min_cpu;
+}
+
+/*
+ * This code is called to construct an output trace from
+ * the maximum trace. Having separate traces serves both
+ * atomicity (a new max might be saved while we are busy
+ * accessing /proc/latency_trace) and it is also used to
+ * delay the (expensive) sorting of the output trace by
+ * timestamps, in the trace_all_cpus case.
+ */
+static void update_out_trace(void)
+{
+	struct trace_entry *out_entry, *entry, *tmp;
+	cycle_t stamp, first_stamp, last_stamp;
+	struct block_idx bidx = { { 0, }, };
+	struct cpu_trace *tmp_max, *tmp_out;
+	int cpu, sum, entries, underrun_sum, overrun_sum;
+
+	/*
+	 * For out_tr we only have the first array's trace entries
+	 * allocated - and they have are larger on SMP to make room
+	 * for all trace entries from all CPUs.
+	 */
+	tmp_out = out_tr.traces + 0;
+	tmp_max = max_tr.traces + max_tr.cpu;
+	/*
+	 * Easier to copy this way. Note: the trace buffer is private
+	 * to the output buffer, so preserve it:
+	 */
+	copy_trace(tmp_out, tmp_max, 0);
+	tmp = tmp_out->trace;
+	*tmp_out = *tmp_max;
+	tmp_out->trace = tmp;
+
+	out_tr.cpu = max_tr.cpu;
+
+	if (!tmp_out->trace)
+		return;
+
+	out_entry = tmp_out->trace + 0;
+
+	if (!trace_all_cpus) {
+		entries = min(tmp_out->trace_idx, MAX_TRACE);
+		if (!entries)
+			return;
+		out_tr.first_timestamp = tmp_out->trace[0].timestamp;
+		out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp;
+		return;
+	}
+	/*
+	 * Find the range of timestamps that are fully traced in
+	 * all CPU traces. (since CPU traces can cover a variable
+	 * range of time, we have to find the best range.)
+	 */
+	first_stamp = 0;
+	for_each_online_cpu(cpu) {
+		tmp_max = max_tr.traces + cpu;
+		stamp = tmp_max->trace[0].timestamp;
+		if (stamp > first_stamp)
+			first_stamp = stamp;
+	}
+	/*
+	 * Save the timestamp range:
+	 */
+	tmp_max = max_tr.traces + max_tr.cpu;
+	entries = min(tmp_max->trace_idx, MAX_TRACE);
+	/*
+	 * No saved trace yet?
+	 */
+	if (!entries) {
+		out_tr.traces[0].trace_idx = 0;
+		return;
+	}
+
+	last_stamp = tmp_max->trace[entries-1].timestamp;
+
+	if (last_stamp < first_stamp) {
+		WARN_ON(1);
+
+		for_each_online_cpu(cpu) {
+			tmp_max = max_tr.traces + cpu;
+			entries = min(tmp_max->trace_idx, MAX_TRACE);
+			printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n",
+				cpu,
+				tmp_max->trace[0].timestamp,
+				tmp_max->trace[1].timestamp,
+				entries,
+				tmp_max->trace[entries-2].timestamp,
+				tmp_max->trace[entries-1].timestamp);
+		}
+		tmp_max = max_tr.traces + max_tr.cpu;
+		entries = min(tmp_max->trace_idx, MAX_TRACE);
+
+		printk("CPU%d entries: %d\n", max_tr.cpu, entries);
+		printk("first stamp: %016Lx\n", first_stamp);
+		printk(" last stamp: %016Lx\n", first_stamp);
+	}
+
+#if 0
+	printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp);
+	printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp);
+	printk("   +1 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries].timestamp,
+		tmp_max->trace[entries].timestamp);
+	printk("   +2 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries+1].timestamp,
+		tmp_max->trace[entries+1].timestamp);
+	printk("      delta: %Ld\n", last_stamp-first_stamp);
+	printk("    entries: %d\n", entries);
+#endif
+
+	out_tr.first_timestamp = first_stamp;
+	out_tr.last_timestamp = last_stamp;
+
+	/*
+	 * Fetch trace entries one by one, in increasing timestamp
+	 * order. Start at first_stamp, stop at last_stamp:
+	 */
+	sum = 0;
+	for (;;) {
+		cpu = min_idx(&bidx);
+		if (cpu == -1)
+			break;
+		entry = max_tr.traces[cpu].trace + bidx.idx[cpu];
+		if (entry->timestamp > last_stamp)
+			break;
+
+		bidx.idx[cpu]++;
+		if (entry->timestamp < first_stamp)
+			continue;
+		*out_entry = *entry;
+		out_entry++;
+		sum++;
+		if (sum > MAX_TRACE*NR_CPUS) {
+			printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE,
+				NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	sum = 0;
+	underrun_sum = 0;
+	overrun_sum = 0;
+	for_each_online_cpu(cpu) {
+		sum += max_tr.traces[cpu].trace_idx;
+		underrun_sum += atomic_read(&max_tr.traces[cpu].underrun);
+		overrun_sum += atomic_read(&max_tr.traces[cpu].overrun);
+	}
+	tmp_out->trace_idx = sum;
+	atomic_set(&tmp_out->underrun, underrun_sum);
+	atomic_set(&tmp_out->overrun, overrun_sum);
+}
+
+static void notrace print_help_header(struct seq_file *m)
+{
+	seq_puts(m, "                 _------=> CPU#            \n");
+	seq_puts(m, "                / _-----=> irqs-off        \n");
+	seq_puts(m, "               | / _----=> need-resched    \n");
+	seq_puts(m, "               || / _---=> hardirq/softirq \n");
+	seq_puts(m, "               ||| / _--=> preempt-depth   \n");
+	seq_puts(m, "               |||| /                      \n");
+	seq_puts(m, "               |||||     delay             \n");
+	seq_puts(m, "   cmd     pid ||||| time  |   caller      \n");
+	seq_puts(m, "      \\   /    |||||   \\   |   /           \n");
+}
+
+static void * notrace l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	unsigned long entries;
+	struct cpu_trace *tr = out_tr.traces + 0;
+
+	down(&out_mutex);
+	/*
+	 * if the file is being read newly, update the output trace:
+	 */
+	if (!n) {
+		// TODO: use the sequence counter here to optimize
+		down(&max_mutex);
+		update_out_trace();
+		up(&max_mutex);
+#if 0
+		if (!tr->trace_idx) {
+			up(&out_mutex);
+			return NULL;
+		}
+#endif
+		construct_pid_to_cmdline(tr);
+	}
+	entries = min(tr->trace_idx, MAX_TRACE);
+
+	if (!n) {
+		seq_printf(m, "preemption latency trace v1.1.5 on %s\n",
+			   UTS_RELEASE);
+		seq_puts(m, "--------------------------------------------------------------------\n");
+		seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+			cycles_to_usecs(tr->saved_latency),
+			entries,
+			(entries + atomic_read(&tr->underrun) +
+			 atomic_read(&tr->overrun)),
+			out_tr.cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+			"server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+			"desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+			"preempt",
+#else
+			"rt",
+#endif
+			0, 0,
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+			softirq_preemption
+#else
+			0
+#endif
+			,
+#ifdef CONFIG_PREEMPT_HARDIRQS
+ hardirq_preemption
+#else
+			0
+#endif
+		);
+#ifdef CONFIG_SMP
+		seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+		seq_puts(m, ")\n");
+#endif
+		seq_puts(m, "    -----------------\n");
+		seq_printf(m, "    | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n",
+			tr->comm, tr->pid, tr->uid, tr->nice,
+			tr->policy, tr->rt_priority);
+		seq_puts(m, "    -----------------\n");
+		if (trace_user_triggered) {
+			seq_puts(m, " => started at: ");
+			print_name_offset(m, tr->critical_start);
+			seq_puts(m, "\n => ended at:   ");
+			print_name_offset(m, tr->critical_end);
+			seq_puts(m, "\n");
+		}
+		seq_puts(m, "\n");
+
+		if (!trace_verbose)
+			print_help_header(m);
+	}
+	if (n >= entries || !tr->trace)
+		return NULL;
+
+	return tr->trace + n;
+}
+
+static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned long entries = min(tr->trace_idx, MAX_TRACE);
+
+	WARN_ON(!tr->trace);
+
+	if (++*pos >= entries) {
+		if (*pos == entries)
+			seq_puts(m, "\n\nvim:ft=help\n");
+		return NULL;
+	}
+	return tr->trace + *pos;
+}
+
+static void notrace l_stop(struct seq_file *m, void *p)
+{
+	up(&out_mutex);
+}
+
+static void print_timestamp(struct seq_file *m, unsigned long abs_usecs,
+						unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_puts(m, "!: ");
+	else if (rel_usecs > 1)
+		seq_puts(m, "+: ");
+	else
+		seq_puts(m, " : ");
+}
+
+static void
+print_timestamp_short(struct seq_file *m, unsigned long abs_usecs,
+			unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_putc(m, '!');
+	else if (rel_usecs > 1)
+		seq_putc(m, '+');
+	else
+		seq_putc(m, ' ');
+}
+
+static void
+print_generic(struct seq_file *m, struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid);
+	seq_printf(m, "%d", entry->cpu);
+	seq_printf(m, "%c%c",
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+		(entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' :
+ 		((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		seq_putc(m, 'H');
+	else {
+		if (hardirq)
+			seq_putc(m, 'h');
+		else {
+			if (softirq)
+				seq_putc(m, 's');
+			else
+				seq_putc(m, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		seq_printf(m, "%x", entry->preempt_count);
+	else
+		seq_puts(m, ".");
+}
+
+
+static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+	if (trace_verbose) {
+		seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ",
+			pid_to_cmdline(entry->pid),
+			entry->pid, entry->cpu, entry->flags,
+			entry->preempt_count, trace_idx,
+			entry->timestamp, abs_usecs/1000,
+			abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+		print_name_offset(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name_offset(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	} else {
+		print_generic(m, entry);
+		print_timestamp(m, abs_usecs, rel_usecs);
+		print_name(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	}
+	return 0;
+}
+
+static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry, int mode64)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+
+	if (!mode64) {
+		/*
+		 * For convenience, print small numbers in decimal:
+		 */
+		if (abs((int)entry->u.special.v1) < 10000)
+			seq_printf(m, " (%5ld ", entry->u.special.v1);
+		else
+			seq_printf(m, " (%lx ", entry->u.special.v1);
+		if (abs((int)entry->u.special.v2) < 10000)
+			seq_printf(m, "%5ld ", entry->u.special.v2);
+		else
+			seq_printf(m, "%lx ", entry->u.special.v2);
+		if (abs((int)entry->u.special.v3) < 10000)
+			seq_printf(m, "%5ld)\n", entry->u.special.v3);
+		else
+			seq_printf(m, "%lx)\n", entry->u.special.v3);
+	} else {
+		seq_printf(m, " (%13Ld %ld)\n",
+			   ((u64)entry->u.special.v1 << 32)
+			   + (u64)entry->u.special.v2, entry->u.special.v3);
+	}
+	return 0;
+}
+
+static int notrace
+l_show_special_pid(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int pid;
+
+	pid = entry->u.special.v1;
+
+	abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+	seq_printf(m, " <%.8s-%d> (%ld %ld)\n",
+		pid_to_cmdline(pid), pid,
+		entry->u.special.v2, entry->u.special.v3);
+
+	return 0;
+}
+
+static int notrace
+l_show_special_sym(struct seq_file *m, unsigned long trace_idx,
+		   struct trace_entry *entry, struct trace_entry *entry0,
+		   struct trace_entry *next_entry, int mode64)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+
+	seq_puts(m, "()<-");
+	print_name(m, entry->u.special.v1);
+	seq_puts(m, "()<-");
+	print_name(m, entry->u.special.v2);
+	seq_puts(m, "()<-");
+	print_name(m, entry->u.special.v3);
+	seq_puts(m, "()\n");
+
+	return 0;
+}
+
+
+static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	if (!trace_verbose)
+		return 0;
+
+	abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+	seq_printf(m,
+		"[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n",
+			entry->u.cmdline.str,
+			abs_usecs/1000, abs_usecs % 1000,
+			rel_usecs/1000, rel_usecs % 1000);
+
+	return 0;
+}
+
+extern unsigned long sys_call_table[NR_syscalls];
+
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+extern unsigned long ia32_sys_call_table[], ia32_syscall_end[];
+#define IA32_NR_syscalls (ia32_syscall_end - ia32_sys_call_table)
+#endif
+
+static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned long nr;
+
+	abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_puts(m, "> ");
+	nr = entry->u.syscall.nr;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+	if (nr & 0x80000000) {
+		nr &= ~0x80000000;
+		if (nr < IA32_NR_syscalls)
+			print_name(m, ia32_sys_call_table[nr]);
+		else
+			seq_printf(m, "<badsys(%lu)>", nr);
+	} else
+#endif
+	if (nr < NR_syscalls)
+		print_name(m, sys_call_table[nr]);
+	else
+		seq_printf(m, "<badsys(%lu)>", nr);
+
+#ifdef CONFIG_64BIT
+	seq_printf(m, " (%016lx %016lx %016lx)\n",
+		entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+#else
+	seq_printf(m, " (%08lx %08lx %08lx)\n",
+		entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+#endif
+
+	return 0;
+}
+
+static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_printf(m, "< (%ld)\n", entry->u.sysret.ret);
+
+	return 0;
+}
+
+
+static int notrace l_show(struct seq_file *m, void *p)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	struct trace_entry *entry, *entry0, *next_entry;
+	unsigned long trace_idx;
+
+	cond_resched();
+	entry = p;
+	if (entry->timestamp < out_tr.first_timestamp)
+		return 0;
+	if (entry->timestamp > out_tr.last_timestamp)
+		return 0;
+
+	entry0 = tr->trace;
+	trace_idx = entry - entry0;
+
+	if (trace_idx + 1 < tr->trace_idx)
+		next_entry = entry + 1;
+	else
+		next_entry = entry;
+
+	if (trace_verbose)
+		seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx);
+
+	switch (entry->type) {
+		case TRACE_FN:
+			l_show_fn(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 0);
+			break;
+		case TRACE_SPECIAL_PID:
+			l_show_special_pid(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL_U64:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 1);
+			break;
+		case TRACE_SPECIAL_SYM:
+			l_show_special_sym(m, trace_idx, entry, entry0,
+					   next_entry, 1);
+			break;
+		case TRACE_CMDLINE:
+			l_show_cmdline(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSCALL:
+			l_show_syscall(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSRET:
+			l_show_sysret(m, trace_idx, entry, entry0, next_entry);
+			break;
+		default:
+			seq_printf(m, "unknown trace type %d\n", entry->type);
+	}
+	return 0;
+}
+
+struct seq_operations latency_trace_op = {
+	.start	= l_start,
+	.next	= l_next,
+	.stop	= l_stop,
+	.show	= l_show
+};
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /proc/latency_trace)
+ */
+static void update_max_tr(struct cpu_trace *tr)
+{
+	struct cpu_trace *save;
+	int cpu, all_cpus = 0;
+
+#ifdef CONFIG_PREEMPT
+	WARN_ON(!preempt_count() && !irqs_disabled());
+#endif
+
+	max_tr.cpu = tr->cpu;
+	save = max_tr.traces + tr->cpu;
+
+	if ((wakeup_timing || trace_user_triggered || trace_print_on_crash ||
+	     print_functions) && trace_all_cpus) {
+		all_cpus = 1;
+		for_each_online_cpu(cpu)
+			atomic_inc(&cpu_traces[cpu].disabled);
+	}
+
+	save->saved_latency = preempt_max_latency;
+	save->preempt_timestamp = tr->preempt_timestamp;
+	save->critical_start = tr->critical_start;
+	save->critical_end = tr->critical_end;
+	save->critical_sequence = tr->critical_sequence;
+
+	memcpy(save->comm, current->comm, CMDLINE_BYTES);
+	save->pid = current->pid;
+	save->uid = current->uid;
+	save->nice = current->static_prio - 20 - MAX_RT_PRIO;
+	save->policy = current->policy;
+	save->rt_priority = current->rt_priority;
+
+	if (all_cpus) {
+		for_each_online_cpu(cpu) {
+			copy_trace(max_tr.traces + cpu, cpu_traces + cpu, 1);
+			atomic_dec(&cpu_traces[cpu].disabled);
+		}
+	} else
+		copy_trace(save, tr, 1);
+}
+
+#else /* !EVENT_TRACE */
+
+static inline void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+}
+
+static inline void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+}
+
+static inline void update_max_tr(struct cpu_trace *tr)
+{
+}
+
+static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+}
+
+#endif
+
+static int setup_preempt_thresh(char *s)
+{
+	int thresh;
+
+	get_option(&s, &thresh);
+	if (thresh > 0) {
+		preempt_thresh = usecs_to_cycles(thresh);
+		printk("Preemption threshold = %u us\n", thresh);
+	}
+	return 1;
+}
+__setup("preempt_thresh=", setup_preempt_thresh);
+
+static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr)
+{
+	if (trace_all_cpus)
+		for_each_online_cpu(cpu) {
+			tr = cpu_traces + cpu;
+			tr->trace_idx = 0;
+			atomic_set(&tr->underrun, 0);
+			atomic_set(&tr->overrun, 0);
+		}
+	else{
+		tr->trace_idx = 0;
+		atomic_set(&tr->underrun, 0);
+		atomic_set(&tr->overrun, 0);
+	}
+}
+
+#ifdef CONFIG_CRITICAL_TIMING
+
+static void notrace
+check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1;
+	cycle_t T0, T1, T2, delta;
+	unsigned long flags;
+
+	if (trace_user_triggered)
+		return;
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = tr->preempt_timestamp;
+	T1 = get_monotonic_cycles();
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T2 = get_monotonic_cycles();
+
+	delta = T2-T0;
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug: updating %016Lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us critical section "
+			"violates %lu us threshold.\n"
+			" => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, cycles_to_usecs(preempt_thresh), t0);
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, t0);
+
+	print_symbol("<%s>\n", tr->critical_start);
+	printk(" =>   ended at timestamp %lu: ", t1);
+	print_symbol("<%s>\n", tr->critical_end);
+	dump_stack();
+	t1 = cycles_to_usecs(get_monotonic_cycles());
+	printk(" =>   dump-end timestamp %lu\n\n", t1);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+out:
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_monotonic_cycles();
+	tr->early_warning = 0;
+	reset_trace_idx(cpu, tr);
+	_trace_cmdline(cpu, tr);
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+}
+
+void notrace touch_critical_timing(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	if (preempt_count() > 0 && tr->critical_start) {
+		atomic_inc(&tr->disabled);
+		check_critical_timing(cpu, tr, CALLER_ADDR0);
+		tr->critical_start = CALLER_ADDR0;
+		tr->critical_sequence = max_sequence;
+		atomic_dec(&tr->disabled);
+	}
+}
+EXPORT_SYMBOL(touch_critical_timing);
+
+void notrace stop_critical_timing(void)
+{
+	struct cpu_trace *tr = cpu_traces + raw_smp_processor_id();
+
+	tr->critical_start = 0;
+}
+EXPORT_SYMBOL(stop_critical_timing);
+
+static inline void notrace
+__start_critical_timing(unsigned long eip, unsigned long parent_eip,
+			int latency_type)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_monotonic_cycles();
+	tr->critical_start = eip;
+	reset_trace_idx(cpu, tr);
+	tr->latency_type = latency_type;
+	_trace_cmdline(cpu, tr);
+
+	local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+
+	atomic_dec(&tr->disabled);
+}
+
+static inline void notrace
+__stop_critical_timing(unsigned long eip, unsigned long parent_eip)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+	local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+	check_critical_timing(cpu, tr, eip);
+	tr->critical_start = 0;
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+
+#ifdef CONFIG_LOCKDEP
+
+void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__stop_critical_timing(a0, a1);
+}
+
+void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__start_critical_timing(a0, a1, INTERRUPT_LATENCY);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+/*
+ * Dummy:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void notrace trace_hardirqs_on(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__stop_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void notrace trace_hardirqs_off(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__start_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */,
+					INTERRUPT_LATENCY);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+#endif /* !CONFIG_LOCKDEP */
+
+#endif /* CONFIG_CRITICAL_IRQSOFF_TIMING */
+
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+
+static inline unsigned long get_parent_eip(void)
+{
+	unsigned long parent_eip = CALLER_ADDR1;
+
+	if (in_lock_functions(parent_eip)) {
+		parent_eip = CALLER_ADDR2;
+		if (in_lock_functions(parent_eip))
+			parent_eip = CALLER_ADDR3;
+	}
+
+	return parent_eip;
+}
+
+void notrace add_preempt_count(unsigned int val)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_WARN_ON(((int)preempt_count() < 0)))
+		return;
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	if (DEBUG_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10))
+		return;
+#endif
+
+	preempt_count() += val;
+#ifdef CONFIG_PREEMPT_TRACE
+	if (val <= 10) {
+		unsigned int idx = preempt_count() & PREEMPT_MASK;
+		if (idx < MAX_PREEMPT_TRACE) {
+			current->preempt_trace_eip[idx] = eip;
+			current->preempt_trace_parent_eip[idx] = parent_eip;
+		}
+	}
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__start_critical_timing(eip, parent_eip,
+							PREEMPT_LATENCY);
+	}
+#endif
+	(void)eip, (void)parent_eip;
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void notrace sub_preempt_count(unsigned int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_WARN_ON(unlikely(val > preempt_count())))
+		return;
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	if (DEBUG_WARN_ON((val < PREEMPT_MASK) &&
+			  !(preempt_count() & PREEMPT_MASK)))
+		return;
+#endif
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__stop_critical_timing(CALLER_ADDR0,
+						       CALLER_ADDR1);
+	}
+#endif
+	preempt_count() -= val;
+}
+
+EXPORT_SYMBOL(sub_preempt_count);
+
+void notrace mask_preempt_count(unsigned int mask)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+	preempt_count() |= mask;
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__start_critical_timing(eip, parent_eip,
+							PREEMPT_LATENCY);
+	}
+#endif
+	(void) eip, (void) parent_eip;
+}
+EXPORT_SYMBOL(mask_preempt_count);
+
+void notrace unmask_preempt_count(unsigned int mask)
+{
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__stop_critical_timing(CALLER_ADDR0,
+						       CALLER_ADDR1);
+	}
+#endif
+	preempt_count() &= ~mask;
+}
+EXPORT_SYMBOL(unmask_preempt_count);
+
+
+#endif
+
+/*
+ * Wakeup latency timing/tracing. We get upcalls from the scheduler
+ * when a task is being woken up and we time/trace it until it gets
+ * to a CPU - or an even-higher-prio task supercedes it. (in that
+ * case we throw away the currently traced task - we dont try to
+ * handle nesting, that simplifies things significantly)
+ */
+#ifdef CONFIG_WAKEUP_TIMING
+
+static void notrace
+check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip,
+		    unsigned long *flags)
+{
+	int cpu = raw_smp_processor_id();
+	unsigned long latency, t0, t1;
+	cycle_t T0, T1, delta;
+
+	if (trace_user_triggered)
+		return;
+
+	atomic_inc(&tr->disabled);
+	if (atomic_read(&tr->disabled) != 1)
+		goto out;
+
+	T0 = tr->preempt_timestamp;
+	T1 = get_monotonic_cycles();
+	/*
+	 * Any wraparound or time warp and we are out:
+	 */
+	if (T0 > T1)
+		goto out;
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out;
+
+	____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip,
+		  0, 0, 0, *flags);
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug2: updating %016lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx]\n", T0, T1);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+	atomic_dec(&tr->disabled);
+	__raw_spin_unlock(&sch.trace_lock);
+	local_irq_restore(*flags);
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us wakeup latency "
+			"violates %lu us threshold.\n",
+				current->comm, current->pid,
+				raw_smp_processor_id(), latency,
+				cycles_to_usecs(preempt_thresh));
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"wakeup.\n", current->comm, current->pid,
+				raw_smp_processor_id(), latency);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+	return;
+
+out:
+	atomic_dec(&tr->disabled);
+	__raw_spin_unlock(&sch.trace_lock);
+	local_irq_restore(*flags);
+}
+
+/*
+ * Start wakeup latency tracing - called with the runqueue held
+ * and interrupts disabled:
+ */
+void __trace_start_sched_wakeup(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	int cpu;
+
+	if (trace_user_triggered || !wakeup_timing) {
+		trace_special_pid(p->pid, p->prio, -1);
+		return;
+	}
+
+	__raw_spin_lock(&sch.trace_lock);
+	if (sch.task && (sch.task->prio <= p->prio))
+		goto out_unlock;
+
+	/*
+	 * New highest-prio task just woke up - start tracing:
+	 */
+	sch.task = p;
+	cpu = task_cpu(p);
+	sch.cpu = cpu;
+	/*
+	 * We keep using this CPU's trace buffer even if the task
+	 * gets migrated to another CPU. Tracing only happens on
+	 * the CPU that 'owns' the highest-prio task so it's
+	 * fundamentally single-threaded.
+	 */
+	sch.tr = tr = cpu_traces + cpu;
+	reset_trace_idx(cpu, tr);
+
+//	if (!atomic_read(&tr->disabled)) {
+		atomic_inc(&tr->disabled);
+		tr->critical_sequence = max_sequence;
+		tr->preempt_timestamp = get_monotonic_cycles();
+		tr->latency_type = WAKEUP_LATENCY;
+		tr->critical_start = CALLER_ADDR0;
+		_trace_cmdline(raw_smp_processor_id(), tr);
+		atomic_dec(&tr->disabled);
+//	}
+
+	mcount();
+	trace_special_pid(p->pid, p->prio, cpu);
+	trace_special_sym();
+out_unlock:
+	__raw_spin_unlock(&sch.trace_lock);
+}
+
+void trace_stop_sched_switched(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	local_irq_save(flags);
+	__raw_spin_lock(&sch.trace_lock);
+	if (p == sch.task) {
+		trace_special_pid(p->pid, p->prio, task_cpu(p));
+
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		WARN_ON(!tr);
+		/* auto-unlocks the spinlock: */
+		check_wakeup_timing(tr, CALLER_ADDR0, &flags);
+	} else {
+		if (sch.task)
+			trace_special_pid(sch.task->pid, sch.task->prio,
+					  p->prio);
+		if (sch.task && (sch.task->prio >= p->prio))
+			sch.task = NULL;
+		__raw_spin_unlock(&sch.trace_lock);
+	}
+	local_irq_restore(flags);
+}
+
+void trace_change_sched_cpu(struct task_struct *p, int new_cpu)
+{
+	unsigned long flags;
+
+	if (!wakeup_timing)
+		return;
+
+	trace_special_pid(p->pid, task_cpu(p), new_cpu);
+	trace_special_sym();
+	local_irq_save(flags);
+	__raw_spin_lock(&sch.trace_lock);
+	if (p == sch.task && task_cpu(p) != new_cpu) {
+		sch.cpu = new_cpu;
+		trace_special(task_cpu(p), new_cpu, 0);
+	}
+	__raw_spin_unlock(&sch.trace_lock);
+	local_irq_restore(flags);
+}
+
+#endif
+
+#ifdef CONFIG_EVENT_TRACE
+
+long user_trace_start(void)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (!trace_user_triggered || trace_print_on_crash || print_functions)
+		return -EINVAL;
+
+	/*
+	 * If the user has not yet reset the max latency after
+	 * bootup then we assume that this was the intention
+	 * (we wont get any tracing done otherwise):
+	 */
+	if (preempt_max_latency == (cycle_t)ULONG_MAX)
+		preempt_max_latency = 0;
+
+	/*
+	 * user_trace_start() might be called from hardirq
+	 * context, if trace_user_triggered_irq is set, so
+	 * be careful about locking:
+	 */
+	if (preempt_count() || irqs_disabled()) {
+		if (down_trylock(&max_mutex))
+			return -EAGAIN;
+	} else
+		down(&max_mutex);
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	tr = cpu_traces + cpu;
+
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		__raw_spin_lock(&sch.trace_lock);
+		sch.task = current;
+		sch.cpu = cpu;
+		sch.tr = tr;
+		__raw_spin_unlock(&sch.trace_lock);
+	}
+#endif
+	reset_trace_idx(cpu, tr);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_monotonic_cycles();
+	tr->critical_start = CALLER_ADDR0;
+	_trace_cmdline(cpu, tr);
+	mcount();
+
+	WARN_ON(!irqs_disabled());
+	local_irq_restore(flags);
+
+	up(&max_mutex);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(user_trace_start);
+
+long user_trace_stop(void)
+{
+	unsigned long latency = 0, flags;
+	struct cpu_trace *tr;
+	cycle_t delta;
+
+	if (!trace_user_triggered || trace_print_on_crash || print_functions)
+		return -EINVAL;
+
+	local_irq_save(flags);
+	mcount();
+
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		struct task_struct *t;
+
+		__raw_spin_lock(&sch.trace_lock);
+#if 0
+		t = sch.task;
+		if (current != t) {
+			__raw_spin_unlock(&sch.trace_lock);
+			local_irq_restore(flags);
+			printk("wrong stop: curr: %s/%d[%d] => %p\n",
+			       current->comm, current->pid,
+			       task_thread_info(current)->cpu, t);
+			if (t)
+				printk("wrong stop: curr: %s/%d[%d]\n",
+				       t->comm, t->pid, task_thread_info(t)->cpu);
+			return -EINVAL;
+		}
+#endif
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		__raw_spin_unlock(&sch.trace_lock);
+	} else
+#endif
+		tr = cpu_traces + smp_processor_id();
+
+	atomic_inc(&tr->disabled);
+	if (tr->preempt_timestamp) {
+		cycle_t T0, T1;
+		unsigned long long tmp0;
+
+		T0 = tr->preempt_timestamp;
+		T1 = get_monotonic_cycles();
+		tmp0 = preempt_max_latency;
+		if (T1 < T0)
+			T0 = T1;
+		delta = T1 - T0;
+		if (!report_latency(delta))
+			goto out;
+		if (tr->critical_sequence != max_sequence ||
+						down_trylock(&max_mutex))
+			goto out;
+
+		WARN_ON(!preempt_thresh && preempt_max_latency > delta);
+
+		preempt_max_latency = delta;
+		update_max_tr(tr);
+
+		latency = cycles_to_usecs(delta);
+
+		max_sequence++;
+		up(&max_mutex);
+out:
+		tr->preempt_timestamp = 0;
+	}
+	atomic_dec(&tr->disabled);
+	local_irq_restore(flags);
+
+	if (latency) {
+		if (preempt_thresh)
+			printk("(%16s-%-5d|#%d): %lu us user-latency "
+				"violates %lu us threshold.\n",
+					current->comm, current->pid,
+					raw_smp_processor_id(), latency,
+					cycles_to_usecs(preempt_thresh));
+		else
+			printk("(%16s-%-5d|#%d): new %lu us user-latency.\n",
+				current->comm, current->pid,
+					raw_smp_processor_id(), latency);
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(user_trace_stop);
+
+static int trace_print_cpu = -1;
+
+void notrace stop_trace(void)
+{
+	if (trace_print_on_crash && trace_print_cpu == -1) {
+		trace_enabled = -1;
+		trace_print_cpu = raw_smp_processor_id();
+	}
+}
+
+EXPORT_SYMBOL(stop_trace);
+
+static void print_entry(struct trace_entry *entry, struct trace_entry *entry0)
+{
+	unsigned long abs_usecs;
+	int hardirq, softirq;
+
+	abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp);
+
+	printk("%-5d ", entry->pid);
+
+	printk("%d%c%c",
+		entry->cpu,
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+		(entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' :
+ 		((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		printk("H");
+	else {
+		if (hardirq)
+			printk("h");
+		else {
+			if (softirq)
+				printk("s");
+			else
+				printk(".");
+		}
+	}
+
+	if (entry->preempt_count)
+		printk(":%x ", entry->preempt_count);
+	else
+		printk(":. ");
+
+	printk("%ld.%03ldms: ", abs_usecs/1000, abs_usecs % 1000);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		printk_name(entry->u.fn.eip);
+		printk("  <= (");
+		printk_name(entry->u.fn.parent_eip);
+		printk(")\n");
+		break;
+	case TRACE_SPECIAL:
+		printk(" special: %lx %lx %lx\n",
+		       entry->u.special.v1, entry->u.special.v2,
+		       entry->u.special.v3);
+		break;
+	case TRACE_SPECIAL_U64:
+		printk("  spec64: %lx%08lx %lx\n",
+		       entry->u.special.v1, entry->u.special.v2,
+		       entry->u.special.v3);
+		break;
+	}
+}
+
+/*
+ * Print the current trace at crash time.
+ *
+ * We print it backwards, so that the newest (most interesting) entries
+ * are printed first.
+ */
+void print_last_trace(void)
+{
+	unsigned int idx0, idx, i, cpu;
+	struct cpu_trace *tr;
+	struct trace_entry *entry0, *entry;
+
+	preempt_disable();
+	cpu = smp_processor_id();
+	if (trace_enabled != -1 || trace_print_cpu != cpu ||
+						!trace_print_on_crash) {
+		if (trace_print_on_crash)
+			printk("skipping trace printing on CPU#%d != %d\n",
+				cpu, trace_print_cpu);
+		preempt_enable();
+		return;
+	}
+
+	trace_print_on_crash = 0;
+
+	tr = cpu_traces + cpu;
+	if (!tr->trace)
+		goto out;
+
+	printk("Last %ld trace entries:\n", MAX_TRACE);
+	idx0 = tr->trace_idx;
+	printk("curr idx: %d\n", idx0);
+	if (idx0 >= MAX_TRACE)
+		idx0 = 0;
+	idx = idx0;
+	entry0 = tr->trace + idx0;
+
+	for (i = 0; i < MAX_TRACE; i++) {
+		if (idx == 0)
+			idx = MAX_TRACE-1;
+		else
+			idx--;
+		entry = tr->trace + idx;
+		switch (entry->type) {
+		case TRACE_FN:
+		case TRACE_SPECIAL:
+		case TRACE_SPECIAL_U64:
+			print_entry(entry, entry0);
+			break;
+		}
+	}
+	printk("printed %ld entries\n", MAX_TRACE);
+out:
+	preempt_enable();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * On SMP, try to 'peek' on other CPU's traces and record them
+ * in this CPU's trace. This way we get a rough idea about what's
+ * going on there, without the overhead of global tracing.
+ *
+ * (no need to make this PER_CPU, we bounce it around anyway.)
+ */
+unsigned long nmi_eips[NR_CPUS];
+unsigned long nmi_flags[NR_CPUS];
+
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	int cpu, this_cpu = smp_processor_id();
+
+	__trace(eip, parent_eip);
+
+	nmi_eips[this_cpu] = parent_eip;
+	nmi_flags[this_cpu] = flags;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (cpu_online(cpu) && cpu != this_cpu) {
+			__trace(eip, nmi_eips[cpu]);
+			__trace(eip, nmi_flags[cpu]);
+		}
+}
+#else
+/*
+ * On UP, NMI tracing is quite simple:
+ */
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	__trace(eip, parent_eip);
+}
+#endif
+
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACE
+
+static void print_preempt_trace(struct task_struct *task)
+{
+	unsigned int count = task_thread_info(task)->preempt_count;
+	unsigned int i, lim = count & PREEMPT_MASK;
+	if (lim >= MAX_PREEMPT_TRACE)
+		lim = MAX_PREEMPT_TRACE-1;
+	printk("---------------------------\n");
+	printk("| preempt count: %08x ]\n", count);
+	printk("| %d-level deep critical section nesting:\n", lim);
+	printk("----------------------------------------\n");
+	for (i = 1; i <= lim; i++) {
+		printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]);
+		print_symbol("%s\n", task->preempt_trace_eip[i]);
+		printk(".....[<%08lx>] ..   ( <= ",
+				task->preempt_trace_parent_eip[i]);
+		print_symbol("%s)\n", task->preempt_trace_parent_eip[i]);
+	}
+	printk("\n");
+}
+
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE)
+void print_traces(struct task_struct *task)
+{
+	if (!task)
+		task = current;
+
+#ifdef CONFIG_PREEMPT_TRACE
+	print_preempt_trace(task);
+#endif
+#ifdef CONFIG_EVENT_TRACE
+	print_last_trace();
+#endif
+}
+#endif
+
+#ifdef CONFIG_EVENT_TRACE
+/*
+ * Allocate all the per-CPU trace buffers and the
+ * save-maximum/save-output staging buffers:
+ */
+void __init init_tracer(void)
+{
+	unsigned long size, total_size = 0;
+	struct trace_entry *array;
+	struct cpu_trace *tr;
+	int cpu;
+
+	printk("num_possible_cpus(): %d\n", num_possible_cpus());
+
+	size = sizeof(struct trace_entry)*MAX_TRACE;
+
+	for_each_possible_cpu(cpu) {
+		tr = cpu_traces + cpu;
+		array = alloc_bootmem(size);
+		if (!array) {
+			printk(KERN_ERR
+			"CPU#%d: failed to allocate %ld bytes trace buffer!\n",
+				cpu, size);
+		} else {
+			printk(KERN_INFO
+			"CPU#%d: allocated %ld bytes trace buffer.\n",
+				cpu, size);
+			total_size += size;
+		}
+		tr->cpu = cpu;
+		tr->trace = array;
+
+		array = alloc_bootmem(size);
+		if (!array) {
+			printk(KERN_ERR
+			"CPU#%d: failed to allocate %ld bytes max-trace buffer!\n",
+				cpu, size);
+		} else {
+			printk(KERN_INFO
+			"CPU#%d: allocated %ld bytes max-trace buffer.\n",
+				cpu, size);
+			total_size += size;
+		}
+		max_tr.traces[cpu].trace = array;
+	}
+
+	/*
+	 * The output trace buffer is a special one that only has
+	 * trace entries for the first cpu-trace structure:
+	 */
+	size = sizeof(struct trace_entry)*MAX_TRACE*num_possible_cpus();
+	array = alloc_bootmem(size);
+	if (!array) {
+		printk(KERN_ERR
+			"failed to allocate %ld bytes out-trace buffer!\n",
+			size);
+	} else {
+		printk(KERN_INFO "allocated %ld bytes out-trace buffer.\n",
+			size);
+		total_size += size;
+	}
+	out_tr.traces[0].trace = array;
+	printk(KERN_INFO
+		"tracer: a total of %ld bytes allocated.\n",
+		total_size);
+}
+#endif
Index: linux-2.6.22-rc2/kernel/lockdep.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/lockdep.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/lockdep.c	2007-05-24 15:57:42.000000000 +0200
@@ -166,14 +166,14 @@ static struct list_head chainhash_table[
 	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
 	(key2))
 
-void lockdep_off(void)
+void notrace lockdep_off(void)
 {
 	current->lockdep_recursion++;
 }
 
 EXPORT_SYMBOL(lockdep_off);
 
-void lockdep_on(void)
+void notrace lockdep_on(void)
 {
 	current->lockdep_recursion--;
 }
@@ -706,7 +706,7 @@ find_usage_forwards(struct lock_class *s
  * Return 1 otherwise and keep <backwards_match> unchanged.
  * Return 0 on error.
  */
-static noinline int
+static noinline notrace int
 find_usage_backwards(struct lock_class *source, unsigned int depth)
 {
 	struct lock_list *entry;
@@ -1386,7 +1386,7 @@ cache_hit:
  * We are building curr_chain_key incrementally, so double-check
  * it from scratch, to make sure that it's done correctly:
  */
-static void check_chain_key(struct task_struct *curr)
+static void notrace check_chain_key(struct task_struct *curr)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
 	struct held_lock *hlock, *prev_hlock = NULL;
@@ -1573,8 +1573,8 @@ valid_state(struct task_struct *curr, st
 /*
  * Mark a lock with a usage bit, and validate the state transition:
  */
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
-		     enum lock_usage_bit new_bit)
+static int notrace mark_lock(struct task_struct *curr, struct held_lock *this,
+			     enum lock_usage_bit new_bit)
 {
 	unsigned int new_mask = 1 << new_bit, ret = 1;
 
@@ -1781,6 +1781,7 @@ static int mark_lock(struct task_struct 
 	 * We must printk outside of the graph_lock:
 	 */
 	if (ret == 2) {
+		user_trace_stop();
 		printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
 		print_lock(this);
 		print_irqtrace_events(curr);
@@ -1794,7 +1795,7 @@ static int mark_lock(struct task_struct 
 /*
  * Mark all held locks with a usage bit:
  */
-static int
+static int notrace
 mark_held_locks(struct task_struct *curr, int hardirq)
 {
 	enum lock_usage_bit usage_bit;
@@ -1841,7 +1842,7 @@ void early_boot_irqs_on(void)
 /*
  * Hardirqs will be enabled:
  */
-void trace_hardirqs_on(void)
+void notrace trace_hardirqs_on(void)
 {
 	struct task_struct *curr = current;
 	unsigned long ip;
@@ -1882,6 +1883,9 @@ void trace_hardirqs_on(void)
 	curr->hardirq_enable_ip = ip;
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(&hardirqs_on_events);
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	time_hardirqs_on(CALLER_ADDR0, 0 /* CALLER_ADDR1 */);
+#endif
 }
 
 EXPORT_SYMBOL(trace_hardirqs_on);
@@ -1889,7 +1893,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off(void)
+void notrace trace_hardirqs_off(void)
 {
 	struct task_struct *curr = current;
 
@@ -1907,6 +1911,9 @@ void trace_hardirqs_off(void)
 		curr->hardirq_disable_ip = _RET_IP_;
 		curr->hardirq_disable_event = ++curr->irq_events;
 		debug_atomic_inc(&hardirqs_off_events);
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		time_hardirqs_off(CALLER_ADDR0, 0 /* CALLER_ADDR1 */);
+#endif
 	} else
 		debug_atomic_inc(&redundant_hardirqs_off);
 }
@@ -2404,7 +2411,7 @@ __lock_release(struct lockdep_map *lock,
 /*
  * Check whether we follow the irq-flags state precisely:
  */
-static void check_flags(unsigned long flags)
+static notrace void check_flags(unsigned long flags)
 {
 #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
 	if (!debug_locks)
@@ -2436,8 +2443,9 @@ static void check_flags(unsigned long fl
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
  */
-void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-		  int trylock, int read, int check, unsigned long ip)
+void notrace
+lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+	     int trylock, int read, int check, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2445,9 +2453,9 @@ void lock_acquire(struct lockdep_map *lo
 		return;
 
 	raw_local_irq_save(flags);
+	current->lockdep_recursion = 1;
 	check_flags(flags);
 
-	current->lockdep_recursion = 1;
 	__lock_acquire(lock, subclass, trylock, read, check,
 		       irqs_disabled_flags(flags), ip);
 	current->lockdep_recursion = 0;
@@ -2456,7 +2464,8 @@ void lock_acquire(struct lockdep_map *lo
 
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+void notrace
+lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2464,8 +2473,8 @@ void lock_release(struct lockdep_map *lo
 		return;
 
 	raw_local_irq_save(flags);
-	check_flags(flags);
 	current->lockdep_recursion = 1;
+	check_flags(flags);
 	__lock_release(lock, nested, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
Index: linux-2.6.22-rc2/kernel/panic.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/panic.c	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/kernel/panic.c	2007-05-24 15:57:42.000000000 +0200
@@ -66,6 +66,8 @@ NORET_TYPE void panic(const char * fmt, 
         unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
+	stop_trace();
+
 	/*
 	 * It's possible to come here directly from a panic-assertion and not
 	 * have preempt disabled. Some functions called from here want
Index: linux-2.6.22-rc2/kernel/printk.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/printk.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/printk.c	2007-05-24 15:57:42.000000000 +0200
@@ -324,12 +324,29 @@ static void __call_console_drivers(unsig
 {
 	struct console *con;
 
+	touch_critical_timing();
 	for (con = console_drivers; con; con = con->next) {
 		if ((con->flags & CON_ENABLED) && con->write &&
 				(cpu_online(smp_processor_id()) ||
-				(con->flags & CON_ANYTIME)))
+				(con->flags & CON_ANYTIME))) {
+			/*
+			 * Disable tracing of printk details - it just
+			 * clobbers the trace output with lots of
+			 * repetitive lines (especially if console is
+			 * on a serial line):
+			 */
+#ifdef CONFIG_EVENT_TRACE
+			int trace_save = trace_enabled;
+
+			trace_enabled = 0;
+			con->write(con, &LOG_BUF(start), end - start);
+			trace_enabled = trace_save;
+#else
 			con->write(con, &LOG_BUF(start), end - start);
+#endif
+		}
 	}
+	touch_critical_timing();
 }
 
 static int __read_mostly ignore_loglevel;
Index: linux-2.6.22-rc2/kernel/sched.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/sched.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/sched.c	2007-05-24 15:57:42.000000000 +0200
@@ -76,6 +76,10 @@ unsigned long long __attribute__((weak))
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
+#define __PRIO(prio) \
+	((prio) <= 99 ? 199 - (prio) : (prio) - 120)
+
+#define PRIO(p) __PRIO((p)->prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
@@ -1043,6 +1047,12 @@ static void deactivate_task(struct task_
 	p->array = NULL;
 }
 
+static inline void trace_start_sched_wakeup(struct task_struct *p, struct rq *rq)
+{
+	if (p != rq->curr)
+		__trace_start_sched_wakeup(p);
+}
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -1060,6 +1070,8 @@ static void resched_task(struct task_str
 {
 	int cpu;
 
+	trace_start_sched_wakeup(p, task_rq(p));
+
 	assert_spin_locked(&task_rq(p)->lock);
 
 	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
@@ -1090,6 +1102,8 @@ static void resched_cpu(int cpu)
 #else
 static inline void resched_task(struct task_struct *p)
 {
+	trace_start_sched_wakeup(p, task_rq(p));
+
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
@@ -1615,14 +1629,19 @@ out:
 
 int fastcall wake_up_process(struct task_struct *p)
 {
-	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
 				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+	mcount();
+	return ret;
 }
 EXPORT_SYMBOL(wake_up_process);
 
 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	int ret = try_to_wake_up(p, state, 0);
+
+	mcount();
+	return ret;
 }
 
 static void task_running_tick(struct rq *rq, struct task_struct *p);
@@ -1860,6 +1879,7 @@ static inline void finish_task_switch(st
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	trace_stop_sched_switched(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
@@ -1930,6 +1950,8 @@ context_switch(struct rq *rq, struct tas
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 
+	trace_cmdline();
+
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 
@@ -3461,41 +3483,39 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_EVENT_TRACE) && defined(CONFIG_DEBUG_RT_MUTEXES)
 
-void fastcall add_preempt_count(int val)
+static void trace_array(struct prio_array *array)
 {
-	/*
-	 * Underflow?
-	 */
-	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
-		return;
-	preempt_count() += val;
-	/*
-	 * Spinlock count overflowing soon?
-	 */
-	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
-				PREEMPT_MASK - 10);
+	int i;
+	struct task_struct *p;
+	struct list_head *head, *tmp;
+
+	for (i = 0; i < MAX_RT_PRIO; i++) {
+		head = array->queue + i;
+		if (list_empty(head)) {
+			WARN_ON(test_bit(i, array->bitmap));
+			continue;
+		}
+		WARN_ON(!test_bit(i, array->bitmap));
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, struct task_struct, run_list);
+			trace_special_pid(p->pid, p->prio, PRIO(p));
+		}
+	}
 }
-EXPORT_SYMBOL(add_preempt_count);
 
-void fastcall sub_preempt_count(int val)
+static inline void trace_all_runnable_tasks(struct rq *rq)
 {
-	/*
-	 * Underflow?
-	 */
-	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
-		return;
-	/*
-	 * Is the spinlock portion underflowing?
-	 */
-	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
-			!(preempt_count() & PREEMPT_MASK)))
-		return;
+	if (trace_enabled)
+		trace_array(rq->active);
+}
+
+#else
 
-	preempt_count() -= val;
+static inline void trace_all_runnable_tasks(struct rq *rq)
+{
 }
-EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
@@ -3641,6 +3661,8 @@ switch_tasks:
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
+	trace_all_runnable_tasks(rq);
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = next->last_ran = now;
@@ -3651,14 +3673,17 @@ switch_tasks:
 		prepare_task_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
+		trace_special_pid(prev->pid, PRIO(prev), PRIO(current));
 		/*
 		 * this_rq must be evaluated again because prev may have moved
 		 * CPUs since it called schedule(), thus the 'rq' on its stack
 		 * frame will be invalid.
 		 */
 		finish_task_switch(this_rq(), prev);
-	} else
+	} else {
 		spin_unlock_irq(&rq->lock);
+		trace_stop_sched_switched(next);
+	}
 
 	prev = current;
 	if (unlikely(reacquire_kernel_lock(prev) < 0))
@@ -4108,6 +4133,7 @@ void rt_mutex_setprio(struct task_struct
 		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
+
 	task_rq_unlock(rq, &flags);
 }
 
@@ -7055,6 +7081,7 @@ void __might_sleep(char *file, int line)
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
+		stop_trace();
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
 		printk("in_atomic():%d, irqs_disabled():%d\n",
Index: linux-2.6.22-rc2/kernel/softlockup.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/softlockup.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/softlockup.c	2007-05-24 15:57:42.000000000 +0200
@@ -100,6 +100,7 @@ void softlockup_tick(void)
 	if (now > (touch_timestamp + 10)) {
 		per_cpu(print_timestamp, this_cpu) = touch_timestamp;
 
+		stop_trace();
 		spin_lock(&print_lock);
 		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
 			this_cpu);
Index: linux-2.6.22-rc2/kernel/sysctl.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/sysctl.c	2007-05-22 16:59:43.000000000 +0200
+++ linux-2.6.22-rc2/kernel/sysctl.c	2007-05-24 15:57:42.000000000 +0200
@@ -29,6 +29,7 @@
 #include <linux/utsname.h>
 #include <linux/capability.h>
 #include <linux/smp_lock.h>
+#include <linux/clocksource.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kobject.h>
@@ -43,6 +44,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/profile.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 
@@ -215,6 +217,132 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_WAKEUP_TIMING
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "wakeup_timing",
+		.data		= &wakeup_timing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_WAKEUP_TIMING) || defined(CONFIG_EVENT_TRACE)
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "preempt_max_latency",
+		.data		= &preempt_max_latency,
+		.maxlen		= sizeof(preempt_max_latency),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "preempt_thresh",
+		.data		= &preempt_thresh,
+		.maxlen		= sizeof(preempt_thresh),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+#endif
+#ifdef CONFIG_EVENT_TRACE
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "trace_enabled",
+		.data		= &trace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "syscall_tracing",
+		.data		= &syscall_tracing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "stackframe_tracing",
+		.data		= &stackframe_tracing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "mcount_enabled",
+		.data		= &mcount_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "trace_user_triggered",
+		.data		= &trace_user_triggered,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "trace_user_trigger_irq",
+		.data		= &trace_user_trigger_irq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "trace_freerunning",
+		.data		= &trace_freerunning,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "trace_print_on_crash",
+		.data		= &trace_print_on_crash,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "trace_verbose",
+		.data		= &trace_verbose,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "trace_all_cpus",
+		.data		= &trace_all_cpus,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "trace_use_raw_cycles",
+		.data		= &trace_use_raw_cycles,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "trace_all_runnable",
+		.data		= &trace_all_runnable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.ctl_name	= KERN_CORE_USES_PID,
 		.procname	= "core_uses_pid",
Index: linux-2.6.22-rc2/lib/Kconfig.debug
===================================================================
--- linux-2.6.22-rc2.orig/lib/Kconfig.debug	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/lib/Kconfig.debug	2007-05-24 15:57:43.000000000 +0200
@@ -296,6 +296,192 @@ config STACKTRACE
 	depends on DEBUG_KERNEL
 	depends on STACKTRACE_SUPPORT
 
+config PREEMPT_TRACE
+	bool
+	default y
+	depends on DEBUG_PREEMPT
+
+config EVENT_TRACE
+	bool "Kernel event tracing"
+	default n
+	depends on GENERIC_TIME
+	select FRAME_POINTER
+	select STACKTRACE
+	help
+	  This option enables a kernel tracing mechanism that will track
+	  certain kernel events such as system call entry and return,
+	  IRQ entry, context-switching, etc.
+
+	  Run the scripts/trace-it utility on a kernel with this option
+	  enabled for sample output.
+
+config FUNCTION_TRACE
+	bool "Kernel function call tracing"
+	default n
+	depends on !REORDER
+	select EVENT_TRACE
+	help
+	  This option enables a kernel tracing mechanism that will track
+	  precise function-call granularity kernel execution. Sample
+	  output:
+
+           pcscd-1772  0D..2 6867us : deactivate_task <pcscd-1772> (-2 1)
+           pcscd-1772  0D..2 6867us : dequeue_task (deactivate_task)
+          <idle>-0     0D..2 6870us : __switch_to (__schedule)
+          <idle>-0     0D..2 6871us : __schedule <pcscd-1772> (-2 20)
+          <idle>-0     0D..2 6871us : __lock_acquire (lock_acquire)
+          <idle>-0     0D..2 6872us : __spin_unlock_irq (__schedule)
+
+	  Run the scripts/trace-it sample utility on a kernel with this
+	  option enabled to capture 1 second worth of events.
+
+	  (Note that kernel size and overhead increases noticeably
+	  with this option enabled.)
+
+config WAKEUP_TIMING
+	bool "Wakeup latency timing"
+	depends on GENERIC_TIME
+	help
+	  This option measures the time spent from a highprio thread being
+	  woken up to it getting scheduled on a CPU, with microsecond
+	  accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+config LATENCY_TRACE
+	bool "Latency tracing"
+	default n
+	depends on LATENCY_TIMING && !REORDER && GENERIC_TIME
+	select FRAME_POINTER
+	select FUNCTION_TRACE
+	help
+	  When this option is enabled then the last maximum latency timing
+	  event's full trace can be found in /proc/latency_trace, in a
+	  human-readable (or rather as some would say, in a
+	  kernel-developer-readable) form.
+
+	  (Note that kernel size and overhead increases noticeably
+	  with this option enabled.)
+
+config CRITICAL_PREEMPT_TIMING
+	bool "Non-preemptible critical section latency timing"
+	default n
+	depends on PREEMPT
+	depends on GENERIC_TIME
+	help
+	  This option measures the time spent in preempt-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
+config CRITICAL_IRQSOFF_TIMING
+	bool "Interrupts-off critical section latency timing"
+	default n
+	depends on GENERIC_TIME
+	select TRACE_IRQFLAGS
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config WAKEUP_LATENCY_HIST
+	bool "wakeup latency histogram"
+	default n
+	depends on WAKEUP_TIMING
+	help
+	  This option logs all the wakeup latency timing to a big histogram
+	  bucket, in the meanwhile, it also dummies up printk produced by
+	  wakeup latency timing.
+
+	  The wakeup latency timing histogram can be viewed via:
+
+	      cat /proc/latency_hist/wakeup_latency/CPU*
+
+	  (Note: * presents CPU ID.)
+
+config PREEMPT_OFF_HIST
+        bool "non-preemptible critical section latency histogram"
+        default n
+        depends on CRITICAL_PREEMPT_TIMING
+        help
+          This option logs all the non-preemptible critical section latency
+	  timing to a big histogram bucket, in the meanwhile, it also
+	  dummies up printk produced by non-preemptible critical section
+	  latency timing.
+
+          The non-preemptible critical section latency timing histogram can
+	  be viewed via:
+
+              cat /proc/latency_hist/preempt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config INTERRUPT_OFF_HIST
+        bool "interrupts-off critical section latency histogram"
+        default n
+        depends on CRITICAL_IRQSOFF_TIMING
+        help
+          This option logs all the interrupts-off critical section latency
+          timing to a big histogram bucket, in the meanwhile, it also
+          dummies up printk produced by interrupts-off critical section
+          latency timing.
+
+          The interrupts-off critical section latency timing histogram can
+          be viewed via:
+
+              cat /proc/latency_hist/interrupt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config CRITICAL_TIMING
+	bool
+	default y
+	depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING
+
+config DEBUG_TRACE_IRQFLAGS
+	bool
+	default y
+	depends on CRITICAL_IRQSOFF_TIMING
+
+config LATENCY_TIMING
+	bool
+	default y
+	depends on WAKEUP_TIMING || CRITICAL_TIMING
+	select SYSCTL
+
+config CRITICAL_LATENCY_HIST
+	bool
+	default y
+	depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST
+
+config LATENCY_HIST
+	bool
+	default y
+	depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST
+
+config MCOUNT
+	bool
+	depends on FUNCTION_TRACE
+	default y
+
 config DEBUG_KOBJECT
 	bool "kobject debugging"
 	depends on DEBUG_KERNEL
Index: linux-2.6.22-rc2/lib/debug_locks.c
===================================================================
--- linux-2.6.22-rc2.orig/lib/debug_locks.c	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/lib/debug_locks.c	2007-05-24 15:57:43.000000000 +0200
@@ -36,7 +36,16 @@ int debug_locks_silent;
 int debug_locks_off(void)
 {
 	if (xchg(&debug_locks, 0)) {
+#if 0
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		if (spin_is_locked(&current->pi_lock))
+			spin_unlock(&current->pi_lock);
+#endif
+#endif
 		if (!debug_locks_silent) {
+			stop_trace();
+			user_trace_stop();
+			printk("stopped custom tracer.\n");
 			console_verbose();
 			return 1;
 		}
Index: linux-2.6.22-rc2/scripts/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/scripts/Makefile	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/scripts/Makefile	2007-05-24 15:57:43.000000000 +0200
@@ -7,6 +7,7 @@
 # conmakehash:   Create chartable
 # conmakehash:	 Create arrays for initializing the kernel console tables
 
+hostprogs-$(CONFIG_EVENT_TRACE)  += trace-it
 hostprogs-$(CONFIG_KALLSYMS)     += kallsyms
 hostprogs-$(CONFIG_LOGO)         += pnmtologo
 hostprogs-$(CONFIG_VT)           += conmakehash
Index: linux-2.6.22-rc2/scripts/trace-it.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/scripts/trace-it.c	2007-05-24 15:57:43.000000000 +0200
@@ -0,0 +1,79 @@
+
+/*
+ * Copyright (C) 2005, Ingo Molnar <mingo@...hat.com>
+ *
+ * user-triggered tracing.
+ *
+ * The -rt kernel has a built-in kernel tracer, which will trace
+ * all kernel function calls (and a couple of special events as well),
+ * by using a build-time gcc feature that instruments all kernel
+ * functions.
+ *
+ * The tracer is highly automated for a number of latency tracing purposes,
+ * but it can also be switched into 'user-triggered' mode, which is a
+ * half-automatic tracing mode where userspace apps start and stop the
+ * tracer. This file shows a dumb example how to turn user-triggered
+ * tracing on, and how to start/stop tracing. Note that if you do
+ * multiple start/stop sequences, the kernel will do a maximum search
+ * over their latencies, and will keep the trace of the largest latency
+ * in /proc/latency_trace. The maximums are also reported to the kernel
+ * log. (but can also be read from /proc/sys/kernel/preempt_max_latency)
+ *
+ * For the tracer to be activated, turn on CONFIG_EVENT_TRACING
+ * in the .config, rebuild the kernel and boot into it. The trace will
+ * get _alot_ more verbose if you also turn on CONFIG_FUNCTION_TRACING,
+ * every kernel function call will be put into the trace. Note that
+ * CONFIG_FUNCTION_TRACING has significant runtime overhead, so you dont
+ * want to use it for performance testing :)
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <linux/unistd.h>
+
+int main (int argc, char **argv)
+{
+	int ret;
+
+	if (getuid() != 0) {
+		fprintf(stderr, "needs to run as root.\n");
+		exit(1);
+	}
+	ret = system("cat /proc/sys/kernel/mcount_enabled >/dev/null 2>/dev/null");
+	if (ret) {
+		fprintf(stderr, "CONFIG_LATENCY_TRACING not enabled?\n");
+		exit(1);
+	}
+	system("echo 1 > /proc/sys/kernel/trace_user_triggered");
+	system("[ -e /proc/sys/kernel/wakeup_timing ] && echo 0 > /proc/sys/kernel/wakeup_timing");
+	system("echo 1 > /proc/sys/kernel/trace_enabled");
+	system("echo 1 > /proc/sys/kernel/mcount_enabled");
+	system("echo 0 > /proc/sys/kernel/trace_freerunning");
+	system("echo 0 > /proc/sys/kernel/trace_print_on_crash");
+	system("echo 0 > /proc/sys/kernel/trace_verbose");
+	system("echo 0 > /proc/sys/kernel/preempt_thresh 2>/dev/null");
+	system("echo 0 > /proc/sys/kernel/preempt_max_latency 2>/dev/null");
+
+	// start tracing
+	if (prctl(0, 1)) {
+		fprintf(stderr, "trace-it: couldnt start tracing!\n");
+		return 1;
+	}
+	usleep(10000000);
+	if (prctl(0, 0)) {
+		fprintf(stderr, "trace-it: couldnt stop tracing!\n");
+		return 1;
+	}
+
+	system("echo 0 > /proc/sys/kernel/trace_user_triggered");
+	system("echo 0 > /proc/sys/kernel/trace_enabled");
+	system("cat /proc/latency_trace");
+
+	return 0;
+}
+
+
Index: linux-2.6.22-rc2/arch/i386/boot/compressed/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/boot/compressed/Makefile	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/boot/compressed/Makefile	2007-05-24 15:57:42.000000000 +0200
@@ -9,6 +9,7 @@ targets		:= vmlinux vmlinux.bin vmlinux.
 EXTRA_AFLAGS	:= -traditional
 
 LDFLAGS_vmlinux := -T
+CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2  -fno-strict-aliasing
 CFLAGS_misc.o += -fPIC
 hostprogs-y	:= relocs
 
Index: linux-2.6.22-rc2/arch/i386/kernel/Makefile
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/Makefile	2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/Makefile	2007-05-24 15:57:42.000000000 +0200
@@ -21,6 +21,7 @@ obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o tsc_sync.o
 obj-$(CONFIG_SMP)		+= smpcommon.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
+obj-$(CONFIG_MCOUNT)		+= mcount-wrapper.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
Index: linux-2.6.22-rc2/arch/i386/kernel/apic.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/apic.c	2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/apic.c	2007-05-24 15:57:43.000000000 +0200
@@ -577,6 +577,8 @@ void fastcall smp_apic_timer_interrupt(s
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	trace_special(regs->eip, 1, 0);
+
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
Index: linux-2.6.22-rc2/arch/i386/kernel/entry.S
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/entry.S	2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/entry.S	2007-05-24 15:57:42.000000000 +0200
@@ -329,6 +329,11 @@ sysenter_past_esp:
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+#ifdef CONFIG_EVENT_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -343,6 +348,11 @@ sysenter_past_esp:
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
+#ifdef CONFIG_EVENT_TRACE
+	pushl %eax
+	call sys_ret
+	popl %eax
+#endif
 /* if something modifies registers it must also disable sysexit */
 	movl PT_EIP(%esp), %edx
 	movl PT_OLDESP(%esp), %ecx
@@ -366,6 +376,11 @@ ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+#ifdef CONFIG_EVENT_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 	testl $TF_MASK,PT_EFLAGS(%esp)
 	jz no_singlestep
Index: linux-2.6.22-rc2/arch/i386/kernel/hpet.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/hpet.c	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/hpet.c	2007-05-24 15:57:42.000000000 +0200
@@ -205,7 +205,7 @@ static int hpet_next_event(unsigned long
 /*
  * Clock source related code
  */
-static cycle_t read_hpet(void)
+static cycle_t notrace read_hpet(void)
 {
 	return (cycle_t)hpet_readl(HPET_COUNTER);
 }
Index: linux-2.6.22-rc2/arch/i386/kernel/irq.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/irq.c	2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/irq.c	2007-05-24 15:57:42.000000000 +0200
@@ -68,7 +68,7 @@ static union irq_ctx *softirq_ctx[NR_CPU
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
+fastcall notrace unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	struct pt_regs *old_regs;
 	/* high bit used in ret_from_ code */
@@ -87,6 +87,11 @@ fastcall unsigned int do_IRQ(struct pt_r
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
+#ifdef CONFIG_EVENT_TRACE
+	if (irq == trace_user_trigger_irq)
+		user_trace_start();
+#endif
+	trace_special(regs->eip, irq, 0);
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
 	{
Index: linux-2.6.22-rc2/arch/i386/kernel/mcount-wrapper.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.22-rc2/arch/i386/kernel/mcount-wrapper.S	2007-05-24 15:57:42.000000000 +0200
@@ -0,0 +1,27 @@
+/*
+ *  linux/arch/i386/mcount-wrapper.S
+ *
+ *  Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+mcount:
+
+	cmpl $0, mcount_enabled
+	jz out
+
+	push %ebp
+	mov %esp, %ebp
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	call __mcount
+
+	popl %edx
+	popl %ecx
+	popl %eax
+	popl %ebp
+out:
+	ret
+
Index: linux-2.6.22-rc2/arch/i386/kernel/traps.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/traps.c	2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/traps.c	2007-05-24 15:57:42.000000000 +0200
@@ -136,7 +136,7 @@ static inline unsigned long print_contex
 
 #define MSG(msg) ops->warning(data, msg)
 
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
+void notrace dump_trace(struct task_struct *task, struct pt_regs *regs,
 	        unsigned long *stack,
 		struct stacktrace_ops *ops, void *data)
 {
@@ -222,6 +222,7 @@ show_trace_log_lvl(struct task_struct *t
 {
 	dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
 	printk("%s =======================\n", log_lvl);
+	print_traces(task);
 }
 
 void show_trace(struct task_struct *task, struct pt_regs *regs,
Index: linux-2.6.22-rc2/arch/i386/kernel/tsc.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/kernel/tsc.c	2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/kernel/tsc.c	2007-05-24 15:57:42.000000000 +0200
@@ -255,7 +255,7 @@ core_initcall(cpufreq_tsc);
 
 static unsigned long current_tsc_khz = 0;
 
-static cycle_t read_tsc(void)
+static notrace cycle_t read_tsc(void)
 {
 	cycle_t ret;
 
Index: linux-2.6.22-rc2/arch/i386/mm/fault.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/mm/fault.c	2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/mm/fault.c	2007-05-24 15:57:42.000000000 +0200
@@ -483,6 +483,7 @@ bad_area_nosemaphore:
 		nr = (address - idt_descr.address) >> 3;
 
 		if (nr == 6) {
+		stop_trace();
 			do_invalid_op(regs, 0);
 			return;
 		}
Index: linux-2.6.22-rc2/arch/i386/mm/init.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/i386/mm/init.c	2007-05-22 16:25:05.000000000 +0200
+++ linux-2.6.22-rc2/arch/i386/mm/init.c	2007-05-24 15:57:42.000000000 +0200
@@ -193,7 +193,7 @@ static inline int page_kills_ppro(unsign
 	return 0;
 }
 
-int page_is_ram(unsigned long pagenr)
+int notrace page_is_ram(unsigned long pagenr)
 {
 	int i;
 	unsigned long addr, end;
Index: linux-2.6.22-rc2/include/asm-i386/processor.h
===================================================================
--- linux-2.6.22-rc2.orig/include/asm-i386/processor.h	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/asm-i386/processor.h	2007-05-24 15:57:43.000000000 +0200
@@ -602,7 +602,9 @@ static inline void load_esp0(struct tss_
  * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
  * resulting in stale register contents being returned.
  */
-static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
+static inline void
+cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx,
+      unsigned int *ecx, unsigned int *edx)
 {
 	*eax = op;
 	*ecx = 0;
@@ -610,8 +612,9 @@ static inline void cpuid(unsigned int op
 }
 
 /* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
-			       int *edx)
+static inline void
+cpuid_count(int op, int count, unsigned int *eax, unsigned int *ebx,
+	    unsigned int *ecx, unsigned int *edx)
 {
 	*eax = op;
 	*ecx = count;
Index: linux-2.6.22-rc2/arch/x86_64/ia32/ia32entry.S
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/ia32/ia32entry.S	2007-05-22 16:25:09.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/ia32/ia32entry.S	2007-05-24 15:57:42.000000000 +0200
@@ -120,7 +120,9 @@ sysenter_do_call:	
 	cmpl	$(IA32_NR_syscalls-1),%eax
 	ja	ia32_badsys
 	IA32_ARG_FIXUP 1
+	TRACE_SYS_IA32_CALL
 	call	*ia32_sys_call_table(,%rax,8)
+	TRACE_SYS_RET
 	movq	%rax,RAX-ARGOFFSET(%rsp)
 	GET_THREAD_INFO(%r10)
 	cli
@@ -229,7 +231,9 @@ cstar_do_call:	
 	cmpl $IA32_NR_syscalls-1,%eax
 	ja  ia32_badsys
 	IA32_ARG_FIXUP 1
+	TRACE_SYS_IA32_CALL
 	call *ia32_sys_call_table(,%rax,8)
+	TRACE_SYS_RET
 	movq %rax,RAX-ARGOFFSET(%rsp)
 	GET_THREAD_INFO(%r10)
 	cli
@@ -323,8 +327,10 @@ ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls-1),%eax
 	ja  ia32_badsys
 	IA32_ARG_FIXUP
+	TRACE_SYS_IA32_CALL
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
+	TRACE_SYS_RET
 	movq %rax,RAX-ARGOFFSET(%rsp)
 	jmp int_ret_from_sys_call 
 
@@ -394,7 +400,7 @@ END(ia32_ptregs_common)
 
 	.section .rodata,"a"
 	.align 8
-ia32_sys_call_table:
+ENTRY(ia32_sys_call_table)
 	.quad sys_restart_syscall
 	.quad sys_exit
 	.quad stub32_fork
@@ -719,4 +725,7 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd
 	.quad compat_sys_timerfd
 	.quad sys_eventfd
+#ifdef CONFIG_EVENT_TRACE
+ .globl ia32_syscall_end
+#endif
 ia32_syscall_end:
Index: linux-2.6.22-rc2/arch/x86_64/kernel/entry.S
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/entry.S	2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/entry.S	2007-05-24 15:57:42.000000000 +0200
@@ -53,6 +53,47 @@
 
 	.code64
 
+#ifdef CONFIG_EVENT_TRACE
+
+ENTRY(mcount)
+	cmpl $0, mcount_enabled
+	jz out
+
+	push %rbp
+	mov %rsp,%rbp
+
+	push %r11
+	push %r10
+	push %r9
+	push %r8
+	push %rdi
+	push %rsi
+	push %rdx
+	push %rcx
+	push %rax
+
+	mov 0x0(%rbp),%rax
+	mov 0x8(%rbp),%rdi
+	mov 0x8(%rax),%rsi
+
+	call   __trace
+
+	pop %rax
+	pop %rcx
+	pop %rdx
+	pop %rsi
+	pop %rdi
+	pop %r8
+	pop %r9
+	pop %r10
+	pop %r11
+
+	pop %rbp
+out:
+	ret
+
+#endif
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
@@ -234,7 +275,9 @@ ENTRY(system_call)
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
 	movq %r10,%rcx
+	TRACE_SYS_CALL
 	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
+	TRACE_SYS_RET
 	movq %rax,RAX-ARGOFFSET(%rsp)
 /*
  * Syscall return path ending with SYSRET (fast path)
@@ -316,7 +359,9 @@ tracesys:			 
 	cmova %rcx,%rax
 	ja  1f
 	movq %r10,%rcx	/* fixup for C */
+	TRACE_SYS_CALL
 	call *sys_call_table(,%rax,8)
+ 	TRACE_SYS_RET
 1:	movq %rax,RAX-ARGOFFSET(%rsp)
 	/* Use IRET because user could have changed frame */
 		
Index: linux-2.6.22-rc2/arch/x86_64/kernel/head64.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/head64.c	2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/head64.c	2007-05-24 15:57:42.000000000 +0200
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/percpu.h>
+#include <linux/sched.h>
 
 #include <asm/processor.h>
 #include <asm/proto.h>
@@ -58,7 +59,7 @@ static void __init copy_bootdata(char *r
 	memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
 }
 
-void __init x86_64_start_kernel(char * real_mode_data)
+void __init notrace x86_64_start_kernel(char * real_mode_data)
 {
 	int i;
 
Index: linux-2.6.22-rc2/arch/x86_64/kernel/irq.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/irq.c	2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/irq.c	2007-05-24 15:57:42.000000000 +0200
@@ -115,6 +115,12 @@ asmlinkage unsigned int do_IRQ(struct pt
 	irq_enter();
 	irq = __get_cpu_var(vector_irq)[vector];
 
+#ifdef CONFIG_EVENT_TRACE
+	if (irq == trace_user_trigger_irq)
+		user_trace_start();
+#endif
+	trace_special(regs->rip, irq, 0);
+
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	stack_overflow_check(regs);
 #endif
Index: linux-2.6.22-rc2/arch/x86_64/kernel/setup64.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/setup64.c	2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/setup64.c	2007-05-24 15:57:42.000000000 +0200
@@ -114,7 +114,7 @@ void __init setup_per_cpu_areas(void)
 	}
 } 
 
-void pda_init(int cpu)
+void notrace pda_init(int cpu)
 { 
 	struct x8664_pda *pda = cpu_pda(cpu);
 
@@ -188,7 +188,7 @@ unsigned long kernel_eflags;
  * 'CPU state barrier', nothing should get across.
  * A lot of state is already set up in PDA init.
  */
-void __cpuinit cpu_init (void)
+void __cpuinit notrace cpu_init (void)
 {
 	int cpu = stack_smp_processor_id();
 	struct tss_struct *t = &per_cpu(init_tss, cpu);
Index: linux-2.6.22-rc2/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/smpboot.c	2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/smpboot.c	2007-05-24 15:57:42.000000000 +0200
@@ -318,7 +318,7 @@ static inline void set_cpu_sibling_map(i
 /*
  * Setup code on secondary processor (after comming out of the trampoline)
  */
-void __cpuinit start_secondary(void)
+void __cpuinit notrace start_secondary(void)
 {
 	/*
 	 * Dont put anything before smp_callin(), SMP
Index: linux-2.6.22-rc2/arch/x86_64/kernel/traps.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/traps.c	2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/traps.c	2007-05-24 15:57:42.000000000 +0200
@@ -346,6 +346,7 @@ show_trace(struct task_struct *tsk, stru
 	printk("\nCall Trace:\n");
 	dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
 	printk("\n");
+	print_traces(tsk);
 }
 
 static void
Index: linux-2.6.22-rc2/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/vsyscall.c	2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/vsyscall.c	2007-05-24 15:57:42.000000000 +0200
@@ -43,7 +43,7 @@
 #include <asm/desc.h>
 #include <asm/topology.h>
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace
 #define __syscall_clobber "r11","rcx","memory"
 #define __pa_vsymbol(x)			\
 	({unsigned long v;  		\
Index: linux-2.6.22-rc2/include/asm-x86_64/calling.h
===================================================================
--- linux-2.6.22-rc2.orig/include/asm-x86_64/calling.h	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/include/asm-x86_64/calling.h	2007-05-24 15:57:43.000000000 +0200
@@ -160,3 +160,53 @@
 	.macro icebp
 	.byte 0xf1
 	.endm
+
+/*
+ * latency-tracing helpers:
+ */
+
+	.macro TRACE_SYS_CALL
+
+#ifdef CONFIG_EVENT_TRACE
+	SAVE_ARGS
+
+	mov     %rdx, %rcx
+	mov     %rsi, %rdx
+	mov     %rdi, %rsi
+	mov     %rax, %rdi
+
+	call sys_call
+
+	RESTORE_ARGS
+#endif
+	.endm
+
+
+	.macro TRACE_SYS_IA32_CALL
+
+#ifdef CONFIG_EVENT_TRACE
+	SAVE_ARGS
+
+	mov     %rdx, %rcx
+	mov     %rsi, %rdx
+	mov     %rdi, %rsi
+	mov     %rax, %rdi
+
+	call sys_ia32_call
+
+	RESTORE_ARGS
+#endif
+	.endm
+
+	.macro TRACE_SYS_RET
+
+#ifdef CONFIG_EVENT_TRACE
+	SAVE_ARGS
+
+	mov     %rax, %rdi
+
+	call sys_ret
+
+	RESTORE_ARGS
+#endif
+	.endm
Index: linux-2.6.22-rc2/include/asm-x86_64/unistd.h
===================================================================
--- linux-2.6.22-rc2.orig/include/asm-x86_64/unistd.h	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/include/asm-x86_64/unistd.h	2007-05-24 15:57:43.000000000 +0200
@@ -11,6 +11,8 @@
  * Note: holes are not allowed.
  */
 
+#define NR_syscalls (__NR_syscall_max+1)
+
 /* at least 8 syscall per cacheline */
 #define __NR_read                                0
 __SYSCALL(__NR_read, sys_read)
Index: linux-2.6.22-rc2/include/linux/prctl.h
===================================================================
--- linux-2.6.22-rc2.orig/include/linux/prctl.h	2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/include/linux/prctl.h	2007-05-24 15:57:43.000000000 +0200
@@ -3,6 +3,7 @@
 
 /* Values to pass as first argument to prctl() */
 
+#define PR_SET_TRACING    0  /* Second arg is tracing on/off */
 #define PR_SET_PDEATHSIG  1  /* Second arg is a signal */
 #define PR_GET_PDEATHSIG  2  /* Second arg is a ptr to return the signal */
 
Index: linux-2.6.22-rc2/kernel/sys.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/sys.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/sys.c	2007-05-24 15:57:42.000000000 +0200
@@ -2149,6 +2149,14 @@ asmlinkage long sys_prctl(int option, un
 {
 	long error;
 
+#ifdef CONFIG_EVENT_TRACE
+	if (option == PR_SET_TRACING) {
+		if (arg2)
+			return user_trace_start();
+		return user_trace_stop();
+	}
+#endif
+
 	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
 	if (error)
 		return error;
Index: linux-2.6.22-rc2/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/time/timekeeping.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/time/timekeeping.c	2007-05-24 15:57:42.000000000 +0200
@@ -195,6 +195,34 @@ static void change_clocksource(void)
 	printk(KERN_INFO "Time: %s clocksource has been installed.\n",
 	       clock->name);
 }
+
+cycle_t notrace get_monotonic_cycles(void)
+{
+	cycle_t cycle_now, cycle_delta;
+
+	/* read clocksource: */
+	cycle_now = clocksource_read(clock);
+
+	/* calculate the delta since the last update_wall_time: */
+	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+	return clock->cycle_last + cycle_delta;
+}
+
+unsigned long notrace cycles_to_usecs(cycle_t cycles)
+{
+	u64 ret = cyc2ns(clock, cycles);
+
+	do_div(ret, 1000);
+
+	return ret;
+}
+
+cycle_t notrace usecs_to_cycles(unsigned long usecs)
+{
+	return ns2cyc(clock, (u64)usecs * 1000);
+}
+
 #else
 static inline void change_clocksource(void) { }
 #endif
Index: linux-2.6.22-rc2/arch/x86_64/kernel/stacktrace.c
===================================================================
--- linux-2.6.22-rc2.orig/arch/x86_64/kernel/stacktrace.c	2007-05-22 16:25:10.000000000 +0200
+++ linux-2.6.22-rc2/arch/x86_64/kernel/stacktrace.c	2007-05-24 15:57:42.000000000 +0200
@@ -24,7 +24,7 @@ static int save_stack_stack(void *data, 
 	return -1;
 }
 
-static void save_stack_address(void *data, unsigned long addr)
+static void notrace save_stack_address(void *data, unsigned long addr)
 {
 	struct stack_trace *trace = (struct stack_trace *)data;
 	if (trace->skip > 0) {
Index: linux-2.6.22-rc2/kernel/hrtimer.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/hrtimer.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/hrtimer.c	2007-05-24 15:57:42.000000000 +0200
@@ -553,6 +553,8 @@ static inline int hrtimer_enqueue_reprog
 	return 0;
 }
 
+void trace_start_ht_debug(void);
+
 /*
  * Switch to high resolution mode
  */
@@ -576,6 +578,9 @@ static int hrtimer_switch_to_hres(void)
 
 	tick_setup_sched_timer();
 
+	if (!cpu)
+		trace_start_ht_debug();
+
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
 	local_irq_restore(flags);
Index: linux-2.6.22-rc2/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.22-rc2.orig/kernel/time/tick-sched.c	2007-05-22 16:25:19.000000000 +0200
+++ linux-2.6.22-rc2/kernel/time/tick-sched.c	2007-05-24 15:57:42.000000000 +0200
@@ -167,9 +167,21 @@ void tick_nohz_stop_sched_tick(void)
 		goto end;
 
 	cpu = smp_processor_id();
-	if (unlikely(local_softirq_pending()))
-		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-		       local_softirq_pending());
+	if (unlikely(local_softirq_pending())) {
+		static int ratelimit;
+
+		if (ratelimit < 10) {
+			if (!cpu) {
+				trace_special(0, 0, 0);
+				user_trace_stop();
+				ratelimit = 10;
+			}
+			printk(KERN_ERR
+			       "NOHZ: local_softirq_pending %02x on CPU %d\n",
+			       local_softirq_pending(), cpu);
+			ratelimit++;
+		}
+	}
 
 	now = ktime_get();
 	/*
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/