This patch adds context switch tracing, of the format of: _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 1d..3 137us+: 0:140:R --> 2912:120 sshd-2912 1d..3 216us+: 2912:120:S --> 0:140 swapper-0 1d..3 261us+: 0:140:R --> 2912:120 bash-2920 0d..3 267us+: 2920:120:S --> 0:140 sshd-2912 1d..3 330us!: 2912:120:S --> 0:140 swapper-0 1d..3 2389us+: 0:140:R --> 2847:120 yum-upda-2847 1d..3 2411us!: 2847:120:S --> 0:140 swapper-0 0d..3 11089us+: 0:140:R --> 3139:120 gdm-bina-3139 0d..3 11113us!: 3139:120:S --> 0:140 swapper-0 1d..3 102328us+: 0:140:R --> 2847:120 yum-upda-2847 1d..3 102348us!: 2847:120:S --> 0:140 Two files are added to /debugfs/tracing sched_trace - outputs the above format. sched_trace_ctrl 0 - turns off context switch tracing. 1 - turns on context switch tracing. Signed-off-by: Steven Rostedt Cc: Mathieu Desnoyers --- lib/tracing/Kconfig | 10 + lib/tracing/Makefile | 1 lib/tracing/trace_sched_switch.c | 205 +++++++++++++++++++++++++++++++++++++++ lib/tracing/tracer.c | 32 ++++++ lib/tracing/tracer.h | 18 +++ 5 files changed, 265 insertions(+), 1 deletion(-) Index: linux-mcount.git/lib/tracing/Kconfig =================================================================== --- linux-mcount.git.orig/lib/tracing/Kconfig 2008-01-21 09:59:15.000000000 -0500 +++ linux-mcount.git/lib/tracing/Kconfig 2008-01-21 10:03:57.000000000 -0500 @@ -24,3 +24,13 @@ config FUNCTION_TRACER insert a call to an architecture specific __mcount routine, that the debugging mechanism using this facility will hook by providing a set of inline routines. + +config CONTEXT_SWITCH_TRACER + bool "Trace process context switches" + depends on DEBUG_KERNEL + default n + select TRACING + help + This tracer hooks into the context switch and records + all switching of tasks. + Index: linux-mcount.git/lib/tracing/Makefile =================================================================== --- linux-mcount.git.orig/lib/tracing/Makefile 2008-01-21 09:59:15.000000000 -0500 +++ linux-mcount.git/lib/tracing/Makefile 2008-01-21 10:03:57.000000000 -0500 @@ -1,6 +1,7 @@ obj-$(CONFIG_MCOUNT) += libmcount.o obj-$(CONFIG_TRACING) += tracer.o +obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_function.o libmcount-y := mcount.o Index: linux-mcount.git/lib/tracing/trace_sched_switch.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-mcount.git/lib/tracing/trace_sched_switch.c 2008-01-21 10:03:57.000000000 -0500 @@ -0,0 +1,205 @@ +/* + * trace context switch + * + * Copyright (C) 2007 Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "tracer.h" + +static struct tracing_trace sched_switch_trace __read_mostly; +static DEFINE_PER_CPU(struct tracing_trace_cpu, sched_switch_trace_cpu); + +static int trace_enabled __read_mostly; +static unsigned long trace_nr_entries = (16384UL); + +static int __init set_nr_entries(char *str) +{ + if (!str) + return 0; + trace_nr_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("trace_ctx_entries=", set_nr_entries); + +static notrace void sched_switch_callback(const struct marker *mdata, + void *private_data, + const char *format, ...) +{ + struct tracing_trace *tr = mdata->private; + struct tracing_trace_cpu *data; + struct task_struct *prev; + struct task_struct *next; + unsigned long flags; + va_list ap; + int cpu; + + if (likely(!trace_enabled)) + return; + + va_start(ap, format); + prev = va_arg(ap, typeof(prev)); + next = va_arg(ap, typeof(next)); + va_end(ap); + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + atomic_inc(&data->disabled); + + if (likely(atomic_read(&data->disabled) == 1)) + tracing_sched_switch_trace(tr, data, prev, next, flags); + + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} + +static notrace void sched_switch_reset(struct tracing_trace *tr) +{ + int cpu; + + tr->time_start = now(); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +#ifdef CONFIG_DEBUG_FS +static void sched_switch_trace_ctrl_update(struct tracing_trace *tr, + unsigned long val) +{ + val = !!val; + + /* When starting a new trace, reset the buffers */ + if (val) + sched_switch_reset(tr); + + if (tr->ctrl ^ val) { + if (val) + trace_enabled = 1; + else + trace_enabled = 0; + tr->ctrl = val; + } +} + +static __init void sched_switch_trace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + sched_switch_trace.ctrl_update = sched_switch_trace_ctrl_update; + + entry = debugfs_create_file("sched_trace_ctrl", 0644, d_tracer, + &sched_switch_trace, &tracing_ctrl_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'sched_trace_ctrl' entry\n"); + + entry = debugfs_create_file("sched_trace", 0444, d_tracer, + &sched_switch_trace, &tracing_lt_fops); + if (!entry) + pr_warning("Could not create debugfs 'sched_trace' entry\n"); +} + +#else +static __init void sched_switch_trace_init_debugfs(void) +{ + /* + * No way to turn on or off the trace function + * without debugfs, so we just turn it on. + */ +} +#endif + +static void sched_switch_trace_open(struct tracing_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + trace_enabled = 0; +} + +static void sched_switch_trace_close(struct tracing_iterator *iter) +{ + if (iter->tr->ctrl) + trace_enabled = 1; +} + +__init static int sched_switch_trace_alloc_buffers(void) +{ + const int order = page_order(trace_nr_entries * TRACING_ENTRY_SIZE); + const unsigned long size = (1UL << order) << PAGE_SHIFT; + struct tracing_entry *array; + int ret; + int i; + + for_each_possible_cpu(i) { + sched_switch_trace.data[i] = + &per_cpu(sched_switch_trace_cpu, i); + array = (struct tracing_entry *) + __get_free_pages(GFP_KERNEL, order); + if (array == NULL) { + printk(KERN_ERR "sched_switch tracer: failed to" + " allocate %ld bytes for trace buffer!\n", size); + goto free_buffers; + } + sched_switch_trace.data[i]->trace = array; + } + + /* + * Since we allocate by orders of pages, we may be able to + * round up a bit. + */ + sched_switch_trace.entries = size / TRACING_ENTRY_SIZE; + + pr_info("sched_switch tracer: %ld bytes allocated for %ld", + size, trace_nr_entries); + pr_info(" entries of %ld bytes\n", (long)TRACING_ENTRY_SIZE); + pr_info(" actual entries %ld\n", sched_switch_trace.entries); + + sched_switch_trace_init_debugfs(); + + sched_switch_trace.open = sched_switch_trace_open; + sched_switch_trace.close = sched_switch_trace_close; + + ret = marker_probe_register("kernel_sched_schedule", + "prev %p next %p", + sched_switch_callback, + &sched_switch_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to switch_to\n"); + goto out; + } + + ret = marker_arm("kernel_sched_schedule"); + if (ret) { + pr_info("sched trace: Couldn't arm probe switch_to\n"); + goto out; + } + + out: + return 0; + + free_buffers: + for (i-- ; i >= 0; i--) { + struct tracing_trace_cpu *data = sched_switch_trace.data[i]; + + if (data && data->trace) { + free_pages((unsigned long)data->trace, order); + data->trace = NULL; + } + } + return -ENOMEM; +} + +device_initcall(sched_switch_trace_alloc_buffers); Index: linux-mcount.git/lib/tracing/tracer.c =================================================================== --- linux-mcount.git.orig/lib/tracing/tracer.c 2008-01-21 10:01:49.000000000 -0500 +++ linux-mcount.git/lib/tracing/tracer.c 2008-01-21 10:03:57.000000000 -0500 @@ -31,6 +31,7 @@ enum trace_type { __TRACE_FIRST_TYPE = 0, TRACE_FN, + TRACE_CTX, __TRACE_LAST_TYPE }; @@ -107,6 +108,24 @@ notrace void tracing_function_trace(stru entry->fn.parent_ip = parent_ip; } +notrace void tracing_sched_switch_trace(struct tracing_trace *tr, + struct tracing_trace_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags) +{ + struct tracing_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_CTX; + entry->ctx.prev_pid = prev->pid; + entry->ctx.prev_prio = prev->prio; + entry->ctx.prev_state = prev->state; + entry->ctx.next_pid = next->pid; + entry->ctx.next_prio = next->prio; +} + #ifdef CONFIG_DEBUG_FS enum trace_iterator { TRACE_ITER_SYM_ONLY = 1, @@ -414,6 +433,8 @@ lat_print_timestamp(struct seq_file *m, seq_puts(m, " : "); } +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + static void notrace print_lat_fmt(struct seq_file *m, struct tracing_iterator *iter, unsigned int trace_idx, int cpu) @@ -424,6 +445,7 @@ print_lat_fmt(struct seq_file *m, struct unsigned long rel_usecs; int sym_only = !!(trace_flags & TRACE_ITER_SYM_ONLY); int verbose = !!(trace_flags & TRACE_ITER_VERBOSE); + int S; if (!next_entry) next_entry = entry; @@ -450,6 +472,16 @@ print_lat_fmt(struct seq_file *m, struct seq_print_ip_sym(m, entry->fn.parent_ip, sym_only); seq_puts(m, ")\n"); break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + seq_printf(m, " %d:%d:%c --> %d:%d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); + break; } } Index: linux-mcount.git/lib/tracing/tracer.h =================================================================== --- linux-mcount.git.orig/lib/tracing/tracer.h 2008-01-21 09:59:15.000000000 -0500 +++ linux-mcount.git/lib/tracing/tracer.h 2008-01-21 10:03:57.000000000 -0500 @@ -10,6 +10,14 @@ struct tracing_function { unsigned long parent_ip; }; +struct tracing_sched_switch { + unsigned int prev_pid; + unsigned char prev_prio; + unsigned char prev_state; + unsigned int next_pid; + unsigned char next_prio; +}; + struct tracing_entry { char type; char cpu; /* who will want to trace more than 256 CPUS? */ @@ -18,7 +26,10 @@ struct tracing_entry { int pid; cycle_t t; char comm[TASK_COMM_LEN]; - struct tracing_function fn; + union { + struct tracing_function fn; + struct tracing_sched_switch ctx; + }; }; struct tracing_trace_cpu { @@ -75,6 +86,11 @@ void tracing_function_trace(struct traci unsigned long ip, unsigned long parent_ip, unsigned long flags); +void tracing_sched_switch_trace(struct tracing_trace *tr, + struct tracing_trace_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags); extern struct file_operations tracing_fops; extern struct file_operations tracing_lt_fops; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/