This patch adds hooks to trace the wake up latency of the highest priority waking task. "wakeup" is added to /debugfs/tracing/available_tracers Also added to /debugfs/tracing tracing_max_latency holds the current max latency for the wakeup wakeup_thresh if set to other than zero, a log will be recorded for every wakeup that takes longer than the number entered in here (usecs for all counters) (deletes previous trace) Examples: (with mcount_enabled = 0) ============ preemption latency trace v1.1.5 on 2.6.24-rc8 -------------------------------------------------------------------- latency: 26 us, #2/2, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: migration/0-3 (uid:0 nice:-5 policy:1 rt_prio:99) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / quilt-8551 0d..3 0us+: wake_up_process+0x15/0x17 (sched_exec+0xc9/0x100 ) quilt-8551 0d..4 26us : sched_switch_callback+0x73/0x81 (schedule+0x483/0x6d5 ) vim:ft=help ============ (with mcount_enabled = 1) ============ preemption latency trace v1.1.5 on 2.6.24-rc8 -------------------------------------------------------------------- latency: 36 us, #45/45, CPU#0 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: migration/1-5 (uid:0 nice:-5 policy:1 rt_prio:99) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / bash-10653 1d..3 0us : wake_up_process+0x15/0x17 (sched_exec+0xc9/0x100 ) bash-10653 1d..3 1us : try_to_wake_up+0x271/0x2e7 (sub_preempt_count+0xc/0x7a ) bash-10653 1d..2 2us : try_to_wake_up+0x296/0x2e7 (update_rq_clock+0x9/0x20 ) bash-10653 1d..2 2us : update_rq_clock+0x1e/0x20 (__update_rq_clock+0xc/0x90 ) bash-10653 1d..2 3us : __update_rq_clock+0x1b/0x90 (sched_clock+0x9/0x29 ) bash-10653 1d..2 4us : try_to_wake_up+0x2a6/0x2e7 (activate_task+0xc/0x3f ) bash-10653 1d..2 4us : activate_task+0x2d/0x3f (enqueue_task+0xe/0x66 ) bash-10653 1d..2 5us : enqueue_task+0x5b/0x66 (enqueue_task_rt+0x9/0x3c ) bash-10653 1d..2 6us : try_to_wake_up+0x2ba/0x2e7 (check_preempt_wakeup+0x12/0x99 ) [...] bash-10653 1d..5 33us : tracing_record_cmdline+0xcf/0xd4 (_spin_unlock+0x9/0x33 ) bash-10653 1d..5 34us : _spin_unlock+0x19/0x33 (sub_preempt_count+0xc/0x7a ) bash-10653 1d..4 35us : wakeup_sched_switch+0x65/0x2ff (_spin_lock_irqsave+0xc/0xa9 ) bash-10653 1d..4 35us : _spin_lock_irqsave+0x19/0xa9 (add_preempt_count+0xe/0x77 ) bash-10653 1d..4 36us : sched_switch_callback+0x73/0x81 (schedule+0x483/0x6d5 ) vim:ft=help ============ The [...] was added here to not waste your email box space. Signed-off-by: Steven Rostedt --- lib/tracing/Kconfig | 14 + lib/tracing/Makefile | 1 lib/tracing/trace_wakeup.c | 377 +++++++++++++++++++++++++++++++++++++++++++++ lib/tracing/tracer.c | 127 +++++++++++++++ lib/tracing/tracer.h | 5 5 files changed, 523 insertions(+), 1 deletion(-) Index: linux-mcount.git/lib/tracing/Kconfig =================================================================== --- linux-mcount.git.orig/lib/tracing/Kconfig 2008-01-30 15:33:38.000000000 -0500 +++ linux-mcount.git/lib/tracing/Kconfig 2008-01-30 15:38:44.000000000 -0500 @@ -9,6 +9,9 @@ config MCOUNT bool select FRAME_POINTER +config TRACER_MAX_TRACE + bool + config TRACING bool select DEBUG_FS @@ -25,6 +28,17 @@ config FUNCTION_TRACER that the debugging mechanism using this facility will hook by providing a set of inline routines. +config WAKEUP_TRACER + bool "Trace wakeup latencies" + depends on DEBUG_KERNEL + select TRACING + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + help + This tracer adds hooks into scheduling to time the latency + of the highest priority task tasks to be scheduled in + after it has worken up. + config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on DEBUG_KERNEL Index: linux-mcount.git/lib/tracing/Makefile =================================================================== --- linux-mcount.git.orig/lib/tracing/Makefile 2008-01-30 15:32:34.000000000 -0500 +++ linux-mcount.git/lib/tracing/Makefile 2008-01-30 15:38:44.000000000 -0500 @@ -3,5 +3,6 @@ obj-$(CONFIG_MCOUNT) += libmcount.o obj-$(CONFIG_TRACING) += tracer.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_function.o +obj-$(CONFIG_WAKEUP_TRACER) += trace_wakeup.o libmcount-y := mcount.o Index: linux-mcount.git/lib/tracing/trace_wakeup.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-mcount.git/lib/tracing/trace_wakeup.c 2008-01-30 15:38:44.000000000 -0500 @@ -0,0 +1,377 @@ +/* + * trace task wakeup timings + * + * Copyright (C) 2007 Steven Rostedt + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "tracer.h" + +static struct tracing_trace *tracer_trace __read_mostly; +static int trace_enabled __read_mostly; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static unsigned wakeup_prio = -1; + +static DEFINE_SPINLOCK(wakeup_lock); + +static void notrace __wakeup_reset(struct tracing_trace *tr); +/* + * Should this new latency be reported/recorded? + */ +static int notrace report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void notrace wakeup_sched_switch(void *private, + struct task_struct *prev, + struct task_struct *next) +{ + struct tracing_trace **ptr = private; + struct tracing_trace *tr = *ptr; + struct tracing_trace_cpu *data; + unsigned long latency = 0, t0 = 0, t1 = 0; + cycle_t T0, T1, delta; + unsigned long flags; + int cpu; + + if (unlikely(!trace_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set trace_enabled = 1. We want to make sure + * that another CPU does not see the trace_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + /* The task we are waitng for is waking up */ + data = tr->data[wakeup_cpu]; + + if (unlikely(!data) || unlikely(!data->trace) || + unlikely(atomic_read(&data->disabled))) + return; + + spin_lock_irqsave(&wakeup_lock, flags); + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + atomic_inc(&tr->data[cpu]->disabled); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!trace_enabled || next != wakeup_task)) + goto out; + + tracing_function_trace(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = now(); + delta = T1-T0; + + if (!report_latency(delta)) + goto out; + + latency = cycles_to_usecs(delta); + + tracing_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + + update_max_tr(tr, wakeup_task, wakeup_cpu); + + if (tracing_thresh) + printk(KERN_WARNING "(%16s-%-5d|#%d): %lu us wakeup latency " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + wakeup_task->comm, wakeup_task->pid, + raw_smp_processor_id(), + latency, cycles_to_usecs(tracing_thresh), t0); + else + printk(KERN_WARNING "(%16s-%-5d|#%d): new %lu us maximum " + "wakeup latency.\n => started at timestamp %lu: ", + wakeup_task->comm, wakeup_task->pid, + cpu, latency, t0); + + printk(KERN_CONT " ended at timestamp %lu: ", t1); + dump_stack(); + t1 = cycles_to_usecs(now()); + printk(KERN_CONT " dump-end timestamp %lu\n\n", t1); + +out: + __wakeup_reset(tr); + atomic_dec(&tr->data[cpu]->disabled); + spin_unlock_irqrestore(&wakeup_lock, flags); + +} + +static struct tracer_switch_ops switch_ops __read_mostly = { + .func = wakeup_sched_switch, + .private = &tracer_trace, +}; + +static void notrace __wakeup_reset(struct tracing_trace *tr) +{ + struct tracing_trace_cpu *data; + int cpu; + + assert_spin_locked(&wakeup_lock); + + for_each_possible_cpu(cpu) { + data = tr->data[cpu]; + tracing_reset(data); + } + + wakeup_cpu = -1; + wakeup_prio = -1; + if (wakeup_task) { + put_task_struct(wakeup_task); + tracing_stop_function_trace(); + } + + wakeup_task = NULL; +} + +static void notrace wakeup_reset(struct tracing_trace *tr) +{ + unsigned long flags; + + spin_lock_irqsave(&wakeup_lock, flags); + __wakeup_reset(tr); + spin_unlock_irqrestore(&wakeup_lock, flags); +} + +static notrace void wakeup_check_start(struct tracing_trace *tr, + struct task_struct *p, + struct task_struct *curr) +{ + unsigned long flags; + long disabled; + int cpu = smp_processor_id(); + + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (unlikely(disabled != 1)) + goto out; + + /* interrupts should be off from try_to_wake_up */ + spin_lock(&wakeup_lock); + + /* check for races. */ + if (!trace_enabled || p->prio >= wakeup_prio) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(tr); + + wakeup_cpu = task_cpu(p); + wakeup_prio = p->prio; + + wakeup_task = p; + get_task_struct(wakeup_task); + + local_save_flags(flags); + + tr->data[wakeup_cpu]->preempt_timestamp = now(); + tracing_start_function_trace(); + tracing_function_trace(tr, tr->data[wakeup_cpu], + CALLER_ADDR1, CALLER_ADDR2, flags); + + + out_locked: + spin_unlock(&wakeup_lock); + out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static notrace void wake_up_callback(const struct marker *mdata, + void *private_data, + const char *format, ...) +{ + struct tracing_trace **ptr = mdata->private; + struct tracing_trace *tr = *ptr; + struct task_struct *curr; + struct task_struct *p; + va_list ap; + + if (likely(!trace_enabled)) + return; + + va_start(ap, format); + + /* now get the meat: "p %p rq->curr %p" */ + p = va_arg(ap, typeof(p)); + curr = va_arg(ap, typeof(curr)); + + va_end(ap); + + wakeup_check_start(tr, p, curr); +} + +static notrace void start_wakeup_trace(struct tracing_trace *tr) +{ + int ret; + + ret = marker_arm("kernel_sched_wakeup"); + if (ret) { + pr_info("wakeup trace: Couldn't arm probe" + " kernel_sched_wakeup\n"); + return; + } + + ret = marker_arm("kernel_sched_wakeup_new"); + if (ret) { + pr_info("wakeup trace: Couldn't arm probe" + " kernel_sched_wakeup_new\n"); + goto out; + } + + register_tracer_switch(&switch_ops); + tracing_start_sched_switch(); + + wakeup_reset(tr); + + /* + * Don't let the trace_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + trace_enabled = 1; + + return; + + out: + marker_disarm("kernel_sched_wakeup"); +} + +static notrace void stop_wakeup_trace(struct tracing_trace *tr) +{ + trace_enabled = 0; + tracing_stop_sched_switch(); + unregister_tracer_switch(&switch_ops); + marker_disarm("kernel_sched_wakeup"); + marker_disarm("kernel_sched_wakeup_new"); +} + +static notrace void wakeup_trace_init(struct tracing_trace *tr) +{ + tracer_trace = tr; + + if (tr->ctrl) + start_wakeup_trace(tr); +} + +static notrace void wakeup_trace_reset(struct tracing_trace *tr) +{ + if (tr->ctrl) { + stop_wakeup_trace(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + } +} + +static void wakeup_trace_ctrl_update(struct tracing_trace *tr) +{ + if (tr->ctrl) + start_wakeup_trace(tr); + else + stop_wakeup_trace(tr); +} + +static void notrace wakeup_trace_open(struct tracing_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_wakeup_trace(iter->tr); +} + +static void notrace wakeup_trace_close(struct tracing_iterator *iter) +{ + /* forget about any processes we were recording */ + if (iter->tr->ctrl) + start_wakeup_trace(iter->tr); +} + +static struct trace_types_struct wakeup_trace __read_mostly = +{ + .name = "wakeup", + .init = wakeup_trace_init, + .reset = wakeup_trace_reset, + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .ctrl_update = wakeup_trace_ctrl_update, + .print_max = 1, +}; + +__init static int init_wakeup_trace(void) +{ + int ret; + + ret = register_trace(&wakeup_trace); + if (ret) + return ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "p %p rq->curr %p", + wake_up_callback, + &tracer_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + goto fail; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "p %p rq->curr %p", + wake_up_callback, + &tracer_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + return 0; + fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup"); + fail: + unregister_trace(&wakeup_trace); + return 0; +} + +device_initcall(init_wakeup_trace); Index: linux-mcount.git/lib/tracing/tracer.c =================================================================== --- linux-mcount.git.orig/lib/tracing/tracer.c 2008-01-30 15:33:38.000000000 -0500 +++ linux-mcount.git/lib/tracing/tracer.c 2008-01-30 15:38:44.000000000 -0500 @@ -28,8 +28,13 @@ #include "tracer.h" +unsigned long tracing_max_latency __read_mostly = (cycle_t)ULONG_MAX; +unsigned long tracing_thresh __read_mostly; + static struct tracing_trace tracer_trace __read_mostly; static DEFINE_PER_CPU(struct tracing_trace_cpu, tracer_trace_cpu); +static struct tracing_trace max_tr __read_mostly; +static DEFINE_PER_CPU(struct tracing_trace_cpu, max_data); static int trace_enabled __read_mostly; static unsigned long trace_nr_entries = (65536UL); @@ -64,6 +69,43 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x08, }; +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /debugfs/tracing/latency_trace) + */ +void update_max_tr(struct tracing_trace *tr, struct task_struct *tsk, int cpu) +{ + struct tracing_trace_cpu *data = tr->data[cpu]; + void *save_trace; + int i; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + + /* clear out all the previous traces */ + for_each_possible_cpu(i) { + data = tr->data[i]; + save_trace = max_tr.data[i]->trace; + memcpy(max_tr.data[i], data, sizeof(*data)); + data->trace = save_trace; + } + + data = max_tr.data[cpu]; + data->saved_latency = tracing_max_latency; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + data->pid = tsk->pid; + data->uid = tsk->uid; + data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + data->policy = tsk->policy; + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(current); +} + int register_trace(struct trace_types_struct *type) { struct trace_types_struct *t; @@ -813,7 +855,10 @@ __tracing_open(struct inode *inode, stru } mutex_lock(&trace_types_lock); - iter->tr = inode->i_private; + if (current_trace && current_trace->print_max) + iter->tr = &max_tr; + else + iter->tr = inode->i_private; iter->trace = current_trace; iter->pos = -1; @@ -1169,6 +1214,50 @@ static ssize_t tracing_set_trace_write(s return cnt; } +static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, 64, "%ld\n", + *ptr == (unsigned long)-1 ? -1 : cycles_to_usecs(*ptr)); + if (r > 64) + r = 64; + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t tracing_max_lat_write(struct file *filp, + const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long *ptr = filp->private_data; + long val; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + + *ptr = usecs_to_cycles(val); + + return cnt; +} + +static struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, +}; + static struct file_operations tracing_ctrl_fops = { .open = tracing_open_generic, .read = tracing_ctrl_read, @@ -1237,6 +1326,19 @@ static __init void tracer_init_debugfs(v &tracer_trace, &set_tracer_fops); if (!entry) pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, + &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_max_latency' entry\n"); + + entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); } /* dummy trace to disable tracing */ @@ -1260,6 +1362,8 @@ __init static int tracer_alloc_buffers(v for_each_possible_cpu(i) { tracer_trace.data[i] = &per_cpu(tracer_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_data, i); + array = (struct tracing_entry *) __get_free_pages(GFP_KERNEL, order); if (array == NULL) { @@ -1268,6 +1372,18 @@ __init static int tracer_alloc_buffers(v goto free_buffers; } tracer_trace.data[i]->trace = array; + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (struct tracing_entry *) + __get_free_pages(GFP_KERNEL, order); + if (array == NULL) { + printk(KERN_ERR "wakeup tracer: failed to allocate" + " %ld bytes for trace buffer!\n", size); + goto free_buffers; + } + max_tr.data[i]->trace = array; +#endif } /* @@ -1275,6 +1391,7 @@ __init static int tracer_alloc_buffers(v * round up a bit. */ tracer_trace.entries = size / TRACING_ENTRY_SIZE; + max_tr.entries = tracer_trace.entries; pr_info("tracer: %ld bytes allocated for %ld", size, trace_nr_entries); @@ -1297,6 +1414,14 @@ __init static int tracer_alloc_buffers(v free_pages((unsigned long)data->trace, order); data->trace = NULL; } + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + if (data && data->trace) { + free_pages((unsigned long)data->trace, order); + data->trace = NULL; + } +#endif } return -ENOMEM; } Index: linux-mcount.git/lib/tracing/tracer.h =================================================================== --- linux-mcount.git.orig/lib/tracing/tracer.h 2008-01-30 15:34:21.000000000 -0500 +++ linux-mcount.git/lib/tracing/tracer.h 2008-01-30 15:38:44.000000000 -0500 @@ -69,6 +69,7 @@ struct trace_types_struct { void (*stop)(struct tracing_iterator *iter); void (*ctrl_update)(struct tracing_trace *tr); struct trace_types_struct *next; + int print_max; }; struct tracing_iterator { @@ -107,6 +108,10 @@ void tracing_start_sched_switch(void); void tracing_stop_sched_switch(void); extern int tracing_sched_switch_enabled; +extern unsigned long tracing_max_latency; +extern unsigned long tracing_thresh; + +void update_max_tr(struct tracing_trace *tr, struct task_struct *tsk, int cpu); static inline notrace cycle_t now(void) { -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/