[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4A698FFE.8030906@cn.fujitsu.com>
Date: Fri, 24 Jul 2009 18:42:06 +0800
From: Zhaolei <zhaolei@...fujitsu.com>
To: Steven Rostedt <rostedt@...dmis.org>
CC: Frederic Weisbecker <fweisbec@...il.com>,
Ingo Molnar <mingo@...e.hu>,
LKML <linux-kernel@...r.kernel.org>
Subject: [PATCH] Add walltime support for ring-buffer
Note:
It is only a prototype patch to for demostrate what is result
looks like.
There still have many works to do(bug to fix) before apply.
Signed-off-by: Zhao Lei <zhaolei@...fujitsu.com>
---
drivers/oprofile/cpu_buffer.c | 4 +-
include/linux/ring_buffer.h | 12 ++-
kernel/trace/ring_buffer.c | 159 ++++++++++++++++++++++++++++++----
kernel/trace/ring_buffer_benchmark.c | 3 +-
kernel/trace/trace.c | 8 +-
kernel/trace/trace_functions_graph.c | 8 +-
kernel/trace/trace_selftest.c | 2 +-
7 files changed, 161 insertions(+), 35 deletions(-)
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index a7aae24..1b9b36d 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -187,14 +187,14 @@ int op_cpu_buffer_write_commit(struct op_entry *entry)
struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
{
struct ring_buffer_event *e;
- e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+ e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
if (e)
goto event;
if (ring_buffer_swap_cpu(op_ring_buffer_read,
op_ring_buffer_write,
cpu))
return NULL;
- e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+ e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
if (e)
goto event;
return NULL;
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 7fca716..eafffca 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -134,18 +134,22 @@ int ring_buffer_write(struct ring_buffer *buffer,
unsigned long length, void *data);
struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+ struct timespec *timespec);
struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+ struct timespec *timespec);
struct ring_buffer_iter *
ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
void ring_buffer_read_finish(struct ring_buffer_iter *iter);
struct ring_buffer_event *
-ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts,
+ struct timespec *timespec);
struct ring_buffer_event *
-ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts,
+ struct timespec *timespec);
void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 51633d7..698e47f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -318,6 +318,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
+ u32 walltime[3]; /* page wall time */
local_t commit; /* write committed index */
unsigned char data[]; /* data of buffer page */
};
@@ -442,6 +443,8 @@ struct ring_buffer_per_cpu {
unsigned long read;
u64 write_stamp;
u64 read_stamp;
+ u64 read_walltimestamp;
+ u32 read_walltime[3];
atomic_t record_disabled;
};
@@ -469,6 +472,8 @@ struct ring_buffer_iter {
unsigned long head;
struct buffer_page *head_page;
u64 read_stamp;
+ u64 read_walltimestamp;
+ u32 read_walltime[3];
};
/* buffer may be either ring_buffer or ring_buffer_per_cpu */
@@ -1499,6 +1504,14 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
+ cpu_buffer->read_walltimestamp =
+ cpu_buffer->reader_page->page->time_stamp;
+ cpu_buffer->read_walltime[0] =
+ cpu_buffer->reader_page->page->walltime[0];
+ cpu_buffer->read_walltime[1] =
+ cpu_buffer->reader_page->page->walltime[1];
+ cpu_buffer->read_walltime[2] =
+ cpu_buffer->reader_page->page->walltime[2];
cpu_buffer->reader_page->read = 0;
}
@@ -1518,6 +1531,10 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
rb_inc_page(cpu_buffer, &iter->head_page);
iter->read_stamp = iter->head_page->page->time_stamp;
+ iter->read_walltimestamp = iter->head_page->page->time_stamp;
+ iter->read_walltime[0] = iter->head_page->page->walltime[0];
+ iter->read_walltime[1] = iter->head_page->page->walltime[1];
+ iter->read_walltime[2] = iter->head_page->page->walltime[2];
iter->head = 0;
}
@@ -1927,8 +1944,48 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* If this is the first commit on the page, then update
* its timestamp.
*/
- if (!tail)
+ if (!tail) {
+ struct timespec timespec;
tail_page->page->time_stamp = *ts;
+ /*
+ * Fixme:
+ * 1: Get walltime close to get timestamp
+ * 2: don't need to set timestamp in rb_move_tail(), timestamp
+ * will be rewrited here
+ */
+ /*
+ * Fixme:
+ * We can't call getnstimeofday() here,
+ * because when when a function called with xtime_lock holded,
+ * kernel will run to here by function_trace, then
+ * getnstimeofday() will run into infinite loop, fox ex:
+ * tick_do_update_jiffies64() * write_seqlock(xtime_lock) *
+ * ->do_timer()
+ * ->update_wall_time()
+ * ->update_xtime_cache()
+ * ->ftrace_call() * here into function trace *
+ * ->function_test_events_call()
+ * ->trace_current_buffer_lock_reserve()
+ * -> ...
+ * -> here
+ *
+ * So we use get_seconds() when xtime_lock is holded,
+ * it is only a temporary way just to get walltime function
+ * work, we must fix it.
+ *
+ * Another way is call ftrace_disable_cpu() before
+ * write_seqlock().
+ */
+ if (xtime_lock.sequence & 1) {
+ timespec.tv_sec = get_seconds();
+ timespec.tv_nsec = 0;
+ } else {
+ getnstimeofday(×pec);
+ }
+ tail_page->page->walltime[0] = timespec.tv_nsec;
+ *(__kernel_time_t *)(tail_page->page->walltime + 1) =
+ timespec.tv_sec;
+ }
return event;
}
@@ -2674,10 +2731,19 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
}
- if (iter->head)
+ if (iter->head) {
iter->read_stamp = cpu_buffer->read_stamp;
- else
+ iter->read_walltimestamp = cpu_buffer->read_walltimestamp;
+ iter->read_walltime[0] = cpu_buffer->read_walltime[0];
+ iter->read_walltime[1] = cpu_buffer->read_walltime[1];
+ iter->read_walltime[2] = cpu_buffer->read_walltime[2];
+ } else {
iter->read_stamp = iter->head_page->page->time_stamp;
+ iter->read_walltimestamp = iter->head_page->page->time_stamp;
+ iter->read_walltime[0] = iter->head_page->page->walltime[0];
+ iter->read_walltime[1] = iter->head_page->page->walltime[1];
+ iter->read_walltime[2] = iter->head_page->page->walltime[2];
+ }
}
/**
@@ -2780,6 +2846,21 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
return;
}
+static void rb_cal_walltime(u32 *walltime, u64 ts, u32 *base_walltime,
+ u64 base_ts)
+{
+ u64 sec;
+ u64 nsec;
+ sec = *(__kernel_time_t *)(base_walltime + 1);
+ nsec = base_walltime[0] + ts - base_ts;
+ while (nsec >= NSEC_PER_SEC) {
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ walltime[0] = nsec;
+ *(__kernel_time_t *)(walltime + 1) = sec;
+}
+
static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -2953,12 +3034,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
}
static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+ struct timespec *timespec)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
struct buffer_page *reader;
int nr_loops = 0;
+ u64 timestamp;
cpu_buffer = buffer->buffers[cpu];
@@ -3004,10 +3087,21 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
goto again;
case RINGBUF_TYPE_DATA:
- if (ts) {
- *ts = cpu_buffer->read_stamp + event->time_delta;
+ if (ts || timespec) {
+ timestamp = cpu_buffer->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(buffer,
- cpu_buffer->cpu, ts);
+ cpu_buffer->cpu,
+ ×tamp);
+ }
+ if (ts)
+ *ts = timestamp;
+ if (timespec) {
+ u32 walltime[3];
+ rb_cal_walltime(walltime, timestamp,
+ cpu_buffer->read_walltime,
+ cpu_buffer->read_walltimestamp);
+ timespec->tv_nsec = walltime[0];
+ timespec->tv_sec = *(__kernel_time_t *)(walltime + 1);
}
return event;
@@ -3020,12 +3114,13 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
EXPORT_SYMBOL_GPL(ring_buffer_peek);
static struct ring_buffer_event *
-rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts, struct timespec *timespec)
{
struct ring_buffer *buffer;
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
int nr_loops = 0;
+ u64 timestamp;
if (ring_buffer_iter_empty(iter))
return NULL;
@@ -3070,10 +3165,20 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
goto again;
case RINGBUF_TYPE_DATA:
- if (ts) {
- *ts = iter->read_stamp + event->time_delta;
+ if (ts || timespec) {
+ timestamp = iter->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(buffer,
- cpu_buffer->cpu, ts);
+ cpu_buffer->cpu,
+ ×tamp);
+ }
+ if (ts)
+ *ts = timestamp;
+ if (timespec) {
+ u32 walltime[3];
+ rb_cal_walltime(walltime, timestamp,
+ iter->read_walltime, iter->read_walltimestamp);
+ timespec->tv_nsec = walltime[0];
+ timespec->tv_sec = *(__kernel_time_t *)(walltime + 1);
}
return event;
@@ -3110,7 +3215,8 @@ static inline int rb_ok_to_lock(void)
* not consume the data.
*/
struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+ struct timespec *timespec)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
@@ -3125,7 +3231,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
local_irq_save(flags);
if (dolock)
spin_lock(&cpu_buffer->reader_lock);
- event = rb_buffer_peek(buffer, cpu, ts);
+ event = rb_buffer_peek(buffer, cpu, ts, timespec);
+
if (dolock)
spin_unlock(&cpu_buffer->reader_lock);
local_irq_restore(flags);
@@ -3147,7 +3254,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
* not increment the iterator.
*/
struct ring_buffer_event *
-ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts,
+ struct timespec *timespec)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
struct ring_buffer_event *event;
@@ -3155,7 +3263,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
again:
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- event = rb_iter_peek(iter, ts);
+ event = rb_iter_peek(iter, ts, timespec);
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
if (event && event->type_len == RINGBUF_TYPE_PADDING) {
@@ -3175,7 +3283,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
* and eventually empty the ring buffer if the producer is slower.
*/
struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+ struct timespec *timespec)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event = NULL;
@@ -3196,7 +3305,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
if (dolock)
spin_lock(&cpu_buffer->reader_lock);
- event = rb_buffer_peek(buffer, cpu, ts);
+ event = rb_buffer_peek(buffer, cpu, ts, timespec);
if (!event)
goto out_unlock;
@@ -3287,7 +3396,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
* This reads the next event in the ring buffer and increments the iterator.
*/
struct ring_buffer_event *
-ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts,
+ struct timespec *timespec)
{
struct ring_buffer_event *event;
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
@@ -3295,7 +3405,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
again:
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- event = rb_iter_peek(iter, ts);
+ event = rb_iter_peek(iter, ts, timespec);
if (!event)
goto out;
@@ -3353,6 +3463,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->write_stamp = 0;
cpu_buffer->read_stamp = 0;
+ cpu_buffer->read_walltimestamp = 0;
+ cpu_buffer->read_walltime[0] = 0;
+ cpu_buffer->read_walltime[1] = 0;
+ cpu_buffer->read_walltime[2] = 0;
rb_head_page_activate(cpu_buffer);
}
@@ -3620,6 +3734,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
unsigned int commit;
unsigned int read;
u64 save_timestamp;
+ u32 save_walltime[3];
int ret = -1;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -3679,6 +3794,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
/* save the current timestamp, since the user will need it */
save_timestamp = cpu_buffer->read_stamp;
+ rb_cal_walltime(save_walltime, cpu_buffer->read_stamp,
+ cpu_buffer->read_walltime,
+ cpu_buffer->read_walltimestamp);
/* Need to copy one event at a time */
do {
@@ -3697,6 +3815,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
/* update bpage */
local_set(&bpage->commit, pos);
bpage->time_stamp = save_timestamp;
+ bpage->walltime[0] = save_walltime[0];
+ bpage->walltime[1] = save_walltime[1];
+ bpage->walltime[2] = save_walltime[2];
/* we copied everything to the beginning */
read = 0;
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc..059a728 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -57,8 +57,9 @@ static enum event_status read_event(int cpu)
struct ring_buffer_event *event;
int *entry;
u64 ts;
+ struct timespec timespec;
- event = ring_buffer_consume(buffer, cpu, &ts);
+ event = ring_buffer_consume(buffer, cpu, &ts, ×pec);
if (!event)
return EVENT_DROPPED;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e30e6b1..22787b3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1393,7 +1393,7 @@ static void trace_iterator_increment(struct trace_iterator *iter)
iter->idx++;
if (iter->buffer_iter[iter->cpu])
- ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+ ring_buffer_read(iter->buffer_iter[iter->cpu], NULL, NULL);
ftrace_enable_cpu();
}
@@ -1408,9 +1408,9 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
ftrace_disable_cpu();
if (buf_iter)
- event = ring_buffer_iter_peek(buf_iter, ts);
+ event = ring_buffer_iter_peek(buf_iter, ts, NULL);
else
- event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+ event = ring_buffer_peek(iter->tr->buffer, cpu, ts, NULL);
ftrace_enable_cpu();
@@ -1489,7 +1489,7 @@ static void trace_consume(struct trace_iterator *iter)
{
/* Don't allow ftrace to trace into the ring buffers */
ftrace_disable_cpu();
- ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, NULL);
ftrace_enable_cpu();
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index abf7c4a..03520a5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -340,12 +340,12 @@ get_return_for_leaf(struct trace_iterator *iter,
/* First peek to compare current entry and the next one */
if (ring_iter)
- event = ring_buffer_iter_peek(ring_iter, NULL);
+ event = ring_buffer_iter_peek(ring_iter, NULL, NULL);
else {
/* We need to consume the current entry to see the next one */
- ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL, NULL);
event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
- NULL);
+ NULL, NULL);
}
if (!event)
@@ -362,7 +362,7 @@ get_return_for_leaf(struct trace_iterator *iter,
/* this is a leaf, now advance the iterator */
if (ring_iter)
- ring_buffer_read(ring_iter, NULL);
+ ring_buffer_read(ring_iter, NULL, NULL);
return next;
}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 71f2edb..e6fa0d1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
struct trace_entry *entry;
unsigned int loops = 0;
- while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+ while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
entry = ring_buffer_event_data(event);
/*
--
1.5.5.3
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists