[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <1332433596.2487.33.camel@twins>
Date: Thu, 22 Mar 2012 17:26:36 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: mingo@...nel.org
Cc: Stephane Eranian <eranian@...gle.com>,
Vince Weaver <vince@...ter.net>,
Arnaldo Carvalho de Melo <acme@...radead.org>,
Jiri Olsa <jolsa@...hat.com>,
linux-kernel <linux-kernel@...r.kernel.org>
Subject: [PATCH] perf: mmap_page capabilities and docs
And now with LKML cc'ed as well.
I saw that Ingo merged my RDPMC patches this merge window.. sadly they
weren't quite ready, however this forced me into finishing it, so its
not too bad.
Find below the patch that should complete the feature and address all
complaints, namely:
- capabilities, so we can detect what is actually available at runtime
- rdpmc weirdness due to being 40/48 bits and not sign-extending
properly.
- documentation as to how all this stuff works.
Also, find attached a tool that uses this feature, its not quite solid
yet, but should illustrate enough to get there.
The tool, profviz, uses -finstrument-functions hooks to measure events
at function level granularity. It also constructs a call-graph using
these hooks and presents the output in a (graphviz) .dot file.
The call arcs should be weighted by the fraction (percentage) of time
spend in the callee vs the caller -- XXX not quite working right.
Nodes are coloured (red) according to their contribution to total
runtime. Arcs are coloured (green) according to their fraction of time.
Usage:
gcc -finstrument-functions -o my_prog main.c
LD_PRELOAD=profviz.so ./my_prog
xdot prof.dot
or
dot -Tsvg prof.svg prof.dot
The attached prof.svg it example output of:
LD_PRELOAD=./profviz.so ./perf report
---
Subject: perf: mmap_page capabilities and docs
From: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Date: Thu Mar 22 15:39:40 CET 2012
Add a capabilities field to perf_event_mmap_page to indicate what is
actually available for use.
Also improve the documentation for the new features.
Cc: Stephane Eranian <eranian@...gle.com>
Cc: Vince Weaver <vweaver1@...s.utk.edu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
---
arch/x86/kernel/cpu/perf_event.c | 10 ++++
include/linux/perf_event.h | 83 ++++++++++++++++++++++++++++++++++-----
kernel/events/core.c | 4 -
3 files changed, 84 insertions(+), 13 deletions(-)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1621,6 +1621,9 @@ static int x86_pmu_event_idx(struct perf
{
int idx = event->hw.idx;
+ if (!x86_pmu.attr_rdpmc)
+ return 0;
+
if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
idx -= X86_PMC_IDX_FIXED;
idx |= 1 << 30;
@@ -1705,14 +1708,19 @@ static struct pmu pmu = {
.flush_branch_stack = x86_pmu_flush_branch_stack,
};
-void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
+ userpg->cap_usr_time = 0;
+ userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
+ userpg->pmc_width = x86_pmu.cntval_bits;
+
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
return;
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
return;
+ userpg->cap_usr_time = 1;
userpg->time_mult = this_cpu_read(cyc2ns);
userpg->time_shift = CYC2NS_SCALE_FACTOR;
userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -299,18 +299,31 @@ struct perf_event_mmap_page {
/*
* Bits needed to read the hw events in user-space.
*
- * u32 seq;
- * s64 count;
+ * u32 seq, time_mult, time_shift, idx, width;
+ * u64 count, enabled, running;
+ * u64 cyc, time_offset;
+ * s64 pmc = 0;
*
* do {
* seq = pc->lock;
- *
* barrier()
- * if (pc->index) {
- * count = pmc_read(pc->index - 1);
- * count += pc->offset;
- * } else
- * goto regular_read;
+ *
+ * enabled = pc->time_enabled;
+ * running = pc->time_running;
+ *
+ * if (pc->cap_usr_time && enabled != running) {
+ * cyc = rdtsc();
+ * time_offset = pc->time_offset;
+ * time_mult = pc->time_mult;
+ * time_shift = pc->time_shift;
+ * }
+ *
+ * idx = pc->index;
+ * count = pc->offset;
+ * if (pc->cap_usr_rdpmc && idx) {
+ * width = pc->pmc_width;
+ * pmc = rdpmc(idx - 1);
+ * }
*
* barrier();
* } while (pc->lock != seq);
@@ -323,14 +336,57 @@ struct perf_event_mmap_page {
__s64 offset; /* add to hardware event value */
__u64 time_enabled; /* time event active */
__u64 time_running; /* time event on cpu */
- __u32 time_mult, time_shift;
+ union {
+ __u64 capabilities;
+ __u64 cap_usr_time : 1,
+ cap_usr_rdpmc : 1,
+ cap_____res : 62;
+ };
+
+ /*
+ * If cap_usr_rdpmc this field provides the bit-width of the value
+ * read using the rdpmc() or equivalent instruction. This can be used
+ * to sign extend the result like:
+ *
+ * pmc <<= 64 - width;
+ * pmc >>= 64 - width; // signed shift right
+ * count += pmc;
+ */
+ __u16 pmc_width;
+
+ /*
+ * If cap_usr_time the below fields can be used to compute the time
+ * delta since time_enabled (in ns) using rdtsc or similar.
+ *
+ * u64 quot, rem;
+ * u64 delta;
+ *
+ * quot = (cyc >> time_shift);
+ * rem = cyc & ((1 << time_shift) - 1);
+ * delta = time_offset + quot * time_mult +
+ * ((rem * time_mult) >> time_shift);
+ *
+ * Where time_offset,time_mult,time_shift and cyc are read in the
+ * seqcount loop described above. This delta can then be added to
+ * enabled and possible running (if idx), improving the scaling:
+ *
+ * enabled += delta;
+ * if (idx)
+ * running += delta;
+ *
+ * quot = count / running;
+ * rem = count % running;
+ * count = quot * enabled + (rem * enabled) / running;
+ */
+ __u16 time_shift;
+ __u32 time_mult;
__u64 time_offset;
/*
* Hole for extension of the self monitor capabilities
*/
- __u64 __reserved[121]; /* align to 1k */
+ __u64 __reserved[120]; /* align to 1k */
/*
* Control data for the mmap() data buffer.
@@ -347,6 +403,13 @@ struct perf_event_mmap_page {
__u64 data_tail; /* user-space written tail */
};
+/*
+ * Build time assertion that we keep the data_head at the intended location.
+ * IOW, validation we got the __reserved[] size right.
+ */
+extern char __assert_mmap_data_head_offset
+ [1 - 2*!!(offsetof(struct perf_event_mmap_page, data_head) != 1024)];
+
#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
#define PERF_RECORD_MISC_KERNEL (1 << 0)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3348,7 +3348,7 @@ static void calc_timer_values(struct per
*running = ctx_time - event->tstamp_running;
}
-void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
}
@@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct p
userpg->time_running = running +
atomic64_read(&event->child_total_time_running);
- perf_update_user_clock(userpg, now);
+ arch_perf_update_userpage(userpg, now);
barrier();
++userpg->lock;
View attachment "profviz.c" of type "text/x-csrc" (14549 bytes)
Download attachment "prof.svg" of type "image/svg+xml" (255756 bytes)
Powered by blists - more mailing lists