lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed,  3 Mar 2010 07:55:01 +0100
From:	Frederic Weisbecker <fweisbec@...il.com>
To:	Ingo Molnar <mingo@...e.hu>
Cc:	LKML <linux-kernel@...r.kernel.org>,
	Frederic Weisbecker <fweisbec@...il.com>,
	Ingo Molnar <mingo@...e.hu>,
	Thomas Gleixner <tglx@...utronix.de>,
	"H. Peter Anvin" <hpa@...or.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Paul Mackerras <paulus@...ba.org>,
	Steven Rostedt <rostedt@...dmis.org>,
	Arnaldo Carvalho de Melo <acme@...hat.com>
Subject: [RFC][PATCH 2/3] perf: Take a hot regs snapshot for trace events

We are taking a wrong regs snapshot when a trace event triggers.
Either we use get_irq_regs(), which gives us the interrupted
registers if we are in an interrupt, or we use task_pt_regs()
which gives us the state before we entered the kernel, assuming
we are lucky enough to be no kernel thread, in which case
task_pt_regs() returns the initial set of regs when the kernel
thread was started.

What we want is different. We need a hot snapshot of the regs,
so that we can get the instruction pointer to record in the
sample, the frame pointer for the callchain, and some other
things.

In order to achieve that, we introduce PERF_SAVE_REGS() that
takes a live snapshot of the regs we are interested in, and
we use it from perf_tp_event().

Comparison with perf record -e lock: -R -a -f -g
Before:

        perf  [kernel]                   [k] __do_softirq
               |
               --- __do_softirq
                  |
                  |--55.16%-- __open
                  |
                   --44.84%-- __write_nocancel

After:

            perf  [kernel]           [k] perf_tp_event
               |
               --- perf_tp_event
                  |
                  |--41.07%-- ftrace_profile_lock_acquire
                  |          lock_acquire
                  |          |
                  |          |--39.36%-- _raw_spin_lock
                  |          |          |
                  |          |          |--7.81%-- hrtimer_interrupt
                  |          |          |          smp_apic_timer_interrupt
                  |          |          |          apic_timer_interrupt

The old case was producing unreliable callchains. Now having
right frame and instruction pointers, we have the trace we
want. We can even think about implementing a "skip" field in
PERF_SAVE_REGS() to rewind to the caller "n" (here n would
be equal to 2 so that we start on lock_acquire()).

Signed-off-by: Frederic Weisbecker <fweisbec@...il.com>
Cc: Ingo Molnar <mingo@...e.hu>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: H. Peter Anvin <hpa@...or.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Paul Mackerras <paulus@...ba.org>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Arnaldo Carvalho de Melo <acme@...hat.com>
---
 arch/x86/include/asm/asm.h        |    1 +
 arch/x86/include/asm/perf_event.h |   25 +++++++++++++++++++++++++
 kernel/perf_event.c               |   19 ++++++++++++++-----
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b3ed1e1..d86e7dc 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -27,6 +27,7 @@
 #define _ASM_ADD	__ASM_SIZE(add)
 #define _ASM_SUB	__ASM_SIZE(sub)
 #define _ASM_XADD	__ASM_SIZE(xadd)
+#define _ASM_POP	__ASM_SIZE(pop)
 
 #define _ASM_AX		__ASM_REG(ax)
 #define _ASM_BX		__ASM_REG(bx)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index befd172..a166fa5 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -122,6 +122,31 @@ union cpuid10_edx {
 extern void init_hw_perf_events(void);
 extern void perf_events_lapic_init(void);
 
+/*
+ * Take a snapshot of the regs. We only need a few of them:
+ * ip for PERF_SAMPLE_IP
+ * cs for user_mode() tests
+ * bp for callchains
+ * eflags, for future purposes, just in case
+ */
+#define PERF_SAVE_REGS(regs)				{\
+	memset((regs), 0, sizeof(*(regs)));		\
+	(regs)->cs = __KERNEL_CS;			\
+							\
+	asm volatile(					\
+		"call 1f\n"				\
+		"1:\n"					\
+		_ASM_POP " %[ip]\n"			\
+		_ASM_MOV "%%" _ASM_BP ", %[bp]\n"	\
+		"pushf\n"				\
+		_ASM_POP " %[flags]\n"			\
+		:  [ip]    "=m" ((regs)->ip),		\
+		   [bp]    "=m" ((regs)->bp),		\
+		   [flags] "=m" ((regs)->flags)		\
+		:: "memory"				\
+	);						\
+}
+
 #define PERF_EVENT_INDEX_OFFSET			0
 
 #else
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a661e79..f412be1 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2807,6 +2807,16 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 }
 
 /*
+ * Hot regs snapshot support -- arch specific
+ * This needs to be a macro because we want the current
+ * frame pointer.
+ */
+#ifndef PERF_SAVE_REGS
+#define PERF_SAVE_REGS(regs)	memset(regs, 0, sizeof(*regs));
+#endif
+
+
+/*
  * Output
  */
 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
@@ -4337,6 +4347,8 @@ static const struct pmu perf_ops_task_clock = {
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 			  int entry_size)
 {
+	struct pt_regs regs;
+
 	struct perf_raw_record raw = {
 		.size = entry_size,
 		.data = record,
@@ -4347,14 +4359,11 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 		.raw = &raw,
 	};
 
-	struct pt_regs *regs = get_irq_regs();
-
-	if (!regs)
-		regs = task_pt_regs(current);
+	PERF_SAVE_REGS(&regs);
 
 	/* Trace events already protected against recursion */
 	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-				&data, regs);
+				&data, &regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
-- 
1.6.2.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists