lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 25 Mar 2016 16:02:38 -0700
From:	Andi Kleen <andi@...stfloor.org>
To:	acme@...nel.org
Cc:	jolsa@...nel.org, linux-kernel@...r.kernel.org,
	Andi Kleen <ak@...ux.intel.com>
Subject: [PATCH 4/4] perf, tools, script: Add brstackasm output for branch stacks

From: Andi Kleen <ak@...ux.intel.com>

Implement printing full disassembled sequences for branch stacks in perf
script. This allows to directly print hot paths for individual samples,
together with branch misprediction and even cycle count information.

% perf record -b ...
% perf script -F brstackasm
...
        00007f0668d54e88        movsx (%rsi), %ecx
        00007f0668d54e8b        lea -0x30(%rcx), %eax
        00007f0668d54e8e        cmp $0x9, %al
        00007f0668d54e90        jbe 0x68d54eaf
        00007f0668d54e92        cmp %cl, %dl
        00007f0668d54e94        jnz 0x68d54eb5
        00007f0668d54e96        add $0x1, %rdi
        00007f0668d54e9a        movsx (%rdi), %edx
        00007f0668d54e9d        add $0x1, %rsi
        00007f0668d54ea1        test %dl, %dl
        00007f0668d54ea3        jnz _dl_cache_libcmp+11       # PRED 21 cycles
        00007f0668d54dfb        lea -0x30(%rdx), %eax
        00007f0668d54dfe        cmp $0x9, %al
        00007f0668d54e00        ja _dl_cache_libcmp+152       # PRED 2 cycles
        00007f0668d54e88        movsx (%rsi), %ecx
        00007f0668d54e8b        lea -0x30(%rcx), %eax
        00007f0668d54e8e        cmp $0x9, %al
        00007f0668d54e90        jbe 0x68d54eaf
        00007f0668d54e92        cmp %cl, %dl
        00007f0668d54e94        jnz 0x68d54eb5                # PRED 3 cycles
        00007f0668d54eb5        movsx %dl, %eax
        00007f0668d54eb8        sub %ecx, %eax
        00007f0668d54eba        ret                           # PRED 1 cycles
        00007f0668d54fae        test %eax, %eax
        00007f0668d54fb0        jz _dl_load_cache_lookup+688
        00007f0668d54fb6        jns 0x68d54f70
        00007f0668d54fb8        lea 0x1(%r14), %ebx
        00007f0668d54fbc        cmp %r15d, %ebx
        00007f0668d54fbf        nop
        00007f0668d54fc0        jle 0x68d54f79                # PRED 2 cycles

Open issues:
- Occasionally the path does not reach up to the sample IP, as the LBRs
may be freezed earlier.

Signed-off-by: Andi Kleen <ak@...ux.intel.com>
---
 tools/perf/Documentation/perf-script.txt |   9 +-
 tools/perf/builtin-script.c              | 191 ++++++++++++++++++++++++++++++-
 2 files changed, 195 insertions(+), 5 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index c834f4d..4a30f02 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-	srcline, period, iregs, brstack, brstacksym, flags, asm.
+	srcline, period, iregs, brstack, brstacksym, flags, asm, brstackasm
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -f sw:comm,tid,time,ip,sym  and -f trace:time,cpu,trace
@@ -176,17 +176,22 @@ OPTIONS
 	i.e., -f "" is not allowed.
 
 	The brstack output includes branch related information with raw addresses using the
-	/v/v/v/v/ syntax in the following order:
+	/v/v/v/v/cycles syntax in the following order:
 	FROM: branch source instruction
 	TO  : branch target instruction
         M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
 	X/- : X=branch inside a transactional region, -=not in transaction region or not supported
 	A/- : A=TSX abort entry, -=not aborted region or not supported
+	cycles
 
 	The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
 
 	When asm is specified the assembler instruction of each sample is printed in disassembled form.
 
+	When brstackasm is specified the full assembler sequences of branch sequences for each sample
+	is printed. This is the full execution path leading to the sample. This is only supported when the
+	sample was recorded with perf record -b or -j any.
+
 -k::
 --vmlinux=<file>::
         vmlinux pathname
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 706ece8..766242b 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -42,6 +42,7 @@ static bool			nanosecs;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 static struct perf_stat_config	stat_config;
+static int 			max_blocks;
 
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 
@@ -67,6 +68,7 @@ enum perf_output_field {
 	PERF_OUTPUT_WEIGHT	    = 1U << 18,
 	PERF_OUTPUT_BPF_OUTPUT	    = 1U << 19,
 	PERF_OUTPUT_ASM		    = 1U << 20,
+	PERF_OUTPUT_BRSTACKASM	    = 1U << 21,
 };
 
 struct output_option {
@@ -94,6 +96,7 @@ struct output_option {
 	{.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
 	{.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
 	{.str = "asm", .field = PERF_OUTPUT_ASM},
+	{.str = "brstackasm", .field = PERF_OUTPUT_BRSTACKASM},
 };
 
 /* default set to maintain compatibility with current format */
@@ -293,6 +296,13 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 		       "selected.\n");
 		return -EINVAL;
 	}
+	if (PRINT_FIELD(BRSTACKASM) &&
+	    !(perf_evlist__combined_branch_type(session->evlist) &
+	      PERF_SAMPLE_BRANCH_ANY)) {
+		pr_err("Display of branch stack assembler requested, but non all-branch filter set\n");
+		return -EINVAL;
+	}
+
 	if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
 		perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
 					PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -462,10 +472,10 @@ static const char *dis_resolve(struct ud *u, uint64_t addr, int64_t *off)
 	if (!al.sym)
 		return NULL;
 
-	if (addr < al.sym->end)
-		*off = addr - al.sym->start;
+	if (al.addr < al.sym->end)
+		*off = al.addr - al.sym->start;
 	else
-		*off = addr - al.map->start - al.sym->start;
+		*off = al.addr - al.map->start - al.sym->start;
 	return al.sym->name;
 }
 #endif
@@ -630,6 +640,176 @@ static void print_sample_brstacksym(union perf_event *event __maybe_unused,
 	}
 }
 
+#ifdef HAVE_UDIS86
+#define MAXBB 16384UL
+#define MAXINSN 16
+
+static int grab_bb(char *buffer, u64 start, u64 end,
+		    struct machine *machine, struct thread *thread,
+		    bool *is64bit, u8 *cpumode)
+{
+	int offset, len;
+	struct addr_location al;
+	bool kernel;
+
+	if (!start || !end)
+		return 0;
+
+	kernel = machine__kernel_ip(machine, start);
+	if (kernel)
+		*cpumode = PERF_RECORD_MISC_KERNEL;
+	else
+		*cpumode = PERF_RECORD_MISC_USER;
+	if (kernel != machine__kernel_ip(machine, end))
+		return 0;
+
+	memset(&al, 0, sizeof(al));
+	if (end - start > MAXBB - MAXINSN) {
+		pr_debug("\tbasic block %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",
+		       start, end, end - start);
+		return 0;
+	}
+
+	thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+	if (!al.map || !al.map->dso) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+				start, end);
+		return 0;
+	}
+	if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+				start, end);
+		return 0;
+	}
+
+	/* Load maps to ensure dso->is_64_bit has been updated */
+	map__load(al.map, machine->symbol_filter);
+
+	offset = al.map->map_ip(al.map, start);
+	len = dso__data_read_offset(al.map->dso, machine,
+				    offset, (u8 *)buffer,
+				    end - start + MAXINSN);
+
+	*is64bit = al.map->dso->is_64_bit;
+	return len;
+}
+#endif
+
+static void print_sample_brstackasm(union perf_event *event __maybe_unused,
+				    struct perf_sample *sample,
+				    struct thread *thread __maybe_unused,
+				    struct perf_event_attr *attr __maybe_unused,
+				    struct machine *machine __maybe_unused)
+{
+#ifdef HAVE_UDIS86
+	struct branch_stack *br = sample->branch_stack;
+	u64 start, end;
+	int i;
+	static bool ud_initialized = false;
+	static struct perf_ud ud;
+	char buffer[MAXBB];
+	int len;
+	bool last;
+	bool is64bit;
+	int nr;
+
+	if (!(br && br->nr))
+		return;
+	nr = br->nr;
+	if (max_blocks && nr > max_blocks + 1)
+		nr = max_blocks + 1;
+
+	if (!ud_initialized) {
+		ud_initialized = true;
+		ud_init(&ud.ud_obj);
+		ud_set_syntax(&ud.ud_obj, UD_SYN_ATT);
+		ud_set_sym_resolver(&ud.ud_obj, dis_resolve);
+	}
+	ud.thread = thread;
+	ud.cpu = sample->cpu;
+
+	putchar('\n');
+	for (i = nr - 2; i >= 0; i--) {
+		if (br->entries[i].from || br->entries[i].to)
+			printf("%d: %lx-%lx\n", i,
+				br->entries[i].from,
+				br->entries[i].to);
+		start = br->entries[i + 1].to;
+		end = br->entries[i].from;
+
+		/*
+		 * Leave extra bytes for the final jump instruction for
+		 * which we don't know the length
+		 */
+		len = grab_bb(buffer, start, end + MAXINSN,
+				machine, thread, &is64bit,
+				&ud.cpumode);
+		if (len <= 0)
+			continue;
+
+		ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+		ud_set_pc(&ud.ud_obj, start);
+		ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+		last = false;
+		while (ud_disassemble(&ud.ud_obj) && !last) {
+			if (ud_insn_ptr(&ud.ud_obj) ==
+					(uint8_t *)buffer + end - start) {
+				printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s\n",
+					ud_insn_off(&ud.ud_obj),
+					ud_insn_asm(&ud.ud_obj),
+					br->entries[i].flags.predicted ? " PRED" : "",
+					br->entries[i].flags.mispred ? " MISPRED" : "",
+					br->entries[i].flags.in_tx ? " INTX" : "",
+					br->entries[i].flags.abort ? " ABORT" : "");
+				if (br->entries[i].flags.cycles)
+					printf(" %d cycles", br->entries[i].flags.cycles);
+				last = true;
+			} else {
+				printf("\t%016" PRIx64 "\t%s\n",
+						ud_insn_off(&ud.ud_obj),
+					ud_insn_asm(&ud.ud_obj));
+			}
+		}
+	}
+
+	/*
+	 * Hit the branch? In this case we are already done, and the target
+	 * has not been executed yet.
+	 */
+	if (br->entries[0].from == sample->ip)
+		return;
+	if (br->entries[0].flags.abort)
+		return;
+
+	/*
+	 * Print final block upto sample
+	 */
+	start = br->entries[0].to;
+	end = sample->ip;
+	len = grab_bb(buffer, start, end, machine, thread, &is64bit,
+			&ud.cpumode);
+	ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+	if (len <= 0) {
+		/* Print at least last IP if basic block did not work */
+		len = grab_bb(buffer, sample->ip, sample->ip + MAXINSN,
+				machine, thread, &is64bit, &ud.cpumode);
+		if (len <= 0)
+			return;
+		ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+		ud_set_pc(&ud.ud_obj, sample->ip);
+		if (ud_disassemble(&ud.ud_obj))
+			printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+			       ud_insn_asm(&ud.ud_obj));
+		return;
+	}
+	ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+	ud_set_pc(&ud.ud_obj, start);
+	while (ud_disassemble(&ud.ud_obj) &&
+		ud_insn_ptr(&ud.ud_obj) <= (uint8_t *)buffer + end - start)
+		printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+			       ud_insn_asm(&ud.ud_obj));
+#endif
+}
 
 static void print_sample_addr(union perf_event *event,
 			  struct perf_sample *sample,
@@ -909,6 +1089,9 @@ print_rest:
 	if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
 		print_sample_bpf_output(sample);
 
+	if (PRINT_FIELD(BRSTACKASM))
+		print_sample_brstackasm(event, sample, thread, attr,
+					machine);
 	if (PRINT_FIELD(ASM))
 		print_sample_asm(event, sample, thread, attr, al, machine);
 
@@ -2140,6 +2323,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "Show the mmap events"),
 	OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
 		    "Show context switch events (if recorded)"),
+	OPT_INTEGER(0, "max-blocks", &max_blocks,
+		    "Maximum number of code blocks to dump with brstackasm"),
 	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
 	OPT_BOOLEAN(0, "ns", &nanosecs,
 		    "Use 9 decimal places when displaying time"),
-- 
2.5.5

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ