lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190222194244.GF26132@kernel.org>
Date:   Fri, 22 Feb 2019 16:42:44 -0300
From:   Arnaldo Carvalho de Melo <arnaldo.melo@...il.com>
To:     Adrian Hunter <adrian.hunter@...el.com>
Cc:     Jiri Olsa <jolsa@...hat.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 6/6] perf thread-stack: Hide x86 retpolines

Em Wed, Jan 09, 2019 at 11:18:35AM +0200, Adrian Hunter escreveu:
> x86 retpoline functions pollute the call graph by showing up everywhere
> there is an indirect branch, but they do not really mean anything. Make
> changes so that the default retpoline functions will no longer appear in
> the call graph. Note this only affects the call graph, since all the
> original branches are left unchanged.
> 
> This does not handle function return thunks, nor is there any improvement
> for the handling of inline thunks or extern thunks.
> 
> Example:
> 
> $ cat simple-retpoline.c
> __attribute__((noinline)) int bar(void)
> {
>         return -1;
> }
> 
> int foo(void)
> {
>         return bar() + 1;
> }
> 
> __attribute__((indirect_branch("thunk"))) int main()
> {
>         int (*volatile fn)(void) = foo;
> 
>         fn();
>         return fn();
> }
> $ gcc -ggdb3 -Wall -Wextra -O2 -o simple-retpoline simple-retpoline.c
> $ objdump -d simple-retpoline
> <SNIP>
> 0000000000001040 <main>:
>     1040:       48 83 ec 18             sub    $0x18,%rsp
>     1044:       48 8d 05 25 01 00 00    lea    0x125(%rip),%rax        # 1170 <foo>
>     104b:       48 89 44 24 08          mov    %rax,0x8(%rsp)
>     1050:       48 8b 44 24 08          mov    0x8(%rsp),%rax
>     1055:       e8 1f 01 00 00          callq  1179 <__x86_indirect_thunk_rax>
>     105a:       48 8b 44 24 08          mov    0x8(%rsp),%rax
>     105f:       48 83 c4 18             add    $0x18,%rsp
>     1063:       e9 11 01 00 00          jmpq   1179 <__x86_indirect_thunk_rax>
> <SNIP>
> 0000000000001160 <bar>:
>     1160:       b8 ff ff ff ff          mov    $0xffffffff,%eax
>     1165:       c3                      retq
> <SNIP>
> 0000000000001170 <foo>:
>     1170:       e8 eb ff ff ff          callq  1160 <bar>
>     1175:       83 c0 01                add    $0x1,%eax
>     1178:       c3                      retq
> 0000000000001179 <__x86_indirect_thunk_rax>:
>     1179:       e8 07 00 00 00          callq  1185 <__x86_indirect_thunk_rax+0xc>
>     117e:       f3 90                   pause
>     1180:       0f ae e8                lfence
>     1183:       eb f9                   jmp    117e <__x86_indirect_thunk_rax+0x5>
>     1185:       48 89 04 24             mov    %rax,(%rsp)
>     1189:       c3                      retq
> <SNIP>
> $ perf record -o simple-retpoline.perf.data -e intel_pt/cyc/u ./simple-retpoline
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0,017 MB simple-retpoline.perf.data ]
> $ perf script -i simple-retpoline.perf.data --itrace=be -s ~/libexec/perf-core/scripts/python/export-to-sqlite.py simple-retpoline.db branches calls
> 2019-01-08 14:03:37.851655 Creating database...
> 2019-01-08 14:03:37.863256 Writing records...
> 2019-01-08 14:03:38.069750 Adding indexes
> 2019-01-08 14:03:38.078799 Done
> $ ~/libexec/perf-core/scripts/python/exported-sql-viewer.py simple-retpoline.db
> 
> Before:
> 
>     main
>         -> __x86_indirect_thunk_rax
>             -> __x86_indirect_thunk_rax
>                 -> foo
>                     -> bar
> 
> After:
> 
>     main
>         -> foo
>             -> bar
> 
> Signed-off-by: Adrian Hunter <adrian.hunter@...el.com>
> ---
>  tools/perf/util/thread-stack.c | 112 ++++++++++++++++++++++++++++++++-
>  1 file changed, 109 insertions(+), 3 deletions(-)
> 
> diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c
> index 632c07a125ab..805e30836460 100644
> --- a/tools/perf/util/thread-stack.c
> +++ b/tools/perf/util/thread-stack.c
> @@ -20,6 +20,7 @@
>  #include "thread.h"
>  #include "event.h"
>  #include "machine.h"
> +#include "env.h"
>  #include "util.h"
>  #include "debug.h"
>  #include "symbol.h"
> @@ -29,6 +30,19 @@
>  
>  #define STACK_GROWTH 2048
>  
> +/*
> + * State of retpoline detection.
> + *
> + * RETPOLINE_NONE: no retpoline detection
> + * X86_RETPOLINE_POSSIBLE: x86 retpoline possible
> + * X86_RETPOLINE_DETECTED: x86 retpoline detected
> + */
> +enum retpoline_state_t {
> +	RETPOLINE_NONE,
> +	X86_RETPOLINE_POSSIBLE,
> +	X86_RETPOLINE_DETECTED,
> +};
> +
>  /**
>   * struct thread_stack_entry - thread stack entry.
>   * @ret_addr: return address
> @@ -64,6 +78,7 @@ struct thread_stack_entry {
>   * @crp: call/return processor
>   * @comm: current comm
>   * @arr_sz: size of array if this is the first element of an array
> + * @rstate: used to detect retpolines
>   */
>  struct thread_stack {
>  	struct thread_stack_entry *stack;
> @@ -76,6 +91,7 @@ struct thread_stack {
>  	struct call_return_processor *crp;
>  	struct comm *comm;
>  	unsigned int arr_sz;
> +	enum retpoline_state_t rstate;
>  };
>  
>  /*
> @@ -115,10 +131,16 @@ static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
>  	if (err)
>  		return err;
>  
> -	if (thread->mg && thread->mg->machine)
> -		ts->kernel_start = machine__kernel_start(thread->mg->machine);
> -	else
> +	if (thread->mg && thread->mg->machine) {
> +		struct machine *machine = thread->mg->machine;
> +		const char *arch = perf_env__arch(machine->env);
> +
> +		ts->kernel_start = machine__kernel_start(machine);
> +		if (!strcmp(arch, "x86"))
> +			ts->rstate = X86_RETPOLINE_POSSIBLE;
> +	} else {
>  		ts->kernel_start = 1ULL << 63;
> +	}
>  	ts->crp = crp;
>  
>  	return 0;
> @@ -733,6 +755,70 @@ static int thread_stack__trace_end(struct thread_stack *ts,
>  				     false, true);
>  }
>  
> +static bool is_x86_retpoline(const char *name)
> +{
> +	const char *p = strstr(name, "__x86_indirect_thunk_");
> +
> +	return p == name || !strcmp(name, "__indirect_thunk_start");
> +}
> +
> +/*
> + * x86 retpoline functions pollute the call graph. This function removes them.
> + * This does not handle function return thunks, nor is there any improvement
> + * for the handling of inline thunks or extern thunks.
> + */
> +static int thread_stack__x86_retpoline(struct thread_stack *ts,
> +				       struct perf_sample *sample,
> +				       struct addr_location *to_al)
> +{
> +	struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1];
> +	struct call_path_root *cpr = ts->crp->cpr;
> +	struct symbol *sym = tse->cp->sym;
> +	struct symbol *tsym = to_al->sym;
> +	struct call_path *cp;
> +
> +	if (sym && sym->name && is_x86_retpoline(sym->name)) {


  CC       /tmp/build/perf/util/scripting-engines/trace-event-perl.o
  CC       /tmp/build/perf/util/intel-pt.o
  CC       /tmp/build/perf/util/intel-pt-decoder/intel-pt-log.o
util/thread-stack.c:780:18: error: address of array 'sym->name' will always evaluate to 'true' [-Werror,-Wpointer-bool-conversion]
        if (sym && sym->name && is_x86_retpoline(sym->name)) {
                ~~ ~~~~~^~~~
1 error generated.
mv: cannot stat '/tmp/build/perf/util/.thread-stack.o.tmp': No such file or directory
make[4]: *** [/git/linux/tools/build/Makefile.build:96: /tmp/build/perf/util/thread-stack.o] Error 1


[acme@...co perf]$ pahole -C symbol ~/bin/perf
struct symbol {
	struct rb_node             rb_node;              /*     0    24 */
	u64                        start;                /*    24     8 */
	u64                        end;                  /*    32     8 */
	u16                        namelen;              /*    40     2 */
	u8                         type:4;               /*    42: 4  1 */
	u8                         binding:4;            /*    42: 0  1 */
	u8                         idle:1;               /*    43: 7  1 */
	u8                         ignore:1;             /*    43: 6  1 */
	u8                         inlined:1;            /*    43: 5  1 */

	/* XXX 5 bits hole, try to pack */

	u8                         arch_sym;             /*    44     1 */
	_Bool                      annotate2;            /*    45     1 */
	char                       name[0];              /*    46     0 */

	/* size: 48, cachelines: 1, members: 12 */
	/* bit holes: 1, sum bit holes: 5 bits */
	/* padding: 2 */
	/* last cacheline: 48 bytes */
};
[acme@...co perf]$

I'm removing that sym->name test.

> +		/*
> +		 * This is a x86 retpoline fn. It pollutes the call graph by
> +		 * showing up everywhere there is an indirect branch, but does
> +		 * not itself mean anything. Here the top-of-stack is removed,
> +		 * by decrementing the stack count, and then further down, the
> +		 * resulting top-of-stack is replaced with the actual target.
> +		 * The result is that the retpoline functions will no longer
> +		 * appear in the call graph. Note this only affects the call
> +		 * graph, since all the original branches are left unchanged.
> +		 */
> +		ts->cnt -= 1;
> +		sym = ts->stack[ts->cnt - 2].cp->sym;
> +		if (sym && sym == tsym && to_al->addr != tsym->start) {
> +			/*
> +			 * Target is back to the middle of the symbol we came
> +			 * from so assume it is an indirect jmp and forget it
> +			 * altogether.
> +			 */
> +			ts->cnt -= 1;
> +			return 0;
> +		}
> +	} else if (sym && sym == tsym) {
> +		/*
> +		 * Target is back to the symbol we came from so assume it is an
> +		 * indirect jmp and forget it altogether.
> +		 */
> +		ts->cnt -= 1;
> +		return 0;
> +	}
> +
> +	cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym,
> +				sample->addr, ts->kernel_start);
> +	if (!cp)
> +		return -ENOMEM;
> +
> +	/* Replace the top-of-stack with the actual target */
> +	ts->stack[ts->cnt - 1].cp = cp;
> +
> +	return 0;
> +}
> +
>  int thread_stack__process(struct thread *thread, struct comm *comm,
>  			  struct perf_sample *sample,
>  			  struct addr_location *from_al,
> @@ -740,6 +826,7 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
>  			  struct call_return_processor *crp)
>  {
>  	struct thread_stack *ts = thread__stack(thread, sample->cpu);
> +	enum retpoline_state_t rstate;
>  	int err = 0;
>  
>  	if (ts && !ts->crp) {
> @@ -755,6 +842,10 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
>  		ts->comm = comm;
>  	}
>  
> +	rstate = ts->rstate;
> +	if (rstate == X86_RETPOLINE_DETECTED)
> +		ts->rstate = X86_RETPOLINE_POSSIBLE;
> +
>  	/* Flush stack on exec */
>  	if (ts->comm != comm && thread->pid_ == thread->tid) {
>  		err = __thread_stack__flush(thread, ts);
> @@ -791,10 +882,25 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
>  					ts->kernel_start);
>  		err = thread_stack__push_cp(ts, ret_addr, sample->time, ref,
>  					    cp, false, trace_end);
> +
> +		/*
> +		 * A call to the same symbol but not the start of the symbol,
> +		 * may be the start of a x86 retpoline.
> +		 */
> +		if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
> +		    from_al->sym == to_al->sym &&
> +		    to_al->addr != to_al->sym->start)
> +			ts->rstate = X86_RETPOLINE_DETECTED;
> +
>  	} else if (sample->flags & PERF_IP_FLAG_RETURN) {
>  		if (!sample->ip || !sample->addr)
>  			return 0;
>  
> +		/* x86 retpoline 'return' doesn't match the stack */
> +		if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 &&
> +		    ts->stack[ts->cnt - 1].ret_addr != sample->addr)
> +			return thread_stack__x86_retpoline(ts, sample, to_al);
> +
>  		err = thread_stack__pop_cp(thread, ts, sample->addr,
>  					   sample->time, ref, from_al->sym);
>  		if (err) {
> -- 
> 2.17.1

-- 

- Arnaldo

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ