lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <55E6F51B.9010203@intel.com>
Date:	Wed, 2 Sep 2015 16:09:47 +0300
From:	Adrian Hunter <adrian.hunter@...el.com>
To:	Arnaldo Carvalho de Melo <acme@...nel.org>
Cc:	linux-kernel@...r.kernel.org, Jiri Olsa <jolsa@...hat.com>
Subject: Re: [PATCH 2/2] perf tools: Add support for PERF_RECORD_SWITCH to
 Intel PT

On 13/08/15 12:40, Adrian Hunter wrote:
> Add support for selecting and processing PERF_RECORD_SWITCH
> events for use by Intel PT.  If they are available, they will be
> used in preference to sched_switch events.
> 
> This enables an unprivileged user to trace multi-threaded or
> multi-process workloads with any level of perf_event_paranoid.
> However it depends on kernel support for PERF_RECORD_SWITCH.
> 
> Without this patch, tracing a multi-threaded workload will
> decode without error but all the data will be attributed to
> the main thread.
> 
> Without this patch, tracing a multi-process workload will
> result in decoder errors because the decoder will not know
> which executable is executing.
> 
> Signed-off-by: Adrian Hunter <adrian.hunter@...el.com>

This one still applies.

> ---
>  tools/perf/arch/x86/util/intel-pt.c |  55 ++++++++++++---
>  tools/perf/util/intel-pt.c          | 129 +++++++++++++++++++++++++++++-------
>  2 files changed, 151 insertions(+), 33 deletions(-)
> 
> diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
> index 2ca10d796c0b..b02af064f0f9 100644
> --- a/tools/perf/arch/x86/util/intel-pt.c
> +++ b/tools/perf/arch/x86/util/intel-pt.c
> @@ -624,13 +624,49 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
>  	 * threads.
>  	 */
>  	if (have_timing_info && !cpu_map__empty(cpus)) {
> -		err = intel_pt_track_switches(evlist);
> -		if (err == -EPERM)
> -			pr_debug2("Unable to select sched:sched_switch\n");
> -		else if (err)
> -			return err;
> -		else
> -			ptr->have_sched_switch = 1;
> +		if (perf_can_record_switch_events()) {
> +			bool cpu_wide = !target__none(&opts->target) &&
> +					!target__has_task(&opts->target);
> +
> +			if (!cpu_wide && perf_can_record_cpu_wide()) {
> +				struct perf_evsel *switch_evsel;
> +
> +				err = parse_events(evlist, "dummy:u", NULL);
> +				if (err)
> +					return err;
> +
> +				switch_evsel = perf_evlist__last(evlist);
> +
> +				switch_evsel->attr.freq = 0;
> +				switch_evsel->attr.sample_period = 1;
> +				switch_evsel->attr.context_switch = 1;
> +
> +				switch_evsel->system_wide = true;
> +				switch_evsel->no_aux_samples = true;
> +				switch_evsel->immediate = true;
> +
> +				perf_evsel__set_sample_bit(switch_evsel, TID);
> +				perf_evsel__set_sample_bit(switch_evsel, TIME);
> +				perf_evsel__set_sample_bit(switch_evsel, CPU);
> +
> +				opts->record_switch_events = false;
> +				ptr->have_sched_switch = 3;
> +			} else {
> +				opts->record_switch_events = true;
> +				if (cpu_wide)
> +					ptr->have_sched_switch = 3;
> +				else
> +					ptr->have_sched_switch = 2;
> +			}
> +		} else {
> +			err = intel_pt_track_switches(evlist);
> +			if (err == -EPERM)
> +				pr_debug2("Unable to select sched:sched_switch\n");
> +			else if (err)
> +				return err;
> +			else
> +				ptr->have_sched_switch = 1;
> +		}
>  	}
>  
>  	if (intel_pt_evsel) {
> @@ -663,8 +699,11 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
>  		tracking_evsel->attr.sample_period = 1;
>  
>  		/* In per-cpu case, always need the time of mmap events etc */
> -		if (!cpu_map__empty(cpus))
> +		if (!cpu_map__empty(cpus)) {
>  			perf_evsel__set_sample_bit(tracking_evsel, TIME);
> +			/* And the CPU for switch events */
> +			perf_evsel__set_sample_bit(tracking_evsel, CPU);
> +		}
>  	}
>  
>  	/*
> diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
> index 4bae958096d4..1f6aab58e931 100644
> --- a/tools/perf/util/intel-pt.c
> +++ b/tools/perf/util/intel-pt.c
> @@ -1145,11 +1145,13 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
>  	return 0;
>  }
>  
> -static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip)
> +static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip)
>  {
> +	struct machine *machine = pt->machine;
>  	struct map *map;
>  	struct symbol *sym, *start;
>  	u64 ip, switch_ip = 0;
> +	const char *ptss;
>  
>  	if (ptss_ip)
>  		*ptss_ip = 0;
> @@ -1177,8 +1179,13 @@ static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip)
>  	if (!switch_ip || !ptss_ip)
>  		return 0;
>  
> +	if (pt->have_sched_switch == 1)
> +		ptss = "perf_trace_sched_switch";
> +	else
> +		ptss = "__perf_event_task_sched_out";
> +
>  	for (sym = start; sym; sym = dso__next_symbol(sym)) {
> -		if (!strcmp(sym->name, "perf_trace_sched_switch")) {
> +		if (!strcmp(sym->name, ptss)) {
>  			ip = map->unmap_ip(map, sym->start);
>  			if (ip >= map->start && ip < map->end) {
>  				*ptss_ip = ip;
> @@ -1198,11 +1205,11 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
>  
>  	if (!pt->kernel_start) {
>  		pt->kernel_start = machine__kernel_start(pt->machine);
> -		if (pt->per_cpu_mmaps && pt->have_sched_switch &&
> +		if (pt->per_cpu_mmaps &&
> +		    (pt->have_sched_switch == 1 || pt->have_sched_switch == 3) &&
>  		    !pt->timeless_decoding && intel_pt_tracing_kernel(pt) &&
>  		    !pt->sampling_mode) {
> -			pt->switch_ip = intel_pt_switch_ip(pt->machine,
> -							   &pt->ptss_ip);
> +			pt->switch_ip = intel_pt_switch_ip(pt, &pt->ptss_ip);
>  			if (pt->switch_ip) {
>  				intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n",
>  					     pt->switch_ip, pt->ptss_ip);
> @@ -1387,31 +1394,18 @@ static struct intel_pt_queue *intel_pt_cpu_to_ptq(struct intel_pt *pt, int cpu)
>  	return NULL;
>  }
>  
> -static int intel_pt_process_switch(struct intel_pt *pt,
> -				   struct perf_sample *sample)
> +static int intel_pt_sync_switch(struct intel_pt *pt, int cpu, pid_t tid,
> +				u64 timestamp)
>  {
>  	struct intel_pt_queue *ptq;
> -	struct perf_evsel *evsel;
> -	pid_t tid;
> -	int cpu, err;
> -
> -	evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id);
> -	if (evsel != pt->switch_evsel)
> -		return 0;
> -
> -	tid = perf_evsel__intval(evsel, sample, "next_pid");
> -	cpu = sample->cpu;
> -
> -	intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
> -		     cpu, tid, sample->time, perf_time_to_tsc(sample->time,
> -		     &pt->tc));
> +	int err;
>  
>  	if (!pt->sync_switch)
> -		goto out;
> +		return 1;
>  
>  	ptq = intel_pt_cpu_to_ptq(pt, cpu);
>  	if (!ptq)
> -		goto out;
> +		return 1;
>  
>  	switch (ptq->switch_state) {
>  	case INTEL_PT_SS_NOT_TRACING:
> @@ -1424,7 +1418,7 @@ static int intel_pt_process_switch(struct intel_pt *pt,
>  		return 0;
>  	case INTEL_PT_SS_EXPECTING_SWITCH_EVENT:
>  		if (!ptq->on_heap) {
> -			ptq->timestamp = perf_time_to_tsc(sample->time,
> +			ptq->timestamp = perf_time_to_tsc(timestamp,
>  							  &pt->tc);
>  			err = auxtrace_heap__add(&pt->heap, ptq->queue_nr,
>  						 ptq->timestamp);
> @@ -1441,10 +1435,76 @@ static int intel_pt_process_switch(struct intel_pt *pt,
>  	default:
>  		break;
>  	}
> -out:
> +
> +	return 1;
> +}
> +
> +static int intel_pt_process_switch(struct intel_pt *pt,
> +				   struct perf_sample *sample)
> +{
> +	struct perf_evsel *evsel;
> +	pid_t tid;
> +	int cpu, ret;
> +
> +	evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id);
> +	if (evsel != pt->switch_evsel)
> +		return 0;
> +
> +	tid = perf_evsel__intval(evsel, sample, "next_pid");
> +	cpu = sample->cpu;
> +
> +	intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
> +		     cpu, tid, sample->time, perf_time_to_tsc(sample->time,
> +		     &pt->tc));
> +
> +	ret = intel_pt_sync_switch(pt, cpu, tid, sample->time);
> +	if (ret <= 0)
> +		return ret;
> +
>  	return machine__set_current_tid(pt->machine, cpu, -1, tid);
>  }
>  
> +static int intel_pt_context_switch(struct intel_pt *pt, union perf_event *event,
> +				   struct perf_sample *sample)
> +{
> +	bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
> +	pid_t pid, tid;
> +	int cpu, ret;
> +
> +	cpu = sample->cpu;
> +
> +	if (pt->have_sched_switch == 3) {
> +		if (!out)
> +			return 0;
> +		if (event->header.type != PERF_RECORD_SWITCH_CPU_WIDE) {
> +			pr_err("Expecting CPU-wide context switch event\n");
> +			return -EINVAL;
> +		}
> +		pid = event->context_switch.next_prev_pid;
> +		tid = event->context_switch.next_prev_tid;
> +	} else {
> +		if (out)
> +			return 0;
> +		pid = sample->pid;
> +		tid = sample->tid;
> +	}
> +
> +	if (tid == -1) {
> +		pr_err("context_switch event has no tid\n");
> +		return -EINVAL;
> +	}
> +
> +	intel_pt_log("context_switch: cpu %d pid %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
> +		     cpu, pid, tid, sample->time, perf_time_to_tsc(sample->time,
> +		     &pt->tc));
> +
> +	ret = intel_pt_sync_switch(pt, cpu, tid, sample->time);
> +	if (ret <= 0)
> +		return ret;
> +
> +	return machine__set_current_tid(pt->machine, cpu, pid, tid);
> +}
> +
>  static int intel_pt_process_itrace_start(struct intel_pt *pt,
>  					 union perf_event *event,
>  					 struct perf_sample *sample)
> @@ -1515,6 +1575,9 @@ static int intel_pt_process_event(struct perf_session *session,
>  		err = intel_pt_process_switch(pt, sample);
>  	else if (event->header.type == PERF_RECORD_ITRACE_START)
>  		err = intel_pt_process_itrace_start(pt, event, sample);
> +	else if (event->header.type == PERF_RECORD_SWITCH ||
> +		 event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
> +		err = intel_pt_context_switch(pt, event, sample);
>  
>  	intel_pt_log("event %s (%u): cpu %d time %"PRIu64" tsc %#"PRIx64"\n",
>  		     perf_event__name(event->header.type), event->header.type,
> @@ -1777,6 +1840,18 @@ static struct perf_evsel *intel_pt_find_sched_switch(struct perf_evlist *evlist)
>  	return NULL;
>  }
>  
> +static bool intel_pt_find_switch(struct perf_evlist *evlist)
> +{
> +	struct perf_evsel *evsel;
> +
> +	evlist__for_each(evlist, evsel) {
> +		if (evsel->attr.context_switch)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
>  static const char * const intel_pt_info_fmts[] = {
>  	[INTEL_PT_PMU_TYPE]		= "  PMU Type            %"PRId64"\n",
>  	[INTEL_PT_TIME_SHIFT]		= "  Time Shift          %"PRIu64"\n",
> @@ -1888,6 +1963,10 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
>  			pr_err("%s: missing sched_switch event\n", __func__);
>  			goto err_delete_thread;
>  		}
> +	} else if (pt->have_sched_switch == 2 &&
> +		   !intel_pt_find_switch(session->evlist)) {
> +		pr_err("%s: missing context_switch attribute flag\n", __func__);
> +		goto err_delete_thread;
>  	}
>  
>  	if (session->itrace_synth_opts && session->itrace_synth_opts->set) {
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ