linux-kernel - RE: [PATCH v3] perf evsel: Refactor tool events

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CO6PR11MB56353A69AEF31BA2731920EBEEF02@CO6PR11MB5635.namprd11.prod.outlook.com>
Date: Mon, 27 May 2024 04:15:32 +0000
From: "Wang, Weilin" <weilin.wang@...el.com>
To: Ian Rogers <irogers@...gle.com>, Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>, Arnaldo Carvalho de Melo <acme@...nel.org>,
	Namhyung Kim <namhyung@...nel.org>, Mark Rutland <mark.rutland@....com>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>, Jiri Olsa
	<jolsa@...nel.org>, "Hunter, Adrian" <adrian.hunter@...el.com>, Kan Liang
	<kan.liang@...ux.intel.com>, Ze Gao <zegao2021@...il.com>, Leo Yan
	<leo.yan@...ux.dev>, Ravi Bangoria <ravi.bangoria@....com>, Dmitrii Dolgov
	<9erthalion6@...il.com>, Song Liu <song@...nel.org>, James Clark
	<james.clark@....com>, "linux-perf-users@...r.kernel.org"
	<linux-perf-users@...r.kernel.org>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>
Subject: RE: [PATCH v3] perf evsel: Refactor tool events



> -----Original Message-----
> From: Ian Rogers <irogers@...gle.com>
> Sent: Friday, May 3, 2024 4:29 PM
> To: Wang, Weilin <weilin.wang@...el.com>; Peter Zijlstra
> <peterz@...radead.org>; Ingo Molnar <mingo@...hat.com>; Arnaldo
> Carvalho de Melo <acme@...nel.org>; Namhyung Kim
> <namhyung@...nel.org>; Mark Rutland <mark.rutland@....com>;
> Alexander Shishkin <alexander.shishkin@...ux.intel.com>; Jiri Olsa
> <jolsa@...nel.org>; Ian Rogers <irogers@...gle.com>; Hunter, Adrian
> <adrian.hunter@...el.com>; Kan Liang <kan.liang@...ux.intel.com>; Ze Gao
> <zegao2021@...il.com>; Leo Yan <leo.yan@...ux.dev>; Ravi Bangoria
> <ravi.bangoria@....com>; Dmitrii Dolgov <9erthalion6@...il.com>; Song
> Liu <song@...nel.org>; James Clark <james.clark@....com>; linux-perf-
> users@...r.kernel.org; linux-kernel@...r.kernel.org
> Subject: [PATCH v3] perf evsel: Refactor tool events
> 
> Tool events unnecessarily open a dummy perf event which is useless
> even with `perf record` which will still open a dummy event. Change
> the behavior of tool events so:
> 
>  - duration_time - call `rdclock` on open and then report the count as
>    a delta since the start in evsel__read_counter. This moves code out
>    of builtin-stat making it more general purpose.
> 
>  - user_time/system_time - open the fd as either `/proc/pid/stat` or
>    `/proc/stat` for cases like system wide. evsel__read_counter will
>    read the appropriate field out of the procfs file. These values
>    were previously supplied by wait4, if the procfs read fails then
>    the wait4 values are used, assuming the process/thread terminated.
>    By reading user_time and system_time this way, interval mode, per
>    PID and per CPU can be supported although there are restrictions
>    given what the files provide (e.g. per PID can't be combined with
>    per CPU).
> 
> Opening any of the tool events for `perf record` is changed to return
> invalid.
> 
> Signed-off-by: Ian Rogers <irogers@...gle.com>
Tested-by: Weilin Wang <weilin.wang@...el.com>
> ---
> v3. v1 and v2 were posted as RFCs with retirement latency patches,
>     this patch is now posted as standalone cleanup as we consider
>     alternatives with the retirement latency implementation. The patch
>     is updated to hopefully fix Namhyung's review feedback. user_time
>     and system_time are opened with an all online CPUs cpu
>     map. evsel's saved_times needs to be an xyarray to handle saving
>     the original time value per PID or per CPU.
> ---
>  tools/perf/builtin-stat.c      |  75 +++++------
>  tools/perf/util/evsel.c        | 226 ++++++++++++++++++++++++++++++++-
>  tools/perf/util/evsel.h        |  14 ++
>  tools/perf/util/parse-events.c |   7 +-
>  4 files changed, 277 insertions(+), 45 deletions(-)
> 
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 65a3dd7ffac3..01fc61d69b43 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -255,45 +255,38 @@ static int evsel__write_stat_event(struct evsel
> *counter, int cpu_map_idx, u32 t
>  					   process_synthesized_event, NULL);
>  }
> 
> -static int read_single_counter(struct evsel *counter, int cpu_map_idx,
> -			       int thread, struct timespec *rs)
> -{
> -	switch(counter->tool_event) {
> -		case PERF_TOOL_DURATION_TIME: {
> -			u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL;
> -			struct perf_counts_values *count =
> -				perf_counts(counter->counts, cpu_map_idx,
> thread);
> -			count->ena = count->run = val;
> -			count->val = val;
> -			return 0;
> -		}
> -		case PERF_TOOL_USER_TIME:
> -		case PERF_TOOL_SYSTEM_TIME: {
> -			u64 val;
> -			struct perf_counts_values *count =
> -				perf_counts(counter->counts, cpu_map_idx,
> thread);
> -			if (counter->tool_event == PERF_TOOL_USER_TIME)
> -				val = ru_stats.ru_utime_usec_stat.mean;
> -			else
> -				val = ru_stats.ru_stime_usec_stat.mean;
> -			count->ena = count->run = val;
> -			count->val = val;
> -			return 0;
> -		}
> -		default:
> -		case PERF_TOOL_NONE:
> -			return evsel__read_counter(counter, cpu_map_idx,
> thread);
> -		case PERF_TOOL_MAX:
> -			/* This should never be reached */
> -			return 0;
> +static int read_single_counter(struct evsel *counter, int cpu_map_idx, int
> thread)
> +{
> +	int err = evsel__read_counter(counter, cpu_map_idx, thread);
> +
> +	/*
> +	 * Reading user and system time will fail when the process
> +	 * terminates. Use the wait4 values in that case.
> +	 */
> +	if (err && cpu_map_idx == 0 &&
> +	    (counter->tool_event == PERF_TOOL_USER_TIME ||
> +	     counter->tool_event == PERF_TOOL_SYSTEM_TIME)) {
> +		u64 val, *start_time;
> +		struct perf_counts_values *count =
> +			perf_counts(counter->counts, cpu_map_idx, thread);
> +
> +		start_time = xyarray__entry(counter->start_times,
> cpu_map_idx, thread);
> +		if (counter->tool_event == PERF_TOOL_USER_TIME)
> +			val = ru_stats.ru_utime_usec_stat.mean;
> +		else
> +			val = ru_stats.ru_stime_usec_stat.mean;
> +		count->ena = count->run = *start_time + val;
> +		count->val = val;
> +		return 0;
>  	}
> +	return err;
>  }
> 
>  /*
>   * Read out the results of a single counter:
>   * do not aggregate counts across CPUs in system-wide mode
>   */
> -static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int
> cpu_map_idx)
> +static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
>  {
>  	int nthreads = perf_thread_map__nr(evsel_list->core.threads);
>  	int thread;
> @@ -311,7 +304,7 @@ static int read_counter_cpu(struct evsel *counter,
> struct timespec *rs, int cpu_
>  		 * (via evsel__read_counter()) and sets their count->loaded.
>  		 */
>  		if (!perf_counts__is_loaded(counter->counts, cpu_map_idx,
> thread) &&
> -		    read_single_counter(counter, cpu_map_idx, thread, rs)) {
> +		    read_single_counter(counter, cpu_map_idx, thread)) {
>  			counter->counts->scaled = -1;
>  			perf_counts(counter->counts, cpu_map_idx, thread)-
> >ena = 0;
>  			perf_counts(counter->counts, cpu_map_idx, thread)-
> >run = 0;
> @@ -340,7 +333,7 @@ static int read_counter_cpu(struct evsel *counter,
> struct timespec *rs, int cpu_
>  	return 0;
>  }
> 
> -static int read_affinity_counters(struct timespec *rs)
> +static int read_affinity_counters(void)
>  {
>  	struct evlist_cpu_iterator evlist_cpu_itr;
>  	struct affinity saved_affinity, *affinity;
> @@ -361,10 +354,8 @@ static int read_affinity_counters(struct timespec *rs)
>  		if (evsel__is_bpf(counter))
>  			continue;
> 
> -		if (!counter->err) {
> -			counter->err = read_counter_cpu(counter, rs,
> -
> 	evlist_cpu_itr.cpu_map_idx);
> -		}
> +		if (!counter->err)
> +			counter->err = read_counter_cpu(counter,
> evlist_cpu_itr.cpu_map_idx);
>  	}
>  	if (affinity)
>  		affinity__cleanup(&saved_affinity);
> @@ -388,11 +379,11 @@ static int read_bpf_map_counters(void)
>  	return 0;
>  }
> 
> -static int read_counters(struct timespec *rs)
> +static int read_counters(void)
>  {
>  	if (!stat_config.stop_read_counter) {
>  		if (read_bpf_map_counters() ||
> -		    read_affinity_counters(rs))
> +		    read_affinity_counters())
>  			return -1;
>  	}
>  	return 0;
> @@ -423,7 +414,7 @@ static void process_interval(void)
> 
>  	evlist__reset_aggr_stats(evsel_list);
> 
> -	if (read_counters(&rs) == 0)
> +	if (read_counters() == 0)
>  		process_counters();
> 
>  	if (STAT_RECORD) {
> @@ -911,7 +902,7 @@ static int __run_perf_stat(int argc, const char **argv,
> int run_idx)
>  	 * avoid arbitrary skew, we must read all counters before closing any
>  	 * group leaders.
>  	 */
> -	if (read_counters(&(struct timespec) { .tv_nsec = t1-t0 }) == 0)
> +	if (read_counters() == 0)
>  		process_counters();
> 
>  	/*
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 3536404e9447..d98415e1f0b7 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -10,6 +10,7 @@
>  #include <errno.h>
>  #include <inttypes.h>
>  #include <linux/bitops.h>
> +#include <api/io.h>
>  #include <api/fs/fs.h>
>  #include <api/fs/tracing_path.h>
>  #include <linux/hw_breakpoint.h>
> @@ -30,6 +31,7 @@
>  #include "counts.h"
>  #include "event.h"
>  #include "evsel.h"
> +#include "time-utils.h"
>  #include "util/env.h"
>  #include "util/evsel_config.h"
>  #include "util/evsel_fprintf.h"
> @@ -1487,6 +1489,9 @@ void evsel__exit(struct evsel *evsel)
>  	evsel->per_pkg_mask = NULL;
>  	zfree(&evsel->metric_events);
>  	perf_evsel__object.fini(evsel);
> +	if (evsel->tool_event == PERF_TOOL_SYSTEM_TIME ||
> +	    evsel->tool_event == PERF_TOOL_USER_TIME)
> +		xyarray__delete(evsel->start_times);
>  }
> 
>  void evsel__delete(struct evsel *evsel)
> @@ -1600,11 +1605,173 @@ static int evsel__read_group(struct evsel
> *leader, int cpu_map_idx, int thread)
>  	return evsel__process_group_data(leader, cpu_map_idx, thread,
> data);
>  }
> 
> +static bool read_until_char(struct io *io, char e)
> +{
> +	char c;
> +
> +	do {
> +		c = io__get_char(io);
> +		if (c == -1)
> +			return false;
> +	} while (c != e);
> +	return true;
> +}
> +
> +static int read_stat_field(int fd, struct perf_cpu cpu, int field, __u64 *val)
> +{
> +	char buf[256];
> +	struct io io;
> +	int i;
> +
> +	io__init(&io, fd, buf, sizeof(buf));
> +
> +	/* Skip lines to relevant CPU. */
> +	for (i = -1; i < cpu.cpu; i++) {
> +		if (!read_until_char(&io, '\n'))
> +			return -EINVAL;
> +	}
> +	/* Skip to "cpu". */
> +	if (io__get_char(&io) != 'c') return -EINVAL;
> +	if (io__get_char(&io) != 'p') return -EINVAL;
> +	if (io__get_char(&io) != 'u') return -EINVAL;
> +
> +	/* Skip N of cpuN. */
> +	if (!read_until_char(&io, ' '))
> +		return -EINVAL;
> +
> +	i = 1;
> +	while (true) {
> +		if (io__get_dec(&io, val) != ' ')
> +			break;
> +		if (field == i)
> +			return 0;
> +		i++;
> +	}
> +	return -EINVAL;
> +}
> +
> +static int read_pid_stat_field(int fd, int field, __u64 *val)
> +{
> +	char buf[256];
> +	struct io io;
> +	int c, i;
> +
> +	io__init(&io, fd, buf, sizeof(buf));
> +	if (io__get_dec(&io, val) != ' ')
> +		return -EINVAL;
> +	if (field == 1)
> +		return 0;
> +
> +	/* Skip comm. */
> +	if (io__get_char(&io) != '(' || !read_until_char(&io, ')'))
> +		return -EINVAL;
> +	if (field == 2)
> +		return -EINVAL; /* String can't be returned. */
> +
> +	/* Skip state */
> +	if (io__get_char(&io) != ' ' || io__get_char(&io) == -1)
> +		return -EINVAL;
> +	if (field == 3)
> +		return -EINVAL; /* String can't be returned. */
> +
> +	/* Loop over numeric fields*/
> +	if (io__get_char(&io) != ' ')
> +		return -EINVAL;
> +
> +	i = 4;
> +	while (true) {
> +		c = io__get_dec(&io, val);
> +		if (c == -1)
> +			return -EINVAL;
> +		if (c == -2) {
> +			/* Assume a -ve was read */
> +			c = io__get_dec(&io, val);
> +			*val *= -1;
> +		}
> +		if (c != ' ')
> +			return -EINVAL;
> +		if (field == i)
> +			return 0;
> +		i++;
> +	}
> +	return -EINVAL;
> +}
> +
> +static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread)
> +{
> +	__u64 *start_time, cur_time, delta_start;
> +	int fd, err = 0;
> +	struct perf_counts_values *count;
> +	bool adjust = false;
> +
> +	count = perf_counts(evsel->counts, cpu_map_idx, thread);
> +
> +	switch (evsel->tool_event) {
> +	case PERF_TOOL_DURATION_TIME:
> +		/*
> +		 * Pretend duration_time is only on the first CPU and thread,
> or
> +		 * else aggregation will scale duration_time by the number of
> +		 * CPUs/threads.
> +		 */
> +		start_time = &evsel->start_time;
> +		if (cpu_map_idx == 0 && thread == 0)
> +			cur_time = rdclock();
> +		else
> +			cur_time = *start_time;
> +		break;
> +	case PERF_TOOL_USER_TIME:
> +	case PERF_TOOL_SYSTEM_TIME: {
> +		bool system = evsel->tool_event ==
> PERF_TOOL_SYSTEM_TIME;
> +
> +		start_time = xyarray__entry(evsel->start_times, cpu_map_idx,
> thread);
> +		fd = FD(evsel, cpu_map_idx, thread);
> +		lseek(fd, SEEK_SET, 0);
> +		if (evsel->pid_stat) {
> +			/* The event exists solely on 1 CPU. */
> +			if (cpu_map_idx == 0)
> +				err = read_pid_stat_field(fd, system ? 15 : 14,
> &cur_time);
> +			else
> +				cur_time = 0;
> +		} else {
> +			/* The event is for all threads. */
> +			if (thread == 0) {
> +				struct perf_cpu cpu =
> perf_cpu_map__cpu(evsel->core.cpus,
> +
> 	cpu_map_idx);
> +
> +				err = read_stat_field(fd, cpu, system ? 3 : 1,
> &cur_time);
> +			} else {
> +				cur_time = 0;
> +			}
> +		}
> +		adjust = true;
> +		break;
> +	}
> +	case PERF_TOOL_NONE:
> +	case PERF_TOOL_MAX:
> +	default:
> +		err = -EINVAL;
> +	}
> +	if (err)
> +		return err;
> +
> +	delta_start = cur_time - *start_time;
> +	if (adjust) {
> +		__u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
> +
> +		delta_start *= 1000000000 / ticks_per_sec;
> +	}
> +	count->val    = delta_start;
> +	count->ena    = count->run = delta_start;
> +	count->lost   = 0;
> +	return 0;
> +}
> +
>  int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
>  {
> -	u64 read_format = evsel->core.attr.read_format;
> +	if (evsel__is_tool(evsel))
> +		return evsel__read_tool(evsel, cpu_map_idx, thread);
> 
> -	if (read_format & PERF_FORMAT_GROUP)
> +	if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
>  		return evsel__read_group(evsel, cpu_map_idx, thread);
> 
>  	return evsel__read_one(evsel, cpu_map_idx, thread);
> @@ -1823,6 +1990,14 @@ static int __evsel__prepare_open(struct evsel
> *evsel, struct perf_cpu_map *cpus,
>  	    perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus),
> nthreads) < 0)
>  		return -ENOMEM;
> 
> +	if ((evsel->tool_event == PERF_TOOL_SYSTEM_TIME ||
> +	     evsel->tool_event == PERF_TOOL_USER_TIME) &&
> +	    !evsel->start_times) {
> +		evsel->start_times = xyarray__new(perf_cpu_map__nr(cpus),
> nthreads, sizeof(__u64));
> +		if (!evsel->start_times)
> +			return -ENOMEM;
> +	}
> +
>  	evsel->open_flags = PERF_FLAG_FD_CLOEXEC;
>  	if (evsel->cgrp)
>  		evsel->open_flags |= PERF_FLAG_PID_CGROUP;
> @@ -2005,6 +2180,13 @@ static int evsel__open_cpu(struct evsel *evsel,
> struct perf_cpu_map *cpus,
>  	int pid = -1, err, old_errno;
>  	enum rlimit_action set_rlimit = NO_CHANGE;
> 
> +	if (evsel->tool_event == PERF_TOOL_DURATION_TIME) {
> +		if (evsel->core.attr.sample_period) /* no sampling */
> +			return -EINVAL;
> +		evsel->start_time = rdclock();
> +		return 0;
> +	}
> +
>  	err = __evsel__prepare_open(evsel, cpus, threads);
>  	if (err)
>  		return err;
> @@ -2037,6 +2219,46 @@ static int evsel__open_cpu(struct evsel *evsel,
> struct perf_cpu_map *cpus,
>  			if (!evsel->cgrp && !evsel->core.system_wide)
>  				pid = perf_thread_map__pid(threads, thread);
> 
> +			if (evsel->tool_event == PERF_TOOL_USER_TIME ||
> +			    evsel->tool_event == PERF_TOOL_SYSTEM_TIME) {
> +				bool system = evsel->tool_event ==
> PERF_TOOL_SYSTEM_TIME;
> +				__u64 *start_time = NULL;
> +
> +				if (evsel->core.attr.sample_period) {
> +					/* no sampling */
> +					err = -EINVAL;
> +					goto out_close;
> +				}
> +				if (pid > -1) {
> +					char buf[64];
> +
> +					snprintf(buf, sizeof(buf),
> "/proc/%d/stat", pid);
> +					fd = open(buf, O_RDONLY);
> +					evsel->pid_stat = true;
> +				} else {
> +					fd = open("/proc/stat", O_RDONLY);
> +				}
> +				FD(evsel, idx, thread) = fd;
> +				if (fd < 0) {
> +					err = -errno;
> +					goto out_close;
> +				}
> +				start_time = xyarray__entry(evsel-
> >start_times, idx, thread);
> +				if (pid > -1) {
> +					err = read_pid_stat_field(fd, system ?
> 15 : 14,
> +								  start_time);
> +				} else {
> +					struct perf_cpu cpu;
> +
> +					cpu = perf_cpu_map__cpu(evsel-
> >core.cpus, idx);
> +					err = read_stat_field(fd, cpu, system ?
> 3 : 1,
> +							      start_time);
> +				}
> +				if (err)
> +					goto out_close;
> +				continue;
> +			}
> +
>  			group_fd = get_group_fd(evsel, idx, thread);
> 
>  			if (group_fd == -2) {
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 517cff431de2..d592dd9fb969 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -170,6 +170,20 @@ struct evsel {
> 
>  	/* for missing_features */
>  	struct perf_pmu		*pmu;
> +
> +	/* For tool events */
> +	/* Beginning time subtracted when the counter is read. */
> +	union {
> +		/* duration_time is a single global time. */
> +		__u64 start_time;
> +		/*
> +		 * user_time and system_time read an initial value potentially
> +		 * per-CPU or per-pid.
> +		 */
> +		struct xyarray *start_times;
> +	};
> +	/* Is the tool's fd for /proc/pid/stat or /proc/stat. */
> +	bool pid_stat;
>  };
> 
>  struct perf_missing_features {
> diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> index 0f308b4db2b9..a8866c0ce4b6 100644
> --- a/tools/perf/util/parse-events.c
> +++ b/tools/perf/util/parse-events.c
> @@ -308,11 +308,16 @@ static int add_event_tool(struct list_head *list, int
> *idx,
>  		.type = PERF_TYPE_SOFTWARE,
>  		.config = PERF_COUNT_SW_DUMMY,
>  	};
> +	const char *cpu_list = NULL;
> 
> +	if (tool_event == PERF_TOOL_DURATION_TIME) {
> +		/* Duration time is gathered globally, pretend it is only on
> CPU0. */
> +		cpu_list = "0";
> +	}
>  	evsel = __add_event(list, idx, &attr, /*init_attr=*/true,
> /*name=*/NULL,
>  			    /*metric_id=*/NULL, /*pmu=*/NULL,
>  			    /*config_terms=*/NULL,
> /*auto_merge_stats=*/false,
> -			    /*cpu_list=*/"0");
> +			    cpu_list);
>  	if (!evsel)
>  		return -ENOMEM;
>  	evsel->tool_event = tool_event;
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog