lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z9Tk-HFcMVvDViiO@google.com>
Date: Fri, 14 Mar 2025 19:24:56 -0700
From: Namhyung Kim <namhyung@...nel.org>
To: Swapnil Sapkal <swapnil.sapkal@....com>
Cc: peterz@...radead.org, mingo@...hat.com, acme@...nel.org,
	irogers@...gle.com, james.clark@....com, ravi.bangoria@....com,
	yu.c.chen@...el.com, mark.rutland@....com,
	alexander.shishkin@...ux.intel.com, jolsa@...nel.org,
	rostedt@...dmis.org, vincent.guittot@...aro.org,
	adrian.hunter@...el.com, kan.liang@...ux.intel.com,
	gautham.shenoy@....com, kprateek.nayak@....com,
	juri.lelli@...hat.com, yangjihong@...edance.com, void@...ifault.com,
	tj@...nel.org, sshegde@...ux.ibm.com, linux-kernel@...r.kernel.org,
	linux-perf-users@...r.kernel.org, santosh.shukla@....com,
	ananth.narayan@....com, sandipan.das@....com,
	James Clark <james.clark@...aro.org>
Subject: Re: [PATCH v3 1/8] perf sched stats: Add record and rawdump support

Hello,

On Tue, Mar 11, 2025 at 12:02:23PM +0000, Swapnil Sapkal wrote:
> Define new, perf tool only, sample types and their layouts. Add logic
> to parse /proc/schedstat, convert it to perf sample format and save
> samples to perf.data file with `perf sched stats record` command. Also
> add logic to read perf.data file, interpret schedstat samples and
> print rawdump of samples with `perf script -D`.
> 
> Note that, /proc/schedstat file output is standardized with version
> number. The patch supports v15 but older or newer version can be added
> easily.
> 
> Co-developed-by: Ravi Bangoria <ravi.bangoria@....com>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@....com>
> Tested-by: James Clark <james.clark@...aro.org>
> Signed-off-by: Swapnil Sapkal <swapnil.sapkal@....com>
> ---
>  tools/lib/perf/Documentation/libperf.txt    |   2 +
>  tools/lib/perf/Makefile                     |   2 +-
>  tools/lib/perf/include/perf/event.h         |  42 ++++
>  tools/lib/perf/include/perf/schedstat-v15.h |  52 +++++
>  tools/perf/builtin-inject.c                 |   2 +
>  tools/perf/builtin-sched.c                  | 226 +++++++++++++++++-
>  tools/perf/util/event.c                     |  98 ++++++++
>  tools/perf/util/event.h                     |   2 +
>  tools/perf/util/session.c                   |  20 ++
>  tools/perf/util/synthetic-events.c          | 239 ++++++++++++++++++++
>  tools/perf/util/synthetic-events.h          |   3 +
>  tools/perf/util/tool.c                      |  20 ++
>  tools/perf/util/tool.h                      |   4 +-
>  13 files changed, 709 insertions(+), 3 deletions(-)
>  create mode 100644 tools/lib/perf/include/perf/schedstat-v15.h
> 
> diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt
> index 59aabdd3cabf..3f295639903d 100644
> --- a/tools/lib/perf/Documentation/libperf.txt
> +++ b/tools/lib/perf/Documentation/libperf.txt
> @@ -210,6 +210,8 @@ SYNOPSIS
>    struct perf_record_time_conv;
>    struct perf_record_header_feature;
>    struct perf_record_compressed;
> +  struct perf_record_schedstat_cpu;
> +  struct perf_record_schedstat_domain;
>  --
>  
>  DESCRIPTION
> diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile
> index e9a7ac2c062e..4b60804aa0b6 100644
> --- a/tools/lib/perf/Makefile
> +++ b/tools/lib/perf/Makefile
> @@ -174,7 +174,7 @@ install_lib: libs
>  		$(call do_install_mkdir,$(libdir_SQ)); \
>  		cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ)
>  
> -HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h
> +HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h schedstat-v15.h
>  INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h
>  
>  INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf
> diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
> index 37bb7771d914..189106874063 100644
> --- a/tools/lib/perf/include/perf/event.h
> +++ b/tools/lib/perf/include/perf/event.h
> @@ -457,6 +457,44 @@ struct perf_record_compressed {
>  	char			 data[];
>  };
>  
> +struct perf_record_schedstat_cpu_v15 {
> +#define CPU_FIELD(_type, _name, _ver)		_type _name
> +#include "schedstat-v15.h"
> +#undef CPU_FIELD
> +};
> +
> +struct perf_record_schedstat_cpu {
> +	struct perf_event_header header;
> +	__u64			 timestamp;
> +	union {
> +		struct perf_record_schedstat_cpu_v15 v15;
> +	};
> +	__u32			 cpu;
> +	__u16			 version;

Why not putting these before the union?  I think it'll have variable
size once you add different versions then it'd be hard to access the
fields after union.  You may want to add a padding explicitly.

> +};
> +
> +struct perf_record_schedstat_domain_v15 {
> +#define DOMAIN_FIELD(_type, _name, _ver)	_type _name
> +#include "schedstat-v15.h"
> +#undef DOMAIN_FIELD
> +};
> +
> +#define DOMAIN_NAME_LEN		16
> +
> +struct perf_record_schedstat_domain {
> +	struct perf_event_header header;
> +	__u16			 version;
> +	__u64			 timestamp;
> +	__u32			 cpu;
> +	__u16			 domain;

If this has similar information for schedstat_cpu, I think it's better
to start with the same layout.  And having version before timestamp
would add unnecessary paddings.


> +	char			 name[DOMAIN_NAME_LEN];
> +	union {
> +		struct perf_record_schedstat_domain_v15 v15;
> +	};
> +	__u16			 nr_cpus;
> +	__u8			 cpu_mask[];

Does cpu_mask represent the domain membership?  Maybe you can split
those info into a separate record or put it in a header feature like
we have topology information there.


> +};
> +
>  enum perf_user_event_type { /* above any possible kernel type */
>  	PERF_RECORD_USER_TYPE_START		= 64,
>  	PERF_RECORD_HEADER_ATTR			= 64,
> @@ -478,6 +516,8 @@ enum perf_user_event_type { /* above any possible kernel type */
>  	PERF_RECORD_HEADER_FEATURE		= 80,
>  	PERF_RECORD_COMPRESSED			= 81,
>  	PERF_RECORD_FINISHED_INIT		= 82,
> +	PERF_RECORD_SCHEDSTAT_CPU		= 83,
> +	PERF_RECORD_SCHEDSTAT_DOMAIN		= 84,
>  	PERF_RECORD_HEADER_MAX
>  };
>  
> @@ -518,6 +558,8 @@ union perf_event {
>  	struct perf_record_time_conv		time_conv;
>  	struct perf_record_header_feature	feat;
>  	struct perf_record_compressed		pack;
> +	struct perf_record_schedstat_cpu	schedstat_cpu;
> +	struct perf_record_schedstat_domain	schedstat_domain;
>  };
>  
>  #endif /* __LIBPERF_EVENT_H */
> diff --git a/tools/lib/perf/include/perf/schedstat-v15.h b/tools/lib/perf/include/perf/schedstat-v15.h
> new file mode 100644
> index 000000000000..43f8060c5337
> --- /dev/null
> +++ b/tools/lib/perf/include/perf/schedstat-v15.h
> @@ -0,0 +1,52 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifdef CPU_FIELD
> +CPU_FIELD(__u32, yld_count, v15);
> +CPU_FIELD(__u32, array_exp, v15);
> +CPU_FIELD(__u32, sched_count, v15);
> +CPU_FIELD(__u32, sched_goidle, v15);
> +CPU_FIELD(__u32, ttwu_count, v15);
> +CPU_FIELD(__u32, ttwu_local, v15);
> +CPU_FIELD(__u64, rq_cpu_time, v15);
> +CPU_FIELD(__u64, run_delay, v15);
> +CPU_FIELD(__u64, pcount, v15);
> +#endif
> +
> +#ifdef DOMAIN_FIELD
> +DOMAIN_FIELD(__u32, idle_lb_count, v15);
> +DOMAIN_FIELD(__u32, idle_lb_balanced, v15);
> +DOMAIN_FIELD(__u32, idle_lb_failed, v15);
> +DOMAIN_FIELD(__u32, idle_lb_imbalance, v15);
> +DOMAIN_FIELD(__u32, idle_lb_gained, v15);
> +DOMAIN_FIELD(__u32, idle_lb_hot_gained, v15);
> +DOMAIN_FIELD(__u32, idle_lb_nobusyq, v15);
> +DOMAIN_FIELD(__u32, idle_lb_nobusyg, v15);
> +DOMAIN_FIELD(__u32, busy_lb_count, v15);
> +DOMAIN_FIELD(__u32, busy_lb_balanced, v15);
> +DOMAIN_FIELD(__u32, busy_lb_failed, v15);
> +DOMAIN_FIELD(__u32, busy_lb_imbalance, v15);
> +DOMAIN_FIELD(__u32, busy_lb_gained, v15);
> +DOMAIN_FIELD(__u32, busy_lb_hot_gained, v15);
> +DOMAIN_FIELD(__u32, busy_lb_nobusyq, v15);
> +DOMAIN_FIELD(__u32, busy_lb_nobusyg, v15);
> +DOMAIN_FIELD(__u32, newidle_lb_count, v15);
> +DOMAIN_FIELD(__u32, newidle_lb_balanced, v15);
> +DOMAIN_FIELD(__u32, newidle_lb_failed, v15);
> +DOMAIN_FIELD(__u32, newidle_lb_imbalance, v15);
> +DOMAIN_FIELD(__u32, newidle_lb_gained, v15);
> +DOMAIN_FIELD(__u32, newidle_lb_hot_gained, v15);
> +DOMAIN_FIELD(__u32, newidle_lb_nobusyq, v15);
> +DOMAIN_FIELD(__u32, newidle_lb_nobusyg, v15);
> +DOMAIN_FIELD(__u32, alb_count, v15);
> +DOMAIN_FIELD(__u32, alb_failed, v15);
> +DOMAIN_FIELD(__u32, alb_pushed, v15);
> +DOMAIN_FIELD(__u32, sbe_count, v15);
> +DOMAIN_FIELD(__u32, sbe_balanced, v15);
> +DOMAIN_FIELD(__u32, sbe_pushed, v15);
> +DOMAIN_FIELD(__u32, sbf_count, v15);
> +DOMAIN_FIELD(__u32, sbf_balanced, v15);
> +DOMAIN_FIELD(__u32, sbf_pushed, v15);
> +DOMAIN_FIELD(__u32, ttwu_wake_remote, v15);
> +DOMAIN_FIELD(__u32, ttwu_move_affine, v15);
> +DOMAIN_FIELD(__u32, ttwu_move_balance, v15);
> +#endif
> diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
> index 11e49cafa3af..af1add2abf72 100644
> --- a/tools/perf/builtin-inject.c
> +++ b/tools/perf/builtin-inject.c
> @@ -2530,6 +2530,8 @@ int cmd_inject(int argc, const char **argv)
>  	inject.tool.finished_init	= perf_event__repipe_op2_synth;
>  	inject.tool.compressed		= perf_event__repipe_op4_synth;
>  	inject.tool.auxtrace		= perf_event__repipe_auxtrace;
> +	inject.tool.schedstat_cpu	= perf_event__repipe_op2_synth;
> +	inject.tool.schedstat_domain	= perf_event__repipe_op2_synth;
>  	inject.tool.dont_split_sample_group = true;
>  	inject.session = __perf_session__new(&data, &inject.tool,
>  					     /*trace_event_repipe=*/inject.output.is_pipe);
> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
> index 26ece6e9bfd1..1c3b56013164 100644
> --- a/tools/perf/builtin-sched.c
> +++ b/tools/perf/builtin-sched.c
> @@ -28,6 +28,8 @@
>  #include "util/debug.h"
>  #include "util/event.h"
>  #include "util/util.h"
> +#include "util/synthetic-events.h"
> +#include "util/target.h"
>  
>  #include <linux/kernel.h>
>  #include <linux/log2.h>
> @@ -55,6 +57,7 @@
>  #define MAX_PRIO		140
>  
>  static const char *cpu_list;
> +static struct perf_cpu_map *user_requested_cpus;

I guess this can be in evlist.


>  static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
>  
>  struct sched_atom;
> @@ -236,6 +239,9 @@ struct perf_sched {
>  	volatile bool   thread_funcs_exit;
>  	const char	*prio_str;
>  	DECLARE_BITMAP(prio_bitmap, MAX_PRIO);
> +
> +	struct perf_session *session;
> +	struct perf_data *data;
>  };
>  
>  /* per thread run time data */
> @@ -3670,6 +3676,199 @@ static void setup_sorting(struct perf_sched *sched, const struct option *options
>  	sort_dimension__add("pid", &sched->cmp_pid);
>  }
>  
> +static int process_synthesized_schedstat_event(const struct perf_tool *tool,
> +					       union perf_event *event,
> +					       struct perf_sample *sample __maybe_unused,
> +					       struct machine *machine __maybe_unused)
> +{
> +	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
> +
> +	if (perf_data__write(sched->data, event, event->header.size) <= 0) {
> +		pr_err("failed to write perf data, error: %m\n");
> +		return -1;
> +	}
> +
> +	sched->session->header.data_size += event->header.size;
> +	return 0;
> +}
> +
> +static void sighandler(int sig __maybe_unused)
> +{
> +}
> +
> +static int enable_sched_schedstats(int *reset)
> +{
> +	char path[PATH_MAX];
> +	FILE *fp;
> +	char ch;
> +
> +	snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
> +	fp = fopen(path, "w+");
> +	if (!fp) {
> +		pr_err("Failed to open %s\n", path);
> +		return -1;
> +	}
> +
> +	ch = getc(fp);
> +	if (ch == '0') {
> +		*reset = 1;
> +		rewind(fp);
> +		putc('1', fp);
> +		fclose(fp);
> +	}
> +	return 0;
> +}
> +
> +static int disable_sched_schedstat(void)
> +{
> +	char path[PATH_MAX];
> +	FILE *fp;
> +
> +	snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
> +	fp = fopen(path, "w");
> +	if (!fp) {
> +		pr_err("Failed to open %s\n", path);
> +		return -1;
> +	}
> +
> +	putc('0', fp);
> +	fclose(fp);
> +	return 0;
> +}
> +
> +/* perf.data or any other output file name used by stats subcommand (only). */
> +const char *output_name;
> +
> +static int perf_sched__schedstat_record(struct perf_sched *sched,
> +					int argc, const char **argv)
> +{
> +	struct perf_session *session;
> +	struct evlist *evlist;
> +	struct target *target;
> +	int reset = 0;
> +	int err = 0;
> +	int fd;
> +	struct perf_data data = {
> +		.path  = output_name,
> +		.mode  = PERF_DATA_MODE_WRITE,
> +	};
> +
> +	signal(SIGINT, sighandler);
> +	signal(SIGCHLD, sighandler);
> +	signal(SIGTERM, sighandler);
> +
> +	evlist = evlist__new();
> +	if (!evlist)
> +		return -ENOMEM;
> +
> +	session = perf_session__new(&data, &sched->tool);
> +	if (IS_ERR(session)) {
> +		pr_err("Perf session creation failed.\n");

Also need evlist__delete().


> +		return PTR_ERR(session);
> +	}
> +
> +	session->evlist = evlist;
> +
> +	sched->session = session;
> +	sched->data = &data;
> +
> +	fd = perf_data__fd(&data);
> +
> +	/*
> +	 * Capture all important metadata about the system. Although they are
> +	 * not used by `perf sched stats` tool directly, they provide useful
> +	 * information about profiled environment.
> +	 */
> +	perf_header__set_feat(&session->header, HEADER_HOSTNAME);
> +	perf_header__set_feat(&session->header, HEADER_OSRELEASE);
> +	perf_header__set_feat(&session->header, HEADER_VERSION);
> +	perf_header__set_feat(&session->header, HEADER_ARCH);
> +	perf_header__set_feat(&session->header, HEADER_NRCPUS);
> +	perf_header__set_feat(&session->header, HEADER_CPUDESC);
> +	perf_header__set_feat(&session->header, HEADER_CPUID);
> +	perf_header__set_feat(&session->header, HEADER_TOTAL_MEM);
> +	perf_header__set_feat(&session->header, HEADER_CMDLINE);
> +	perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY);
> +	perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY);
> +	perf_header__set_feat(&session->header, HEADER_CACHE);
> +	perf_header__set_feat(&session->header, HEADER_MEM_TOPOLOGY);
> +	perf_header__set_feat(&session->header, HEADER_CPU_PMU_CAPS);
> +	perf_header__set_feat(&session->header, HEADER_HYBRID_TOPOLOGY);
> +	perf_header__set_feat(&session->header, HEADER_PMU_CAPS);

Probably you don't need {CPU_,}PMU_CAPS.  Also I wonder if it's possible
to add cpu-domain info here.

> +
> +	err = perf_session__write_header(session, evlist, fd, false);
> +	if (err < 0)
> +		goto out;
> +
> +	/*
> +	 * `perf sched stats` does not support workload profiling (-p pid)
> +	 * since /proc/schedstat file contains cpu specific data only. Hence, a
> +	 * profile target is either set of cpus or systemwide, never a process.
> +	 * Note that, although `-- <workload>` is supported, profile data are
> +	 * still cpu/systemwide.
> +	 */
> +	target = zalloc(sizeof(struct target));

It seems no need to alloc the target, just putting it on stack would be
fine.


> +	if (cpu_list)
> +		target->cpu_list = cpu_list;
> +	else
> +		target->system_wide = true;
> +
> +	if (argc) {
> +		err = evlist__prepare_workload(evlist, target, argv, false, NULL);
> +		if (err)
> +			goto out_target;
> +	}
> +
> +	if (cpu_list) {
> +		user_requested_cpus = perf_cpu_map__new(cpu_list);

Where is this freed?


> +		if (!user_requested_cpus)
> +			goto out_target;
> +	}
> +
> +	err = perf_event__synthesize_schedstat(&(sched->tool),
> +					       process_synthesized_schedstat_event,
> +					       user_requested_cpus);
> +	if (err < 0)
> +		goto out_target;
> +
> +	err = enable_sched_schedstats(&reset);
> +	if (err < 0)
> +		goto out_target;
> +
> +	if (argc)
> +		evlist__start_workload(evlist);
> +
> +	/* wait for signal */
> +	pause();
> +
> +	if (reset) {
> +		err = disable_sched_schedstat();
> +		if (err < 0)
> +			goto out_target;
> +	}
> +
> +	err = perf_event__synthesize_schedstat(&(sched->tool),
> +					       process_synthesized_schedstat_event,
> +					       user_requested_cpus);
> +	if (err < 0)
> +		goto out_target;
> +
> +	err = perf_session__write_header(session, evlist, fd, true);
> +
> +out_target:
> +	free(target);
> +out:
> +	if (!err)
> +		fprintf(stderr, "[ perf sched stats: Wrote samples to %s ]\n", data.path);
> +	else
> +		fprintf(stderr, "[ perf sched stats: Failed !! ]\n");
> +
> +	close(fd);
> +	perf_session__delete(session);

It seems session->evlist is deleted only when the data is in read mode.

> +
> +	return err;
> +}
> +
>  static bool schedstat_events_exposed(void)
>  {
>  	/*
> @@ -3846,6 +4045,12 @@ int cmd_sched(int argc, const char **argv)
>  	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
>  	OPT_PARENT(sched_options)
>  	};
> +	const struct option stats_options[] = {
> +	OPT_STRING('o', "output", &output_name, "file",
> +		   "`stats record` with output filename"),
> +	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
> +	OPT_END()
> +	};
>  
>  	const char * const latency_usage[] = {
>  		"perf sched latency [<options>]",
> @@ -3863,9 +4068,13 @@ int cmd_sched(int argc, const char **argv)
>  		"perf sched timehist [<options>]",
>  		NULL
>  	};
> +	const char *stats_usage[] = {
> +		"perf sched stats {record} [<options>]",
> +		NULL
> +	};
>  	const char *const sched_subcommands[] = { "record", "latency", "map",
>  						  "replay", "script",
> -						  "timehist", NULL };
> +						  "timehist", "stats", NULL };
>  	const char *sched_usage[] = {
>  		NULL,
>  		NULL
> @@ -3961,6 +4170,21 @@ int cmd_sched(int argc, const char **argv)
>  			return ret;
>  
>  		return perf_sched__timehist(&sched);
> +	} else if (!strcmp(argv[0], "stats")) {
> +		const char *const stats_subcommands[] = {"record", NULL};
> +
> +		argc = parse_options_subcommand(argc, argv, stats_options,
> +						stats_subcommands,
> +						stats_usage,
> +						PARSE_OPT_STOP_AT_NON_OPTION);
> +
> +		if (argv[0] && !strcmp(argv[0], "record")) {
> +			if (argc)
> +				argc = parse_options(argc, argv, stats_options,
> +						     stats_usage, 0);
> +			return perf_sched__schedstat_record(&sched, argc, argv);
> +		}
> +		usage_with_options(stats_usage, stats_options);
>  	} else {
>  		usage_with_options(sched_usage, sched_options);
>  	}
> diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
> index aac96d5d1917..0f863d38abe2 100644
> --- a/tools/perf/util/event.c
> +++ b/tools/perf/util/event.c
> @@ -77,6 +77,8 @@ static const char *perf_event__names[] = {
>  	[PERF_RECORD_HEADER_FEATURE]		= "FEATURE",
>  	[PERF_RECORD_COMPRESSED]		= "COMPRESSED",
>  	[PERF_RECORD_FINISHED_INIT]		= "FINISHED_INIT",
> +	[PERF_RECORD_SCHEDSTAT_CPU]		= "SCHEDSTAT_CPU",
> +	[PERF_RECORD_SCHEDSTAT_DOMAIN]		= "SCHEDSTAT_DOMAIN",
>  };
>  
>  const char *perf_event__name(unsigned int id)
> @@ -550,6 +552,102 @@ size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *ma
>  	return ret;
>  }
>  
> +size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp)
> +{
> +	struct perf_record_schedstat_cpu *cs = &event->schedstat_cpu;
> +	__u16 version = cs->version;
> +	size_t size = 0;
> +
> +	size = fprintf(fp, "\ncpu%u ", cs->cpu);
> +
> +#define CPU_FIELD(_type, _name, _ver)						\
> +	size += fprintf(fp, "%" PRIu64 " ", (unsigned long)cs->_ver._name)
> +
> +	if (version == 15) {
> +#include <perf/schedstat-v15.h>
> +		return size;
> +	}
> +#undef CPU_FIELD
> +
> +	return fprintf(fp, "Unsupported /proc/schedstat version %d.\n",
> +		       event->schedstat_cpu.version);
> +}
> +
> +size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp)
> +{
> +	struct perf_record_schedstat_domain *ds = &event->schedstat_domain;
> +	__u16 version = ds->version;
> +	size_t cpu_mask_len_2;
> +	size_t cpu_mask_len;
> +	size_t size = 0;
> +	char *cpu_mask;
> +	int idx;
> +	int i, j;
> +	bool low;
> +
> +	if (ds->name[0])
> +		size = fprintf(fp, "\ndomain%u:%s ", ds->domain, ds->name);
> +	else
> +		size = fprintf(fp, "\ndomain%u ", ds->domain);
> +
> +	cpu_mask_len = ((ds->nr_cpus + 3) >> 2);
> +	cpu_mask_len_2 = cpu_mask_len + ((cpu_mask_len - 1) / 8);
> +
> +	cpu_mask = zalloc(cpu_mask_len_2 + 1);
> +	if (!cpu_mask)
> +		return fprintf(fp, "Cannot allocate memory for cpumask\n");
> +
> +	idx = ((ds->nr_cpus + 7) >> 3) - 1;
> +
> +	i = cpu_mask_len_2 - 1;
> +
> +	low = true;
> +	j = 1;
> +	while (i >= 0) {
> +		__u8 m;
> +
> +		if (low)
> +			m = ds->cpu_mask[idx] & 0xf;
> +		else
> +			m = (ds->cpu_mask[idx] & 0xf0) >> 4;
> +
> +		if (m >= 0 && m <= 9)
> +			m += '0';
> +		else if (m >= 0xa && m <= 0xf)
> +			m = m + 'a' - 10;
> +		else if (m >= 0xA && m <= 0xF)
> +			m = m + 'A' - 10;
> +
> +		cpu_mask[i] = m;
> +
> +		if (j == 8 && i != 0) {
> +			cpu_mask[i - 1] = ',';
> +			j = 0;
> +			i--;
> +		}
> +
> +		if (!low)
> +			idx--;
> +		low = !low;
> +		i--;
> +		j++;
> +	}
> +	size += fprintf(fp, "%s ", cpu_mask);
> +	free(cpu_mask);
> +
> +#define DOMAIN_FIELD(_type, _name, _ver)					\
> +	size += fprintf(fp, "%" PRIu64 " ", (unsigned long)ds->_ver._name)
> +
> +	if (version == 15) {
> +#include <perf/schedstat-v15.h>
> +		return size;
> +	}
> +#undef DOMAIN_FIELD
> +
> +	return fprintf(fp, "Unsupported /proc/schedstat version %d.\n",
> +		       event->schedstat_domain.version);
> +}
> +
>  size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp)
>  {
>  	size_t ret = fprintf(fp, "PERF_RECORD_%s",
> diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
> index 2744c54f404e..333f2405cd5a 100644
> --- a/tools/perf/util/event.h
> +++ b/tools/perf/util/event.h
> @@ -361,6 +361,8 @@ size_t perf_event__fprintf_cgroup(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf_ksymbol(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf_bpf(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *machine,FILE *fp);
> +size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp);
> +size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp);
>  size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp);
>  
>  int kallsyms__get_function_start(const char *kallsyms_filename,
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index c06e3020a976..bcffee2b7239 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -692,6 +692,20 @@ static void perf_event__time_conv_swap(union perf_event *event,
>  	}
>  }
>  
> +static void
> +perf_event__schedstat_cpu_swap(union perf_event *event __maybe_unused,
> +			       bool sample_id_all __maybe_unused)
> +{
> +	/* FIXME */
> +}
> +
> +static void
> +perf_event__schedstat_domain_swap(union perf_event *event __maybe_unused,
> +				  bool sample_id_all __maybe_unused)
> +{
> +	/* FIXME */
> +}
> +
>  typedef void (*perf_event__swap_op)(union perf_event *event,
>  				    bool sample_id_all);
>  
> @@ -730,6 +744,8 @@ static perf_event__swap_op perf_event__swap_ops[] = {
>  	[PERF_RECORD_STAT_ROUND]	  = perf_event__stat_round_swap,
>  	[PERF_RECORD_EVENT_UPDATE]	  = perf_event__event_update_swap,
>  	[PERF_RECORD_TIME_CONV]		  = perf_event__time_conv_swap,
> +	[PERF_RECORD_SCHEDSTAT_CPU]	  = perf_event__schedstat_cpu_swap,
> +	[PERF_RECORD_SCHEDSTAT_DOMAIN]	  = perf_event__schedstat_domain_swap,
>  	[PERF_RECORD_HEADER_MAX]	  = NULL,
>  };
>  
> @@ -1455,6 +1471,10 @@ static s64 perf_session__process_user_event(struct perf_session *session,
>  		return err;
>  	case PERF_RECORD_FINISHED_INIT:
>  		return tool->finished_init(session, event);
> +	case PERF_RECORD_SCHEDSTAT_CPU:
> +		return tool->schedstat_cpu(session, event);
> +	case PERF_RECORD_SCHEDSTAT_DOMAIN:
> +		return tool->schedstat_domain(session, event);
>  	default:
>  		return -EINVAL;
>  	}
> diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
> index 6923b0d5efed..f928f07bea15 100644
> --- a/tools/perf/util/synthetic-events.c
> +++ b/tools/perf/util/synthetic-events.c
> @@ -2511,3 +2511,242 @@ int parse_synth_opt(char *synth)
>  
>  	return ret;
>  }
> +
> +static union perf_event *__synthesize_schedstat_cpu(struct io *io, __u16 version,
> +						    __u64 *cpu, __u64 timestamp)
> +{
> +	struct perf_record_schedstat_cpu *cs;
> +	union perf_event *event;
> +	size_t size;
> +	char ch;
> +
> +	size = sizeof(struct perf_record_schedstat_cpu);

I think the kernel code prefers sizeof(*cs) instead.


> +	size = PERF_ALIGN(size, sizeof(u64));
> +	event = zalloc(size);

The size is static, do you really need a dynamic allocation?

Thanks,
Namhyung

> +
> +	if (!event)
> +		return NULL;
> +
> +	cs = &event->schedstat_cpu;
> +	cs->header.type = PERF_RECORD_SCHEDSTAT_CPU;
> +	cs->header.size = size;
> +	cs->timestamp = timestamp;
> +
> +	if (io__get_char(io) != 'p' || io__get_char(io) != 'u')
> +		goto out_cpu;
> +
> +	if (io__get_dec(io, (__u64 *)cpu) != ' ')
> +		goto out_cpu;
> +
> +#define CPU_FIELD(_type, _name, _ver)					\
> +	do {								\
> +		__u64 _tmp;						\
> +		ch = io__get_dec(io, &_tmp);				\
> +		if (ch != ' ' && ch != '\n')				\
> +			goto out_cpu;					\
> +		cs->_ver._name = _tmp;					\
> +	} while (0)
> +
> +	if (version == 15) {
> +#include <perf/schedstat-v15.h>
> +	}
> +#undef CPU_FIELD
> +
> +	cs->cpu = *cpu;
> +	cs->version = version;
> +
> +	return event;
> +out_cpu:
> +	free(event);
> +	return NULL;
> +}
> +
> +static size_t schedstat_sanitize_cpumask(char *cpu_mask, size_t cpu_mask_len)
> +{
> +	char *dst = cpu_mask;
> +	char *src = cpu_mask;
> +	int i = 0;
> +
> +	for ( ; src < cpu_mask + cpu_mask_len; dst++, src++) {
> +		while (*src == ',')
> +			src++;
> +
> +		*dst = *src;
> +	}
> +
> +	for ( ; dst < src; dst++, i++)
> +		*dst = '\0';
> +
> +	return cpu_mask_len - i;
> +}
> +
> +static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 version,
> +						       __u64 cpu, __u64 timestamp)
> +{
> +	struct perf_env env = { .total_mem = 0, };
> +	int nr_cpus_avail = perf_env__nr_cpus_avail(&env);
> +	struct perf_record_schedstat_domain *ds;
> +	union perf_event *event;
> +	char *d_name = NULL;
> +	size_t cpu_mask_len = 0;
> +	char *cpu_mask = NULL;
> +	__u64 d_num;
> +	size_t size;
> +	int i = 0;
> +	bool low;
> +	char ch;
> +	int idx;
> +
> +	if (io__get_char(io) != 'o' || io__get_char(io) != 'm' || io__get_char(io) != 'a' ||
> +	    io__get_char(io) != 'i' || io__get_char(io) != 'n')
> +		return NULL;
> +
> +	ch = io__get_dec(io, &d_num);
> +
> +	if (io__getdelim(io, &cpu_mask, &cpu_mask_len, ' ') < 0 || !cpu_mask_len)
> +		goto out;
> +
> +	cpu_mask[cpu_mask_len - 1] = '\0';
> +	cpu_mask_len--;
> +	cpu_mask_len = schedstat_sanitize_cpumask(cpu_mask, cpu_mask_len);
> +
> +	size = sizeof(struct perf_record_schedstat_domain) + ((nr_cpus_avail + 7) >> 3);
> +	size = PERF_ALIGN(size, sizeof(u64));
> +	event = zalloc(size);
> +
> +	if (!event)
> +		goto out_cpu_mask;
> +
> +	ds = &event->schedstat_domain;
> +	ds->header.type = PERF_RECORD_SCHEDSTAT_DOMAIN;
> +	ds->header.size = size;
> +	ds->version = version;
> +	ds->timestamp = timestamp;
> +	if (d_name)
> +		strncpy(ds->name, d_name, DOMAIN_NAME_LEN - 1);
> +	ds->domain = d_num;
> +	ds->nr_cpus = nr_cpus_avail;
> +
> +	idx = ((nr_cpus_avail + 7) >> 3) - 1;
> +	low = true;
> +	for (i = cpu_mask_len - 1; i >= 0 && idx >= 0; i--) {
> +		char mask = cpu_mask[i];
> +
> +		if (mask >= '0' && mask <= '9')
> +			mask -= '0';
> +		else if (mask >= 'a' && mask <= 'f')
> +			mask = mask - 'a' + 10;
> +		else if (mask >= 'A' && mask <= 'F')
> +			mask = mask - 'A' + 10;
> +
> +		if (low) {
> +			ds->cpu_mask[idx] = mask;
> +		} else {
> +			ds->cpu_mask[idx] |= (mask << 4);
> +			idx--;
> +		}
> +		low = !low;
> +	}
> +
> +	free(cpu_mask);
> +
> +#define DOMAIN_FIELD(_type, _name, _ver)				\
> +	do {								\
> +		__u64 _tmp;						\
> +		ch = io__get_dec(io, &_tmp);				\
> +		if (ch != ' ' && ch != '\n')				\
> +			goto out_domain;				\
> +		ds->_ver._name = _tmp;					\
> +	} while (0)
> +
> +	if (version == 15) {
> +#include <perf/schedstat-v15.h>
> +	}
> +#undef DOMAIN_FIELD
> +
> +	ds->cpu = cpu;
> +	return event;
> +
> +out_domain:
> +	free(event);
> +out_cpu_mask:
> +	free(cpu_mask);
> +out:
> +	return NULL;
> +}
> +
> +int perf_event__synthesize_schedstat(const struct perf_tool *tool,
> +				     perf_event__handler_t process,
> +				     struct perf_cpu_map *user_requested_cpus)
> +{
> +	char *line = NULL, path[PATH_MAX];
> +	union perf_event *event = NULL;
> +	size_t line_len = 0;
> +	char bf[BUFSIZ];
> +	__u64 timestamp;
> +	__u64 cpu = -1;
> +	__u16 version;
> +	struct io io;
> +	int ret = -1;
> +	char ch;
> +
> +	snprintf(path, PATH_MAX, "%s/schedstat", procfs__mountpoint());
> +	io.fd = open(path, O_RDONLY, 0);
> +	if (io.fd < 0) {
> +		pr_err("Failed to open %s. Possibly CONFIG_SCHEDSTAT is disabled.\n", path);
> +		return -1;
> +	}
> +	io__init(&io, io.fd, bf, sizeof(bf));
> +
> +	if (io__getline(&io, &line, &line_len) < 0 || !line_len)
> +		goto out;
> +
> +	if (!strcmp(line, "version 15\n")) {
> +		version = 15;
> +	} else {
> +		pr_err("Unsupported %s version: %s", path, line + 8);
> +		goto out_free_line;
> +	}
> +
> +	if (io__getline(&io, &line, &line_len) < 0 || !line_len)
> +		goto out_free_line;
> +	timestamp = atol(line + 10);
> +
> +	/*
> +	 * FIXME: Can be optimized a bit by not synthesizing domain samples
> +	 * for filtered out cpus.
> +	 */
> +	for (ch = io__get_char(&io); !io.eof; ch = io__get_char(&io)) {
> +		struct perf_cpu this_cpu;
> +
> +		if (ch == 'c') {
> +			event = __synthesize_schedstat_cpu(&io, version,
> +							   &cpu, timestamp);
> +		} else if (ch == 'd') {
> +			event = __synthesize_schedstat_domain(&io, version,
> +							      cpu, timestamp);
> +		}
> +		if (!event)
> +			goto out_free_line;
> +
> +		this_cpu.cpu = cpu;
> +
> +		if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu))
> +			continue;
> +
> +		if (process(tool, event, NULL, NULL) < 0) {
> +			free(event);
> +			goto out_free_line;
> +		}
> +
> +		free(event);
> +	}
> +
> +	ret = 0;
> +
> +out_free_line:
> +	free(line);
> +out:
> +	close(io.fd);
> +	return ret;
> +}
> diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
> index b9c936b5cfeb..eab914c238df 100644
> --- a/tools/perf/util/synthetic-events.h
> +++ b/tools/perf/util/synthetic-events.h
> @@ -141,4 +141,7 @@ int perf_event__synthesize_for_pipe(const struct perf_tool *tool,
>  				    struct perf_data *data,
>  				    perf_event__handler_t process);
>  
> +int perf_event__synthesize_schedstat(const struct perf_tool *tool,
> +				     perf_event__handler_t process,
> +				     struct perf_cpu_map *user_requested_cpu);
>  #endif // __PERF_SYNTHETIC_EVENTS_H
> diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
> index 3b7f390f26eb..9f81d720735f 100644
> --- a/tools/perf/util/tool.c
> +++ b/tools/perf/util/tool.c
> @@ -230,6 +230,24 @@ static int perf_session__process_compressed_event_stub(struct perf_session *sess
>  	return 0;
>  }
>  
> +static int process_schedstat_cpu_stub(struct perf_session *perf_session __maybe_unused,
> +				      union perf_event *event)
> +{
> +	if (dump_trace)
> +		perf_event__fprintf_schedstat_cpu(event, stdout);
> +	dump_printf(": unhandled!\n");
> +	return 0;
> +}
> +
> +static int process_schedstat_domain_stub(struct perf_session *perf_session __maybe_unused,
> +					 union perf_event *event)
> +{
> +	if (dump_trace)
> +		perf_event__fprintf_schedstat_domain(event, stdout);
> +	dump_printf(": unhandled!\n");
> +	return 0;
> +}
> +
>  void perf_tool__init(struct perf_tool *tool, bool ordered_events)
>  {
>  	tool->ordered_events = ordered_events;
> @@ -286,6 +304,8 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
>  	tool->compressed = perf_session__process_compressed_event_stub;
>  #endif
>  	tool->finished_init = process_event_op2_stub;
> +	tool->schedstat_cpu = process_schedstat_cpu_stub;
> +	tool->schedstat_domain = process_schedstat_domain_stub;
>  }
>  
>  bool perf_tool__compressed_is_stub(const struct perf_tool *tool)
> diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
> index db1c7642b0d1..d289a5396b01 100644
> --- a/tools/perf/util/tool.h
> +++ b/tools/perf/util/tool.h
> @@ -77,7 +77,9 @@ struct perf_tool {
>  			stat,
>  			stat_round,
>  			feature,
> -			finished_init;
> +			finished_init,
> +			schedstat_cpu,
> +			schedstat_domain;
>  	event_op4	compressed;
>  	event_op3	auxtrace;
>  	bool		ordered_events;
> -- 
> 2.43.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ