[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CABPqkBQj4o0CCPL8d3eU2fCpjBnwLduni7MsiMR+-o=g=Ws7Mg@mail.gmail.com>
Date: Thu, 5 Nov 2020 00:40:36 -0800
From: Stephane Eranian <eranian@...gle.com>
To: Namhyung Kim <namhyung@...nel.org>
Cc: Arnaldo Carvalho de Melo <acme@...nel.org>,
Jiri Olsa <jolsa@...hat.com>,
Peter Zijlstra <a.p.zijlstra@...llo.nl>,
Andi Kleen <ak@...ux.intel.com>,
Ingo Molnar <mingo@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
LKML <linux-kernel@...r.kernel.org>,
Ian Rogers <irogers@...gle.com>,
"Paul E. McKenney" <paulmck@...ibm.com>
Subject: Re: [RFC] perf evlist: Warn if event group has mixed sw/hw events
On Mon, Oct 26, 2020 at 7:19 AM Namhyung Kim <namhyung@...nel.org> wrote:
>
> I found that order of events in a group impacts performance during the
> open. If a group has a software event as a leader and has other
> hardware events, the lead needs to be moved to a hardware context.
> This includes RCU synchronization which takes about 20 msec on my
> system. And this is just for a single group, so total time increases
> in proportion to the number of event groups and the number of cpus.
>
> On my 36 cpu system, opening 3 groups system-wide takes more than 2
> seconds. You can see and compare it easily with the following:
>
> $ time ./perf stat -a -e '{cs,cycles},{cs,cycles},{cs,cycles}' sleep 1
> ...
> 1.006333430 seconds time elapsed
>
> real 0m3.969s
> user 0m0.089s
> sys 0m0.074s
>
> $ time ./perf stat -a -e '{cycles,cs},{cycles,cs},{cycles,cs}' sleep 1
> ...
> 1.006755292 seconds time elapsed
>
> real 0m1.144s
> user 0m0.067s
> sys 0m0.083s
>
> This patch just added a warning before running it. I'd really want to
> fix the kernel if possible but don't have a good idea. Thoughts?
>
This is a problem for us. This has caused problems on our systems with
perf command taking much longer than expected and firing timeouts.
The cost of perf_event_open() should not be so dependent on the order
of the events in a group. The penalty incurred by synchronize_rcu()
is very large and likely does not scale too well. Scalability may not
only be impacted by the number of CPUs of the machine. I am not an
expert
at RCU but it seems it exposes perf_event_open() to penalties caused
by other subsystem operations. I am wondering if there would be a
different way of handling the change of group type that would avoid
the high cost of synchronize_rcu().
> Signed-off-by: Namhyung Kim <namhyung@...nel.org>
> ---
> tools/perf/builtin-record.c | 2 +
> tools/perf/builtin-stat.c | 2 +
> tools/perf/builtin-top.c | 2 +
> tools/perf/util/evlist.c | 78 +++++++++++++++++++++++++++++++++++++
> tools/perf/util/evlist.h | 1 +
> 5 files changed, 85 insertions(+)
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index adf311d15d3d..c0b08cacbae0 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -912,6 +912,8 @@ static int record__open(struct record *rec)
>
> perf_evlist__config(evlist, opts, &callchain_param);
>
> + evlist__warn_mixed_group(evlist);
> +
> evlist__for_each_entry(evlist, pos) {
> try_again:
> if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index b01af171d94f..d5d4e02bda69 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -738,6 +738,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
> if (affinity__setup(&affinity) < 0)
> return -1;
>
> + evlist__warn_mixed_group(evsel_list);
> +
> evlist__for_each_cpu (evsel_list, i, cpu) {
> affinity__set(&affinity, cpu);
>
> diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> index 7c64134472c7..9ad319cea948 100644
> --- a/tools/perf/builtin-top.c
> +++ b/tools/perf/builtin-top.c
> @@ -1027,6 +1027,8 @@ static int perf_top__start_counters(struct perf_top *top)
>
> perf_evlist__config(evlist, opts, &callchain_param);
>
> + evlist__warn_mixed_group(evlist);
> +
> evlist__for_each_entry(evlist, counter) {
> try_again:
> if (evsel__open(counter, top->evlist->core.cpus,
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index 8bdf3d2c907c..02cff39e509e 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -28,6 +28,7 @@
> #include <unistd.h>
> #include <sched.h>
> #include <stdlib.h>
> +#include <dirent.h>
>
> #include "parse-events.h"
> #include <subcmd/parse-options.h>
> @@ -1980,3 +1981,80 @@ struct evsel *evlist__find_evsel(struct evlist *evlist, int idx)
> }
> return NULL;
> }
> +
> +static int *sw_types;
> +static int nr_sw_types;
> +
> +static void collect_software_pmu_types(void)
> +{
> + const char *known_sw_pmu[] = {
> + "software", "tracepoint", "breakpoint", "kprobe", "uprobe", "msr"
> + };
> + DIR *dir;
> + struct dirent *d;
> + char path[PATH_MAX];
> + int i;
> +
> + if (sw_types != NULL)
> + return;
> +
> + nr_sw_types = ARRAY_SIZE(known_sw_pmu);
> + sw_types = calloc(nr_sw_types, sizeof(int));
> + if (sw_types == NULL) {
> + pr_err("Memory allocation failed!\n");
> + return;
> + }
> +
> + dir = opendir("/sys/bus/event_source/devices");
> + while ((d = readdir(dir)) != NULL) {
> + for (i = 0; i < nr_sw_types; i++) {
> + if (strcmp(d->d_name, known_sw_pmu[i]))
> + continue;
> +
> + snprintf(path, sizeof(path), "%s/%s/type",
> + "bus/event_source/devices", d->d_name);
> + sysfs__read_int(path, &sw_types[i]);
> + }
> + }
> + closedir(dir);
> +}
> +
> +static bool is_software_event(struct evsel *evsel)
> +{
> + int i;
> +
> + for (i = 0; i < nr_sw_types; i++) {
> + if (evsel->core.attr.type == (unsigned)sw_types[i])
> + return true;
> + }
> + return false;
> +}
> +
> +void evlist__warn_mixed_group(struct evlist *evlist)
> +{
> + struct evsel *leader, *evsel;
> + bool warn = true;
> +
> + collect_software_pmu_types();
> +
> + /* Warn if an event group has a sw leader and hw siblings */
> + evlist__for_each_entry(evlist, leader) {
> + if (!evsel__is_group_event(leader))
> + continue;
> +
> + if (!is_software_event(leader))
> + continue;
> +
> + for_each_group_member(evsel, leader) {
> + if (is_software_event(evsel))
> + continue;
> + if (!warn)
> + continue;
> +
> + pr_warning("WARNING: Event group has mixed hw/sw events.\n"
> + "This will slow down the perf_event_open syscall.\n"
> + "Consider putting a hw event as a leader.\n\n");
> + warn = false;
> + }
> + }
> +}
> diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
> index e1a450322bc5..a5b0a1d03a00 100644
> --- a/tools/perf/util/evlist.h
> +++ b/tools/perf/util/evlist.h
> @@ -387,4 +387,5 @@ int evlist__ctlfd_ack(struct evlist *evlist);
> #define EVLIST_DISABLED_MSG "Events disabled\n"
>
> struct evsel *evlist__find_evsel(struct evlist *evlist, int idx);
> +void evlist__warn_mixed_group(struct evlist *evlist);
> #endif /* __PERF_EVLIST_H */
> --
> 2.29.0.rc1.297.gfa9743e501-goog
>
Powered by blists - more mailing lists