[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251106071241.141234-8-irogers@google.com>
Date: Wed, 5 Nov 2025 23:12:38 -0800
From: Ian Rogers <irogers@...gle.com>
To: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>, Namhyung Kim <namhyung@...nel.org>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>, Jiri Olsa <jolsa@...nel.org>,
Ian Rogers <irogers@...gle.com>, Adrian Hunter <adrian.hunter@...el.com>,
"Dr. David Alan Gilbert" <linux@...blig.org>, Yang Li <yang.lee@...ux.alibaba.com>,
James Clark <james.clark@...aro.org>, Thomas Falcon <thomas.falcon@...el.com>,
Thomas Richter <tmricht@...ux.ibm.com>, linux-perf-users@...r.kernel.org,
linux-kernel@...r.kernel.org, Andi Kleen <ak@...ux.intel.com>,
Dapeng Mi <dapeng1.mi@...ux.intel.com>
Subject: [PATCH v3 7/9] perf evlist: Reduce affinity use and move into
iterator, fix no affinity
The evlist__for_each_cpu iterator will call sched_setaffitinity when
moving between CPUs to avoid IPIs. If only 1 IPI is saved then this
may be unprofitable as the delay to get scheduled may be
considerable. This may be particularly true if reading an event group
in `perf stat` in interval mode.
Move the affinity handling completely into the iterator so that a
single evlist__use_affinity can determine whether CPU affinities will
be used. For `perf record` the change is minimal as the dummy event
and the real event will always make the use of affinities the thing to
do. In `perf stat`, tool events are ignored and affinities only used
if >1 event on the same CPU occur. Determining if affinities are
useful is done by per-event in a new PMU benefits from affinity
function.
Fix a bug where when there are no affinities that the CPU map iterator
may reference a CPU not present in the initial evsel. Fix by making
the iterator and non-iterator code common.
Fix a bug where closing events on an evlist wasn't closing TPEBS
events.
Signed-off-by: Ian Rogers <irogers@...gle.com>
---
tools/perf/builtin-stat.c | 128 ++++++++++++++----------------
tools/perf/util/evlist.c | 160 ++++++++++++++++++++++++--------------
tools/perf/util/evlist.h | 26 +++++--
tools/perf/util/pmu.c | 12 +++
tools/perf/util/pmu.h | 1 +
5 files changed, 189 insertions(+), 138 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 3cd663b3b357..5f31cd5bb03b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -360,22 +360,14 @@ static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
return 0;
}
-static int read_affinity_counters(void)
+static int read_counters_with_affinity(void)
{
struct evlist_cpu_iterator evlist_cpu_itr;
- struct affinity saved_affinity, *affinity;
if (all_counters_use_bpf)
return 0;
- if (!target__has_cpu(&target) || target__has_per_thread(&target))
- affinity = NULL;
- else if (affinity__setup(&saved_affinity) < 0)
- return -1;
- else
- affinity = &saved_affinity;
-
- evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+ evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
struct evsel *counter = evlist_cpu_itr.evsel;
if (evsel__is_bpf(counter))
@@ -384,8 +376,6 @@ static int read_affinity_counters(void)
if (!counter->err)
counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
}
- if (affinity)
- affinity__cleanup(&saved_affinity);
return 0;
}
@@ -408,12 +398,18 @@ static int read_bpf_map_counters(void)
static int read_counters(void)
{
- if (!stat_config.stop_read_counter) {
- if (read_bpf_map_counters() ||
- read_affinity_counters())
- return -1;
- }
- return 0;
+ int ret;
+
+ if (stat_config.stop_read_counter)
+ return 0;
+
+ // Read all BPF counters first.
+ ret = read_bpf_map_counters();
+ if (ret)
+ return ret;
+
+ // Read non-BPF and non-tool counters next.
+ return read_counters_with_affinity();
}
static void process_counters(void)
@@ -754,7 +750,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
const bool forks = (argc > 0);
bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false;
struct evlist_cpu_iterator evlist_cpu_itr;
- struct affinity saved_affinity, *affinity = NULL;
int err, open_err = 0;
bool second_pass = false, has_supported_counters;
@@ -766,14 +761,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
child_pid = evsel_list->workload.pid;
}
- if (!cpu_map__is_dummy(evsel_list->core.user_requested_cpus)) {
- if (affinity__setup(&saved_affinity) < 0) {
- err = -1;
- goto err_out;
- }
- affinity = &saved_affinity;
- }
-
evlist__for_each_entry(evsel_list, counter) {
counter->reset_group = false;
if (bpf_counter__load(counter, &target)) {
@@ -786,49 +773,48 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
evlist__reset_aggr_stats(evsel_list);
- evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
- counter = evlist_cpu_itr.evsel;
+ /*
+ * bperf calls evsel__open_per_cpu() in bperf__load(), so
+ * no need to call it again here.
+ */
+ if (!target.use_bpf) {
+ evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
+ counter = evlist_cpu_itr.evsel;
- /*
- * bperf calls evsel__open_per_cpu() in bperf__load(), so
- * no need to call it again here.
- */
- if (target.use_bpf)
- break;
+ if (counter->reset_group || !counter->supported)
+ continue;
+ if (evsel__is_bperf(counter))
+ continue;
- if (counter->reset_group || !counter->supported)
- continue;
- if (evsel__is_bperf(counter))
- continue;
+ while (true) {
+ if (create_perf_stat_counter(counter, &stat_config,
+ evlist_cpu_itr.cpu_map_idx) == 0)
+ break;
- while (true) {
- if (create_perf_stat_counter(counter, &stat_config,
- evlist_cpu_itr.cpu_map_idx) == 0)
- break;
+ open_err = errno;
+ /*
+ * Weak group failed. We cannot just undo this
+ * here because earlier CPUs might be in group
+ * mode, and the kernel doesn't support mixing
+ * group and non group reads. Defer it to later.
+ * Don't close here because we're in the wrong
+ * affinity.
+ */
+ if ((open_err == EINVAL || open_err == EBADF) &&
+ evsel__leader(counter) != counter &&
+ counter->weak_group) {
+ evlist__reset_weak_group(evsel_list, counter, false);
+ assert(counter->reset_group);
+ counter->supported = true;
+ second_pass = true;
+ break;
+ }
- open_err = errno;
- /*
- * Weak group failed. We cannot just undo this here
- * because earlier CPUs might be in group mode, and the kernel
- * doesn't support mixing group and non group reads. Defer
- * it to later.
- * Don't close here because we're in the wrong affinity.
- */
- if ((open_err == EINVAL || open_err == EBADF) &&
- evsel__leader(counter) != counter &&
- counter->weak_group) {
- evlist__reset_weak_group(evsel_list, counter, false);
- assert(counter->reset_group);
- counter->supported = true;
- second_pass = true;
- break;
+ if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
+ break;
}
-
- if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
- break;
}
}
-
if (second_pass) {
/*
* Now redo all the weak group after closing them,
@@ -836,7 +822,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
*/
/* First close errored or weak retry */
- evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+ evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
counter = evlist_cpu_itr.evsel;
if (!counter->reset_group && counter->supported)
@@ -845,7 +831,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx);
}
/* Now reopen weak */
- evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+ evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
counter = evlist_cpu_itr.evsel;
if (!counter->reset_group)
@@ -854,17 +840,18 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
while (true) {
pr_debug2("reopening weak %s\n", evsel__name(counter));
if (create_perf_stat_counter(counter, &stat_config,
- evlist_cpu_itr.cpu_map_idx) == 0)
+ evlist_cpu_itr.cpu_map_idx) == 0) {
+ evlist_cpu_iterator__exit(&evlist_cpu_itr);
break;
-
+ }
open_err = errno;
- if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
+ if (stat_handle_error(counter, open_err) != COUNTER_RETRY) {
+ evlist_cpu_iterator__exit(&evlist_cpu_itr);
break;
+ }
}
}
}
- affinity__cleanup(affinity);
- affinity = NULL;
has_supported_counters = false;
evlist__for_each_entry(evsel_list, counter) {
@@ -1015,7 +1002,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
if (forks)
evlist__cancel_workload(evsel_list);
- affinity__cleanup(affinity);
return err;
}
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index e8217efdda53..b6df81b8a236 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -358,36 +358,111 @@ int evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name,
}
#endif
-struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity)
+/*
+ * Should sched_setaffinity be used with evlist__for_each_cpu? Determine if
+ * migrating the thread will avoid possibly numerous IPIs.
+ */
+static bool evlist__use_affinity(struct evlist *evlist)
+{
+ struct evsel *pos;
+ struct perf_cpu_map *used_cpus = NULL;
+ bool ret = false;
+
+ /*
+ * With perf record core.user_requested_cpus is usually NULL.
+ * Use the old method to handle this for now.
+ */
+ if (!evlist->core.user_requested_cpus ||
+ cpu_map__is_dummy(evlist->core.user_requested_cpus))
+ return false;
+
+ evlist__for_each_entry(evlist, pos) {
+ struct perf_cpu_map *intersect;
+
+ if (!perf_pmu__benefits_from_affinity(pos->pmu))
+ continue;
+
+ if (evsel__is_dummy_event(pos)) {
+ /*
+ * The dummy event is opened on all CPUs so assume >1
+ * event with shared CPUs.
+ */
+ ret = true;
+ break;
+ }
+ if (evsel__is_retire_lat(pos)) {
+ /*
+ * Retirement latency events are similar to tool ones in
+ * their implementation, and so don't require affinity.
+ */
+ continue;
+ }
+ if (perf_cpu_map__is_empty(used_cpus)) {
+ /* First benefitting event, we want >1 on a common CPU. */
+ used_cpus = perf_cpu_map__get(pos->core.cpus);
+ continue;
+ }
+ if ((pos->core.attr.read_format & PERF_FORMAT_GROUP) &&
+ evsel__leader(pos) != pos) {
+ /* Skip members of the same sample group. */
+ continue;
+ }
+ intersect = perf_cpu_map__intersect(used_cpus, pos->core.cpus);
+ if (!perf_cpu_map__is_empty(intersect)) {
+ /* >1 event with shared CPUs. */
+ perf_cpu_map__put(intersect);
+ ret = true;
+ break;
+ }
+ perf_cpu_map__put(intersect);
+ perf_cpu_map__merge(&used_cpus, pos->core.cpus);
+ }
+ perf_cpu_map__put(used_cpus);
+ return ret;
+}
+
+void evlist_cpu_iterator__init(struct evlist_cpu_iterator *itr, struct evlist *evlist)
{
- struct evlist_cpu_iterator itr = {
+ *itr = (struct evlist_cpu_iterator){
.container = evlist,
.evsel = NULL,
.cpu_map_idx = 0,
.evlist_cpu_map_idx = 0,
.evlist_cpu_map_nr = perf_cpu_map__nr(evlist->core.all_cpus),
.cpu = (struct perf_cpu){ .cpu = -1},
- .affinity = affinity,
+ .affinity = NULL,
};
if (evlist__empty(evlist)) {
/* Ensure the empty list doesn't iterate. */
- itr.evlist_cpu_map_idx = itr.evlist_cpu_map_nr;
- } else {
- itr.evsel = evlist__first(evlist);
- if (itr.affinity) {
- itr.cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
- affinity__set(itr.affinity, itr.cpu.cpu);
- itr.cpu_map_idx = perf_cpu_map__idx(itr.evsel->core.cpus, itr.cpu);
- /*
- * If this CPU isn't in the evsel's cpu map then advance
- * through the list.
- */
- if (itr.cpu_map_idx == -1)
- evlist_cpu_iterator__next(&itr);
- }
+ itr->evlist_cpu_map_idx = itr->evlist_cpu_map_nr;
+ return;
}
- return itr;
+
+ if (evlist__use_affinity(evlist)) {
+ if (affinity__setup(&itr->saved_affinity) == 0)
+ itr->affinity = &itr->saved_affinity;
+ }
+ itr->evsel = evlist__first(evlist);
+ itr->cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
+ if (itr->affinity)
+ affinity__set(itr->affinity, itr->cpu.cpu);
+ itr->cpu_map_idx = perf_cpu_map__idx(itr->evsel->core.cpus, itr->cpu);
+ /*
+ * If this CPU isn't in the evsel's cpu map then advance
+ * through the list.
+ */
+ if (itr->cpu_map_idx == -1)
+ evlist_cpu_iterator__next(itr);
+}
+
+void evlist_cpu_iterator__exit(struct evlist_cpu_iterator *itr)
+{
+ if (!itr->affinity)
+ return;
+
+ affinity__cleanup(itr->affinity);
+ itr->affinity = NULL;
}
void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
@@ -417,14 +492,11 @@ void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
*/
if (evlist_cpu_itr->cpu_map_idx == -1)
evlist_cpu_iterator__next(evlist_cpu_itr);
+ } else {
+ evlist_cpu_iterator__exit(evlist_cpu_itr);
}
}
-bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr)
-{
- return evlist_cpu_itr->evlist_cpu_map_idx >= evlist_cpu_itr->evlist_cpu_map_nr;
-}
-
static int evsel__strcmp(struct evsel *pos, char *evsel_name)
{
if (!evsel_name)
@@ -452,19 +524,11 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
{
struct evsel *pos;
struct evlist_cpu_iterator evlist_cpu_itr;
- struct affinity saved_affinity, *affinity = NULL;
bool has_imm = false;
- // See explanation in evlist__close()
- if (!cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
- if (affinity__setup(&saved_affinity) < 0)
- return;
- affinity = &saved_affinity;
- }
-
/* Disable 'immediate' events last */
for (int imm = 0; imm <= 1; imm++) {
- evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
+ evlist__for_each_cpu(evlist_cpu_itr, evlist) {
pos = evlist_cpu_itr.evsel;
if (evsel__strcmp(pos, evsel_name))
continue;
@@ -482,7 +546,6 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
break;
}
- affinity__cleanup(affinity);
evlist__for_each_entry(evlist, pos) {
if (evsel__strcmp(pos, evsel_name))
continue;
@@ -522,16 +585,8 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
{
struct evsel *pos;
struct evlist_cpu_iterator evlist_cpu_itr;
- struct affinity saved_affinity, *affinity = NULL;
- // See explanation in evlist__close()
- if (!cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
- if (affinity__setup(&saved_affinity) < 0)
- return;
- affinity = &saved_affinity;
- }
-
- evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
+ evlist__for_each_cpu(evlist_cpu_itr, evlist) {
pos = evlist_cpu_itr.evsel;
if (evsel__strcmp(pos, evsel_name))
continue;
@@ -541,7 +596,6 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
continue;
evsel__enable_cpu(pos, evlist_cpu_itr.cpu_map_idx);
}
- affinity__cleanup(affinity);
evlist__for_each_entry(evlist, pos) {
if (evsel__strcmp(pos, evsel_name))
continue;
@@ -1338,28 +1392,14 @@ void evlist__close(struct evlist *evlist)
{
struct evsel *evsel;
struct evlist_cpu_iterator evlist_cpu_itr;
- struct affinity affinity;
-
- /*
- * With perf record core.user_requested_cpus is usually NULL.
- * Use the old method to handle this for now.
- */
- if (!evlist->core.user_requested_cpus ||
- cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
- evlist__for_each_entry_reverse(evlist, evsel)
- evsel__close(evsel);
- return;
- }
-
- if (affinity__setup(&affinity) < 0)
- return;
- evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
+ evlist__for_each_cpu(evlist_cpu_itr, evlist) {
+ if (evlist_cpu_itr.cpu_map_idx == 0 && evsel__is_retire_lat(evlist_cpu_itr.evsel))
+ evsel__tpebs_close(evlist_cpu_itr.evsel);
perf_evsel__close_cpu(&evlist_cpu_itr.evsel->core,
evlist_cpu_itr.cpu_map_idx);
}
- affinity__cleanup(&affinity);
evlist__for_each_entry_reverse(evlist, evsel) {
perf_evsel__free_fd(&evsel->core);
perf_evsel__free_id(&evsel->core);
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 5e71e3dc6042..b4604c3f03d6 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -10,6 +10,7 @@
#include <internal/evlist.h>
#include <internal/evsel.h>
#include <perf/evlist.h>
+#include "affinity.h"
#include "events_stats.h"
#include "evsel.h"
#include "rblist.h"
@@ -361,6 +362,8 @@ struct evlist_cpu_iterator {
struct perf_cpu cpu;
/** If present, used to set the affinity when switching between CPUs. */
struct affinity *affinity;
+ /** Maybe be used to hold affinity state prior to iterating. */
+ struct affinity saved_affinity;
};
/**
@@ -368,22 +371,31 @@ struct evlist_cpu_iterator {
* affinity, iterate over all CPUs and then the evlist
* for each evsel on that CPU. When switching between
* CPUs the affinity is set to the CPU to avoid IPIs
- * during syscalls.
+ * during syscalls. The affinity is set up and removed
+ * automatically, if the loop is broken a call to
+ * evlist_cpu_iterator__exit is necessary.
* @evlist_cpu_itr: the iterator instance.
* @evlist: evlist instance to iterate.
- * @affinity: NULL or used to set the affinity to the current CPU.
*/
-#define evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) \
- for ((evlist_cpu_itr) = evlist__cpu_begin(evlist, affinity); \
+#define evlist__for_each_cpu(evlist_cpu_itr, evlist) \
+ for (evlist_cpu_iterator__init(&(evlist_cpu_itr), evlist); \
!evlist_cpu_iterator__end(&evlist_cpu_itr); \
evlist_cpu_iterator__next(&evlist_cpu_itr))
-/** Returns an iterator set to the first CPU/evsel of evlist. */
-struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity);
+/** Setup an iterator set to the first CPU/evsel of evlist. */
+void evlist_cpu_iterator__init(struct evlist_cpu_iterator *itr, struct evlist *evlist);
+/**
+ * Cleans up the iterator, automatically done by evlist_cpu_iterator__next when
+ * the end of the list is reached. Multiple calls are safe.
+ */
+void evlist_cpu_iterator__exit(struct evlist_cpu_iterator *itr);
/** Move to next element in iterator, updating CPU, evsel and the affinity. */
void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr);
/** Returns true when iterator is at the end of the CPUs and evlist. */
-bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr);
+static inline bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr)
+{
+ return evlist_cpu_itr->evlist_cpu_map_idx >= evlist_cpu_itr->evlist_cpu_map_nr;
+}
struct evsel *evlist__get_tracking_event(struct evlist *evlist);
void evlist__set_tracking_event(struct evlist *evlist, struct evsel *tracking_evsel);
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index d597263fab4f..caa3bd5e0956 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -2402,6 +2402,18 @@ bool perf_pmu__is_software(const struct perf_pmu *pmu)
return false;
}
+bool perf_pmu__benefits_from_affinity(struct perf_pmu *pmu)
+{
+ if (!pmu)
+ return true; /* Assume is core. */
+
+ /*
+ * All perf event PMUs should benefit from accessing the perf event
+ * contexts on the local CPU.
+ */
+ return pmu->type <= PERF_PMU_TYPE_PE_END;
+}
+
FILE *perf_pmu__open_file(const struct perf_pmu *pmu, const char *name)
{
char path[PATH_MAX];
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 1ebcf0242af8..87e12a9a0e67 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -259,6 +259,7 @@ bool perf_pmu__name_no_suffix_match(const struct perf_pmu *pmu, const char *to_m
* perf_sw_context in the kernel?
*/
bool perf_pmu__is_software(const struct perf_pmu *pmu);
+bool perf_pmu__benefits_from_affinity(struct perf_pmu *pmu);
FILE *perf_pmu__open_file(const struct perf_pmu *pmu, const char *name);
FILE *perf_pmu__open_file_at(const struct perf_pmu *pmu, int dirfd, const char *name);
--
2.51.2.1041.gc1ab5b90ca-goog
Powered by blists - more mailing lists