[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20250716223924.825772-1-irogers@google.com>
Date: Wed, 16 Jul 2025 15:39:24 -0700
From: Ian Rogers <irogers@...gle.com>
To: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
Andi Kleen <ak@...ux.intel.com>, Dapeng Mi <dapeng1.mi@...ux.intel.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>, Namhyung Kim <namhyung@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>, Jiri Olsa <jolsa@...nel.org>,
Ian Rogers <irogers@...gle.com>, Adrian Hunter <adrian.hunter@...el.com>,
Kan Liang <kan.liang@...ux.intel.com>, linux-perf-users@...r.kernel.org,
linux-kernel@...r.kernel.org, Thomas Falcon <thomas.falcon@...el.com>,
Stephane Eranian <eranian@...gle.com>
Subject: [RFC PATCH v1] perf/core: [Untested] Perf event state changes to fix
enable time
Perf events (and the event_filter_match) may mean that an event is
inactive, and wanting to be active from sched_in, but this doesn't
happen due to the filter. As the event is inactive the enabled time
will accrue but the event's running time won't increase. The ratio of
enabled to running time is used to scale event counts as it is
incorrectly assumed in tools that an event is only not accruing
running time in relation enabled time due to multiplexing (a filter
may mean it doesn't accrue too).
For hybrid systems with a task p-core event, the filter will mean the
enabled time will increase on the e-cores making it look like the
event is multiplexing on p-cores. This is reported here:
https://lore.kernel.org/lkml/20250627192417.1157736-13-irogers@google.com/
For a perf event opened on both a task and a CPU, the event is only
active when on the matching CPU due to the filter. This leads to the
enabled time increasing but not the running time, again looking like
multiplexing and causing massive incorrect scaling within tools.
This patch proposes a solution by adding a new PERF_EVENT_STATE_ON
that replaces PERF_EVENT_STATE_INACTIVE. PERF_EVENT_STATE_INACTIVE is
now used for flexible events that failed to be scheduled due to
insufficient counters.
This patch is to highlight the problem and the changes to perf event
states and their associated state machine. It is uncompiled/untested
and intended for discussion purposed only.
---
include/linux/perf_event.h | 50 ++++++++++++++++++++++----------------
kernel/events/core.c | 34 ++++++++++++++------------
2 files changed, 47 insertions(+), 37 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ec9d96025683..8c4ffd7e2a02 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -638,29 +638,36 @@ struct perf_addr_filter_range {
/*
* The normal states are:
*
- * ACTIVE --.
- * ^ |
- * | |
- * sched_{in,out}() |
- * | |
- * v |
- * ,---> INACTIVE --+ <-.
- * | | |
- * | {dis,en}able()
- * sched_in() | |
- * | OFF <--' --+
- * | |
- * `---> ERROR ------'
+ * .-- disable() -----> OFF <----disable()-------.
+ * | | ^ |
+ * +-----------------> enable() | |
+ * | | .------. |
+ * | v / |
+ * ERROR .---------> ON <-----------. |
+ * ^ | | | |
+ * | filter <-- sched_in() sched_out() |
+ * | | | |
+ * +-(lack of counters)--+ | |
+ * v v | |
+ * INACTIVE ACTIVE | |
+ * | | | |
+ * .---------------------+-------------+---------.
*
* That is:
*
- * sched_in: INACTIVE -> {ACTIVE,ERROR}
- * sched_out: ACTIVE -> INACTIVE
- * disable: {ACTIVE,INACTIVE} -> OFF
- * enable: {OFF,ERROR} -> INACTIVE
+ * sched_in: ON -> {ON,ERROR,INACTIVE,ACTIVE}
+ * sched_out: {INACTIVE,ACTIVE} -> ON
+ * disable: {ON,ERROR,INACTIVE,ACTIVE} -> OFF
+ * enable: {OFF,ERROR} -> ON
*
- * Where {OFF,ERROR} are disabled states.
+ * Where {ON,INACTIVE,ACTIVE} are enabled states and {OFF,ERROR} are disabled
+ * states.
*
+ * ON differs from INACTIVE as enabled time only increases for INACTIVE and
+ * ACTIVE events, running time only increases for ACTIVE events. A filter, such
+ * as the task or CPU an event should be scheduled on, can cause an event during
+ * sched_in to remain in the ON state.
+
* Then we have the {EXIT,REVOKED,DEAD} states which are various shades of
* defunct events:
*
@@ -681,9 +688,10 @@ enum perf_event_state {
PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */
PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */
PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */
- PERF_EVENT_STATE_OFF = -1,
- PERF_EVENT_STATE_INACTIVE = 0,
- PERF_EVENT_STATE_ACTIVE = 1,
+ PERF_EVENT_STATE_OFF = -1, /* regular disabled event */
+ PERF_EVENT_STATE_ON = 0, /* enabled but not scheduled */
+ PERF_EVENT_STATE_INACTIVE = 1, /* not scheduled due to lack of counters */
+ PERF_EVENT_STATE_ACTIVE = 2, /* scheduled and counting */
};
struct file;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1f746469fda5..ef76c3887f71 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1920,7 +1920,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
static inline void perf_event__state_init(struct perf_event *event)
{
event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
- PERF_EVENT_STATE_INACTIVE;
+ PERF_EVENT_STATE_ON;
}
static int __perf_event_read_size(u64 read_format, int nr_siblings)
@@ -2362,14 +2362,14 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
- enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+ enum perf_event_state state = PERF_EVENT_STATE_ON;
// XXX cpc serialization, probably per-cpu IRQ disabled
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
- if (event->state != PERF_EVENT_STATE_ACTIVE)
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
/*
@@ -2409,7 +2409,7 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event;
- if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+ if (group_event->state < PERF_EVENT_STATE_INACTIVE)
return;
perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
@@ -2583,7 +2583,7 @@ static void __perf_event_disable(struct perf_event *event,
struct perf_event_context *ctx,
void *info)
{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
+ if (event->state < PERF_EVENT_STATE_ON)
return;
perf_pmu_disable(event->pmu_ctx->pmu);
@@ -3135,13 +3135,13 @@ static void __perf_event_enable(struct perf_event *event,
struct perf_event *leader = event->group_leader;
struct perf_event_context *task_ctx;
- if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ if (event->state >= PERF_EVENT_STATE_ON ||
event->state <= PERF_EVENT_STATE_ERROR)
return;
ctx_time_freeze(cpuctx, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+ perf_event_set_state(event, PERF_EVENT_STATE_ON);
perf_cgroup_event_enable(event, ctx);
if (!ctx->is_active)
@@ -3150,6 +3150,8 @@ static void __perf_event_enable(struct perf_event *event,
if (!event_filter_match(event))
return;
+ perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
+
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
@@ -3178,7 +3180,7 @@ static void _perf_event_enable(struct perf_event *event)
struct perf_event_context *ctx = event->ctx;
raw_spin_lock_irq(&ctx->lock);
- if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ if (event->state >= PERF_EVENT_STATE_ON ||
event->state < PERF_EVENT_STATE_ERROR) {
out:
raw_spin_unlock_irq(&ctx->lock);
@@ -4525,10 +4527,10 @@ static int event_enable_on_exec(struct perf_event *event,
return 0;
event->attr.enable_on_exec = 0;
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ if (event->state >= PERF_EVENT_STATE_ON)
return 0;
- perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+ perf_event_set_state(event, PERF_EVENT_STATE_ON);
return 1;
}
@@ -8544,7 +8546,7 @@ perf_iterate_ctx(struct perf_event_context *ctx,
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (!all) {
- if (event->state < PERF_EVENT_STATE_INACTIVE)
+ if (event->state < PERF_EVENT_STATE_ON)
continue;
if (!event_filter_match(event))
continue;
@@ -8568,7 +8570,7 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
if (!smp_load_acquire(&event->ctx))
continue;
- if (event->state < PERF_EVENT_STATE_INACTIVE)
+ if (event->state < PERF_EVENT_STATE_ON)
continue;
if (!event_filter_match(event))
continue;
@@ -12864,7 +12866,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->ns = get_pid_ns(task_active_pid_ns(current));
event->id = atomic64_inc_return(&perf_event_id);
- event->state = PERF_EVENT_STATE_INACTIVE;
+ event->state = PERF_EVENT_STATE_ON;
if (parent_event)
event->event_caps = parent_event->event_caps;
@@ -13875,7 +13877,7 @@ static void __perf_pmu_install_event(struct pmu *pmu,
event->pmu_ctx = epc;
if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
+ event->state = PERF_EVENT_STATE_ON;
perf_install_in_context(ctx, event, cpu);
/*
@@ -14288,8 +14290,8 @@ inherit_event(struct perf_event *parent_event,
* not its attr.disabled bit. We hold the parent's mutex,
* so we won't race with perf_event_{en, dis}able_family.
*/
- if (parent_state >= PERF_EVENT_STATE_INACTIVE)
- child_event->state = PERF_EVENT_STATE_INACTIVE;
+ if (parent_state >= PERF_EVENT_STATE_ON)
+ child_event->state = PERF_EVENT_STATE_ON;
else
child_event->state = PERF_EVENT_STATE_OFF;
--
2.50.0.727.gbf7dc18ff4-goog
Powered by blists - more mailing lists