[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YUxSMqrJK/rKPZS3@hirez.programming.kicks-ass.net>
Date: Thu, 23 Sep 2021 12:08:50 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Song Liu <songliubraving@...com>
Cc: linux-kernel@...r.kernel.org, acme@...nel.org, mingo@...hat.com,
kernel-team@...com, eranian@...gle.com,
Lucian Grijincu <lucian@...com>
Subject: Re: [PATCH] perf/core: fix userpage->time_enabled of inactive events
On Tue, Sep 21, 2021 at 06:17:15PM -0700, Song Liu wrote:
> Users of rdpmc rely on the mmapped user page to calculate accurate
> time_enabled. Currently, userpage->time_enabled is only updated when the
> event is added to the pmu. As a result, inactive event (due to counter
> multiplexing) does not have accurate userpage->time_enabled. This can
> be reproduced with something like:
>
> /* open 20 task perf_event "cycles", to create multiplexing */
>
> fd = perf_event_open(); /* open task perf_event "cycles" */
> userpage = mmap(fd); /* use mmap and rdmpc */
>
> while (true) {
> time_enabled_mmap = xxx; /* use logic in perf_event_mmap_page */
> time_enabled_read = read(fd).time_enabled;
> if (time_enabled_mmap > time_enabled_read)
> BUG();
> }
*groan*, yes I fear you're right.
> @@ -3807,6 +3816,23 @@ ctx_sched_in(struct perf_event_context *ctx,
> /* Then walk through the lower prio flexible groups */
> if (is_active & EVENT_FLEXIBLE)
> ctx_flexible_sched_in(ctx, cpuctx);
> +
> + /*
> + * Update userpage for inactive events. This is needed for accurate
> + * time_enabled.
> + */
> + if (unlikely(ctx->rotate_necessary)) {
> + struct perf_event *event;
> +
> + perf_event_groups_for_each(event, &ctx->pinned_groups) {
> + if (event->state == PERF_EVENT_STATE_INACTIVE)
> + perf_event_update_inactive_userpage(event, ctx);
> + }
That's a straight up error, if a pinned event doesn't get scheduled,
it's game over.
> + perf_event_groups_for_each(event, &ctx->flexible_groups) {
> + if (event->state == PERF_EVENT_STATE_INACTIVE)
> + perf_event_update_inactive_userpage(event, ctx);
> + }
> + }
> }
That's terrible though, and also wrong I think.
It's wrong because:
- you should only do this for (is_active & EVENT_TIME)
- you should only consider the events visit_groups_merge() would have
- you miss updating group-siling events
(I also think it's possible to try harder to avoid the work)
Now, looking at visit_groups_merge() we only terminate the iteration
when func returns !0, which is merge_sched_in(), and that *never*
returns !0.
So we already iterate all the right events. So I'm thinking we can do
something like the below, hmm?
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0c000cb01eeb..4d1e962c2ebe 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3707,6 +3712,28 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+static inline bool event_update_userpage(struct perf_event *event)
+{
+ if (!atomic_read(&event->mmap_count))
+ return false;
+
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
+ perf_event_update_userpage(event);
+ return true;
+}
+
+static inline void group_update_userpage(struct perf_event *group_event)
+{
+ struct perf_event *event;
+
+ if (!event_update_userpage(group_event))
+ return;
+
+ for_each_sibling_event(event, group_event)
+ event_update_userpage(event);
+}
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
@@ -3725,14 +3752,18 @@ static int merge_sched_in(struct perf_event *event, void *data)
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ *can_add_hw = 0;
+
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
- }
- *can_add_hw = 0;
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
+ } else {
+ ctx->rotate_necessary = 1;
+ perf_mux_hrtimer_restart(cpuctx);
+
+ group_update_userpage(event);
+ }
}
return 0;
Powered by blists - more mailing lists