[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b38cc358-8e46-48bd-88c0-ff4b8db6bd15@linux.intel.com>
Date: Wed, 7 Aug 2024 11:17:18 -0400
From: "Liang, Kan" <kan.liang@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>, mingo@...nel.org
Cc: acme@...nel.org, namhyung@...nel.org, mark.rutland@....com,
alexander.shishkin@...ux.intel.com, jolsa@...nel.org, irogers@...gle.com,
adrian.hunter@...el.com, linux-perf-users@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH 4/5] perf: Add context time freeze
On 2024-08-07 7:29 a.m., Peter Zijlstra wrote:
> Many of the the context reschedule users are of the form:
>
> ctx_sched_out(.type = EVENT_TIME);
> ... modify context
> ctx_resched();
>
> With the idea that the whole reschedule happens with a single
> time-stamp, rather than with each ctx_sched_out() advancing time and
> ctx_sched_in() re-starting time, creating a non-atomic experience.
>
> However, Kan noticed that since this completely stops time, it
> actually looses a bit of time between the stop and start. Worse, now
> that we can do partial (per PMU) reschedules, the PMUs that are not
> scheduled out still observe the time glitch.
>
> Replace this with:
>
> ctx_time_freeze();
> ... modify context
> ctx_resched();
>
> With the assumption that this happens in a perf_ctx_lock() /
> perf_ctx_unlock() pair.
>
> The new ctx_time_freeze() will update time and sets EVENT_FROZEN, and
> ensures EVENT_TIME and EVENT_FROZEN remain set, this avoids
> perf_event_time_now() from observing a time wobble from not seeing
> EVENT_TIME for a little while.
>
> Additionally, this avoids loosing time between
> ctx_sched_out(EVENT_TIME) and ctx_sched_in(), which would re-set the
> timestamp.
>
> Reported-by: Kan Liang <kan.liang@...ux.intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
> kernel/events/core.c | 128 ++++++++++++++++++++++++++++++++++-----------------
> 1 file changed, 86 insertions(+), 42 deletions(-)
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, re
> return data.ret;
> }
>
> +enum event_type_t {
> + EVENT_FLEXIBLE = 0x01,
> + EVENT_PINNED = 0x02,
> + EVENT_TIME = 0x04,
> + EVENT_FROZEN = 0x08,
> + /* see ctx_resched() for details */
> + EVENT_CPU = 0x10,
> + EVENT_CGROUP = 0x20,
> +
> + /* compound helpers */
> + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> + EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
> +};
> +
> +static inline void __perf_ctx_lock(struct perf_event_context *ctx)
> +{
> + raw_spin_lock(&ctx->lock);
> + WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
> +}
> +
> static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
> struct perf_event_context *ctx)
> {
> - raw_spin_lock(&cpuctx->ctx.lock);
> + __perf_ctx_lock(&cpuctx->ctx);
> if (ctx)
> - raw_spin_lock(&ctx->lock);
> + __perf_ctx_lock(ctx);
> +}
> +
> +static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
> +{
> + /*
> + * If ctx_sched_in() didn't again set any ALL flags, clean up
> + * after ctx_sched_out() by clearing is_active.
> + */
> + if (ctx->is_active & EVENT_FROZEN) {
> + if (!(ctx->is_active & EVENT_ALL))
Nit:
It may be better to add a macro/inline function to replace all the
(ctx->is_active & EVENT_ALL) check? For example,
+static inline bool perf_ctx_has_active_events(struct perf_event_context
*ctx)
+{
+ return ctx->is_active & EVENT_ALL;
+}
...
+ if (ctx->is_active & EVENT_FROZEN) {
+ if (!perf_ctx_has_active_events(ctx))
+ ctx->is_active = 0;
+ else
+ ctx->is_active &= ~EVENT_FROZEN;
It can tell very straightforwardly that we want to clear all flags if
there is no active event.
The EVENT_ALL may bring confusion. It actually means all events, not all
event types. The developer may have to go to the define and figure out
what exactly the EVENT_ALL includes.
Thanks,
Kan
> + ctx->is_active = 0;
> + else
> + ctx->is_active &= ~EVENT_FROZEN;
> + }
> + raw_spin_unlock(&ctx->lock);
> }
>
> static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
> struct perf_event_context *ctx)
> {
> if (ctx)
> - raw_spin_unlock(&ctx->lock);
> - raw_spin_unlock(&cpuctx->ctx.lock);
> + __perf_ctx_unlock(ctx);
> + __perf_ctx_unlock(&cpuctx->ctx);
> }
>
> #define TASK_TOMBSTONE ((void *)-1L)
> @@ -370,16 +405,6 @@ static void event_function_local(struct
> (PERF_SAMPLE_BRANCH_KERNEL |\
> PERF_SAMPLE_BRANCH_HV)
>
> -enum event_type_t {
> - EVENT_FLEXIBLE = 0x1,
> - EVENT_PINNED = 0x2,
> - EVENT_TIME = 0x4,
> - /* see ctx_resched() for details */
> - EVENT_CPU = 0x8,
> - EVENT_CGROUP = 0x10,
> - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> -};
> -
> /*
> * perf_sched_events : >0 events exist
> */
> @@ -2332,18 +2357,39 @@ group_sched_out(struct perf_event *group
> }
>
> static inline void
> -ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
> {
> if (ctx->is_active & EVENT_TIME) {
> + if (ctx->is_active & EVENT_FROZEN)
> + return;
> update_context_time(ctx);
> - update_cgrp_time_from_cpuctx(cpuctx, false);
> + update_cgrp_time_from_cpuctx(cpuctx, final);
> }
> }
>
> static inline void
> +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +{
> + __ctx_time_update(cpuctx, ctx, false);
> +}
> +
> +/*
> + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
> + */
> +static inline void
> +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +{
> + ctx_time_update(cpuctx, ctx);
> + if (ctx->is_active & EVENT_TIME)
> + ctx->is_active |= EVENT_FROZEN;
> +}
> +
> +static inline void
> ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
> {
> if (ctx->is_active & EVENT_TIME) {
> + if (ctx->is_active & EVENT_FROZEN)
> + return;
> update_context_time(ctx);
> update_cgrp_time_from_event(event);
> }
> @@ -2822,7 +2868,7 @@ static int __perf_install_in_context(vo
> #endif
>
> if (reprogram) {
> - ctx_sched_out(ctx, NULL, EVENT_TIME);
> + ctx_time_freeze(cpuctx, ctx);
> add_event_to_ctx(event, ctx);
> ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> get_event_type(event));
> @@ -2968,8 +3014,7 @@ static void __perf_event_enable(struct p
> event->state <= PERF_EVENT_STATE_ERROR)
> return;
>
> - if (ctx->is_active)
> - ctx_sched_out(ctx, NULL, EVENT_TIME);
> + ctx_time_freeze(cpuctx, ctx);
>
> perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
> perf_cgroup_event_enable(event, ctx);
> @@ -2977,19 +3022,15 @@ static void __perf_event_enable(struct p
> if (!ctx->is_active)
> return;
>
> - if (!event_filter_match(event)) {
> - ctx_sched_in(ctx, NULL, EVENT_TIME);
> + if (!event_filter_match(event))
> return;
> - }
>
> /*
> * If the event is in a group and isn't the group leader,
> * then don't put it on unless the group is on.
> */
> - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
> - ctx_sched_in(ctx, NULL, EVENT_TIME);
> + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
> return;
> - }
>
> task_ctx = cpuctx->task_ctx;
> if (ctx->task)
> @@ -3263,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct p
> struct perf_event *event, *tmp;
> struct pmu *pmu = pmu_ctx->pmu;
>
> - if (ctx->task && !ctx->is_active) {
> + if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
> struct perf_cpu_pmu_context *cpc;
>
> cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> @@ -3338,24 +3379,29 @@ ctx_sched_out(struct perf_event_context
> *
> * would only update time for the pinned events.
> */
> - if (is_active & EVENT_TIME) {
> - /* update (and stop) ctx time */
> - update_context_time(ctx);
> - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
> + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
> +
> + /*
> + * CPU-release for the below ->is_active store,
> + * see __load_acquire() in perf_event_time_now()
> + */
> + barrier();
> + ctx->is_active &= ~event_type;
> +
> + if (!(ctx->is_active & EVENT_ALL)) {
> /*
> - * CPU-release for the below ->is_active store,
> - * see __load_acquire() in perf_event_time_now()
> + * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
> + * does not observe a hole. perf_ctx_unlock() will clean up.
> */
> - barrier();
> + if (ctx->is_active & EVENT_FROZEN)
> + ctx->is_active &= EVENT_TIME_FROZEN;
> + else
> + ctx->is_active = 0;
> }
>
> - ctx->is_active &= ~event_type;
> - if (!(ctx->is_active & EVENT_ALL))
> - ctx->is_active = 0;
> -
> if (ctx->task) {
> WARN_ON_ONCE(cpuctx->task_ctx != ctx);
> - if (!ctx->is_active)
> + if (!(ctx->is_active & EVENT_ALL))
> cpuctx->task_ctx = NULL;
> }
>
> @@ -3943,7 +3989,7 @@ ctx_sched_in(struct perf_event_context *
>
> ctx->is_active |= (event_type | EVENT_TIME);
> if (ctx->task) {
> - if (!is_active)
> + if (!(is_active & EVENT_ALL))
> cpuctx->task_ctx = ctx;
> else
> WARN_ON_ONCE(cpuctx->task_ctx != ctx);
> @@ -4424,7 +4470,7 @@ static void perf_event_enable_on_exec(st
>
> cpuctx = this_cpu_ptr(&perf_cpu_context);
> perf_ctx_lock(cpuctx, ctx);
> - ctx_sched_out(ctx, NULL, EVENT_TIME);
> + ctx_time_freeze(cpuctx, ctx);
>
> list_for_each_entry(event, &ctx->event_list, event_entry) {
> enabled |= event_enable_on_exec(event, ctx);
> @@ -4437,8 +4483,6 @@ static void perf_event_enable_on_exec(st
> if (enabled) {
> clone_ctx = unclone_ctx(ctx);
> ctx_resched(cpuctx, ctx, NULL, event_type);
> - } else {
> - ctx_sched_in(ctx, NULL, EVENT_TIME);
> }
> perf_ctx_unlock(cpuctx, ctx);
>
>
>
>
Powered by blists - more mailing lists