linux-kernel - Re: [PATCH 4/5] perf: Add context time freeze

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b38cc358-8e46-48bd-88c0-ff4b8db6bd15@linux.intel.com>
Date: Wed, 7 Aug 2024 11:17:18 -0400
From: "Liang, Kan" <kan.liang@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>, mingo@...nel.org
Cc: acme@...nel.org, namhyung@...nel.org, mark.rutland@....com,
 alexander.shishkin@...ux.intel.com, jolsa@...nel.org, irogers@...gle.com,
 adrian.hunter@...el.com, linux-perf-users@...r.kernel.org,
 linux-kernel@...r.kernel.org
Subject: Re: [PATCH 4/5] perf: Add context time freeze



On 2024-08-07 7:29 a.m., Peter Zijlstra wrote:
> Many of the the context reschedule users are of the form:
> 
>   ctx_sched_out(.type = EVENT_TIME);
>   ... modify context
>   ctx_resched();
> 
> With the idea that the whole reschedule happens with a single
> time-stamp, rather than with each ctx_sched_out() advancing time and
> ctx_sched_in() re-starting time, creating a non-atomic experience.
> 
> However, Kan noticed that since this completely stops time, it
> actually looses a bit of time between the stop and start. Worse, now
> that we can do partial (per PMU) reschedules, the PMUs that are not
> scheduled out still observe the time glitch.
> 
> Replace this with:
> 
>   ctx_time_freeze();
>   ... modify context
>   ctx_resched();
> 
> With the assumption that this happens in a perf_ctx_lock() /
> perf_ctx_unlock() pair.
> 
> The new ctx_time_freeze() will update time and sets EVENT_FROZEN, and
> ensures EVENT_TIME and EVENT_FROZEN remain set, this avoids
> perf_event_time_now() from observing a time wobble from not seeing
> EVENT_TIME for a little while.
> 
> Additionally, this avoids loosing time between
> ctx_sched_out(EVENT_TIME) and ctx_sched_in(), which would re-set the
> timestamp.
> 
> Reported-by: Kan Liang <kan.liang@...ux.intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
>  kernel/events/core.c |  128 ++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 86 insertions(+), 42 deletions(-)
> 
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, re
>  	return data.ret;
>  }
>  
> +enum event_type_t {
> +	EVENT_FLEXIBLE	= 0x01,
> +	EVENT_PINNED	= 0x02,
> +	EVENT_TIME	= 0x04,
> +	EVENT_FROZEN	= 0x08,
> +	/* see ctx_resched() for details */
> +	EVENT_CPU	= 0x10,
> +	EVENT_CGROUP	= 0x20,
> +
> +	/* compound helpers */
> +	EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
> +	EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
> +};
> +
> +static inline void __perf_ctx_lock(struct perf_event_context *ctx)
> +{
> +	raw_spin_lock(&ctx->lock);
> +	WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
> +}
> +
>  static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
>  			  struct perf_event_context *ctx)
>  {
> -	raw_spin_lock(&cpuctx->ctx.lock);
> +	__perf_ctx_lock(&cpuctx->ctx);
>  	if (ctx)
> -		raw_spin_lock(&ctx->lock);
> +		__perf_ctx_lock(ctx);
> +}
> +
> +static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
> +{
> +	/*
> +	 * If ctx_sched_in() didn't again set any ALL flags, clean up
> +	 * after ctx_sched_out() by clearing is_active.
> +	 */
> +	if (ctx->is_active & EVENT_FROZEN) {
> +		if (!(ctx->is_active & EVENT_ALL))

Nit:
It may be better to add a macro/inline function to replace all the
(ctx->is_active & EVENT_ALL) check? For example,

+static inline bool perf_ctx_has_active_events(struct perf_event_context
*ctx)
+{
+	return ctx->is_active & EVENT_ALL;
+}
...
+	if (ctx->is_active & EVENT_FROZEN) {
+		if (!perf_ctx_has_active_events(ctx))
+			ctx->is_active = 0;
+		else
+			ctx->is_active &= ~EVENT_FROZEN;

It can tell very straightforwardly that we want to clear all flags if
there is no active event.
The EVENT_ALL may bring confusion. It actually means all events, not all
event types. The developer may have to go to the define and figure out
what exactly the EVENT_ALL includes.

Thanks,
Kan

> +			ctx->is_active = 0;
> +		else
> +			ctx->is_active &= ~EVENT_FROZEN;
> +	}
> +	raw_spin_unlock(&ctx->lock);
>  }
>  
>  static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
>  			    struct perf_event_context *ctx)
>  {
>  	if (ctx)
> -		raw_spin_unlock(&ctx->lock);
> -	raw_spin_unlock(&cpuctx->ctx.lock);
> +		__perf_ctx_unlock(ctx);
> +	__perf_ctx_unlock(&cpuctx->ctx);
>  }
>  
>  #define TASK_TOMBSTONE ((void *)-1L)
> @@ -370,16 +405,6 @@ static void event_function_local(struct
>  	(PERF_SAMPLE_BRANCH_KERNEL |\
>  	 PERF_SAMPLE_BRANCH_HV)
>  
> -enum event_type_t {
> -	EVENT_FLEXIBLE = 0x1,
> -	EVENT_PINNED = 0x2,
> -	EVENT_TIME = 0x4,
> -	/* see ctx_resched() for details */
> -	EVENT_CPU = 0x8,
> -	EVENT_CGROUP = 0x10,
> -	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> -};
> -
>  /*
>   * perf_sched_events : >0 events exist
>   */
> @@ -2332,18 +2357,39 @@ group_sched_out(struct perf_event *group
>  }
>  
>  static inline void
> -ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
>  {
>  	if (ctx->is_active & EVENT_TIME) {
> +		if (ctx->is_active & EVENT_FROZEN)
> +			return;
>  		update_context_time(ctx);
> -		update_cgrp_time_from_cpuctx(cpuctx, false);
> +		update_cgrp_time_from_cpuctx(cpuctx, final);
>  	}
>  }
>  
>  static inline void
> +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +{
> +	__ctx_time_update(cpuctx, ctx, false);
> +}
> +
> +/*
> + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
> + */
> +static inline void
> +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
> +{
> +	ctx_time_update(cpuctx, ctx);
> +	if (ctx->is_active & EVENT_TIME)
> +		ctx->is_active |= EVENT_FROZEN;
> +}
> +
> +static inline void
>  ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
>  {
>  	if (ctx->is_active & EVENT_TIME) {
> +		if (ctx->is_active & EVENT_FROZEN)
> +			return;
>  		update_context_time(ctx);
>  		update_cgrp_time_from_event(event);
>  	}
> @@ -2822,7 +2868,7 @@ static int  __perf_install_in_context(vo
>  #endif
>  
>  	if (reprogram) {
> -		ctx_sched_out(ctx, NULL, EVENT_TIME);
> +		ctx_time_freeze(cpuctx, ctx);
>  		add_event_to_ctx(event, ctx);
>  		ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
>  			    get_event_type(event));
> @@ -2968,8 +3014,7 @@ static void __perf_event_enable(struct p
>  	    event->state <= PERF_EVENT_STATE_ERROR)
>  		return;
>  
> -	if (ctx->is_active)
> -		ctx_sched_out(ctx, NULL, EVENT_TIME);
> +	ctx_time_freeze(cpuctx, ctx);
>  
>  	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
>  	perf_cgroup_event_enable(event, ctx);
> @@ -2977,19 +3022,15 @@ static void __perf_event_enable(struct p
>  	if (!ctx->is_active)
>  		return;
>  
> -	if (!event_filter_match(event)) {
> -		ctx_sched_in(ctx, NULL, EVENT_TIME);
> +	if (!event_filter_match(event))
>  		return;
> -	}
>  
>  	/*
>  	 * If the event is in a group and isn't the group leader,
>  	 * then don't put it on unless the group is on.
>  	 */
> -	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
> -		ctx_sched_in(ctx, NULL, EVENT_TIME);
> +	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
>  		return;
> -	}
>  
>  	task_ctx = cpuctx->task_ctx;
>  	if (ctx->task)
> @@ -3263,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct p
>  	struct perf_event *event, *tmp;
>  	struct pmu *pmu = pmu_ctx->pmu;
>  
> -	if (ctx->task && !ctx->is_active) {
> +	if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
>  		struct perf_cpu_pmu_context *cpc;
>  
>  		cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> @@ -3338,24 +3379,29 @@ ctx_sched_out(struct perf_event_context
>  	 *
>  	 * would only update time for the pinned events.
>  	 */
> -	if (is_active & EVENT_TIME) {
> -		/* update (and stop) ctx time */
> -		update_context_time(ctx);
> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
> +	__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
> +
> +	/*
> +	 * CPU-release for the below ->is_active store,
> +	 * see __load_acquire() in perf_event_time_now()
> +	 */
> +	barrier();
> +	ctx->is_active &= ~event_type;
> +
> +	if (!(ctx->is_active & EVENT_ALL)) {
>  		/*
> -		 * CPU-release for the below ->is_active store,
> -		 * see __load_acquire() in perf_event_time_now()
> +		 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
> +		 * does not observe a hole. perf_ctx_unlock() will clean up.
>  		 */
> -		barrier();
> +		if (ctx->is_active & EVENT_FROZEN)
> +			ctx->is_active &= EVENT_TIME_FROZEN;
> +		else
> +			ctx->is_active = 0;
>  	}
>  
> -	ctx->is_active &= ~event_type;
> -	if (!(ctx->is_active & EVENT_ALL))
> -		ctx->is_active = 0;
> -
>  	if (ctx->task) {
>  		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
> -		if (!ctx->is_active)
> +		if (!(ctx->is_active & EVENT_ALL))
>  			cpuctx->task_ctx = NULL;
>  	}
>  
> @@ -3943,7 +3989,7 @@ ctx_sched_in(struct perf_event_context *
>  
>  	ctx->is_active |= (event_type | EVENT_TIME);
>  	if (ctx->task) {
> -		if (!is_active)
> +		if (!(is_active & EVENT_ALL))
>  			cpuctx->task_ctx = ctx;
>  		else
>  			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
> @@ -4424,7 +4470,7 @@ static void perf_event_enable_on_exec(st
>  
>  	cpuctx = this_cpu_ptr(&perf_cpu_context);
>  	perf_ctx_lock(cpuctx, ctx);
> -	ctx_sched_out(ctx, NULL, EVENT_TIME);
> +	ctx_time_freeze(cpuctx, ctx);
>  
>  	list_for_each_entry(event, &ctx->event_list, event_entry) {
>  		enabled |= event_enable_on_exec(event, ctx);
> @@ -4437,8 +4483,6 @@ static void perf_event_enable_on_exec(st
>  	if (enabled) {
>  		clone_ctx = unclone_ctx(ctx);
>  		ctx_resched(cpuctx, ctx, NULL, event_type);
> -	} else {
> -		ctx_sched_in(ctx, NULL, EVENT_TIME);
>  	}
>  	perf_ctx_unlock(cpuctx, ctx);
>  
> 
> 
>