linux-kernel - Re: [PATCH v2] perf/core: Optimize event reschedule for a PMU

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <476e7cea-f987-432a-995b-f7d52a123c9d@linux.intel.com>
Date: Fri, 2 Aug 2024 14:30:19 -0400
From: "Liang, Kan" <kan.liang@...ux.intel.com>
To: Namhyung Kim <namhyung@...nel.org>, Peter Zijlstra
 <peterz@...radead.org>, Ingo Molnar <mingo@...nel.org>
Cc: Mark Rutland <mark.rutland@....com>,
 Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
 Arnaldo Carvalho de Melo <acme@...nel.org>,
 LKML <linux-kernel@...r.kernel.org>, Ravi Bangoria <ravi.bangoria@....com>,
 Stephane Eranian <eranian@...gle.com>, Ian Rogers <irogers@...gle.com>,
 Mingwei Zhang <mizhang@...gle.com>
Subject: Re: [PATCH v2] perf/core: Optimize event reschedule for a PMU



On 2024-07-30 8:06 p.m., Namhyung Kim wrote:
> Current ctx_resched() reschedules every events in all PMUs in the
> context even if it only needs to do it for a single event.  This is the
> case when it opens a new event or enables an existing one.  What we
> want is to reschedule events in the PMU only.  Also perf_pmu_resched()
> currently calls ctx_resched() without PMU information.
> 
> Let's add pmu argument to ctx_resched() to do the work for the given
> PMU only.  And change the __pmu_ctx_sched_in() to be symmetrical to the
> _sched_out() counterpart for its arguments so that it can be called
> easily in the __perf_pmu_resched().
> 
> Note that __perf_install_in_context() should call ctx_resched() for the
> very first event in the context in order to set ctx->is_active.  Later
> events can be handled by __perf_pmu_resched().
> 
> Care should be taken when it installs a task event for a PMU and
> there's no CPU event for the PMU.  __perf_pmu_resched() will ask the
> CPU PMU context to schedule any events in it according to the group
> info.  But as the PMU context was not activated, it didn't set the
> event context pointer.  So I added new NULL checks in the
> __pmu_ctx_sched_{in,out}.
> 
> With this change I can get 4x speed up (but actually it's proportional
> to the number of uncore PMU events) on a 2-socket Intel EMR machine in
> opening and closing a perf event for the core PMU in a loop while there
> are a bunch of uncore PMU events active on the CPU.  The test code
> (stress-pmu) follows below.
> 
> Before)
>   # ./stress-pmu
>   delta: 0.087068 sec (870 usec/op)
> 
> After)
>   # ./stress-pmu
>   delta: 0.021440 sec (214 usec/op)
> 
> Signed-off-by: Namhyung Kim <namhyung@...nel.org>
> ---
>   $ cat stress-pmu.c
>   #include <stdio.h>
>   #include <unistd.h>
>   #include <linux/perf_event.h>
>   #include <sys/syscall.h>
>   #include <sys/time.h>
> 
>   /* from uncore cpumask on EMR */
>   #define TARGET_CPU 60
> 
>   #define LOOP 100
>   #define US2S 1000000
> 
>   int open_perf_event(int type, int config)
>   {
>   	struct perf_event_attr attr = {
>   		.type = type,
>   		.config = config,
>   	};
>   	int fd;
> 
>   	fd = syscall(SYS_perf_event_open, &attr, /*pid=*/-1, TARGET_CPU,
>   			/*group_fd=*/-1, /*flags=*/0);
>   	if (fd < 0)
>   		printf("perf_event_open failed (type=%d, config=%d): %m\n", type, config);
>   	return fd;
>   }
> 
>   int main(int argc, char *argv[])
>   {
>   	struct timeval ts1, ts2;
>   	unsigned long long delta;
>   	int target_cpu = TARGET_CPU;
> 
>   	/* open random uncore PMU events */
>   	for (int i = 0; i < 100; i++)
>   		open_perf_event(/*type=*/i + 20, /*config=*/0);
> 
>   	gettimeofday(&ts1, NULL);
>   	for (int i = 0; i < LOOP; i++)
>   		close(open_perf_event(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES));
>   	gettimeofday(&ts2, NULL);
> 
>   	delta = ts2.tv_sec * US2S + ts2.tv_usec - (ts1.tv_sec * US2S + ts1.tv_usec);
>   	printf("delta: %llu.%06llu sec (%llu usec/op)\n",
>   		delta / US2S, delta % US2S, delta / LOOP);
>   	return 0;
>   }
> ---
> v2) add 'pmu' argument to ctx_resched() to reduce duplication
> 
>  kernel/events/core.c | 118 ++++++++++++++++++++++++++++++++++---------
>  1 file changed, 93 insertions(+), 25 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index f64c30e7d5da..41e2533859a4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -709,6 +709,10 @@ static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
>  
>  static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
>  static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
> +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> +				enum event_type_t event_type);
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> +			       enum event_type_t event_type);
>  
>  #ifdef CONFIG_CGROUP_PERF
>  
> @@ -2668,6 +2672,17 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>  		 ctx_sched_in(ctx, EVENT_FLEXIBLE);
>  }
>  
> +static void perf_pmu_sched_in(struct perf_cpu_pmu_context *cpc,
> +			      struct perf_event_pmu_context *task_epc)
> +{
> +	__pmu_ctx_sched_in(&cpc->epc, EVENT_PINNED);
> +	if (task_epc)
> +		 __pmu_ctx_sched_in(task_epc, EVENT_PINNED);
> +	__pmu_ctx_sched_in(&cpc->epc, EVENT_FLEXIBLE);
> +	if (task_epc)
> +		 __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
> +}
> +
>  /*
>   * We want to maintain the following priority of scheduling:
>   *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
> @@ -2683,16 +2698,13 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>   * event_type is a bit mask of the types of events involved. For CPU events,
>   * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
>   */
> -/*
> - * XXX: ctx_resched() reschedule entire perf_event_context while adding new
> - * event to the context or enabling existing event in the context. We can
> - * probably optimize it by rescheduling only affected pmu_ctx.
> - */
>  static void ctx_resched(struct perf_cpu_context *cpuctx,
>  			struct perf_event_context *task_ctx,
> -			enum event_type_t event_type)
> +			struct pmu *pmu, enum event_type_t event_type)
>  {
>  	bool cpu_event = !!(event_type & EVENT_CPU);
> +	struct perf_cpu_pmu_context *cpc = NULL;
> +	struct perf_event_pmu_context *epc = NULL;
>  
>  	/*
>  	 * If pinned groups are involved, flexible groups also need to be
> @@ -2703,10 +2715,24 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>  
>  	event_type &= EVENT_ALL;
>  
> -	perf_ctx_disable(&cpuctx->ctx, false);
> +	if (pmu) {
> +		cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> +		perf_pmu_disable(pmu);
> +	} else {
> +		perf_ctx_disable(&cpuctx->ctx, false);
> +	}
> +
>  	if (task_ctx) {
> -		perf_ctx_disable(task_ctx, false);
> -		task_ctx_sched_out(task_ctx, event_type);
> +		if (pmu) {
> +			if (WARN_ON_ONCE(!cpc->task_epc || cpc->task_epc->ctx != task_ctx))
> +				goto out;
> +
> +			epc = cpc->task_epc;
> +			__pmu_ctx_sched_out(epc, event_type);
> +		} else {
> +			perf_ctx_disable(task_ctx, false);
> +			task_ctx_sched_out(task_ctx, event_type);
> +		}
>  	}
>  
>  	/*
> @@ -2716,15 +2742,30 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>  	 *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
>  	 *  - otherwise, do nothing more.
>  	 */
> -	if (cpu_event)
> -		ctx_sched_out(&cpuctx->ctx, event_type);
> -	else if (event_type & EVENT_PINNED)
> -		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> +	if (cpu_event) {
> +		if (pmu)
> +			__pmu_ctx_sched_out(&cpc->epc, event_type);
> +		else
> +			ctx_sched_out(&cpuctx->ctx, event_type);
> +	} else if (event_type & EVENT_PINNED) {
> +		if (pmu)
> +			__pmu_ctx_sched_out(&cpc->epc, EVENT_FLEXIBLE);
> +		else
> +			ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> +	}
> +
> +	if (pmu)
> +		perf_pmu_sched_in(cpc, epc);
> +	else
> +		perf_event_sched_in(cpuctx, task_ctx);
>  
> -	perf_event_sched_in(cpuctx, task_ctx);
> +out:
> +	if (pmu)
> +		perf_pmu_enable(pmu);
> +	else
> +		perf_ctx_enable(&cpuctx->ctx, false);
>  
> -	perf_ctx_enable(&cpuctx->ctx, false);
> -	if (task_ctx)
> +	if (task_ctx && !pmu)
>  		perf_ctx_enable(task_ctx, false);
>  }
>  
> @@ -2734,7 +2775,7 @@ void perf_pmu_resched(struct pmu *pmu)
>  	struct perf_event_context *task_ctx = cpuctx->task_ctx;
>  
>  	perf_ctx_lock(cpuctx, task_ctx);
> -	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
> +	ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
>  	perf_ctx_unlock(cpuctx, task_ctx);
>  }
>  
> @@ -2792,7 +2833,14 @@ static int  __perf_install_in_context(void *info)
>  	if (reprogram) {
>  		ctx_sched_out(ctx, EVENT_TIME);
>  		add_event_to_ctx(event, ctx);
> -		ctx_resched(cpuctx, task_ctx, get_event_type(event));
> +		if (ctx->nr_events == 1) {
> +			/* The first event needs to set ctx->is_active. */
> +			ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> +		} else {
> +			ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> +				    get_event_type(event));
> +			ctx_sched_in(ctx, EVENT_TIME);

The changelog doesn't mention the time difference much. As my
understanding, the time is shared among PMUs in the same ctx.
When perf does ctx_resched(), the time is deducted.
There is no problem to stop and restart the global time when perf
re-schedule all PMUs.
But if only one PMU is re-scheduled while others are still running, it
may be a problem to stop and restart the global time. Other PMUs will be
impacted.

Thanks,
Kan
> +		}
>  	} else {
>  		add_event_to_ctx(event, ctx);
>  	}
> @@ -2962,7 +3010,8 @@ static void __perf_event_enable(struct perf_event *event,
>  	if (ctx->task)
>  		WARN_ON_ONCE(task_ctx != ctx);
>  
> -	ctx_resched(cpuctx, task_ctx, get_event_type(event));
> +	ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
> +	ctx_sched_in(ctx, EVENT_TIME);
>  }
>  
>  /*
> @@ -3230,6 +3279,13 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
>  	struct perf_event *event, *tmp;
>  	struct pmu *pmu = pmu_ctx->pmu;
>  
> +	/*
> +	 * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> +	 * for task events and there's no cpu events.
> +	 */
> +	if (ctx == NULL)
> +		return;
> +
>  	if (ctx->task && !ctx->is_active) {
>  		struct perf_cpu_pmu_context *cpc;
>  
> @@ -3872,10 +3928,22 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx,
>  	}
>  }
>  
> -static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
> -			       struct pmu *pmu)
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> +			       enum event_type_t event_type)
>  {
> -	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> +	struct perf_event_context *ctx = pmu_ctx->ctx;
> +
> +	/*
> +	 * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> +	 * for task events and there's no cpu events.
> +	 */
> +	if (ctx == NULL)
> +		return;
> +
> +	if (event_type & EVENT_PINNED)
> +		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
> +	if (event_type & EVENT_FLEXIBLE)
> +		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
>  }
>  
>  static void
> @@ -4309,14 +4377,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
>  		update_context_time(&cpuctx->ctx);
>  		__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
>  		rotate_ctx(&cpuctx->ctx, cpu_event);
> -		__pmu_ctx_sched_in(&cpuctx->ctx, pmu);
> +		__pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
>  	}
>  
>  	if (task_event)
>  		rotate_ctx(task_epc->ctx, task_event);
>  
>  	if (task_event || (task_epc && cpu_event))
> -		__pmu_ctx_sched_in(task_epc->ctx, pmu);
> +		__pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
>  
>  	perf_pmu_enable(pmu);
>  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -4394,7 +4462,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
>  	 */
>  	if (enabled) {
>  		clone_ctx = unclone_ctx(ctx);
> -		ctx_resched(cpuctx, ctx, event_type);
> +		ctx_resched(cpuctx, ctx, NULL, event_type);
>  	} else {
>  		ctx_sched_in(ctx, EVENT_TIME);
>  	}