[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <476e7cea-f987-432a-995b-f7d52a123c9d@linux.intel.com>
Date: Fri, 2 Aug 2024 14:30:19 -0400
From: "Liang, Kan" <kan.liang@...ux.intel.com>
To: Namhyung Kim <namhyung@...nel.org>, Peter Zijlstra
<peterz@...radead.org>, Ingo Molnar <mingo@...nel.org>
Cc: Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
LKML <linux-kernel@...r.kernel.org>, Ravi Bangoria <ravi.bangoria@....com>,
Stephane Eranian <eranian@...gle.com>, Ian Rogers <irogers@...gle.com>,
Mingwei Zhang <mizhang@...gle.com>
Subject: Re: [PATCH v2] perf/core: Optimize event reschedule for a PMU
On 2024-07-30 8:06 p.m., Namhyung Kim wrote:
> Current ctx_resched() reschedules every events in all PMUs in the
> context even if it only needs to do it for a single event. This is the
> case when it opens a new event or enables an existing one. What we
> want is to reschedule events in the PMU only. Also perf_pmu_resched()
> currently calls ctx_resched() without PMU information.
>
> Let's add pmu argument to ctx_resched() to do the work for the given
> PMU only. And change the __pmu_ctx_sched_in() to be symmetrical to the
> _sched_out() counterpart for its arguments so that it can be called
> easily in the __perf_pmu_resched().
>
> Note that __perf_install_in_context() should call ctx_resched() for the
> very first event in the context in order to set ctx->is_active. Later
> events can be handled by __perf_pmu_resched().
>
> Care should be taken when it installs a task event for a PMU and
> there's no CPU event for the PMU. __perf_pmu_resched() will ask the
> CPU PMU context to schedule any events in it according to the group
> info. But as the PMU context was not activated, it didn't set the
> event context pointer. So I added new NULL checks in the
> __pmu_ctx_sched_{in,out}.
>
> With this change I can get 4x speed up (but actually it's proportional
> to the number of uncore PMU events) on a 2-socket Intel EMR machine in
> opening and closing a perf event for the core PMU in a loop while there
> are a bunch of uncore PMU events active on the CPU. The test code
> (stress-pmu) follows below.
>
> Before)
> # ./stress-pmu
> delta: 0.087068 sec (870 usec/op)
>
> After)
> # ./stress-pmu
> delta: 0.021440 sec (214 usec/op)
>
> Signed-off-by: Namhyung Kim <namhyung@...nel.org>
> ---
> $ cat stress-pmu.c
> #include <stdio.h>
> #include <unistd.h>
> #include <linux/perf_event.h>
> #include <sys/syscall.h>
> #include <sys/time.h>
>
> /* from uncore cpumask on EMR */
> #define TARGET_CPU 60
>
> #define LOOP 100
> #define US2S 1000000
>
> int open_perf_event(int type, int config)
> {
> struct perf_event_attr attr = {
> .type = type,
> .config = config,
> };
> int fd;
>
> fd = syscall(SYS_perf_event_open, &attr, /*pid=*/-1, TARGET_CPU,
> /*group_fd=*/-1, /*flags=*/0);
> if (fd < 0)
> printf("perf_event_open failed (type=%d, config=%d): %m\n", type, config);
> return fd;
> }
>
> int main(int argc, char *argv[])
> {
> struct timeval ts1, ts2;
> unsigned long long delta;
> int target_cpu = TARGET_CPU;
>
> /* open random uncore PMU events */
> for (int i = 0; i < 100; i++)
> open_perf_event(/*type=*/i + 20, /*config=*/0);
>
> gettimeofday(&ts1, NULL);
> for (int i = 0; i < LOOP; i++)
> close(open_perf_event(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES));
> gettimeofday(&ts2, NULL);
>
> delta = ts2.tv_sec * US2S + ts2.tv_usec - (ts1.tv_sec * US2S + ts1.tv_usec);
> printf("delta: %llu.%06llu sec (%llu usec/op)\n",
> delta / US2S, delta % US2S, delta / LOOP);
> return 0;
> }
> ---
> v2) add 'pmu' argument to ctx_resched() to reduce duplication
>
> kernel/events/core.c | 118 ++++++++++++++++++++++++++++++++++---------
> 1 file changed, 93 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index f64c30e7d5da..41e2533859a4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -709,6 +709,10 @@ static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
>
> static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
> static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
> +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type);
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type);
>
> #ifdef CONFIG_CGROUP_PERF
>
> @@ -2668,6 +2672,17 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
> ctx_sched_in(ctx, EVENT_FLEXIBLE);
> }
>
> +static void perf_pmu_sched_in(struct perf_cpu_pmu_context *cpc,
> + struct perf_event_pmu_context *task_epc)
> +{
> + __pmu_ctx_sched_in(&cpc->epc, EVENT_PINNED);
> + if (task_epc)
> + __pmu_ctx_sched_in(task_epc, EVENT_PINNED);
> + __pmu_ctx_sched_in(&cpc->epc, EVENT_FLEXIBLE);
> + if (task_epc)
> + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
> +}
> +
> /*
> * We want to maintain the following priority of scheduling:
> * - CPU pinned (EVENT_CPU | EVENT_PINNED)
> @@ -2683,16 +2698,13 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
> * event_type is a bit mask of the types of events involved. For CPU events,
> * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
> */
> -/*
> - * XXX: ctx_resched() reschedule entire perf_event_context while adding new
> - * event to the context or enabling existing event in the context. We can
> - * probably optimize it by rescheduling only affected pmu_ctx.
> - */
> static void ctx_resched(struct perf_cpu_context *cpuctx,
> struct perf_event_context *task_ctx,
> - enum event_type_t event_type)
> + struct pmu *pmu, enum event_type_t event_type)
> {
> bool cpu_event = !!(event_type & EVENT_CPU);
> + struct perf_cpu_pmu_context *cpc = NULL;
> + struct perf_event_pmu_context *epc = NULL;
>
> /*
> * If pinned groups are involved, flexible groups also need to be
> @@ -2703,10 +2715,24 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>
> event_type &= EVENT_ALL;
>
> - perf_ctx_disable(&cpuctx->ctx, false);
> + if (pmu) {
> + cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> + perf_pmu_disable(pmu);
> + } else {
> + perf_ctx_disable(&cpuctx->ctx, false);
> + }
> +
> if (task_ctx) {
> - perf_ctx_disable(task_ctx, false);
> - task_ctx_sched_out(task_ctx, event_type);
> + if (pmu) {
> + if (WARN_ON_ONCE(!cpc->task_epc || cpc->task_epc->ctx != task_ctx))
> + goto out;
> +
> + epc = cpc->task_epc;
> + __pmu_ctx_sched_out(epc, event_type);
> + } else {
> + perf_ctx_disable(task_ctx, false);
> + task_ctx_sched_out(task_ctx, event_type);
> + }
> }
>
> /*
> @@ -2716,15 +2742,30 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
> * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
> * - otherwise, do nothing more.
> */
> - if (cpu_event)
> - ctx_sched_out(&cpuctx->ctx, event_type);
> - else if (event_type & EVENT_PINNED)
> - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> + if (cpu_event) {
> + if (pmu)
> + __pmu_ctx_sched_out(&cpc->epc, event_type);
> + else
> + ctx_sched_out(&cpuctx->ctx, event_type);
> + } else if (event_type & EVENT_PINNED) {
> + if (pmu)
> + __pmu_ctx_sched_out(&cpc->epc, EVENT_FLEXIBLE);
> + else
> + ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> + }
> +
> + if (pmu)
> + perf_pmu_sched_in(cpc, epc);
> + else
> + perf_event_sched_in(cpuctx, task_ctx);
>
> - perf_event_sched_in(cpuctx, task_ctx);
> +out:
> + if (pmu)
> + perf_pmu_enable(pmu);
> + else
> + perf_ctx_enable(&cpuctx->ctx, false);
>
> - perf_ctx_enable(&cpuctx->ctx, false);
> - if (task_ctx)
> + if (task_ctx && !pmu)
> perf_ctx_enable(task_ctx, false);
> }
>
> @@ -2734,7 +2775,7 @@ void perf_pmu_resched(struct pmu *pmu)
> struct perf_event_context *task_ctx = cpuctx->task_ctx;
>
> perf_ctx_lock(cpuctx, task_ctx);
> - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
> + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
> perf_ctx_unlock(cpuctx, task_ctx);
> }
>
> @@ -2792,7 +2833,14 @@ static int __perf_install_in_context(void *info)
> if (reprogram) {
> ctx_sched_out(ctx, EVENT_TIME);
> add_event_to_ctx(event, ctx);
> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> + if (ctx->nr_events == 1) {
> + /* The first event needs to set ctx->is_active. */
> + ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> + } else {
> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> + get_event_type(event));
> + ctx_sched_in(ctx, EVENT_TIME);
The changelog doesn't mention the time difference much. As my
understanding, the time is shared among PMUs in the same ctx.
When perf does ctx_resched(), the time is deducted.
There is no problem to stop and restart the global time when perf
re-schedule all PMUs.
But if only one PMU is re-scheduled while others are still running, it
may be a problem to stop and restart the global time. Other PMUs will be
impacted.
Thanks,
Kan
> + }
> } else {
> add_event_to_ctx(event, ctx);
> }
> @@ -2962,7 +3010,8 @@ static void __perf_event_enable(struct perf_event *event,
> if (ctx->task)
> WARN_ON_ONCE(task_ctx != ctx);
>
> - ctx_resched(cpuctx, task_ctx, get_event_type(event));
> + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
> + ctx_sched_in(ctx, EVENT_TIME);
> }
>
> /*
> @@ -3230,6 +3279,13 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> struct perf_event *event, *tmp;
> struct pmu *pmu = pmu_ctx->pmu;
>
> + /*
> + * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> + * for task events and there's no cpu events.
> + */
> + if (ctx == NULL)
> + return;
> +
> if (ctx->task && !ctx->is_active) {
> struct perf_cpu_pmu_context *cpc;
>
> @@ -3872,10 +3928,22 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx,
> }
> }
>
> -static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
> - struct pmu *pmu)
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> + enum event_type_t event_type)
> {
> - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> + struct perf_event_context *ctx = pmu_ctx->ctx;
> +
> + /*
> + * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> + * for task events and there's no cpu events.
> + */
> + if (ctx == NULL)
> + return;
> +
> + if (event_type & EVENT_PINNED)
> + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
> + if (event_type & EVENT_FLEXIBLE)
> + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
> }
>
> static void
> @@ -4309,14 +4377,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
> update_context_time(&cpuctx->ctx);
> __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
> rotate_ctx(&cpuctx->ctx, cpu_event);
> - __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
> + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
> }
>
> if (task_event)
> rotate_ctx(task_epc->ctx, task_event);
>
> if (task_event || (task_epc && cpu_event))
> - __pmu_ctx_sched_in(task_epc->ctx, pmu);
> + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
>
> perf_pmu_enable(pmu);
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -4394,7 +4462,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
> */
> if (enabled) {
> clone_ctx = unclone_ctx(ctx);
> - ctx_resched(cpuctx, ctx, event_type);
> + ctx_resched(cpuctx, ctx, NULL, event_type);
> } else {
> ctx_sched_in(ctx, EVENT_TIME);
> }
Powered by blists - more mailing lists