lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAM9d7cju4gCYkr7WvO3Z00P1H=9s-Um-pBhi2xcMHU6F37+meg@mail.gmail.com>
Date: Fri, 2 Aug 2024 10:39:58 -0700
From: Namhyung Kim <namhyung@...nel.org>
To: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...nel.org>
Cc: Mark Rutland <mark.rutland@....com>, 
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>, 
	Arnaldo Carvalho de Melo <acme@...nel.org>, LKML <linux-kernel@...r.kernel.org>, 
	Ravi Bangoria <ravi.bangoria@....com>, Kan Liang <kan.liang@...ux.intel.com>, 
	Stephane Eranian <eranian@...gle.com>, Ian Rogers <irogers@...gle.com>, 
	Mingwei Zhang <mizhang@...gle.com>
Subject: Re: [PATCH v2] perf/core: Optimize event reschedule for a PMU

Hi Peter,

On Tue, Jul 30, 2024 at 5:06 PM Namhyung Kim <namhyung@...nel.org> wrote:
>
> Current ctx_resched() reschedules every events in all PMUs in the
> context even if it only needs to do it for a single event.  This is the
> case when it opens a new event or enables an existing one.  What we
> want is to reschedule events in the PMU only.  Also perf_pmu_resched()
> currently calls ctx_resched() without PMU information.
>
> Let's add pmu argument to ctx_resched() to do the work for the given
> PMU only.  And change the __pmu_ctx_sched_in() to be symmetrical to the
> _sched_out() counterpart for its arguments so that it can be called
> easily in the __perf_pmu_resched().
>
> Note that __perf_install_in_context() should call ctx_resched() for the
> very first event in the context in order to set ctx->is_active.  Later
> events can be handled by __perf_pmu_resched().
>
> Care should be taken when it installs a task event for a PMU and
> there's no CPU event for the PMU.  __perf_pmu_resched() will ask the
> CPU PMU context to schedule any events in it according to the group
> info.  But as the PMU context was not activated, it didn't set the
> event context pointer.  So I added new NULL checks in the
> __pmu_ctx_sched_{in,out}.
>
> With this change I can get 4x speed up (but actually it's proportional
> to the number of uncore PMU events) on a 2-socket Intel EMR machine in
> opening and closing a perf event for the core PMU in a loop while there
> are a bunch of uncore PMU events active on the CPU.  The test code
> (stress-pmu) follows below.
>
> Before)
>   # ./stress-pmu
>   delta: 0.087068 sec (870 usec/op)
>
> After)
>   # ./stress-pmu
>   delta: 0.021440 sec (214 usec/op)
>
> Signed-off-by: Namhyung Kim <namhyung@...nel.org>
> ---
>   $ cat stress-pmu.c
>   #include <stdio.h>
>   #include <unistd.h>
>   #include <linux/perf_event.h>
>   #include <sys/syscall.h>
>   #include <sys/time.h>
>
>   /* from uncore cpumask on EMR */
>   #define TARGET_CPU 60
>
>   #define LOOP 100
>   #define US2S 1000000
>
>   int open_perf_event(int type, int config)
>   {
>         struct perf_event_attr attr = {
>                 .type = type,
>                 .config = config,
>         };
>         int fd;
>
>         fd = syscall(SYS_perf_event_open, &attr, /*pid=*/-1, TARGET_CPU,
>                         /*group_fd=*/-1, /*flags=*/0);
>         if (fd < 0)
>                 printf("perf_event_open failed (type=%d, config=%d): %m\n", type, config);
>         return fd;
>   }
>
>   int main(int argc, char *argv[])
>   {
>         struct timeval ts1, ts2;
>         unsigned long long delta;
>         int target_cpu = TARGET_CPU;
>
>         /* open random uncore PMU events */
>         for (int i = 0; i < 100; i++)
>                 open_perf_event(/*type=*/i + 20, /*config=*/0);
>
>         gettimeofday(&ts1, NULL);
>         for (int i = 0; i < LOOP; i++)
>                 close(open_perf_event(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES));
>         gettimeofday(&ts2, NULL);
>
>         delta = ts2.tv_sec * US2S + ts2.tv_usec - (ts1.tv_sec * US2S + ts1.tv_usec);
>         printf("delta: %llu.%06llu sec (%llu usec/op)\n",
>                 delta / US2S, delta % US2S, delta / LOOP);
>         return 0;
>   }
> ---
> v2) add 'pmu' argument to ctx_resched() to reduce duplication

Are you ok with this?

Thanks,
Namhyung


>
>  kernel/events/core.c | 118 ++++++++++++++++++++++++++++++++++---------
>  1 file changed, 93 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index f64c30e7d5da..41e2533859a4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -709,6 +709,10 @@ static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
>
>  static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
>  static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
> +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
> +                               enum event_type_t event_type);
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> +                              enum event_type_t event_type);
>
>  #ifdef CONFIG_CGROUP_PERF
>
> @@ -2668,6 +2672,17 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>                  ctx_sched_in(ctx, EVENT_FLEXIBLE);
>  }
>
> +static void perf_pmu_sched_in(struct perf_cpu_pmu_context *cpc,
> +                             struct perf_event_pmu_context *task_epc)
> +{
> +       __pmu_ctx_sched_in(&cpc->epc, EVENT_PINNED);
> +       if (task_epc)
> +                __pmu_ctx_sched_in(task_epc, EVENT_PINNED);
> +       __pmu_ctx_sched_in(&cpc->epc, EVENT_FLEXIBLE);
> +       if (task_epc)
> +                __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
> +}
> +
>  /*
>   * We want to maintain the following priority of scheduling:
>   *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
> @@ -2683,16 +2698,13 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>   * event_type is a bit mask of the types of events involved. For CPU events,
>   * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
>   */
> -/*
> - * XXX: ctx_resched() reschedule entire perf_event_context while adding new
> - * event to the context or enabling existing event in the context. We can
> - * probably optimize it by rescheduling only affected pmu_ctx.
> - */
>  static void ctx_resched(struct perf_cpu_context *cpuctx,
>                         struct perf_event_context *task_ctx,
> -                       enum event_type_t event_type)
> +                       struct pmu *pmu, enum event_type_t event_type)
>  {
>         bool cpu_event = !!(event_type & EVENT_CPU);
> +       struct perf_cpu_pmu_context *cpc = NULL;
> +       struct perf_event_pmu_context *epc = NULL;
>
>         /*
>          * If pinned groups are involved, flexible groups also need to be
> @@ -2703,10 +2715,24 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>
>         event_type &= EVENT_ALL;
>
> -       perf_ctx_disable(&cpuctx->ctx, false);
> +       if (pmu) {
> +               cpc = this_cpu_ptr(pmu->cpu_pmu_context);
> +               perf_pmu_disable(pmu);
> +       } else {
> +               perf_ctx_disable(&cpuctx->ctx, false);
> +       }
> +
>         if (task_ctx) {
> -               perf_ctx_disable(task_ctx, false);
> -               task_ctx_sched_out(task_ctx, event_type);
> +               if (pmu) {
> +                       if (WARN_ON_ONCE(!cpc->task_epc || cpc->task_epc->ctx != task_ctx))
> +                               goto out;
> +
> +                       epc = cpc->task_epc;
> +                       __pmu_ctx_sched_out(epc, event_type);
> +               } else {
> +                       perf_ctx_disable(task_ctx, false);
> +                       task_ctx_sched_out(task_ctx, event_type);
> +               }
>         }
>
>         /*
> @@ -2716,15 +2742,30 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
>          *  - otherwise, do nothing more.
>          */
> -       if (cpu_event)
> -               ctx_sched_out(&cpuctx->ctx, event_type);
> -       else if (event_type & EVENT_PINNED)
> -               ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> +       if (cpu_event) {
> +               if (pmu)
> +                       __pmu_ctx_sched_out(&cpc->epc, event_type);
> +               else
> +                       ctx_sched_out(&cpuctx->ctx, event_type);
> +       } else if (event_type & EVENT_PINNED) {
> +               if (pmu)
> +                       __pmu_ctx_sched_out(&cpc->epc, EVENT_FLEXIBLE);
> +               else
> +                       ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> +       }
> +
> +       if (pmu)
> +               perf_pmu_sched_in(cpc, epc);
> +       else
> +               perf_event_sched_in(cpuctx, task_ctx);
>
> -       perf_event_sched_in(cpuctx, task_ctx);
> +out:
> +       if (pmu)
> +               perf_pmu_enable(pmu);
> +       else
> +               perf_ctx_enable(&cpuctx->ctx, false);
>
> -       perf_ctx_enable(&cpuctx->ctx, false);
> -       if (task_ctx)
> +       if (task_ctx && !pmu)
>                 perf_ctx_enable(task_ctx, false);
>  }
>
> @@ -2734,7 +2775,7 @@ void perf_pmu_resched(struct pmu *pmu)
>         struct perf_event_context *task_ctx = cpuctx->task_ctx;
>
>         perf_ctx_lock(cpuctx, task_ctx);
> -       ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
> +       ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
>         perf_ctx_unlock(cpuctx, task_ctx);
>  }
>
> @@ -2792,7 +2833,14 @@ static int  __perf_install_in_context(void *info)
>         if (reprogram) {
>                 ctx_sched_out(ctx, EVENT_TIME);
>                 add_event_to_ctx(event, ctx);
> -               ctx_resched(cpuctx, task_ctx, get_event_type(event));
> +               if (ctx->nr_events == 1) {
> +                       /* The first event needs to set ctx->is_active. */
> +                       ctx_resched(cpuctx, task_ctx, NULL, get_event_type(event));
> +               } else {
> +                       ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
> +                                   get_event_type(event));
> +                       ctx_sched_in(ctx, EVENT_TIME);
> +               }
>         } else {
>                 add_event_to_ctx(event, ctx);
>         }
> @@ -2962,7 +3010,8 @@ static void __perf_event_enable(struct perf_event *event,
>         if (ctx->task)
>                 WARN_ON_ONCE(task_ctx != ctx);
>
> -       ctx_resched(cpuctx, task_ctx, get_event_type(event));
> +       ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
> +       ctx_sched_in(ctx, EVENT_TIME);
>  }
>
>  /*
> @@ -3230,6 +3279,13 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
>         struct perf_event *event, *tmp;
>         struct pmu *pmu = pmu_ctx->pmu;
>
> +       /*
> +        * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> +        * for task events and there's no cpu events.
> +        */
> +       if (ctx == NULL)
> +               return;
> +
>         if (ctx->task && !ctx->is_active) {
>                 struct perf_cpu_pmu_context *cpc;
>
> @@ -3872,10 +3928,22 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx,
>         }
>  }
>
> -static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
> -                              struct pmu *pmu)
> +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
> +                              enum event_type_t event_type)
>  {
> -       pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> +       struct perf_event_context *ctx = pmu_ctx->ctx;
> +
> +       /*
> +        * CPU's pmu_ctx might not be active when __perf_pmu_resched() is called
> +        * for task events and there's no cpu events.
> +        */
> +       if (ctx == NULL)
> +               return;
> +
> +       if (event_type & EVENT_PINNED)
> +               pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
> +       if (event_type & EVENT_FLEXIBLE)
> +               pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
>  }
>
>  static void
> @@ -4309,14 +4377,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
>                 update_context_time(&cpuctx->ctx);
>                 __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
>                 rotate_ctx(&cpuctx->ctx, cpu_event);
> -               __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
> +               __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
>         }
>
>         if (task_event)
>                 rotate_ctx(task_epc->ctx, task_event);
>
>         if (task_event || (task_epc && cpu_event))
> -               __pmu_ctx_sched_in(task_epc->ctx, pmu);
> +               __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
>
>         perf_pmu_enable(pmu);
>         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -4394,7 +4462,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
>          */
>         if (enabled) {
>                 clone_ctx = unclone_ctx(ctx);
> -               ctx_resched(cpuctx, ctx, event_type);
> +               ctx_resched(cpuctx, ctx, NULL, event_type);
>         } else {
>                 ctx_sched_in(ctx, EVENT_TIME);
>         }
> --
> 2.46.0.rc1.232.g9752f9e123-goog
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ