linux-kernel - [PATCH v4 3/4] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <4887d42a-bd85-e769-5568-49923510e24c@linux.intel.com>
Date:   Wed, 21 Jun 2017 23:20:40 +0300
From:   Alexey Budankov <alexey.budankov@...ux.intel.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Cc:     Andi Kleen <ak@...ux.intel.com>, Kan Liang <kan.liang@...el.com>,
        Dmitri Prokhorov <Dmitry.Prohorov@...el.com>,
        Valery Cherepennikov <valery.cherepennikov@...el.com>,
        Mark Rutland <mark.rutland@....com>,
        David Carrillo-Cisneros <davidcc@...gle.com>,
        Stephane Eranian <eranian@...gle.com>,
        linux-kernel <linux-kernel@...r.kernel.org>
Subject: [PATCH v4 3/4] perf/core: addressing 4x slowdown during per-process
 profiling of STREAM benchmark on Intel Xeon Phi

perf/core: mux switch to skip to the current CPU's events list on mux 
interrupt

By default, the userspace perf tool opens per-cpu task-bound events
when sampling, so for N logical events requested by the user, the tool
will open N * NR_CPUS events.

In the kernel, we mux events with a hrtimer, periodically rotating the
flexible group list and trying to schedule each group in turn. We skip
groups whose cpu filter doesn't match. So when we get unlucky, we can
walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer invocation.

This has been observed to result in significant overhead when running
the STREAM benchmark on 272 core Xeon Phi systems.

One way to avoid this is to place our events into an rb tree sorted by
CPU filter, so that our hrtimer can skip to the current CPU's
list and ignore everything else.

This patch implements mux switch that triggers skipping to
the current CPU's events list only.

Signed-off-by: Alexey Budankov <alexey.budankov@...ux.intel.com>
---
  kernel/events/core.c | 137 
++++++++++++++++++++++++++++++++-------------------
  1 file changed, 86 insertions(+), 51 deletions(-)

1. added mux switch into:

    ctx_sched_out()
    cpu_ctx_sched_out()
    ctx_sched_in()
    cpu_ctx_sched_in()
    perf_event_sched_in()

2. mux switch is set to 1 at perf_rotate_context() only.
    All the other functions set the switch to 0 piror
    calling the API above.

3. implemented iteration thru cpu group lists by mux switch
    at ctx_sched_in() and ctx_sched_out()

4. adjusted the rest of code to adopt changes in API.

The patch was tested under perf_fuzzer and tests on Xeon Phi:
https://github.com/deater/perf_event_tests.
On 21.06.2017 21:31, Alexey Budankov wrote:
No new issues were found in comparison to the clean kernel.

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6eb1c3f..2dbc60d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -555,11 +555,11 @@ void perf_sample_event_took(u64 sample_len_ns)
  static atomic64_t perf_event_id;

  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			      enum event_type_t event_type);
+			      enum event_type_t event_type, int mux);

  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
  			     enum event_type_t event_type,
-			     struct task_struct *task);
+			     struct task_struct *task, int mux);

  static void update_context_time(struct perf_event_context *ctx);
  static u64 perf_event_time(struct perf_event *event);
@@ -701,6 +701,7 @@ static void perf_cgroup_switch(struct task_struct 
*task, int mode)
  	struct perf_cpu_context *cpuctx;
  	struct list_head *list;
  	unsigned long flags;
+	int mux = 0;

  	/*
  	 * Disable interrupts and preemption to avoid this CPU's
@@ -716,7 +717,7 @@ static void perf_cgroup_switch(struct task_struct 
*task, int mode)
  		perf_pmu_disable(cpuctx->ctx.pmu);

  		if (mode & PERF_CGROUP_SWOUT) {
-			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+			cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux);
  			/*
  			 * must not be done before ctxswout due
  			 * to event_filter_match() in event_sched_out()
@@ -735,7 +736,7 @@ static void perf_cgroup_switch(struct task_struct 
*task, int mode)
  			 */
  			cpuctx->cgrp = perf_cgroup_from_task(task,
  							     &cpuctx->ctx);
-			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux);
  		}
  		perf_pmu_enable(cpuctx->ctx.pmu);
  		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -1605,12 +1606,6 @@ perf_event_groups_delete(struct perf_event_groups 
*groups,
  static void
  perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
  {
-	WARN_ON_ONCE(!groups);
-
-	list_rotate_left(&groups->list);
-
-	/* will replace rotation above with mux switch in patch v4-3/4
-
  	struct rb_node *node;
  	struct perf_event *node_event;

@@ -1630,8 +1625,6 @@ perf_event_groups_rotate(struct perf_event_groups 
*groups, int cpu)
  			return;
  		}
  	}
-
-	*/
  }

  /*
@@ -2520,36 +2513,38 @@ static void add_event_to_ctx(struct perf_event 
*event,

  static void ctx_sched_out(struct perf_event_context *ctx,
  			  struct perf_cpu_context *cpuctx,
-			  enum event_type_t event_type);
+			  enum event_type_t event_type, int mux);
  static void
  ctx_sched_in(struct perf_event_context *ctx,
  	     struct perf_cpu_context *cpuctx,
  	     enum event_type_t event_type,
-	     struct task_struct *task);
+	     struct task_struct *task, int mux);

  static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
  			       struct perf_event_context *ctx,
  			       enum event_type_t event_type)
  {
+	int mux = 0;
+
  	if (!cpuctx->task_ctx)
  		return;

  	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
  		return;

-	ctx_sched_out(ctx, cpuctx, event_type);
+	ctx_sched_out(ctx, cpuctx, event_type, mux);
  }

  static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
  				struct perf_event_context *ctx,
-				struct task_struct *task)
+				struct task_struct *task, int mux)
  {
-	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux);
  	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux);
  	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux);
  }

  /*
@@ -2573,6 +2568,7 @@ static void ctx_resched(struct perf_cpu_context 
*cpuctx,
  {
  	enum event_type_t ctx_event_type = event_type & EVENT_ALL;
  	bool cpu_event = !!(event_type & EVENT_CPU);
+	int mux = 0;

  	/*
  	 * If pinned groups are involved, flexible groups also need to be
@@ -2593,11 +2589,11 @@ static void ctx_resched(struct perf_cpu_context 
*cpuctx,
  	 *  - otherwise, do nothing more.
  	 */
  	if (cpu_event)
-		cpu_ctx_sched_out(cpuctx, ctx_event_type);
+		cpu_ctx_sched_out(cpuctx, ctx_event_type, mux);
  	else if (ctx_event_type & EVENT_PINNED)
-		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);

-	perf_event_sched_in(cpuctx, task_ctx, current);
+	perf_event_sched_in(cpuctx, task_ctx, current, mux);
  	perf_pmu_enable(cpuctx->ctx.pmu);
  }

@@ -2615,6 +2611,7 @@ static int  __perf_install_in_context(void *info)
  	struct perf_event_context *task_ctx = cpuctx->task_ctx;
  	bool reprogram = true;
  	int ret = 0;
+	int mux = 0;

  	raw_spin_lock(&cpuctx->ctx.lock);
  	if (ctx->task) {
@@ -2641,7 +2638,7 @@ static int  __perf_install_in_context(void *info)
  	}

  	if (reprogram) {
-		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
  		add_event_to_ctx(event, ctx);
  		ctx_resched(cpuctx, task_ctx, get_event_type(event));
  	} else {
@@ -2777,13 +2774,14 @@ static void __perf_event_enable(struct 
perf_event *event,
  {
  	struct perf_event *leader = event->group_leader;
  	struct perf_event_context *task_ctx;
+	int mux = 0;

  	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
  	    event->state <= PERF_EVENT_STATE_ERROR)
  		return;

  	if (ctx->is_active)
-		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);

  	__perf_event_mark_enabled(event);

@@ -2793,7 +2791,7 @@ static void __perf_event_enable(struct perf_event 
*event,
  	if (!event_filter_match(event)) {
  		if (is_cgroup_event(event))
  			perf_cgroup_defer_enabled(event);
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
  		return;
  	}

@@ -2802,7 +2800,7 @@ static void __perf_event_enable(struct perf_event 
*event,
  	 * then don't put it on unless the group is on.
  	 */
  	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
  		return;
  	}

@@ -2998,10 +2996,9 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);

  static void ctx_sched_out(struct perf_event_context *ctx,
  			  struct perf_cpu_context *cpuctx,
-			  enum event_type_t event_type)
+			  enum event_type_t event_type, int mux)
  {
  	int is_active = ctx->is_active;
-	struct perf_event *event;
  	struct group_sched_params params = {
  			.cpuctx = cpuctx,
  			.ctx = ctx,
@@ -3054,13 +3051,27 @@ static void ctx_sched_out(struct 
perf_event_context *ctx,

  	perf_pmu_disable(ctx->pmu);
  	if (is_active & EVENT_PINNED) {
-		perf_event_groups_iterate(&ctx->pinned_groups,
-				group_sched_out_callback, &params);
+		if (mux) {
+			perf_event_groups_iterate_cpu(&ctx->pinned_groups, -1,
+					group_sched_out_callback, &params);
+			perf_event_groups_iterate_cpu(&ctx->pinned_groups, cpu,
+					group_sched_out_callback, &params);
+		} else {
+			perf_event_groups_iterate(&ctx->pinned_groups,
+					group_sched_out_callback, &params);
+		}
  	}

  	if (is_active & EVENT_FLEXIBLE) {
-		perf_event_groups_iterate(&ctx->flexible_groups,
-				group_sched_out_callback, &params);
+		if (mux) {
+			perf_event_groups_iterate_cpu(&ctx->flexible_groups, -1,
+					group_sched_out_callback, &params);
+			perf_event_groups_iterate_cpu(&ctx->flexible_groups, cpu,
+					group_sched_out_callback, &params);
+		} else {
+			perf_event_groups_iterate(&ctx->flexible_groups,
+					group_sched_out_callback, &params);
+		}
  	}
  	perf_pmu_enable(ctx->pmu);
  }
@@ -3349,16 +3360,16 @@ void __perf_event_task_sched_out(struct 
task_struct *task,
   * Called with IRQs disabled
   */
  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			      enum event_type_t event_type)
+			      enum event_type_t event_type, int mux)
  {
-	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux);
  }

  static void
  ctx_sched_in(struct perf_event_context *ctx,
  	     struct perf_cpu_context *cpuctx,
  	     enum event_type_t event_type,
-	     struct task_struct *task)
+	     struct task_struct *task, int mux)
  {
  	int is_active = ctx->is_active;
  	struct group_sched_params params = {
@@ -3394,31 +3405,54 @@ ctx_sched_in(struct perf_event_context *ctx,
  	 * in order to give them the best chance of going on.
  	 */
  	if (is_active & EVENT_PINNED) {
-		perf_event_groups_iterate(&ctx->pinned_groups,
-				ctx_pinned_sched_in_callback, &params);
+		if (mux) {
+			perf_event_groups_iterate_cpu(&ctx->pinned_groups,
+					-1, ctx_pinned_sched_in_callback,
+					&params);
+			perf_event_groups_iterate_cpu(&ctx->pinned_groups,
+					cpu, ctx_pinned_sched_in_callback,
+					&params);
+		} else {
+			perf_event_groups_iterate(&ctx->pinned_groups,
+					ctx_pinned_sched_in_callback,
+					&params);
+		}
  	}

  	/* Then walk through the lower prio flexible groups */
  	if (is_active & EVENT_FLEXIBLE) {
-		params.can_add_hw = 1;
-		perf_event_groups_iterate(&ctx->flexible_groups,
-				ctx_flexible_sched_in_callback, &params);
+		if (mux) {
+			params.can_add_hw = 1;
+			perf_event_groups_iterate_cpu(&ctx->flexible_groups,
+					-1, ctx_flexible_sched_in_callback,
+					&params);
+			params.can_add_hw = 1;
+			perf_event_groups_iterate_cpu(&ctx->flexible_groups,
+					cpu, ctx_flexible_sched_in_callback,
+					&params);
+		} else {
+			params.can_add_hw = 1;
+			perf_event_groups_iterate(&ctx->flexible_groups,
+					ctx_flexible_sched_in_callback,
+					&params);
+		}
  	}
  }

  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
  			     enum event_type_t event_type,
-			     struct task_struct *task)
+			     struct task_struct *task, int mux)
  {
  	struct perf_event_context *ctx = &cpuctx->ctx;

-	ctx_sched_in(ctx, cpuctx, event_type, task);
+	ctx_sched_in(ctx, cpuctx, event_type, task, mux);
  }

  static void perf_event_context_sched_in(struct perf_event_context *ctx,
  					struct task_struct *task)
  {
  	struct perf_cpu_context *cpuctx;
+	int mux = 0;

  	cpuctx = __get_cpu_context(ctx);
  	if (cpuctx->task_ctx == ctx)
@@ -3435,8 +3469,8 @@ static void perf_event_context_sched_in(struct 
perf_event_context *ctx,
  	 * events, no need to flip the cpuctx's events around.
  	 */
  	if (!perf_event_groups_empty(&ctx->pinned_groups))
-		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-	perf_event_sched_in(cpuctx, ctx, task);
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
+	perf_event_sched_in(cpuctx, ctx, task, mux);
  	perf_pmu_enable(ctx->pmu);
  	perf_ctx_unlock(cpuctx, ctx);
  }
@@ -3681,6 +3715,7 @@ static int perf_rotate_context(struct 
perf_cpu_context *cpuctx)
  {
  	struct perf_event_context *ctx = NULL;
  	int rotate = 0;
+	int mux = 1;

  	if (cpuctx->ctx.nr_events) {
  		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
@@ -3699,15 +3734,15 @@ static int perf_rotate_context(struct 
perf_cpu_context *cpuctx)
  	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
  	perf_pmu_disable(cpuctx->ctx.pmu);

-	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
  	if (ctx)
-		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE, mux);

  	rotate_ctx(&cpuctx->ctx);
  	if (ctx)
  		rotate_ctx(ctx);

-	perf_event_sched_in(cpuctx, ctx, current);
+	perf_event_sched_in(cpuctx, ctx, current, mux);

  	perf_pmu_enable(cpuctx->ctx.pmu);
  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3759,6 +3794,7 @@ static void perf_event_enable_on_exec(int ctxn)
  	struct perf_event *event;
  	unsigned long flags;
  	int enabled = 0;
+	int mux = 0;

  	local_irq_save(flags);
  	ctx = current->perf_event_ctxp[ctxn];
@@ -3767,7 +3803,7 @@ static void perf_event_enable_on_exec(int ctxn)

  	cpuctx = __get_cpu_context(ctx);
  	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+	ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
  	list_for_each_entry(event, &ctx->event_list, event_entry) {
  		enabled |= event_enable_on_exec(event, ctx);
  		event_type |= get_event_type(event);
@@ -3780,7 +3816,7 @@ static void perf_event_enable_on_exec(int ctxn)
  		clone_ctx = unclone_ctx(ctx);
  		ctx_resched(cpuctx, ctx, event_type);
  	} else {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
  	}
  	perf_ctx_unlock(cpuctx, ctx);

@@ -11101,7 +11137,6 @@ static int perf_event_init_context(struct 
task_struct *child, int ctxn)
  {
  	struct perf_event_context *child_ctx, *parent_ctx;
  	struct perf_event_context *cloned_ctx;
-	struct perf_event *event;
  	struct task_struct *parent = current;
  	unsigned long flags;
  	int ret = 0;