lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 30 Dec 2016 18:31:15 -0800
From:   David Carrillo-Cisneros <davidcc@...gle.com>
To:     linux-kernel@...r.kernel.org
Cc:     "x86@...nel.org" <x86@...nel.org>, Ingo Molnar <mingo@...hat.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Andi Kleen <ak@...ux.intel.com>,
        Kan Liang <kan.liang@...el.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Borislav Petkov <bp@...e.de>,
        Srinivas Pandruvada <srinivas.pandruvada@...ux.intel.com>,
        Dave Hansen <dave.hansen@...ux.intel.com>,
        Vikas Shivappa <vikas.shivappa@...ux.intel.com>,
        Paul Turner <pjt@...gle.com>,
        Stephane Eranian <eranian@...gle.com>,
        David Carrillo-Cisneros <davidcc@...gle.com>
Subject: [PATCH] perf/core: only check cpuctx with cgroup events during cgroup switch

Following conversation in CQM/CMT last series about speeding up the
context switch for cgroup events:

https://patchwork.kernel.org/patch/9478617/

This is a low-hanging fruit optimization. It replaces the iteration over
the "pmus" list in cgroup switch by an iteration over a new list that
contains only cpuctxs with at least one cgroup event.

The iteration over pmus has become increasingly costly as the number of
pmus have increased. It is also heavily affected by cache contention.

Below some instrumentation measurements with 10, 50 and 90 percentiles
of the total cost of context switch at distinct levels of cache
contenton for a simple array read/write microbenchark.

Contention
   Level    Nr events      Before (us)            After (us)       Median
 L2    L3     types      (10%, 50%, 90%)       (10%, 50%, 90%     Speedup
--------------------------------------------------------------------------
Low   Low       1       (1.72, 2.42, 5.85)    (1.35, 1.64, 5.46)     29%
High  Low       1       (2.08, 4.56, 19.8)    (1720, 2.20, 13.7)     51%
High  High      1       (2.86, 10.4, 12.7)    (2.54, 4.32, 12.1)     58%

Low   Low       2       (1.98, 3.20, 6.89)    (1.68, 2.41, 8.89)     24%
High  Low       2       (2.48, 5.28, 22.4)    (2150, 3.69, 14.6)     30%
High  High      2       (3.32, 8.09, 13.9)    (2.80, 5.15, 13.7)     36%

where:
 1 event type  = cycles
 2 event types = cycles,intel_cqm/llc_occupancy/

Contetion L2 Low: workset < L2 cache size.
             High:  "    >> L2   "     " .
Contetion L3 Low: workset of task on all sockets < L3 cache size.
             High:   "     "   "   "   "    "   >> L3   "     " .

Median Speed up is (50%ile Before - 50%ile After) /  50%ile Before

Signed-off-by: David Carrillo-Cisneros <davidcc@...gle.com>
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 96 +++++++++++++++++++++-------------------------
 2 files changed, 44 insertions(+), 53 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecd..10d2235 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -788,6 +788,7 @@ struct perf_cpu_context {
 	struct pmu			*unique_pmu;
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
+	struct list_head		cgrp_cpuctx_entry;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index faf073d..35c4fbc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -678,6 +678,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	info->timestamp = ctx->timestamp;
 }
 
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
 #define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
 
@@ -690,61 +692,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 static void perf_cgroup_switch(struct task_struct *task, int mode)
 {
 	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
+	struct list_head *list;
 	unsigned long flags;
 
 	/*
-	 * disable interrupts to avoid geting nr_cgroup
-	 * changes via __perf_event_disable(). Also
-	 * avoids preemption.
+	 * Disable interrupts and preemption to avoid this CPU's
+	 * cgrp_cpuctx_entry to change under us.
 	 */
 	local_irq_save(flags);
 
-	/*
-	 * we reschedule only in the presence of cgroup
-	 * constrained events.
-	 */
+	list = this_cpu_ptr(&cgrp_cpuctx_list);
+	list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			continue; /* ensure we process each cpuctx once */
+		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
 
-		/*
-		 * perf_cgroup_events says at least one
-		 * context on this CPU has cgroup events.
-		 *
-		 * ctx->nr_cgroups reports the number of cgroup
-		 * events for a context.
-		 */
-		if (cpuctx->ctx.nr_cgroups > 0) {
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-			perf_pmu_disable(cpuctx->ctx.pmu);
-
-			if (mode & PERF_CGROUP_SWOUT) {
-				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-				/*
-				 * must not be done before ctxswout due
-				 * to event_filter_match() in event_sched_out()
-				 */
-				cpuctx->cgrp = NULL;
-			}
+		if (mode & PERF_CGROUP_SWOUT) {
+			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+			/*
+			 * must not be done before ctxswout due
+			 * to event_filter_match() in event_sched_out()
+			 */
+			cpuctx->cgrp = NULL;
+		}
 
-			if (mode & PERF_CGROUP_SWIN) {
-				WARN_ON_ONCE(cpuctx->cgrp);
-				/*
-				 * set cgrp before ctxsw in to allow
-				 * event_filter_match() to not have to pass
-				 * task around
-				 * we pass the cpuctx->ctx to perf_cgroup_from_task()
-				 * because cgorup events are only per-cpu
-				 */
-				cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
-				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
-			}
-			perf_pmu_enable(cpuctx->ctx.pmu);
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		if (mode & PERF_CGROUP_SWIN) {
+			WARN_ON_ONCE(cpuctx->cgrp);
+			/*
+			 * set cgrp before ctxsw in to allow
+			 * event_filter_match() to not have to pass
+			 * task around
+			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
+			 * because cgorup events are only per-cpu
+			 */
+			cpuctx->cgrp = perf_cgroup_from_task(task,
+							     &cpuctx->ctx);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 		}
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
 
 	local_irq_restore(flags);
@@ -889,6 +876,7 @@ list_update_cgroup_event(struct perf_event *event,
 			 struct perf_event_context *ctx, bool add)
 {
 	struct perf_cpu_context *cpuctx;
+	struct list_head *lentry;
 
 	if (!is_cgroup_event(event))
 		return;
@@ -902,15 +890,16 @@ list_update_cgroup_event(struct perf_event *event,
 	 * this will always be called from the right CPU.
 	 */
 	cpuctx = __get_cpu_context(ctx);
-
-	/*
-	 * cpuctx->cgrp is NULL until a cgroup event is sched in or
-	 * ctx->nr_cgroup == 0 .
-	 */
-	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
-		cpuctx->cgrp = event->cgrp;
-	else if (!add)
+	lentry = &cpuctx->cgrp_cpuctx_entry;
+	/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+	if (add) {
+		list_add(lentry, this_cpu_ptr(&cgrp_cpuctx_list));
+		if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+			cpuctx->cgrp = event->cgrp;
+	} else {
+		list_del(lentry);
 		cpuctx->cgrp = NULL;
+	}
 }
 
 #else /* !CONFIG_CGROUP_PERF */
@@ -10595,6 +10584,7 @@ static void __init perf_event_init_all_cpus(void)
 		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
 		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 
+		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
-- 
2.8.0.rc3.226.g39d4020

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ