lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Thu,  6 Jul 2017 10:49:48 +0100
From:   Dietmar Eggemann <dietmar.eggemann@....com>
To:     linux-kernel@...r.kernel.org
Cc:     linux-pm@...r.kernel.org, linux@....linux.org.uk,
        Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
        Russell King <rmk+kernel@...linux.org.uk>,
        Catalin Marinas <catalin.marinas@....com>,
        Will Deacon <will.deacon@....com>,
        Juri Lelli <juri.lelli@....com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Morten Rasmussen <morten.rasmussen@....com>,
        Viresh Kumar <viresh.kumar@...aro.org>,
        "Rafael J . Wysocki" <rjw@...ysocki.net>
Subject: [PATCH v2 10/10] drivers base/arch_topology: inline cpu- and frequency-invariant accounting

To speed up the cpu- and frequency-invariant accounting of the task
scheduler make sure that the CIE (topology_get_cpu_scale()) and FIE
(topology_get_freq_scale() get completely inlined into the task
scheduler consumer functions (e.g. __update_load_avg_se()).

This patch-set changes the interface for CIE and FIE from:

drivers/base/arch_topology.c:

 static DEFINE_PER_CPU(unsigned long, item);

 unsigned long topology_get_item_scale(...)
 {
 	return per_cpu(item, cpu)
 }

include/linux/arch_topology.h:

 unsigned long topology_get_item_scale(...);

to:

drivers/base/arch_topology.c:

 DEFINE_PER_CPU(unsigned long, item);

include/linux/arch_topology.h:

 DECLARE_PER_CPU(unsigned long, item);

 static inline
 unsigned long topology_get_item_scale(...)
 {
 	return per_cpu(item, cpu)
 }

An uplift in performance could be detected running the kernel with the
following test patch on top (on JUNO R0 (arm64)):

@@ -2812,10 +2812,18 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
	unsigned long scale_freq, scale_cpu;
	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
	u64 periods;
+       u64 t1, t2;
+
+       t1 = sched_clock_cpu(cpu);

	scale_freq = arch_scale_freq_capacity(NULL, cpu);
	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);

+       t2 = sched_clock_cpu(cpu);
+
+       trace_printk("cpu=%d t1=%llu t2=%llu diff=%llu\n",
+                    cpu, t1, t2, t2 - t1);
+
	delta += sa->period_contrib;
	periods = delta / 1024; /* A period is * 1024us * (~1ms) */

The following test results (3 test runs each) have been obtained by
tracing this trace printk (diff=x) for Cortex A-53 (LITTLE) and Cortex
A-57 (big) cpus w/ (inline) and w/o (non-inline) this patch.

         mean  max  min

A-57 inline:

	119.6  300   60
	 96.8  280   60
	110.2  660   60

A-57 non-inline:

	142.8  460   80
	157.6  680   80
	153.4  720   80

A-53 inline:

	141.6  360  100
	118.8  500  100
	148.6  380  100

A-53 non-inline:

	293    840  120
	253.2  840  120
	299.6 1060  140

Cc: Greg Kroah-Hartman <gregkh@...uxfoundation.org>
Cc: Juri Lelli <juri.lelli@....com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@....com>
---
 drivers/base/arch_topology.c  | 14 ++------------
 include/linux/arch_topology.h | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 63fb3f945d21..b4481cff14bf 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,12 +22,7 @@
 #include <linux/string.h>
 #include <linux/sched/topology.h>
 
-static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
-
-unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu)
-{
-	return per_cpu(freq_scale, cpu);
-}
+DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
 
 void topology_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 			     unsigned long max_freq)
@@ -43,12 +38,7 @@ void topology_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 
 
 static DEFINE_MUTEX(cpu_scale_mutex);
-static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
-
-unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
-{
-	return per_cpu(cpu_scale, cpu);
-}
+DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
 {
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 168104d2d2cf..361e85a30151 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -11,12 +11,23 @@ void topology_normalize_cpu_scale(void);
 struct device_node;
 int topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
+DECLARE_PER_CPU(unsigned long, cpu_scale);
+DECLARE_PER_CPU(unsigned long, freq_scale);
+
 struct sched_domain;
-unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu);
+static inline
+unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(cpu_scale, cpu);
+}
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
 
-unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu);
+static inline
+unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(freq_scale, cpu);
+}
 
 void topology_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 			     unsigned long max_freq);
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ