lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon, 01 Aug 2016 01:38:34 +0200
From:	"Rafael J. Wysocki" <rjw@...ysocki.net>
To:	Linux PM list <linux-pm@...r.kernel.org>
Cc:	Peter Zijlstra <peterz@...radead.org>,
	Srinivas Pandruvada <srinivas.pandruvada@...ux.intel.com>,
	Viresh Kumar <viresh.kumar@...aro.org>,
	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Steve Muckle <steve.muckle@...aro.org>,
	Juri Lelli <juri.lelli@....com>, Ingo Molnar <mingo@...nel.org>
Subject: [RFC][PATCH 7/7] cpufreq: intel_pstate: Change P-state selection algorithm for Core

From: Rafael J. Wysocki <rafael.j.wysocki@...el.com>

The PID-base P-state selection algorithm used by intel_pstate for
Core processors is based on very weak foundations.  Namely, its
decisions are mostly based on the values of the APERF and MPERF
feedback registers and it only estimates the actual utilization to
check if it is not extremely low (in order to avoid getting stuck
in the highest P-state in that case).

Since it generally causes the CPU P-state to ramp up quickly, it
leads to satisfactory performance, but the metric used by it is only
really valid when the CPU changes P-states by itself (ie. in the turbo
range) and if the P-state value set by the driver is treated by the
CPU as the upper limit on turbo P-states selected by it.

As a result, the only case when P-states are reduced by that
algorithm is when the CPU has just come out of idle, but in that
particular case it would have been better to bump up the P-state
instead.  That causes some benchmarks to behave erratically and
attempts to improve the situation lead to excessive energy
consumption, because they make the CPU stay in very high P-states
almost all the time.

Consequently, the only viable way to fix that is to replace the
erroneous algorithm entirely with a better one.

To that end, notice that setting the P-state proportional to the
actual CPU utilization (measured with the help of MPERF and TSC)
generally leads to reasonable behavior, but it does not reflect
the "performance boosting" nature of the current P-state
selection algorithm.  It may be made more similar to that
algorithm, though, by adding iowait boosting to it.

Specifically, if the P-state is bumped up to the maximum after
receiving the UUF_IO flag via cpufreq_update_util(), it will
allow tasks that were previously waiting on I/O to get the full
capacity of the CPU when they are ready to process data again and
that should lead to the desired performance increase overall
without sacrificing too much energy.

For this reason, use the above approach for Core processors in
intel_pstate.

Original-by: Srinivas Pandruvada <srinivas.pandruvada@...ux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@...el.com>
---
 drivers/cpufreq/intel_pstate.c |   43 +++++++++++++++++++++++++++++++++++++++--
 include/linux/sched.h          |    3 ++
 kernel/sched/sched.h           |    3 --
 3 files changed, 44 insertions(+), 5 deletions(-)

Index: linux-pm/drivers/cpufreq/intel_pstate.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/intel_pstate.c
+++ linux-pm/drivers/cpufreq/intel_pstate.c
@@ -181,6 +181,8 @@ struct _pid {
  * @cpu:		CPU number for this instance data
  * @update_util:	CPUFreq utility callback information
  * @update_util_set:	CPUFreq utility callback is set
+ * @iowait_boost:	iowait-related boost fraction
+ * @last_update:	Time of the last update.
  * @pstate:		Stores P state limits for this CPU
  * @vid:		Stores VID limits for this CPU
  * @pid:		Stores PID parameters for this CPU
@@ -206,6 +208,7 @@ struct cpudata {
 	struct vid_data vid;
 	struct _pid pid;
 
+	u64	last_update;
 	u64	last_sample_time;
 	u64	prev_aperf;
 	u64	prev_mperf;
@@ -216,6 +219,7 @@ struct cpudata {
 	struct acpi_processor_performance acpi_perf_data;
 	bool valid_pss_table;
 #endif
+	unsigned int iowait_boost;
 };
 
 static struct cpudata **all_cpu_data;
@@ -229,6 +233,7 @@ static struct cpudata **all_cpu_data;
  * @p_gain_pct:		PID proportional gain
  * @i_gain_pct:		PID integral gain
  * @d_gain_pct:		PID derivative gain
+ * @boost_iowait:	Whether or not to use iowait boosting.
  *
  * Stores per CPU model static PID configuration data.
  */
@@ -240,6 +245,7 @@ struct pstate_adjust_policy {
 	int p_gain_pct;
 	int d_gain_pct;
 	int i_gain_pct;
+	bool boost_iowait;
 };
 
 /**
@@ -277,6 +283,7 @@ struct cpu_defaults {
 	struct pstate_funcs funcs;
 };
 
+static inline int32_t get_target_pstate_default(struct cpudata *cpu);
 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu);
 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
 
@@ -1017,6 +1024,7 @@ static struct cpu_defaults core_params =
 		.p_gain_pct = 20,
 		.d_gain_pct = 0,
 		.i_gain_pct = 0,
+		.boost_iowait = true,
 	},
 	.funcs = {
 		.get_max = core_get_max_pstate,
@@ -1025,7 +1033,7 @@ static struct cpu_defaults core_params =
 		.get_turbo = core_get_turbo_pstate,
 		.get_scaling = core_get_scaling,
 		.get_val = core_get_val,
-		.get_target_pstate = get_target_pstate_use_performance,
+		.get_target_pstate = get_target_pstate_default,
 	},
 };
 
@@ -1290,6 +1298,24 @@ static inline int32_t get_target_pstate_
 	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled);
 }
 
+static inline int32_t get_target_pstate_default(struct cpudata *cpu)
+{
+	struct sample *sample = &cpu->sample;
+	int32_t busy_frac;
+	int pstate;
+
+	busy_frac = div_fp(sample->mperf, sample->tsc);
+	sample->busy_scaled = busy_frac * 100;
+
+	if (busy_frac < cpu->iowait_boost)
+		busy_frac = cpu->iowait_boost;
+
+	cpu->iowait_boost >>= 1;
+
+	pstate = cpu->pstate.turbo_pstate;
+	return fp_toint((pstate + (pstate >> 2)) * busy_frac);
+}
+
 static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
 {
 	int max_perf, min_perf;
@@ -1332,8 +1358,21 @@ static void intel_pstate_update_util(str
 				     unsigned int flags)
 {
 	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
-	u64 delta_ns = time - cpu->sample.time;
+	u64 delta_ns;
+
+	if (pid_params.boost_iowait) {
+		if (flags & UUF_IO) {
+			cpu->iowait_boost = int_tofp(1);
+		} else if (cpu->iowait_boost) {
+			/* Clear iowait_boost if the CPU may have been idle. */
+			delta_ns = time - cpu->last_update;
+			if (delta_ns > TICK_NSEC)
+				cpu->iowait_boost = 0;
+		}
+		cpu->last_update = time;
+	}
 
+	delta_ns = time - cpu->sample.time;
 	if ((s64)delta_ns >= pid_params.sample_rate_ns) {
 		bool sample_taken = intel_pstate_sample(cpu, time);
 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ