lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230112162426.217522-1-bristot@kernel.org>
Date:   Thu, 12 Jan 2023 17:24:26 +0100
From:   Daniel Bristot de Oliveira <bristot@...nel.org>
To:     linux-kernel@...r.kernel.org, Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>
Cc:     Daniel Bristot de Oliveira <bristot@...nel.org>,
        Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Valentin Schneider <vschneid@...hat.com>,
        Joe Mario <jmario@...hat.com>
Subject: [PATCH] sched/idle: Make idle poll dynamic per-cpu

idle=poll is frequently used on ultra-low-latency systems. Examples of
such systems are high-performance trading and 5G NVRAM. The performance
gain is given by avoiding the idle driver machinery and by keeping the
CPU is always in an active state - avoiding (odd) hardware heuristics that
are out of the control of the OS.

Currently, idle=poll is an all-or-nothing static option defined at
boot time. The motivation for creating this option dynamic and per-cpu
are two:

  1) Reduce the power usage/heat by allowing only selected CPUs to
     do idle polling;
  2) Allow multi-tenant systems (e.g., Kubernetes) to enable idle
     polling only when ultra-low-latency applications are present
     on specific CPUs.

Joe Mario did some experiments with this option enabled, and the results
were significant. For example, by using dynamic idle polling on
selected CPUs, cyclictest performance is optimal (like when using
idle=poll), but cpu power consumption drops from 381 to 233 watts.

Also, limiting idle=poll to the set of CPUs that benefits from
it allows other CPUs to benefit from frequency boosts. Joe also
shows that the results can be in the order of 80nsec round trip
improvement when system-wide idle=poll was not used.

The user can enable idle polling with this command:
  # echo 1 > /sys/devices/system/cpu/cpu{CPU_ID}/idle_poll

And disable it via:
  # echo 0 > /sys/devices/system/cpu/cpu{CPU_ID}/idle_poll

By default, all CPUs have idle polling disabled (the current behavior).
A static key avoids the CPU mask check overhead when no idle polling
is enabled.

Cc: Ingo Molnar <mingo@...hat.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Juri Lelli <juri.lelli@...hat.com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@....com>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Ben Segall <bsegall@...gle.com>
Cc: Mel Gorman <mgorman@...e.de>
Cc: Daniel Bristot de Oliveira <bristot@...hat.com>
Cc: Valentin Schneider <vschneid@...hat.com>
Cc: Joe Mario <jmario@...hat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@...nel.org>
---
 kernel/sched/idle.c | 97 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..c6ef1322d549 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,91 @@
 /* Linker adds these: start and end of __cpuidle functions */
 extern char __cpuidle_text_start[], __cpuidle_text_end[];
 
+/*
+ * per-cpu idle polling selector.
+ */
+static struct cpumask cpu_poll_mask;
+DEFINE_STATIC_KEY_FALSE(cpu_poll_enabled);
+
+/*
+ * Protects the mask/static key relation.
+ */
+DEFINE_MUTEX(cpu_poll_mutex);
+
+static ssize_t idle_poll_store(struct device *dev, struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	int cpu = dev->id;
+	int retval, set;
+	bool val;
+
+	retval = kstrtobool(buf, &val);
+	if (retval)
+		return retval;
+
+	mutex_lock(&cpu_poll_mutex);
+
+	if (val) {
+		set = cpumask_test_and_set_cpu(cpu, &cpu_poll_mask);
+
+		/*
+		 * If the CPU was already on, do not increase the static key usage.
+		 */
+		if (!set)
+			static_branch_inc(&cpu_poll_enabled);
+	} else {
+		set = cpumask_test_and_clear_cpu(cpu, &cpu_poll_mask);
+
+		/*
+		 * If the CPU was already off, do not decrease the static key usage.
+		 */
+		if (set)
+			static_branch_dec(&cpu_poll_enabled);
+	}
+
+	mutex_unlock(&cpu_poll_mutex);
+
+	return count;
+}
+
+static ssize_t idle_poll_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", cpumask_test_cpu(dev->id, &cpu_poll_mask));
+}
+
+static DEVICE_ATTR_RW(idle_poll);
+
+static const struct attribute *idle_poll_attrs[] = {
+	&dev_attr_idle_poll.attr,
+	NULL
+};
+
+static int __init idle_poll_sysfs_init(void)
+{
+	int cpu, retval;
+
+	for_each_possible_cpu(cpu) {
+		struct device *dev = get_cpu_device(cpu);
+
+		if (!dev)
+			continue;
+		retval = sysfs_create_files(&dev->kobj, idle_poll_attrs);
+		if (retval)
+			return retval;
+	}
+
+	return 0;
+}
+device_initcall(idle_poll_sysfs_init);
+
+static int is_cpu_idle_poll(int cpu)
+{
+	if (static_branch_unlikely(&cpu_poll_enabled))
+		return cpumask_test_cpu(cpu, &cpu_poll_mask);
+
+	return 0;
+}
+
 /**
  * sched_idle_set_state - Record idle state for the current CPU.
  * @idle_state: State to record.
@@ -51,18 +136,21 @@ __setup("hlt", cpu_idle_nopoll_setup);
 
 static noinline int __cpuidle cpu_idle_poll(void)
 {
-	trace_cpu_idle(0, smp_processor_id());
+	int cpu = smp_processor_id();
+
+	trace_cpu_idle(0, cpu);
 	stop_critical_timings();
 	ct_idle_enter();
 	local_irq_enable();
 
 	while (!tif_need_resched() &&
-	       (cpu_idle_force_poll || tick_check_broadcast_expired()))
+	       (cpu_idle_force_poll || tick_check_broadcast_expired()
+		|| is_cpu_idle_poll(cpu)))
 		cpu_relax();
 
 	ct_idle_exit();
 	start_critical_timings();
-	trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+	trace_cpu_idle(PWR_EVENT_EXIT, cpu);
 
 	return 1;
 }
@@ -296,7 +384,8 @@ static void do_idle(void)
 		 * broadcast device expired for us, we don't want to go deep
 		 * idle as we know that the IPI is going to arrive right away.
 		 */
-		if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+		if (cpu_idle_force_poll || tick_check_broadcast_expired()
+		    || is_cpu_idle_poll(cpu)) {
 			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
 		} else {
-- 
2.38.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ