linux-kernel - [RFC PATCH] sched: idle: Introduce CPU-specific idle=poll

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250621235745.3994-1-atomlin@atomlin.com>
Date: Sat, 21 Jun 2025 19:57:45 -0400
From: Aaron Tomlin <atomlin@...mlin.com>
To: tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	x86@...nel.org,
	peterz@...radead.org,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org
Cc: hpa@...or.com,
	oleg@...hat.com,
	atomlin@...mlin.com,
	dietmar.eggemann@....com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com,
	linux-kernel@...r.kernel.org
Subject: [RFC PATCH] sched: idle: Introduce CPU-specific idle=poll

Currently, the idle=poll kernel boot parameter applies globally, forcing
all CPUs into a shallow polling idle state to ensure ultra-low latency
responsiveness. While this is beneficial for extremely latency-sensitive
workloads, this global application lacks flexibility and can lead to
significant power inefficiency. This is particularly evident in systems
with a high CPU count, such as those utilising the
Full Dynticks/Adaptive Tick feature (i.e., nohz_full). In such
environments, only a subset of CPUs might genuinely require
sub-microsecond responsiveness, while others, though active, could
benefit from entering deeper idle states to conserve power.

This patch addresses this limitation by introducing the ability to
configure idle=poll on a per-CPU basis. This new feature allows
administrators to specifically designate which CPUs are permitted to
remain in the polling idle state.

This provides a critical improvement in power consumption by enabling a
nuanced power management strategy. CPUs running workloads with stringent
ultra-low-latency requirements can continue to benefit from idle=poll,
while other CPUs which in Full Dynticks mode, but not constantly busy
can dynamically enter deeper, power-saving idle states. This granular
control offers significantly enhanced flexibility and efficiency
compared to the previous system-wide limitation of idle=poll.

Consider a CPU configured in Full Dynticks mode, while idle=poll, a
"perf report" from such a system, even when the CPU is largely idle,
frequently reveals the following dominant activity:

    99.70%  swapper  [kernel.kallsyms]  [k] cpu_idle_poll.isra.0
     0.10%  swapper  [kernel.kallsyms]  [k] sched_tick
     0.10%  swapper  [kernel.kallsyms]  [k] native_read_msr
     0.10%  swapper  [kernel.kallsyms]  [k] native_sched_clock

The high percentage of cpu_idle_poll code indicates the CPU is
spending virtually all its time busy-looping in a shallow idle state.
This behavior, while ensuring responsiveness, directly translates to
substantial, unnecessary power consumption for CPUs that are not
"actively" processing latency-critical workloads.

Now consider nohz_full=2-47 and idle=poll,2-26. This setup attempts to
allow for a highly optimised balance between extreme performance for
critical components and significant energy efficiency for the rest of
the system

 - Dedicated Responsiveness. Cores 2-26 provide unparalleled low-latency
   for the most critical workloads by remaining in constant polling,
   consciously trading increased power consumption for absolute speed
   and predictability.

 - Significant Power Savings. Cores 27-47 achieve substantial energy
   conservation by effectively utilising the true deep-sleep
   capabilities of nohz_full when idle, directly addressing and
   mitigating the power waste observed in the perf report for similar
   scenarios.

 - Enhanced Flexibility. This system avoids the previous
   "all-or-nothing" trade-off inherent in a global idle=poll setting. It
   empowers administrators with fine-grained control, enabling a
   precisely-tuned power and performance profile for specific
   application needs and optimising resource utilisation across the
   entire 48-core system.

Signed-off-by: Aaron Tomlin <atomlin@...mlin.com>
---
 arch/x86/kernel/process.c | 27 +++++++++++++++++++++++----
 include/linux/cpu.h       |  1 +
 kernel/sched/idle.c       | 33 ++++++++++++++++++++-------------
 3 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c1d2dac72b9c..43d0cc2bed73 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -973,15 +973,34 @@ void __init arch_post_acpi_subsys_init(void)
 	pr_info("System has AMD C1E erratum E400. Workaround enabled.\n");
 }
 
+cpumask_var_t idle_poll_mask;
+EXPORT_SYMBOL_GPL(idle_poll_mask);
+
+static int __init idle_poll_setup(char *str)
+{
+	int err = 0;
+
+	if (cpulist_parse(str, idle_poll_mask) < 0) {
+		pr_warn("idle poll: incorrect CPU range\n");
+		err = 1;
+	} else {
+		boot_option_idle_override = IDLE_POLL;
+		cpu_idle_poll_update(idle_poll_mask);
+	}
+
+	return err;
+}
+
 static int __init idle_setup(char *str)
 {
 	if (!str)
 		return -EINVAL;
 
-	if (!strcmp(str, "poll")) {
-		pr_info("using polling idle threads\n");
-		boot_option_idle_override = IDLE_POLL;
-		cpu_idle_poll_ctrl(true);
+	if (!strncmp(str, "poll,", 5)) {
+		str += 5;
+		idle_poll_setup(str);
+	} else if (!strcmp(str, "poll")) {
+		cpu_idle_poll_update(cpu_present_mask);
 	} else if (!strcmp(str, "halt")) {
 		/* 'idle=halt' HALT for idle. C-states are disabled. */
 		boot_option_idle_override = IDLE_HALT;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e6089abc28e2..ce909b1839c9 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -164,6 +164,7 @@ static inline void suspend_enable_secondary_cpus(void) { }
 void __noreturn cpu_startup_entry(enum cpuhp_state state);
 
 void cpu_idle_poll_ctrl(bool enable);
+void cpu_idle_poll_update(const struct cpumask *mask);
 
 bool cpu_in_idle(unsigned long pc);
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2c85c86b455f..86365bbbc111 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -19,22 +19,29 @@ void sched_idle_set_state(struct cpuidle_state *idle_state)
 	idle_set_state(this_rq(), idle_state);
 }
 
-static int __read_mostly cpu_idle_force_poll;
+static DEFINE_PER_CPU(int, idle_force_poll);
 
 void cpu_idle_poll_ctrl(bool enable)
 {
 	if (enable) {
-		cpu_idle_force_poll++;
-	} else {
-		cpu_idle_force_poll--;
-		WARN_ON_ONCE(cpu_idle_force_poll < 0);
-	}
+		this_cpu_inc(idle_force_poll);
+	} else
+		WARN_ON_ONCE(this_cpu_dec_return(idle_force_poll) < 0);
+}
+
+void cpu_idle_poll_update(const struct cpumask *mask)
+{
+	int cpu;
+
+	pr_info_once("using polling idle threads\n");
+	for_each_cpu(cpu, mask)
+		per_cpu(idle_force_poll, cpu) = 1;
 }
 
 #ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
 static int __init cpu_idle_poll_setup(char *__unused)
 {
-	cpu_idle_force_poll = 1;
+	cpu_idle_poll_update(cpu_present_mask);
 
 	return 1;
 }
@@ -42,8 +49,6 @@ __setup("nohlt", cpu_idle_poll_setup);
 
 static int __init cpu_idle_nopoll_setup(char *__unused)
 {
-	cpu_idle_force_poll = 0;
-
 	return 1;
 }
 __setup("hlt", cpu_idle_nopoll_setup);
@@ -51,14 +56,16 @@ __setup("hlt", cpu_idle_nopoll_setup);
 
 static noinline int __cpuidle cpu_idle_poll(void)
 {
+	int cpu = smp_processor_id();
+
 	instrumentation_begin();
-	trace_cpu_idle(0, smp_processor_id());
+	trace_cpu_idle(0, cpu);
 	stop_critical_timings();
 	ct_cpuidle_enter();
 
 	raw_local_irq_enable();
 	while (!tif_need_resched() &&
-	       (cpu_idle_force_poll || tick_check_broadcast_expired()))
+	       (per_cpu(idle_force_poll, cpu) || tick_check_broadcast_expired()))
 		cpu_relax();
 	raw_local_irq_disable();
 
@@ -78,7 +85,7 @@ void __weak arch_cpu_idle_exit(void) { }
 void __weak __noreturn arch_cpu_idle_dead(void) { while (1); }
 void __weak arch_cpu_idle(void)
 {
-	cpu_idle_force_poll = 1;
+	this_cpu_inc(idle_force_poll);
 }
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE
@@ -318,7 +325,7 @@ static void do_idle(void)
 		 * broadcast device expired for us, we don't want to go deep
 		 * idle as we know that the IPI is going to arrive right away.
 		 */
-		if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+		if (__this_cpu_read(idle_force_poll) || tick_check_broadcast_expired()) {
 			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
 		} else {
-- 
2.49.0