[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20250621235745.3994-1-atomlin@atomlin.com>
Date: Sat, 21 Jun 2025 19:57:45 -0400
From: Aaron Tomlin <atomlin@...mlin.com>
To: tglx@...utronix.de,
mingo@...hat.com,
bp@...en8.de,
dave.hansen@...ux.intel.com,
x86@...nel.org,
peterz@...radead.org,
juri.lelli@...hat.com,
vincent.guittot@...aro.org
Cc: hpa@...or.com,
oleg@...hat.com,
atomlin@...mlin.com,
dietmar.eggemann@....com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
linux-kernel@...r.kernel.org
Subject: [RFC PATCH] sched: idle: Introduce CPU-specific idle=poll
Currently, the idle=poll kernel boot parameter applies globally, forcing
all CPUs into a shallow polling idle state to ensure ultra-low latency
responsiveness. While this is beneficial for extremely latency-sensitive
workloads, this global application lacks flexibility and can lead to
significant power inefficiency. This is particularly evident in systems
with a high CPU count, such as those utilising the
Full Dynticks/Adaptive Tick feature (i.e., nohz_full). In such
environments, only a subset of CPUs might genuinely require
sub-microsecond responsiveness, while others, though active, could
benefit from entering deeper idle states to conserve power.
This patch addresses this limitation by introducing the ability to
configure idle=poll on a per-CPU basis. This new feature allows
administrators to specifically designate which CPUs are permitted to
remain in the polling idle state.
This provides a critical improvement in power consumption by enabling a
nuanced power management strategy. CPUs running workloads with stringent
ultra-low-latency requirements can continue to benefit from idle=poll,
while other CPUs which in Full Dynticks mode, but not constantly busy
can dynamically enter deeper, power-saving idle states. This granular
control offers significantly enhanced flexibility and efficiency
compared to the previous system-wide limitation of idle=poll.
Consider a CPU configured in Full Dynticks mode, while idle=poll, a
"perf report" from such a system, even when the CPU is largely idle,
frequently reveals the following dominant activity:
99.70% swapper [kernel.kallsyms] [k] cpu_idle_poll.isra.0
0.10% swapper [kernel.kallsyms] [k] sched_tick
0.10% swapper [kernel.kallsyms] [k] native_read_msr
0.10% swapper [kernel.kallsyms] [k] native_sched_clock
The high percentage of cpu_idle_poll code indicates the CPU is
spending virtually all its time busy-looping in a shallow idle state.
This behavior, while ensuring responsiveness, directly translates to
substantial, unnecessary power consumption for CPUs that are not
"actively" processing latency-critical workloads.
Now consider nohz_full=2-47 and idle=poll,2-26. This setup attempts to
allow for a highly optimised balance between extreme performance for
critical components and significant energy efficiency for the rest of
the system
- Dedicated Responsiveness. Cores 2-26 provide unparalleled low-latency
for the most critical workloads by remaining in constant polling,
consciously trading increased power consumption for absolute speed
and predictability.
- Significant Power Savings. Cores 27-47 achieve substantial energy
conservation by effectively utilising the true deep-sleep
capabilities of nohz_full when idle, directly addressing and
mitigating the power waste observed in the perf report for similar
scenarios.
- Enhanced Flexibility. This system avoids the previous
"all-or-nothing" trade-off inherent in a global idle=poll setting. It
empowers administrators with fine-grained control, enabling a
precisely-tuned power and performance profile for specific
application needs and optimising resource utilisation across the
entire 48-core system.
Signed-off-by: Aaron Tomlin <atomlin@...mlin.com>
---
arch/x86/kernel/process.c | 27 +++++++++++++++++++++++----
include/linux/cpu.h | 1 +
kernel/sched/idle.c | 33 ++++++++++++++++++++-------------
3 files changed, 44 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c1d2dac72b9c..43d0cc2bed73 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -973,15 +973,34 @@ void __init arch_post_acpi_subsys_init(void)
pr_info("System has AMD C1E erratum E400. Workaround enabled.\n");
}
+cpumask_var_t idle_poll_mask;
+EXPORT_SYMBOL_GPL(idle_poll_mask);
+
+static int __init idle_poll_setup(char *str)
+{
+ int err = 0;
+
+ if (cpulist_parse(str, idle_poll_mask) < 0) {
+ pr_warn("idle poll: incorrect CPU range\n");
+ err = 1;
+ } else {
+ boot_option_idle_override = IDLE_POLL;
+ cpu_idle_poll_update(idle_poll_mask);
+ }
+
+ return err;
+}
+
static int __init idle_setup(char *str)
{
if (!str)
return -EINVAL;
- if (!strcmp(str, "poll")) {
- pr_info("using polling idle threads\n");
- boot_option_idle_override = IDLE_POLL;
- cpu_idle_poll_ctrl(true);
+ if (!strncmp(str, "poll,", 5)) {
+ str += 5;
+ idle_poll_setup(str);
+ } else if (!strcmp(str, "poll")) {
+ cpu_idle_poll_update(cpu_present_mask);
} else if (!strcmp(str, "halt")) {
/* 'idle=halt' HALT for idle. C-states are disabled. */
boot_option_idle_override = IDLE_HALT;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e6089abc28e2..ce909b1839c9 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -164,6 +164,7 @@ static inline void suspend_enable_secondary_cpus(void) { }
void __noreturn cpu_startup_entry(enum cpuhp_state state);
void cpu_idle_poll_ctrl(bool enable);
+void cpu_idle_poll_update(const struct cpumask *mask);
bool cpu_in_idle(unsigned long pc);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2c85c86b455f..86365bbbc111 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -19,22 +19,29 @@ void sched_idle_set_state(struct cpuidle_state *idle_state)
idle_set_state(this_rq(), idle_state);
}
-static int __read_mostly cpu_idle_force_poll;
+static DEFINE_PER_CPU(int, idle_force_poll);
void cpu_idle_poll_ctrl(bool enable)
{
if (enable) {
- cpu_idle_force_poll++;
- } else {
- cpu_idle_force_poll--;
- WARN_ON_ONCE(cpu_idle_force_poll < 0);
- }
+ this_cpu_inc(idle_force_poll);
+ } else
+ WARN_ON_ONCE(this_cpu_dec_return(idle_force_poll) < 0);
+}
+
+void cpu_idle_poll_update(const struct cpumask *mask)
+{
+ int cpu;
+
+ pr_info_once("using polling idle threads\n");
+ for_each_cpu(cpu, mask)
+ per_cpu(idle_force_poll, cpu) = 1;
}
#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
static int __init cpu_idle_poll_setup(char *__unused)
{
- cpu_idle_force_poll = 1;
+ cpu_idle_poll_update(cpu_present_mask);
return 1;
}
@@ -42,8 +49,6 @@ __setup("nohlt", cpu_idle_poll_setup);
static int __init cpu_idle_nopoll_setup(char *__unused)
{
- cpu_idle_force_poll = 0;
-
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
@@ -51,14 +56,16 @@ __setup("hlt", cpu_idle_nopoll_setup);
static noinline int __cpuidle cpu_idle_poll(void)
{
+ int cpu = smp_processor_id();
+
instrumentation_begin();
- trace_cpu_idle(0, smp_processor_id());
+ trace_cpu_idle(0, cpu);
stop_critical_timings();
ct_cpuidle_enter();
raw_local_irq_enable();
while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ (per_cpu(idle_force_poll, cpu) || tick_check_broadcast_expired()))
cpu_relax();
raw_local_irq_disable();
@@ -78,7 +85,7 @@ void __weak arch_cpu_idle_exit(void) { }
void __weak __noreturn arch_cpu_idle_dead(void) { while (1); }
void __weak arch_cpu_idle(void)
{
- cpu_idle_force_poll = 1;
+ this_cpu_inc(idle_force_poll);
}
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE
@@ -318,7 +325,7 @@ static void do_idle(void)
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
*/
- if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ if (__this_cpu_read(idle_force_poll) || tick_check_broadcast_expired()) {
tick_nohz_idle_restart_tick();
cpu_idle_poll();
} else {
--
2.49.0
Powered by blists - more mailing lists