[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251110033232.12538-2-kernellwp@gmail.com>
Date: Mon, 10 Nov 2025 11:32:22 +0800
From: Wanpeng Li <kernellwp@...il.com>
To: Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Paolo Bonzini <pbonzini@...hat.com>,
Sean Christopherson <seanjc@...gle.com>
Cc: Steven Rostedt <rostedt@...dmis.org>,
Vincent Guittot <vincent.guittot@...aro.org>,
Juri Lelli <juri.lelli@...hat.com>,
linux-kernel@...r.kernel.org,
kvm@...r.kernel.org,
Wanpeng Li <wanpengli@...cent.com>
Subject: [PATCH 01/10] sched: Add vCPU debooster infrastructure
From: Wanpeng Li <wanpengli@...cent.com>
From: Wanpeng Li <wanpengli@...cent.com>
Introduce foundational infrastructure for the vCPU debooster mechanism
to improve yield_to() effectiveness in virtualization workloads.
Add per-rq tracking fields for rate limiting (yield_deboost_last_time_ns)
and debouncing (yield_deboost_last_src/dst_pid, last_pair_time_ns).
Introduce global sysctl knob sysctl_sched_vcpu_debooster_enabled for
runtime control, defaulting to enabled. Add debugfs interface for
observability and initialization in sched_init().
The infrastructure is inert at this stage as no deboost logic is
implemented yet, allowing independent verification that existing
behavior remains unchanged.
Signed-off-by: Wanpeng Li <wanpengli@...cent.com>
---
kernel/sched/core.c | 7 +++++--
kernel/sched/debug.c | 3 +++
kernel/sched/fair.c | 5 +++++
kernel/sched/sched.h | 9 +++++++++
4 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f754a60de848..03380790088b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8706,9 +8706,12 @@ void __init sched_init(void)
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
- struct rq *rq;
+ struct rq *rq = cpu_rq(i);
+ /* init per-rq debounce tracking */
+ rq->yield_deboost_last_src_pid = -1;
+ rq->yield_deboost_last_dst_pid = -1;
+ rq->yield_deboost_last_pair_time_ns = 0;
- rq = cpu_rq(i);
raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b70a790..905f303af752 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -508,6 +508,9 @@ static __init int sched_init_debug(void)
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+ debugfs_create_u32("sched_vcpu_debooster_enabled", 0644, debugfs_sched,
+ &sysctl_sched_vcpu_debooster_enabled);
+
sched_domains_mutex_lock();
update_sched_domain_debugfs();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b752324270b..5b7fcc86ccff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -81,6 +81,11 @@ static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;
__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * vCPU debooster sysctl control
+ */
+unsigned int sysctl_sched_vcpu_debooster_enabled __read_mostly = 1;
+
static int __init setup_sched_thermal_decay_shift(char *str)
{
pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d7..e9b4be024f89 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1292,6 +1292,13 @@ struct rq {
unsigned int push_busy;
struct cpu_stop_work push_work;
+ /* vCPU debooster rate-limit */
+ u64 yield_deboost_last_time_ns;
+ /* per-rq debounce state to avoid cross-CPU races */
+ pid_t yield_deboost_last_src_pid;
+ pid_t yield_deboost_last_dst_pid;
+ u64 yield_deboost_last_pair_time_ns;
+
#ifdef CONFIG_SCHED_CORE
/* per rq */
struct rq *core;
@@ -2816,6 +2823,8 @@ extern int sysctl_resched_latency_warn_once;
extern unsigned int sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_sched_vcpu_debooster_enabled;
+
extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
--
2.43.0
Powered by blists - more mailing lists