[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251110033232.12538-6-kernellwp@gmail.com>
Date: Mon, 10 Nov 2025 11:32:26 +0800
From: Wanpeng Li <kernellwp@...il.com>
To: Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Paolo Bonzini <pbonzini@...hat.com>,
Sean Christopherson <seanjc@...gle.com>
Cc: Steven Rostedt <rostedt@...dmis.org>,
Vincent Guittot <vincent.guittot@...aro.org>,
Juri Lelli <juri.lelli@...hat.com>,
linux-kernel@...r.kernel.org,
kvm@...r.kernel.org,
Wanpeng Li <wanpengli@...cent.com>
Subject: [PATCH 05/10] sched/fair: Wire up yield deboost in yield_to_task_fair()
From: Wanpeng Li <wanpengli@...cent.com>
From: Wanpeng Li <wanpengli@...cent.com>
Integrate the yield deboost mechanism into yield_to_task_fair() to
improve yield_to() effectiveness for virtualization workloads.
Add yield_to_deboost() as the main entry point that validates tasks,
finds cgroup LCA, updates rq clock and accounting, calculates penalty,
and applies EEVDF field adjustments.
The integration point after set_next_buddy() and before yield_task_fair()
works in concert with the existing buddy mechanism: set_next_buddy()
provides immediate preference, yield_to_deboost() applies bounded
vruntime penalty for sustained advantage, and yield_task_fair()
completes the standard yield path.
This is particularly beneficial for vCPU workloads where lock holder
detection triggers yield_to(), the holder needs sustained preference
to make progress, vCPUs may be organized in nested cgroups,
high-frequency yields require rate limiting, and ping-pong patterns
need debouncing.
Operation occurs under rq->lock with bounded penalties. The feature
can be disabled at runtime via
/sys/kernel/debug/sched/sched_vcpu_debooster_enabled.
Dbench workload in a virtualized environment (16 pCPUs host, 16 vCPUs
per VM running dbench-16 benchmark) shows consistent gains:
2 VMs: +14.4% throughput
3 VMs: +9.8% throughput
4 VMs: +6.7% throughput
Performance gains stem from more effective yield_to() behavior,
enabling lock holders to make faster progress and reducing contention
overhead in overcommitted scenarios.
Signed-off-by: Wanpeng Li <wanpengli@...cent.com>
---
kernel/sched/fair.c | 58 +++++++++++++++++++++++++++++++++++++++++----
1 file changed, 54 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4bad324f3662..619af60b7ce6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9017,7 +9017,7 @@ static bool yield_deboost_rate_limit(struct rq *rq, u64 now_ns)
* Returns false with appropriate debug logging if any validation fails,
* ensuring only safe and meaningful yield operations proceed.
*/
-static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct task_struct *p_target,
+static bool yield_deboost_validate_tasks(struct rq *rq, struct task_struct *p_target,
struct task_struct **p_yielding_out,
struct sched_entity **se_y_out,
struct sched_entity **se_t_out)
@@ -9066,7 +9066,7 @@ static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct ta
* the appropriate level for vruntime adjustments and EEVDF field updates
* (deadline, vlag) to maintain scheduler consistency.
*/
-static bool __maybe_unused yield_deboost_find_lca(struct sched_entity *se_y, struct sched_entity *se_t,
+static bool yield_deboost_find_lca(struct sched_entity *se_y, struct sched_entity *se_t,
struct sched_entity **se_y_lca_out,
struct sched_entity **se_t_lca_out,
struct cfs_rq **cfs_rq_common_out)
@@ -9162,7 +9162,7 @@ static u64 yield_deboost_apply_debounce(struct rq *rq, struct sched_entity *se_t
* and implements reverse-pair debounce (~300us) to reduce ping-pong effects.
* Returns 0 if no penalty needed, otherwise returns clamped penalty value.
*/
-static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+static u64 yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca,
struct sched_entity *se_t_lca, struct sched_entity *se_t,
int nr_queued)
{
@@ -9250,7 +9250,7 @@ static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct
* scheduler state consistency. Returns true on successful application,
* false if penalty cannot be safely applied.
*/
-static void __maybe_unused yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+static void yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca,
struct cfs_rq *cfs_rq_common, u64 penalty)
{
u64 new_vruntime;
@@ -9303,6 +9303,52 @@ static void yield_task_fair(struct rq *rq)
se->deadline += calc_delta_fair(se->slice, se);
}
+/*
+ * yield_to_deboost - deboost the yielding task to favor the target on the same rq
+ * @rq: runqueue containing both tasks; rq->lock must be held
+ * @p_target: task to favor in scheduling
+ *
+ * Cooperates with yield_to_task_fair(): buddy provides immediate preference;
+ * this routine applies a bounded vruntime penalty at the cgroup LCA so the
+ * target keeps advantage beyond the buddy effect. EEVDF fields are updated
+ * to keep scheduler state consistent.
+ *
+ * Only operates on tasks resident on the same rq; throttled hierarchies are
+ * rejected early. Penalty is bounded by granularity and queue-size caps.
+ *
+ * Intended primarily for virtualization workloads where a yielding vCPU
+ * should defer to a target vCPU within the same runqueue.
+ * Does not change runnable order directly; complements buddy selection with
+ * a bounded fairness adjustment.
+ */
+static void yield_to_deboost(struct rq *rq, struct task_struct *p_target)
+{
+ struct task_struct *p_yielding;
+ struct sched_entity *se_y, *se_t, *se_y_lca, *se_t_lca;
+ struct cfs_rq *cfs_rq_common;
+ u64 penalty;
+
+ /* Step 1: validate tasks and inputs */
+ if (!yield_deboost_validate_tasks(rq, p_target, &p_yielding, &se_y, &se_t))
+ return;
+
+ /* Step 2: find LCA in cgroup hierarchy */
+ if (!yield_deboost_find_lca(se_y, se_t, &se_y_lca, &se_t_lca, &cfs_rq_common))
+ return;
+
+ /* Step 3: update clock and current accounting */
+ update_rq_clock(rq);
+ if (se_y_lca != cfs_rq_common->curr)
+ update_curr(cfs_rq_common);
+
+ /* Step 4: calculate penalty (caps + debounce) */
+ penalty = yield_deboost_calculate_penalty(rq, se_y_lca, se_t_lca, se_t,
+ cfs_rq_common->nr_queued);
+
+ /* Step 5: apply penalty and update EEVDF fields */
+ yield_deboost_apply_penalty(rq, se_y_lca, cfs_rq_common, penalty);
+}
+
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -9314,6 +9360,10 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
/* Tell the scheduler that we'd really like se to run next. */
set_next_buddy(se);
+ /* Apply deboost under rq lock. */
+ yield_to_deboost(rq, p);
+
+ /* Complete the standard yield path. */
yield_task_fair(rq);
return true;
--
2.43.0
Powered by blists - more mailing lists