[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20251222095700.6598-1-tao.wangtao@honor.com>
Date: Mon, 22 Dec 2025 17:57:00 +0800
From: wangtao <tao.wangtao@...or.com>
To: <mingo@...hat.com>, <peterz@...radead.org>, <juri.lelli@...hat.com>,
<vincent.guittot@...aro.org>
CC: <dietmar.eggemann@....com>, <rostedt@...dmis.org>, <bsegall@...gle.com>,
<mgorman@...e.de>, <vschneid@...hat.com>, <linux-kernel@...r.kernel.org>,
<liulu.liu@...or.com>, <bintian.wang@...or.com>, <wangzicheng@...or.com>,
wangtao <tao.wangtao@...or.com>
Subject: [PATCH v5] sched/fair: Add fair placement lag
This patch introduces a fair placement lag mechanism that tracks join/leave
lag in sum_jlag and its weighted average J to keep the virtual time V
aligned with the EEVDF fluid schedule.
Assume the current virtual time V is V0 and that all weights are 1, and
three tasks join with lag values 0, 36 and -12. Consider the current
preserve_lag handling: vlag_i = (W + w_i) * vlag_i' / W.
The following example illustrates the problem:
event | vlag'| vlag | v | W | V
----------------------------------------
T1 join | 0 | 0 | V0 | 1 | V0
T2 join | 36 | 72 | V0-72| 2 | V0-36
T3 join | -12 | -18 | V0-18| 3 | V0-30
Because V moves backwards after T2 joins, even though lag_T3 < lag_T1 = 0,
we still have v_T2 < v_T3 < v_T1. Therefore the scheduling order is T2, T3,
T1, i.e. T3 with a negative lag is scheduled before T1 with zero lag.
A similar issue exists even without preserve_lag.
If the tasks are added in the order T3, T2, T1, the scheduling order
becomes T2, T1, T3, which shows instability.
We track the cumulative join/leave lag compensation (jlag) in sum_jlag, and
its weighted average over all runnable tasks in J. We choose jlag on joins
and leaves so that V + J remains invariant, improving stability and
fairness.
J can be shown to be bounded by a constant.
Example run with fair placement lag:
time | event | W | sum_jlag | J | V+J | v | V | lag
-------------------------------------------------------------------------
t | T1 join- | 0 | 0 | 0 | V0 | NA | V0 | 0
t | T1 join+ |> 1 |> 0 |> 0 | V0 |> V0 |> V0 |> 0
t | T2 join- | 1 | 0 | 0 | V0 | NA | V0 | 36
t | T2 join+ |> 2 |> 36 |> 18 | V0 |> V0-36 |> V0-18 |> 18
t | T3 join- | 2 | 36 | 0 | V0 | NA | V0 | -12
t | T3 join+ |> 3 |> 24 |> 8 | V0 |> V0+12 |> V0-8 |> -20
t | pick T2 | 3 | 24 | 8 | V0 | V0-36 | V0-8 |> 28
t+24 | T2 leave- | 3 | 24 | 8 |> V0+8 |> V0-12 |> V0 |> 12
t+24 | T2 leave+ |> 2 |> 4 |> 2 | V0+8 |> NA |> V0+6 | 12
t+24 | pick T1 | 2 | 4 | 2 | V0+8 | V0 | V0+6 |> 6
t+40 | T1 leave- | 2 | 4 | 2 |> V0+16 |> V0+16 |> V0+14 |> -2
t+40 | T1 leave+ |> 1 |> 4 |> 4 | V0+16 |> NA |> V0+12 | -2
t+40 | T2 rejoin- | 1 | 4 | 4 | V0+16 | NA | V0+12 | 12
t+40 | T2 rejoin+ |> 2 |> 16 |> 8 | V0+16 |> V0+4 |> V0+8 |> 4
t+40 | pick T2 | 2 | 16 | 8 | V0+16 | V0+4 | V0+8 |> 4
t+52 | T2 leave- | 2 | 16 | 8 |> V0+22 |> V0+16 |> V0+14 |> -2
t+52 | T2 leave+ |> 1 |> 10 |> 10 | V0+22 |> NA |> V0+12 | -2
t+52 | pick T3 | 1 | 10 | 10 | V0+22 | V0+12 | V0+12 |> 0
t+60 | T3 leave- | 1 | 10 | 10 |> V0+30 |> V0+20 |> V0+20 |> 0
t+60 | T3 leave+ |> 0 |> 0 |> 0*| V0+20*| NA |> V0+20 | 0
Notes:
'>' indicates that the value needs to be recomputed.
'NA' means the value is not defined for that event.
'*' the queue becomes empty, J = sum_jlag / W is undefined (W = 0),
and we set J = 0 by convention.
Hackbench tests show that this patch reduces execution time.
hackbench test base fair_place_lag change
----------------------------------------------------------
process 1 group: 0.119 0.107 -10.6%
process 10 group: 0.869 0.767 -11.7%
process 20 group: 1.840 1.579 -14.1%
thread 1 group: 0.089 0.074 -17.2%
thread 10 group: 0.555 0.480 -13.5%
thread 20 group: 1.099 1.022 -7.0%
pipe process 1 group: 0.126 0.084 -33.5%
pipe process 10 group: 0.810 0.673 -17.0%
pipe process 20 group: 1.625 1.314 -19.1%
pipe thread 1 group: 0.092 0.077 -16.7%
pipe thread 10 group: 0.503 0.465 -7.6%
pipe thread 20 group: 0.947 0.906 -4.4%
Signed-off-by: wangtao <tao.wangtao@...or.com>
---
kernel/sched/debug.c | 2 +
kernel/sched/fair.c | 194 +++++++++++++++++++++++++++++++++++++++-
kernel/sched/features.h | 5 ++
kernel/sched/sched.h | 2 +
4 files changed, 201 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..a4fd11c5ca2d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -830,6 +830,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(zero_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vjlag",
+ SPLIT_NS(avg_vjlag(cfs_rq)));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da46c3164537..a775b4361b1c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -593,7 +593,160 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* V +-= lag_i / W
*
- * Also see the comment in place_entity() that deals with this. ]]
+ * If a joining or leaving task j has non-zero lag, V will change. We define
+ * sum_jlag as the sum of lag compensations at task joins/leaves, and J as the
+ * weighted average:
+ *
+ * J = sum_jlag / W
+ *
+ * By construction, we choose jlag at joins/leaves so that V + J is invariant
+ * when there are tasks in the queue.
+ *
+ * --------------------------------------------------------------------
+ * 1) Task l leaving
+ *
+ * Before task l leaves we use the suffix "_l"; after it leaves we use a
+ * prime ('). Given v_l or lag_l, we can compute jlag_l.
+ *
+ * W' = W_l - w_l
+ * V' = (V_l * W_l - v_l * w_l) / W'
+ * = (V_l * W' + V_l * w_l - v_l * w_l) / W'
+ * = V_l + (V_l - v_l) * w_l / W'
+ * = V_l + lag_l / W' // lag_l = (V_l - v_l) * w_l
+ *
+ * For J:
+ *
+ * sum_jlag' = sum_jlag_l - jlag_l = J_l * W_l - jlag_l
+ * J' = sum_jlag' / W'
+ * = (J_l * W_l - jlag_l) / W'
+ * = (J_l * W' + J_l * w_l - jlag_l) / W'
+ * = J_l + (J_l * w_l - jlag_l) / W'
+ *
+ * Enforcing V' + J' = V_l + J_l gives:
+ *
+ * jlag_l = (J_l + V_l - v_l) * w_l = J_l * w_l + lag_l
+ *
+ * --------------------------------------------------------------------
+ * 2) Task j joining
+ *
+ * Before task j joins we use V, W, J; after joining we use the suffix "_j":
+ *
+ * W_j = W + w_j
+ * V_j = (V * W + v_j * w_j) / W_j
+ * = (V * W_j - V * w_j + v_j * w_j) / W_j
+ * = V - (V - v_j) * w_j / W_j
+ *
+ * For J:
+ *
+ * sum_jlag_j = sum_jlag + jlag_j = J * W + jlag_j
+ * J_j = sum_jlag_j / W_j
+ * = (J * W + jlag_j) / W_j
+ * = (J * W_j - J * w_j + jlag_j) / W_j
+ * = J - (J * w_j - jlag_j) / W_j
+ *
+ * Enforcing V_j + J_j = V + J gives:
+ *
+ * jlag_j = (J + V - v_j) * w_j
+ *
+ * When task j joins, v_j, lag_j and jlag_j are not yet defined.
+ * Given V, J and last_leave_lag_j for j, if we set
+ *
+ * jlag_j = last_leave_lag_j
+ *
+ * then
+ *
+ * v_j = J + V - jlag_j / w_j
+ * = V - (last_leave_lag_j / w_j - J)
+ *
+ * We only use jlag_j to compute v_j; it does not affect EEVDF's
+ * eligibility decision. Therefore we can also choose jlag_j to be
+ * last_leave_lag_j / 2, 2 * last_leave_lag_j, or any value with a
+ * uniform constant bound.
+ *
+ * --------------------------------------------------------------------
+ * 3) Meaning of sum_jlag and J
+ *
+ * sum_jlag tracks aggregate join/leave service lag ("jlag") relative to the
+ * EEVDF fluid schedule: service that future scheduling must preferentially
+ * repay or reclaim:
+ * - sum_jlag > 0: positive jlag (we owe service; joined behind fluid);
+ * - sum_jlag = 0: no outstanding jlag;
+ * - sum_jlag < 0: negative jlag (we gave extra service; joined ahead).
+ *
+ * sum_jlag is the outstanding join/leave lag repaid when tasks leave,
+ * keeping V + J aligned with the fluid schedule.
+ *
+ * --------------------------------------------------------------------
+ * 4) Recursive relations for sum_jlag
+ *
+ * sum_jlag(nj + 1, nl) = sum_jlag(nj, nl) + last_leave_lag_j
+ * sum_jlag(nj, nl + 1) = sum_jlag(nj, nl) * (1 - w_l / W) - lag_l
+ * J(nj, nl) = sum_jlag(nj, nl) / W
+ *
+ * Here nj is the number of joins and nl is the number of leaves, and
+ *
+ * nj >= nl
+ * sum_jlag(0, 0) = 0
+ *
+ * --------------------------------------------------------------------
+ * 5) Boundedness of J
+ *
+ * Let nr = nj - nl be the number of tasks currently in the queue. If nr = 1,
+ * the only task l leaves with lag_l = 0 and w_l = W, so sum_jlag becomes 0
+ * and J = 0.
+ *
+ * For nr > 1, assume for all tasks i in the queue:
+ *
+ * 1 < W_MIN <= w_i < W <= W_MAX
+ * nr * W_MIN <= W <= W_MAX
+ * 0 < W_MIN / W_MAX <= 1 - w_i / W <= a < 1.
+ *
+ * and lag bounds
+ *
+ * |last_leave_lag_j| <= q, |lag_l| <= q
+ *
+ * for some global constant q. Then nj-increase (task joins) gives
+ *
+ * |sum_jlag| <= nr * q, W >= nr * W_MIN => |J| <= q / W_MIN
+ *
+ * For nl-increase (task leaves), the recurrence
+ *
+ * sum_jlag(nj, nl + 1) = sum_jlag(nj, nl) * (1 - w_l / W) - lag_l
+ *
+ * has the standard form
+ *
+ * x_{k+1} = a_k * x_k + d_k
+ *
+ * with
+ *
+ * a_k = 1 - w_l / W in (0, 1), |d_k| <= q.
+ *
+ * In such a system with contraction factor |a_k| < 1 and bounded
+ * disturbance (BIBS/ISS-type), x_k and J = x_k / W remain uniformly
+ * bounded. In particular, there exists a constant C1, independent of
+ * nj, nl and nr, such that
+ *
+ * |sum_jlag(nj, nl)| = |sum_jlag(nl0 + nr, nl0 + k)|
+ * <= a^k * nr * q + (1 - a^k) * q / (1 - a)
+ *
+ * The number of remaining tasks is nr - k, so
+ *
+ * W >= (nr - k) * W_MIN
+ *
+ * Therefore
+ *
+ * |J| = |sum_jlag(nj, nl) / W|
+ * <= q * (nr * a^k + (1 - a^k) / (1 - a)) / ((nr - k) * W_MIN)
+ *
+ * Hence
+ *
+ * |J| <= (q / W_MIN) * C1
+ *
+ * where C1 can be chosen as
+ *
+ * C1 = 1 + 1 / (1 - a) + 1 / (e * ln(1 / a)).
+ *
+ * Thus J has a uniform constant upper bound. ]]
*
* However, since v_i is u64, and the multiplication could easily overflow
* transform it into a relative form that uses smaller quantities:
@@ -638,6 +791,36 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg_load -= weight;
}
+/* Average join/leave lag debt J = sum_jlag / W for this cfs_rq. */
+s64 avg_vjlag(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
+
+ return load ? div_s64(cfs_rq->sum_jlag, load) : 0;
+}
+
+/* Account the join lag contribution of a newly enqueued entity. */
+static void sum_jlag_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 jlag_join = se->vlag * weight; /* preserve vlag: vlag - J_j */
+
+ cfs_rq->sum_jlag += jlag_join;
+}
+
+/* Account the leave lag correction when an entity dequeues. */
+static void sum_jlag_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 jlag_leave = (se->vlag + avg_vjlag(cfs_rq)) * weight;
+
+ cfs_rq->sum_jlag -= jlag_leave;
+}
+
static inline
void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
{
@@ -5116,8 +5299,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* other tasks.
*
* EEVDF: placement strategy #1 / #2
+ *
+ * FAIR_PLACE_LAG uses avg_vjlag to keep placement fair and stable
+ * even when tasks that join or leave have non-zero lag.
*/
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
+ if (sched_feat(FAIR_PLACE_LAG))
+ lag = se->vlag - avg_vjlag(cfs_rq);
+ else if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
@@ -5260,6 +5448,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags);
+ sum_jlag_add(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -5397,6 +5586,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->rel_deadline = 1;
}
+ sum_jlag_sub(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 980d92bab8ab..81c72b572630 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -5,6 +5,11 @@
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
SCHED_FEAT(PLACE_LAG, true)
+/*
+ * Adjust EEVDF placement strategy #1 using avg_vjlag to keep scheduling
+ * fair and stable even when tasks that join or leave have non-zero lag.
+ */
+SCHED_FEAT(FAIR_PLACE_LAG, false)
/*
* Give new tasks half a slice to ease into the competition.
*/
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5..8f7eb75f9ab5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -680,6 +680,7 @@ struct cfs_rq {
s64 avg_vruntime;
u64 avg_load;
+ s64 sum_jlag;
u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -3891,6 +3892,7 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern s64 avg_vjlag(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
static inline
void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
--
2.17.1
Powered by blists - more mailing lists