[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260130094608.304041157@infradead.org>
Date: Fri, 30 Jan 2026 10:34:43 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: mingo@...nel.org
Cc: peterz@...radead.org,
juri.lelli@...hat.com,
vincent.guittot@...aro.org,
dietmar.eggemann@....com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
linux-kernel@...r.kernel.org,
wangtao554@...wei.com,
quzicheng@...wei.com,
kprateek.nayak@....com,
wuyun.abel@...edance.com,
dsmythies@...us.net
Subject: [PATCH 4/4] sched/fair: Revert 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")
Zicheng Qu reported that, because avg_vruntime() always includes
cfs_rq->curr, when ->on_rq, place_entity() doesn't work right.
Specifically, the lag scaling in place_entity() relies on
avg_vruntime() being the state *before* placement of the new entity.
However in this case avg_vruntime() will actually already include the
entity, which breaks things.
Also, Zicheng Qu argues that avg_vruntime should be invariant under
reweight. IOW commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
placement bug causing scheduling lag") was wrong!
The issue reported in 6d71a9c61604 could possibly be explained by
rounding artifacts -- notably the extreme weight '2' is outside of the
range of avg_vruntime/sum_w_vruntime, since that uses
scale_load_down(). By scaling vruntime by the real weight, but
accounting it in vruntime with a factor 1024 more, the average moves
significantly.
Tested by reverting 66951e4860d3 ("sched/fair: Fix update_cfs_group()
vs DELAY_DEQUEUE") and tracing vruntime and vlag figures again.
Reported-by: Zicheng Qu <quzicheng@...wei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
kernel/sched/fair.c | 154 +++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 129 insertions(+), 25 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -782,16 +782,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(u64 avruntime, struct sched_entity *se)
{
s64 vlag, limit;
- WARN_ON_ONCE(!se->on_rq);
-
- vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ vlag = avruntime - se->vruntime;
limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
- se->vlag = clamp(vlag, -limit, limit);
+ return clamp(vlag, -limit, limit);
+}
+
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ WARN_ON_ONCE(!se->on_rq);
+
+ se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
}
/*
@@ -3839,23 +3844,135 @@ dequeue_load_avg(struct cfs_rq *cfs_rq,
se_weight(se) * -se->avg.load_sum);
}
-static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+static void rescale_entity(struct sched_entity *se, unsigned long weight,
+ u64 avruntime, bool rel_vprot)
+{
+ unsigned long old_weight = se->load.weight;
+
+ /*
+ * VRUNTIME
+ * --------
+ *
+ * COROLLARY #1: The virtual runtime of the entity needs to be
+ * adjusted if re-weight at !0-lag point.
+ *
+ * Proof: For contradiction assume this is not true, so we can
+ * re-weight without changing vruntime at !0-lag point.
+ *
+ * Weight VRuntime Avg-VRuntime
+ * before w v V
+ * after w' v' V'
+ *
+ * Since lag needs to be preserved through re-weight:
+ *
+ * lag = (V - v)*w = (V'- v')*w', where v = v'
+ * ==> V' = (V - v)*w/w' + v (1)
+ *
+ * Let W be the total weight of the entities before reweight,
+ * since V' is the new weighted average of entities:
+ *
+ * V' = (WV + w'v - wv) / (W + w' - w) (2)
+ *
+ * by using (1) & (2) we obtain:
+ *
+ * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+ * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+ * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+ * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+ *
+ * Since we are doing at !0-lag point which means V != v, we
+ * can simplify (3):
+ *
+ * ==> W / (W + w' - w) = w / w'
+ * ==> Ww' = Ww + ww' - ww
+ * ==> W * (w' - w) = w * (w' - w)
+ * ==> W = w (re-weight indicates w' != w)
+ *
+ * So the cfs_rq contains only one entity, hence vruntime of
+ * the entity @v should always equal to the cfs_rq's weighted
+ * average vruntime @V, which means we will always re-weight
+ * at 0-lag point, thus breach assumption. Proof completed.
+ *
+ *
+ * COROLLARY #2: Re-weight does NOT affect weighted average
+ * vruntime of all the entities.
+ *
+ * Proof: According to corollary #1, Eq. (1) should be:
+ *
+ * (V - v)*w = (V' - v')*w'
+ * ==> v' = V' - (V - v)*w/w' (4)
+ *
+ * According to the weighted average formula, we have:
+ *
+ * V' = (WV - wv + w'v') / (W - w + w')
+ * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+ * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+ * = (WV + w'V' - Vw) / (W - w + w')
+ *
+ * ==> V'*(W - w + w') = WV + w'V' - Vw
+ * ==> V' * (W - w) = (W - w) * V (5)
+ *
+ * If the entity is the only one in the cfs_rq, then reweight
+ * always occurs at 0-lag point, so V won't change. Or else
+ * there are other entities, hence W != w, then Eq. (5) turns
+ * into V' = V. So V won't change in either case, proof done.
+ *
+ *
+ * So according to corollary #1 & #2, the effect of re-weight
+ * on vruntime should be:
+ *
+ * v' = V' - (V - v) * w / w' (4)
+ * = V - (V - v) * w / w'
+ * = V - vl * w / w'
+ * = V - vl'
+ */
+ se->vlag = div_s64(se->vlag * old_weight, weight);
+ if (avruntime)
+ se->vruntime = avruntime - se->vlag;
+
+ /*
+ * DEADLINE
+ * --------
+ *
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ *
+ * d' = v' + (d - v)*w/w'
+ * = V' - (V - v)*w/w' + (d - v)*w/w'
+ * = V - (V - v)*w/w' + (d - v)*w/w'
+ * = V + (d - V)*w/w'
+ */
+ if (se->rel_deadline) {
+ se->deadline = div_s64(se->deadline * old_weight, weight);
+ if (avruntime) {
+ se->rel_deadline = 0;
+ se->deadline += avruntime;
+ }
+ }
+
+ if (rel_vprot) {
+ se->vprot = div_s64(se->vprot * old_weight, weight);
+ if (avruntime)
+ se->vprot += avruntime;
+ }
+}
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
bool rel_vprot = false;
- u64 vprot;
+ u64 avruntime = 0;
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
- update_entity_lag(cfs_rq, se);
- se->deadline -= se->vruntime;
+ avruntime = avg_vruntime(cfs_rq);
+ se->vlag = entity_lag(avruntime, se);
+ se->deadline -= avruntime;
se->rel_deadline = 1;
if (curr && protect_slice(se)) {
- vprot = se->vprot - se->vruntime;
+ se->vprot -= avruntime;
rel_vprot = true;
}
@@ -3866,30 +3983,17 @@ static void reweight_entity(struct cfs_r
}
dequeue_load_avg(cfs_rq, se);
- /*
- * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
- * we need to scale se->vlag when w_i changes.
- */
- se->vlag = div_s64(se->vlag * se->load.weight, weight);
- if (se->rel_deadline)
- se->deadline = div_s64(se->deadline * se->load.weight, weight);
-
- if (rel_vprot)
- vprot = div_s64(vprot * se->load.weight, weight);
+ rescale_entity(se, weight, avruntime, rel_vprot);
update_load_set(&se->load, weight);
do {
u32 divider = get_pelt_divider(&se->avg);
-
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
- place_entity(cfs_rq, se, 0);
- if (rel_vprot)
- se->vprot = se->vruntime + vprot;
update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -5247,7 +5351,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
se->vruntime = vruntime - lag;
- if (se->rel_deadline) {
+ if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
return;
Powered by blists - more mailing lists