lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <173642508376.399.1685643315759195867.tip-bot2@tip-bot2>
Date: Thu, 09 Jan 2025 12:18:03 -0000
From: "tip-bot2 for Peter Zijlstra" <tip-bot2@...utronix.de>
To: linux-tip-commits@...r.kernel.org
Cc: Doug Smythies <dsmythies@...us.net>, Ingo Molnar <mingo@...nel.org>,
 "Peter Zijlstra (Intel)" <peterz@...radead.org>, x86@...nel.org,
 linux-kernel@...r.kernel.org
Subject: [tip: sched/urgent] sched/fair: Fix EEVDF entity placement bug
 causing scheduling lag

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     6d71a9c6160479899ee744d2c6d6602a191deb1f
Gitweb:        https://git.kernel.org/tip/6d71a9c6160479899ee744d2c6d6602a191deb1f
Author:        Peter Zijlstra <peterz@...radead.org>
AuthorDate:    Thu, 09 Jan 2025 11:59:59 +01:00
Committer:     Ingo Molnar <mingo@...nel.org>
CommitterDate: Thu, 09 Jan 2025 12:55:27 +01:00

sched/fair: Fix EEVDF entity placement bug causing scheduling lag

I noticed this in my traces today:

       turbostat-1222    [006] d..2.   311.935649: reweight_entity: (ffff888108f13e00-ffff88885ef38440-6)
                               { weight: 1048576 avg_vruntime: 3184159639071 vruntime: 3184159640194 (-1123) deadline: 3184162621107 } ->
                               { weight: 2 avg_vruntime: 3184177463330 vruntime: 3184748414495 (-570951165) deadline: 4747605329439 }
       turbostat-1222    [006] d..2.   311.935651: reweight_entity: (ffff888108f13e00-ffff88885ef38440-6)
                               { weight: 2 avg_vruntime: 3184177463330 vruntime: 3184748414495 (-570951165) deadline: 4747605329439 } ->
                               { weight: 1048576 avg_vruntime: 3184176414812 vruntime: 3184177464419 (-1049607) deadline: 3184180445332 }

Which is a weight transition: 1048576 -> 2 -> 1048576.

One would expect the lag to shoot out *AND* come back, notably:

  -1123*1048576/2 = -588775424
  -588775424*2/1048576 = -1123

Except the trace shows it is all off. Worse, subsequent cycles shoot it
out further and further.

This made me have a very hard look at reweight_entity(), and
specifically the ->on_rq case, which is more prominent with
DELAY_DEQUEUE.

And indeed, it is all sorts of broken. While the computation of the new
lag is correct, the computation for the new vruntime, using the new lag
is broken for it does not consider the logic set out in place_entity().

With the below patch, I now see things like:

    migration/12-55      [012] d..3.   309.006650: reweight_entity: (ffff8881e0e6f600-ffff88885f235f40-12)
                               { weight: 977582 avg_vruntime: 4860513347366 vruntime: 4860513347908 (-542) deadline: 4860516552475 } ->
                               { weight: 2 avg_vruntime: 4860528915984 vruntime: 4860793840706 (-264924722) deadline: 6427157349203 }
    migration/14-62      [014] d..3.   309.006698: reweight_entity: (ffff8881e0e6cc00-ffff88885f3b5f40-15)
                               { weight: 2 avg_vruntime: 4874472992283 vruntime: 4939833828823 (-65360836540) deadline: 6316614641111 } ->
                               { weight: 967149 avg_vruntime: 4874217684324 vruntime: 4874217688559 (-4235) deadline: 4874220535650 }

Which isn't perfect yet, but much closer.

Reported-by: Doug Smythies <dsmythies@...us.net>
Reported-by: Ingo Molnar <mingo@...nel.org>
Tested-by: Ingo Molnar <mingo@...nel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
Signed-off-by: Ingo Molnar <mingo@...nel.org>
Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
Link: https://lore.kernel.org/r/20250109105959.GA2981@noisy.programming.kicks-ass.net
---
 kernel/sched/fair.c | 145 +++++--------------------------------------
 1 file changed, 18 insertions(+), 127 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e9ca38..eeed8e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -689,21 +689,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
  *
  * XXX could add max_slice to the augmented data to track this.
  */
-static s64 entity_lag(u64 avruntime, struct sched_entity *se)
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	s64 vlag, limit;
 
-	vlag = avruntime - se->vruntime;
-	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
-
-	return clamp(vlag, -limit, limit);
-}
-
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
 	SCHED_WARN_ON(!se->on_rq);
 
-	se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
+	vlag = avg_vruntime(cfs_rq) - se->vruntime;
+	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+	se->vlag = clamp(vlag, -limit, limit);
 }
 
 /*
@@ -3774,137 +3769,32 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
-static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
-			   unsigned long weight)
-{
-	unsigned long old_weight = se->load.weight;
-	s64 vlag, vslice;
-
-	/*
-	 * VRUNTIME
-	 * --------
-	 *
-	 * COROLLARY #1: The virtual runtime of the entity needs to be
-	 * adjusted if re-weight at !0-lag point.
-	 *
-	 * Proof: For contradiction assume this is not true, so we can
-	 * re-weight without changing vruntime at !0-lag point.
-	 *
-	 *             Weight	VRuntime   Avg-VRuntime
-	 *     before    w          v            V
-	 *      after    w'         v'           V'
-	 *
-	 * Since lag needs to be preserved through re-weight:
-	 *
-	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
-	 *	==>	V' = (V - v)*w/w' + v		(1)
-	 *
-	 * Let W be the total weight of the entities before reweight,
-	 * since V' is the new weighted average of entities:
-	 *
-	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
-	 *
-	 * by using (1) & (2) we obtain:
-	 *
-	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
-	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
-	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
-	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
-	 *
-	 * Since we are doing at !0-lag point which means V != v, we
-	 * can simplify (3):
-	 *
-	 *	==>	W / (W + w' - w) = w / w'
-	 *	==>	Ww' = Ww + ww' - ww
-	 *	==>	W * (w' - w) = w * (w' - w)
-	 *	==>	W = w	(re-weight indicates w' != w)
-	 *
-	 * So the cfs_rq contains only one entity, hence vruntime of
-	 * the entity @v should always equal to the cfs_rq's weighted
-	 * average vruntime @V, which means we will always re-weight
-	 * at 0-lag point, thus breach assumption. Proof completed.
-	 *
-	 *
-	 * COROLLARY #2: Re-weight does NOT affect weighted average
-	 * vruntime of all the entities.
-	 *
-	 * Proof: According to corollary #1, Eq. (1) should be:
-	 *
-	 *	(V - v)*w = (V' - v')*w'
-	 *	==>    v' = V' - (V - v)*w/w'		(4)
-	 *
-	 * According to the weighted average formula, we have:
-	 *
-	 *	V' = (WV - wv + w'v') / (W - w + w')
-	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
-	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
-	 *	   = (WV + w'V' - Vw) / (W - w + w')
-	 *
-	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
-	 *	==>	V' * (W - w) = (W - w) * V	(5)
-	 *
-	 * If the entity is the only one in the cfs_rq, then reweight
-	 * always occurs at 0-lag point, so V won't change. Or else
-	 * there are other entities, hence W != w, then Eq. (5) turns
-	 * into V' = V. So V won't change in either case, proof done.
-	 *
-	 *
-	 * So according to corollary #1 & #2, the effect of re-weight
-	 * on vruntime should be:
-	 *
-	 *	v' = V' - (V - v) * w / w'		(4)
-	 *	   = V  - (V - v) * w / w'
-	 *	   = V  - vl * w / w'
-	 *	   = V  - vl'
-	 */
-	if (avruntime != se->vruntime) {
-		vlag = entity_lag(avruntime, se);
-		vlag = div_s64(vlag * old_weight, weight);
-		se->vruntime = avruntime - vlag;
-	}
-
-	/*
-	 * DEADLINE
-	 * --------
-	 *
-	 * When the weight changes, the virtual time slope changes and
-	 * we should adjust the relative virtual deadline accordingly.
-	 *
-	 *	d' = v' + (d - v)*w/w'
-	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
-	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
-	 *	   = V  + (d - V)*w/w'
-	 */
-	vslice = (s64)(se->deadline - avruntime);
-	vslice = div_s64(vslice * old_weight, weight);
-	se->deadline = avruntime + vslice;
-}
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
 	bool curr = cfs_rq->curr == se;
-	u64 avruntime;
 
 	if (se->on_rq) {
 		/* commit outstanding execution time */
 		update_curr(cfs_rq);
-		avruntime = avg_vruntime(cfs_rq);
+		update_entity_lag(cfs_rq, se);
+		se->deadline -= se->vruntime;
+		se->rel_deadline = 1;
 		if (!curr)
 			__dequeue_entity(cfs_rq, se);
 		update_load_sub(&cfs_rq->load, se->load.weight);
 	}
 	dequeue_load_avg(cfs_rq, se);
 
-	if (se->on_rq) {
-		reweight_eevdf(se, avruntime, weight);
-	} else {
-		/*
-		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
-		 * we need to scale se->vlag when w_i changes.
-		 */
-		se->vlag = div_s64(se->vlag * se->load.weight, weight);
-	}
+	/*
+	 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+	 * we need to scale se->vlag when w_i changes.
+	 */
+	se->vlag = div_s64(se->vlag * se->load.weight, weight);
+	if (se->rel_deadline)
+		se->deadline = div_s64(se->deadline * se->load.weight, weight);
 
 	update_load_set(&se->load, weight);
 
@@ -3919,6 +3809,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	enqueue_load_avg(cfs_rq, se);
 	if (se->on_rq) {
 		update_load_add(&cfs_rq->load, se->load.weight);
+		place_entity(cfs_rq, se, 0);
 		if (!curr)
 			__enqueue_entity(cfs_rq, se);
 
@@ -5359,7 +5250,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	se->vruntime = vruntime - lag;
 
-	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
+	if (se->rel_deadline) {
 		se->deadline += se->vruntime;
 		se->rel_deadline = 0;
 		return;

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ