linux-kernel - Re: [PATCH 4/4] sched/fair: Revert 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <716f3b5a-8a82-88e1-b684-4723882a0d6b@huawei.com>
Date: Sat, 31 Jan 2026 09:47:07 +0800
From: Zhang Qiao <zhangqiao22@...wei.com>
To: Peter Zijlstra <peterz@...radead.org>, <mingo@...nel.org>
CC: <juri.lelli@...hat.com>, <vincent.guittot@...aro.org>,
	<dietmar.eggemann@....com>, <rostedt@...dmis.org>, <bsegall@...gle.com>,
	<mgorman@...e.de>, <vschneid@...hat.com>, <linux-kernel@...r.kernel.org>,
	<wangtao554@...wei.com>, <quzicheng@...wei.com>, <kprateek.nayak@....com>,
	<wuyun.abel@...edance.com>, <dsmythies@...us.net>, Hui Tang
	<tanghui20@...wei.com>
Subject: Re: [PATCH 4/4] sched/fair: Revert 6d71a9c61604 ("sched/fair: Fix
 EEVDF entity placement bug causing scheduling lag")

Hi, peter

在 2026/1/30 17:34, Peter Zijlstra 写道:
> Zicheng Qu reported that, because avg_vruntime() always includes
> cfs_rq->curr, when ->on_rq, place_entity() doesn't work right.
> 
> Specifically, the lag scaling in place_entity() relies on
> avg_vruntime() being the state *before* placement of the new entity.
> However in this case avg_vruntime() will actually already include the
> entity, which breaks things.
> 
> Also, Zicheng Qu argues that avg_vruntime should be invariant under
> reweight. IOW commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
> placement bug causing scheduling lag") was wrong!
> 
> The issue reported in 6d71a9c61604 could possibly be explained by
> rounding artifacts -- notably the extreme weight '2' is outside of the
> range of avg_vruntime/sum_w_vruntime, since that uses
> scale_load_down(). By scaling vruntime by the real weight, but
> accounting it in vruntime with a factor 1024 more, the average moves
> significantly.
> 
> Tested by reverting 66951e4860d3 ("sched/fair: Fix update_cfs_group()
> vs DELAY_DEQUEUE") and tracing vruntime and vlag figures again.
> 
> Reported-by: Zicheng Qu <quzicheng@...wei.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
>  kernel/sched/fair.c |  154 +++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 129 insertions(+), 25 deletions(-)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -782,16 +782,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
>   *
>   * XXX could add max_slice to the augmented data to track this.
>   */
> -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +static s64 entity_lag(u64 avruntime, struct sched_entity *se)
>  {
>  	s64 vlag, limit;
>  
> -	WARN_ON_ONCE(!se->on_rq);
> -
> -	vlag = avg_vruntime(cfs_rq) - se->vruntime;
> +	vlag = avruntime - se->vruntime;
>  	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
>  
> -	se->vlag = clamp(vlag, -limit, limit);
> +	return clamp(vlag, -limit, limit);
> +}
> +
> +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> +	WARN_ON_ONCE(!se->on_rq);
> +
> +	se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
>  }
>  
>  /*
> @@ -3839,23 +3844,135 @@ dequeue_load_avg(struct cfs_rq *cfs_rq,
>  		    se_weight(se) * -se->avg.load_sum);
>  }
>  
> -static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
> +static void rescale_entity(struct sched_entity *se, unsigned long weight,
> +			   u64 avruntime, bool rel_vprot)
> +{
> +	unsigned long old_weight = se->load.weight;
> +
> +	/*
> +	 * VRUNTIME
> +	 * --------
> +	 *
> +	 * COROLLARY #1: The virtual runtime of the entity needs to be
> +	 * adjusted if re-weight at !0-lag point.
> +	 *
> +	 * Proof: For contradiction assume this is not true, so we can
> +	 * re-weight without changing vruntime at !0-lag point.
> +	 *
> +	 *             Weight	VRuntime   Avg-VRuntime
> +	 *     before    w          v            V
> +	 *      after    w'         v'           V'
> +	 *
> +	 * Since lag needs to be preserved through re-weight:
> +	 *
> +	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
> +	 *	==>	V' = (V - v)*w/w' + v		(1)
> +	 *
> +	 * Let W be the total weight of the entities before reweight,
> +	 * since V' is the new weighted average of entities:
> +	 *
> +	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
> +	 *
> +	 * by using (1) & (2) we obtain:
> +	 *
> +	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
> +	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
> +	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
> +	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
> +	 *
> +	 * Since we are doing at !0-lag point which means V != v, we
> +	 * can simplify (3):
> +	 *
> +	 *	==>	W / (W + w' - w) = w / w'
> +	 *	==>	Ww' = Ww + ww' - ww
> +	 *	==>	W * (w' - w) = w * (w' - w)
> +	 *	==>	W = w	(re-weight indicates w' != w)
> +	 *
> +	 * So the cfs_rq contains only one entity, hence vruntime of
> +	 * the entity @v should always equal to the cfs_rq's weighted
> +	 * average vruntime @V, which means we will always re-weight
> +	 * at 0-lag point, thus breach assumption. Proof completed.
> +	 *
> +	 *
> +	 * COROLLARY #2: Re-weight does NOT affect weighted average
> +	 * vruntime of all the entities.
> +	 *
> +	 * Proof: According to corollary #1, Eq. (1) should be:
> +	 *
> +	 *	(V - v)*w = (V' - v')*w'
> +	 *	==>    v' = V' - (V - v)*w/w'		(4)
> +	 *
> +	 * According to the weighted average formula, we have:
> +	 *
> +	 *	V' = (WV - wv + w'v') / (W - w + w')
> +	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
> +	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
> +	 *	   = (WV + w'V' - Vw) / (W - w + w')
> +	 *
> +	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
> +	 *	==>	V' * (W - w) = (W - w) * V	(5)
> +	 *
> +	 * If the entity is the only one in the cfs_rq, then reweight
> +	 * always occurs at 0-lag point, so V won't change. Or else
> +	 * there are other entities, hence W != w, then Eq. (5) turns
> +	 * into V' = V. So V won't change in either case, proof done.
> +	 *
> +	 *
> +	 * So according to corollary #1 & #2, the effect of re-weight
> +	 * on vruntime should be:
> +	 *
> +	 *	v' = V' - (V - v) * w / w'		(4)
> +	 *	   = V  - (V - v) * w / w'
> +	 *	   = V  - vl * w / w'
> +	 *	   = V  - vl'
> +	 */
> +	se->vlag = div_s64(se->vlag * old_weight, weight);
> +	if (avruntime)
> +		se->vruntime = avruntime - se->vlag;
> +
> +	/*
> +	 * DEADLINE
> +	 * --------
> +	 *
> +	 * When the weight changes, the virtual time slope changes and
> +	 * we should adjust the relative virtual deadline accordingly.
> +	 *
> +	 *	d' = v' + (d - v)*w/w'
> +	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
> +	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
> +	 *	   = V  + (d - V)*w/w'
> +	 */
> +	if (se->rel_deadline) {
> +		se->deadline = div_s64(se->deadline * old_weight, weight);
> +		if (avruntime) {
> +			se->rel_deadline = 0;
> +			se->deadline += avruntime;
> +		}
> +	}
> +
> +	if (rel_vprot) {
> +		se->vprot = div_s64(se->vprot * old_weight, weight);
> +		if (avruntime)
> +			se->vprot += avruntime;
> +	}
> +}
>  
>  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
>  			    unsigned long weight)
>  {
>  	bool curr = cfs_rq->curr == se;
>  	bool rel_vprot = false;
> -	u64 vprot;
> +	u64 avruntime = 0;
>  
>  	if (se->on_rq) {
>  		/* commit outstanding execution time */
>  		update_curr(cfs_rq);
> -		update_entity_lag(cfs_rq, se);
> -		se->deadline -= se->vruntime;
> +		avruntime = avg_vruntime(cfs_rq);
> +		se->vlag = entity_lag(avruntime, se);


vlag is updated here. Considering vlag and vprot share the same union, updating
vlag will overwrite vprot. Is it right to call protect_slice() (which use vprot)
after this update?

> +		se->deadline -= avruntime;
>  		se->rel_deadline = 1;
>  		if (curr && protect_slice(se)) {
> -			vprot = se->vprot - se->vruntime;
> +			se->vprot -= avruntime;
>  			rel_vprot = true;
>  		}
>  
> @@ -3866,30 +3983,17 @@ static void reweight_entity(struct cfs_r
>  	}
>  	dequeue_load_avg(cfs_rq, se);
>  
> -	/*
> -	 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
> -	 * we need to scale se->vlag when w_i changes.
> -	 */
> -	se->vlag = div_s64(se->vlag * se->load.weight, weight);
> -	if (se->rel_deadline)
> -		se->deadline = div_s64(se->deadline * se->load.weight, weight);
> -
> -	if (rel_vprot)
> -		vprot = div_s64(vprot * se->load.weight, weight);
> +	rescale_entity(se, weight, avruntime, rel_vprot);
>  
>  	update_load_set(&se->load, weight);
>  
>  	do {
>  		u32 divider = get_pelt_divider(&se->avg);
> -
>  		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
>  	} while (0);
>  
>  	enqueue_load_avg(cfs_rq, se);
>  	if (se->on_rq) {
> -		place_entity(cfs_rq, se, 0);
> -		if (rel_vprot)
> -			se->vprot = se->vruntime + vprot;
>  		update_load_add(&cfs_rq->load, se->load.weight);
>  		if (!curr)
>  			__enqueue_entity(cfs_rq, se);
> @@ -5247,7 +5351,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
>  
>  	se->vruntime = vruntime - lag;
>  
> -	if (se->rel_deadline) {
> +	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
>  		se->deadline += se->vruntime;
>  		se->rel_deadline = 0;
>  		return;
> 
> 
> 
> .
>