linux-kernel - [PATCH v2] sched/fair: make task join/leave scheduling more stable

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251205105403.13278-1-tao.wangtao@honor.com>
Date: Fri, 5 Dec 2025 18:54:03 +0800
From: wangtao <tao.wangtao@...or.com>
To: <mingo@...hat.com>, <peterz@...radead.org>, <juri.lelli@...hat.com>,
	<vincent.guittot@...aro.org>
CC: <dietmar.eggemann@....com>, <rostedt@...dmis.org>, <bsegall@...gle.com>,
	<mgorman@...e.de>, <vschneid@...hat.com>, <linux-kernel@...r.kernel.org>,
	<liulu.liu@...or.com>, <bintian.wang@...or.com>, <wangzicheng@...or.com>,
	wangtao <tao.wangtao@...or.com>
Subject: [PATCH v2] sched/fair: make task join/leave scheduling more stable

Suppose the current V = V0, all weights are 1, and we add four tasks
with join_lag(jlag) values 0, 10, 80, and -12. Considering preserve-lag
handling with vlag_i = (W + w_i) * vlag_i' / W:

-----------------------------
event  | jlag| v     | W | V
-----------------------------
add T1 |  0  | V0    | 1 | V0
add T2 | 20  | V0-20 | 2 | V0-10
add T3 | 120 | V0-130| 3 | V0-50
add T4 | -16 | V0-34 | 4 | V0-46

Because V move backwards after adding T2 and T3, even though lag_T4 < 0,
we still have v_T3 < v_T4 < v_T2 < v_T1.
So the schedule order is T3, T4, T2, T1.
A similar issue exists even without preserve-lag.
If tasks are added in the order T2, T3, T1, T4, the schedule order
becomes T3, T1, T4, T2, which shows instability.

Let A = V + (\Sum jlag_i - \Sum jlag_l) / W. Adding or removing a task
does not change A. Using v_j = A - jlag_j makes scheduling more stable
when tasks join or leave. It is equivalent to preserving vlag = vjlag - J.

Using the same example with jlag values 0, 10, 80, -12:

------------------------------------------
event  | jlag| v      | W | V       | J
------------------------------------------
add T1 | 0   | V0     | 1 | V0      | 0
add T2 | 10  | V0-10  | 2 | V0-5    | 5
add T3 | 80  | V0-80  | 3 | V0-30   | 30
add T4 | -12 | V0+12  | 4 | V0-19.5 | 19.5

As long as the four tasks are added at the same time, no matter the
insertion order, the schedule order will be T3, T2, T1, T4.

hackbench tests show that this patch reduces execution time due to fewer
task switches.

-------------------------------------------------
hackbench test         base   patch   opt
-------------------------------------------------
process 1 group:       0.119  0.107  -10.6%
process 10 group:      0.869  0.767  -11.7%
process 20 group:      1.840  1.579  -14.1%
thread 1 group:        0.089  0.074  -17.2%
thread 10 group:       0.555  0.480  -13.5%
thread 20 group:       1.099  1.022  -7.0%
pipe process 1 group:  0.126  0.084  -33.5%
pipe process 10 group: 0.810  0.673  -17.0%
pipe process 20 group: 1.625  1.314  -19.1%
pipe thread 1 group:   0.092  0.077  -16.7%
pipe thread 10 group:  0.503  0.465  -7.6%
pipe thread 20 group:  0.947  0.906  -4.4%

v1: https://lore.kernel.org/all/20251128081118.20025-1-tao.wangtao@honor.com/T/#u

Signed-off-by: wangtao <tao.wangtao@...or.com>
---
 kernel/sched/debug.c |   2 +
 kernel/sched/fair.c  | 139 +++++++++++++++++++------------------------
 kernel/sched/sched.h |   2 +
 3 files changed, 66 insertions(+), 77 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..a4fd11c5ca2d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -830,6 +830,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			SPLIT_NS(zero_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vruntime",
 			SPLIT_NS(avg_vruntime(cfs_rq)));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vjlag",
+			SPLIT_NS(avg_vjlag(cfs_rq)));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "right_vruntime",
 			SPLIT_NS(right_vruntime));
 	spread = right_vruntime - left_vruntime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 769d7b7990df..c877a1f36c7e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -593,7 +593,38 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
  *
  *	      V +-= lag_i / W
  *
- *	    Also see the comment in place_entity() that deals with this. ]]
+ *   Because join/leave operations do not always have zero lag,
+ *   distinguishing join_lag (jlag), inqueue_lag (lag), and leave_lag
+ *   (llag), and using i, j, and l to index queued, joining, and leaving
+ *   tasks helps clarify the problem.
+ *
+ *   For all queued tasks, \Sum lag_i is 0, \Sum jlag_i is not always 0.
+ *
+ *   We define A so that it does not change when tasks join or leave.
+ *     A = V + sum_jlag / W
+ *     sum_jlag = (\Sum jlag_j - \Sum jlag_l)
+ *     J = A - V
+ *
+ *   1) a task j leaves and later re-joins. We know its previous llag.
+ *      Set jlag_j = llag
+ *      Set v_j = A - vjlag_j keeps A unchanged:
+ *        A_j = (A * W + v_j * w_j + vjlag_j * w_j) / (W + w_j) = A
+ *
+ *   2) for any task i running in the queue, v_i and V change.
+ *      vlag_i = V - v_i also changes, but vjlag_i does not change.
+ *
+ *   3) when a task l leaves, we know its v_l.
+ *      set vllag_l = vlag_l = V - v_l
+ *      Set vjlag_l = A - v_l keeps A unchanged:
+ *        A_l = (A * W - v_l * w_l - vjlag_ * w_l) / (W - w_l) = A
+ *      vjlag_l = vllag_l + A - V = vllag_l + J
+ *      jlag_l = llag_l + J * w_l
+ *
+ *      This means sum_jlag is reduced in proportion to the weight of the
+ *      leaving task l. When all tasks leave, sum_jlag becomes the last llag.
+ *
+ *   If n tasks are added and m tasks leave, the upper bound of sum_jlag
+ *   is no more than q * (n - m), where q is the bound of lag.  ]]
  *
  * However, since v_i is u64, and the multiplication could easily overflow
  * transform it into a relative form that uses smaller quantities:
@@ -638,6 +669,31 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	cfs_rq->avg_load -= weight;
 }
 
+s64 avg_vjlag(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	long load = cfs_rq->avg_load;
+
+	if (curr && curr->on_rq)
+		load += scale_load_down(curr->load.weight);
+
+	return load ? div_s64(cfs_rq->sum_jlag, load) : 0;
+}
+
+static void sum_jlag_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight = scale_load_down(se->load.weight);
+
+	cfs_rq->sum_jlag += se->vlag * weight;
+}
+
+static void sum_jlag_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight = scale_load_down(se->load.weight);
+
+	cfs_rq->sum_jlag -= (se->vlag + avg_vjlag(cfs_rq)) * weight;
+}
+
 static inline
 void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
 {
@@ -5106,82 +5162,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		se->slice = sysctl_sched_base_slice;
 	vslice = calc_delta_fair(se->slice, se);
 
-	/*
-	 * Due to how V is constructed as the weighted average of entities,
-	 * adding tasks with positive lag, or removing tasks with negative lag
-	 * will move 'time' backwards, this can screw around with the lag of
-	 * other tasks.
-	 *
-	 * EEVDF: placement strategy #1 / #2
-	 */
-	if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
-		struct sched_entity *curr = cfs_rq->curr;
-		unsigned long load;
-
-		lag = se->vlag;
-
-		/*
-		 * If we want to place a task and preserve lag, we have to
-		 * consider the effect of the new entity on the weighted
-		 * average and compensate for this, otherwise lag can quickly
-		 * evaporate.
-		 *
-		 * Lag is defined as:
-		 *
-		 *   lag_i = S - s_i = w_i * (V - v_i)
-		 *
-		 * To avoid the 'w_i' term all over the place, we only track
-		 * the virtual lag:
-		 *
-		 *   vl_i = V - v_i <=> v_i = V - vl_i
-		 *
-		 * And we take V to be the weighted average of all v:
-		 *
-		 *   V = (\Sum w_j*v_j) / W
-		 *
-		 * Where W is: \Sum w_j
-		 *
-		 * Then, the weighted average after adding an entity with lag
-		 * vl_i is given by:
-		 *
-		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
-		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
-		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
-		 *      = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
-		 *      = V - w_i*vl_i / (W + w_i)
-		 *
-		 * And the actual lag after adding an entity with vl_i is:
-		 *
-		 *   vl'_i = V' - v_i
-		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
-		 *         = vl_i - w_i*vl_i / (W + w_i)
-		 *
-		 * Which is strictly less than vl_i. So in order to preserve lag
-		 * we should inflate the lag before placement such that the
-		 * effective lag after placement comes out right.
-		 *
-		 * As such, invert the above relation for vl'_i to get the vl_i
-		 * we need to use such that the lag after placement is the lag
-		 * we computed before dequeue.
-		 *
-		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
-		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
-		 *
-		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
-		 *                   = W*vl_i
-		 *
-		 *   vl_i = (W + w_i)*vl'_i / W
-		 */
-		load = cfs_rq->avg_load;
-		if (curr && curr->on_rq)
-			load += scale_load_down(curr->load.weight);
-
-		lag *= load + scale_load_down(se->load.weight);
-		if (WARN_ON_ONCE(!load))
-			load = 1;
-		lag = div_s64(lag, load);
-	}
-
+	/* preserve vlag: vllag - J */
+	if (sched_feat(PLACE_LAG))
+		lag = se->vlag - avg_vjlag(cfs_rq);
 	se->vruntime = vruntime - lag;
 
 	if (se->rel_deadline) {
@@ -5257,6 +5240,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	check_schedstat_required();
 	update_stats_enqueue_fair(cfs_rq, se, flags);
+	sum_jlag_add(cfs_rq, se);
 	if (!curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
@@ -5394,6 +5378,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		se->rel_deadline = 1;
 	}
 
+	sum_jlag_sub(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b419a4d98461..9957bf817b37 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -680,6 +680,7 @@ struct cfs_rq {
 
 	s64			avg_vruntime;
 	u64			avg_load;
+	s64			sum_jlag;
 
 	u64			zero_vruntime;
 #ifdef CONFIG_SCHED_CORE
@@ -3959,6 +3960,7 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
 #endif /* !CONFIG_SCHED_MM_CID */
 
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern s64 avg_vjlag(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 static inline
 void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
-- 
2.17.1