linux-kernel - [PATCH v4] sched/fair: make task join/leave scheduling more stable

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20251211082816.27956-1-tao.wangtao@honor.com>
Date: Thu, 11 Dec 2025 16:28:16 +0800
From: wangtao <tao.wangtao@...or.com>
To: <mingo@...hat.com>, <peterz@...radead.org>, <juri.lelli@...hat.com>,
	<vincent.guittot@...aro.org>
CC: <dietmar.eggemann@....com>, <rostedt@...dmis.org>, <bsegall@...gle.com>,
	<mgorman@...e.de>, <vschneid@...hat.com>, <linux-kernel@...r.kernel.org>,
	<liulu.liu@...or.com>, <bintian.wang@...or.com>, <wangzicheng@...or.com>,
	wangtao <tao.wangtao@...or.com>
Subject: [PATCH v4] sched/fair: make task join/leave scheduling more stable

Assume the current V is V0, all weights are 1, and three tasks join with
lag values 0, 36 and -12. Consider preserve_lag handling with
vlag_i = (W + w_i) * vlag_i' / W.

The following example illustrates the problem:

  event    | vlag' | vlag |    v   | W | V
  --------------------------------------------
  T1 joins |     0 |    0 |  V0    | 1 | V0
  T2 joins |    36 |   72 |  V0-72 | 2 | V0-36
  T3 joins |   -12 |  -18 |  V0-18 | 3 | V0-30

Because V moves backward after T2 joins, even though lag_T3 < 0 = lag_T1,
we still have v_T2 < v_T3 < v_T1. Therefore the schedule order is T2, T3,
T1, i.e. T3 with a negative lag is scheduled before T1 with zero lag.
A similar issue exists even without preserve_lag.

If the tasks are added in the order T3, T2, T1, the schedule order
becomes T2, T1, T3, which shows instability.

We define sum_jlag as the sum of lag compensations at task joins/leaves,
and define J as the weighted average lag compensation of all tasks in
the queue. By construction, we choose the jlag values at joins/leaves so
that V + J is invariant, which makes scheduling more stable and fair.
We also show that J is bounded by a constant.

Using the same example with lag values 0, 36 and -12:

  time  | event      |  W | sumJ |  J  |  V+J   |    v   |    V   |  lag
  ----------------------------------------------------------------------
  t     | T1 join-   |  0 |    0 |   0 |  V0    |   NA   |  V0    |    0
  t     | T1 join+   |> 1 |>   0 |>  0 |  V0    |> V0    |> V0    |>   0
  t     | T2 join-   |  1 |    0 |   0 |  V0    |   NA   |  V0    |   36
  t     | T2 join+   |> 2 |>  36 |> 18 |  V0    |> V0-36 |> V0-18 |>  18
  t     | T3 join-   |  2 |   36 |   0 |  V0    |   NA   |  V0    |  -12
  t     | T3 join+   |> 3 |>  24 |>  8 |  V0    |> V0+12 |> V0-8  |> -20
  t     | pick T2    |  3 |   24 |   8 |  V0    |  V0-36 |  V0-8  |>  28
  t+24  | T2 leave-  |  3 |   24 |   8 |> V0+8  |> V0-12 |> V0    |>  12
  t+24  | T2 leave+  |> 2 |>   4 |>  2 |  V0+8  |>  NA   |> V0+6  |   12
  t+24  | pick T1    |  2 |    4 |   2 |  V0+8  |  V0    |  V0+6  |>   6
  t+40  | T1 leave-  |  2 |    4 |   2 |> V0+16 |> V0+16 |> V0+14 |>  -2
  t+40  | T1 leave+  |> 1 |>   4 |>  4 |  V0+16 |>  NA   |> V0+12 |   -2
  t+40  | T2 rejoin- |  1 |    4 |   4 |  V0+16 |   NA   |  V0+12 |   12
  t+40  | T2 rejoin+ |> 2 |>  16 |>  8 |  V0+16 |> V0+4  |> V0+8  |>   4
  t+40  | pick T2    |  2 |   16 |   8 |  V0+16 |  V0+4  |  V0+8  |>   4
  t+52  | T2 leave-  |  2 |   16 |   8 |> V0+22 |> V0+16 |> V0+14 |>  -2
  t+52  | T2 leave+  |> 1 |>  10 |> 10 |  V0+22 |>  NA   |> V0+12 |   -2
  t+52  | pick T3    |  1 |   10 |  10 |  V0+22 |  V0+12 |  V0+12 |>   0
  t+60  | T3 leave-  |  1 |   10 |  10 |> V0+30 |> V0+20 |> V0+20 |>   0
  t+60  | T3 leave+  |> 0 |>   0 |>  0*|  V0+20*|   NA   |> V0+20 |    0

Notes:
  '>' indicates that the value needs to be recomputed.
  '*' when the queue becomes empty, sumJ / 0 is undefined, and we set J = 0

Hackbench tests show that this patch reduces execution time due to
fewer task switches.

  hackbench test          base   patch  change
  ---------------------------------------------
  process 1 group:        0.119  0.107  -10.6%
  process 10 group:       0.869  0.767  -11.7%
  process 20 group:       1.840  1.579  -14.1%
  thread 1 group:         0.089  0.074  -17.2%
  thread 10 group:        0.555  0.480  -13.5%
  thread 20 group:        1.099  1.022   -7.0%
  pipe process 1 group:   0.126  0.084  -33.5%
  pipe process 10 group:  0.810  0.673  -17.0%
  pipe process 20 group:  1.625  1.314  -19.1%
  pipe thread 1 group:    0.092  0.077  -16.7%
  pipe thread 10 group:   0.503  0.465   -7.6%
  pipe thread 20 group:   0.947  0.906   -4.4%

Signed-off-by: wangtao <tao.wangtao@...or.com>
---
 kernel/sched/debug.c |   2 +
 kernel/sched/fair.c  | 254 ++++++++++++++++++++++++++++++-------------
 kernel/sched/sched.h |   2 +
 3 files changed, 181 insertions(+), 77 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..a4fd11c5ca2d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -830,6 +830,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			SPLIT_NS(zero_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vruntime",
 			SPLIT_NS(avg_vruntime(cfs_rq)));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vjlag",
+			SPLIT_NS(avg_vjlag(cfs_rq)));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "right_vruntime",
 			SPLIT_NS(right_vruntime));
 	spread = right_vruntime - left_vruntime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da46c3164537..6f55d0225cab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -593,7 +593,151 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
  *
  *	      V +-= lag_i / W
  *
- *	    Also see the comment in place_entity() that deals with this. ]]
+ * If a joining or leaving task j has non-zero lag, V will change. We define
+ * sum_jlag as the sum of lag compensations at task joins/leaves, and J as the
+ * weighted average:
+ *
+ *     J = sum_jlag / W
+ *
+ * By construction, we choose jlag at joins/leaves so that V + J is invariant
+ * when there are tasks in the queue.
+ *
+ * --------------------------------------------------------------------
+ * 1) Task l leaving
+ *
+ * Before task l leaves we use the suffix "_l"; after it leaves we use a
+ * prime ('). Given v_l or lag_l, we can compute jlag_l.
+ *
+ *   W' = W_l - w_l
+ *   V' = (V_l * W_l - v_l * w_l) / W'
+ *      = (V_l * W' + V_l * w_l - v_l * w_l) / W'
+ *      = V_l + (V_l - v_l) * w_l / W'
+ *      = V_l + lag_l / W'                  // lag_l = (V_l - v_l) * w_l
+ *
+ * For J:
+ *
+ *   sum_jlag' = sum_jlag_l - jlag_l = J_l * W_l - jlag_l
+ *   J' = sum_jlag' / W'
+ *      = (J_l * W_l - jlag_l) / W'
+ *      = (J_l * W' + J_l * w_l - jlag_l) / W'
+ *      = J_l + (J_l * w_l - jlag_l) / W'
+ *
+ * Enforcing V' + J' = V_l + J_l gives:
+ *
+ *   jlag_l = (J_l + V_l - v_l) * w_l = J_l * w_l + lag_l
+ *
+ * --------------------------------------------------------------------
+ * 2) Task j joining
+ *
+ * Before task j joins we use V, W, J; after joining we use the suffix "_j":
+ *
+ *   W_j = W + w_j
+ *   V_j = (V * W + v_j * w_j) / W_j
+ *       = (V * W_j - V * w_j + v_j * w_j) / W_j
+ *       = V - (V - v_j) * w_j / W_j
+ *
+ * For J:
+ *
+ *   sum_jlag_j = sum_jlag + jlag_j = J * W + jlag_j
+ *   J_j = sum_jlag_j / W_j
+ *       = (J * W + jlag_j) / W_j
+ *       = (J * W_j - J * w_j + jlag_j) / W_j
+ *       = J - (J * w_j - jlag_j) / W_j
+ *
+ * Enforcing V_j + J_j = V + J gives:
+ *
+ *   jlag_j = (J + V - v_j) * w_j
+ *
+ * When task j joins, v_j, lag_j and jlag_j are not yet defined.
+ * Given V, J and j's last_leave_lag_j, if we set
+ *
+ *   jlag_j = last_leave_lag_j
+ *
+ * then
+ *
+ *   v_j = J + V - jlag_j / w_j
+ *       = V - (last_leave_lag_j / w_j - J)
+ *
+ * We only use jlag_j to compute v_j; it does not affect EEVDF's
+ * eligibility decision. Therefore we can also choose jlag_j to be
+ * last_leave_lag_j / 2, 2 * last_leave_lag_j, or any value with a
+ * uniform constant bound.
+ *
+ * --------------------------------------------------------------------
+ * 3) From the above construction we get:
+ *
+ *   sum_jlag(nj + 1, nl) = sum_jlag(nj, nl) + last_leave_lag_j
+ *   sum_jlag(nj, nl + 1) = sum_jlag(nj, nl) * (1 - w_l / W) - lag_l
+ *   J(nj, nl) = sum_jlag(nj, nl) / W
+ *
+ * Here nj is the number of joins and nl is the number of leaves, and
+ *
+ *   sum_jlag(0, 0) = 0
+ *   nj >= nl
+ *
+ * --------------------------------------------------------------------
+ * 4) Boundedness of J
+ *
+ * Let nr = nj - nl be the number of tasks currently in the queue. If nr = 1,
+ * the only task l leaves with lag_l = 0 and w_l = W, so sum_jlag becomes 0
+ * and J = 0.
+ *
+ * For nr > 1, assume for all tasks i in the queue:
+ *
+ *   1 < W_MIN <= w_i < W <= W_MAX
+ *   nr * W_MIN <= W <= W_MAX
+ *
+ * and lag bounds
+ *
+ *   |last_leave_lag_j| <= q, |lag_l| <= q
+ *
+ * for some global constant q. Moreover, there exists a constant a with
+ *
+ *   0 < W_MIN / W_MAX <= 1 - w_i / W <= a < 1.
+ *
+ * Then nj-increase (joins) gives
+ *
+ *   |sum_jlag| <= nr * q,  W >= nr * W_MIN  =>  |J| <= q / W_MIN
+ *
+ * For nl-increase (leaves), the recurrence
+ *
+ *   sum_jlag(nj, nl + 1) =
+ *     sum_jlag(nj, nl) * (1 - w_l / W) - lag_l
+ *
+ * has the standard form
+ *
+ *   x_{k+1} = a_k * x_k + d_k
+ *
+ * with
+ *
+ *   a_k = 1 - w_l / W in (0, 1),  |d_k| <= q.
+ *
+ * This is a 1-D discrete-time linear system with |a_k| < 1 and bounded
+ * additive disturbance d_k. In such a bounded-disturbance (BIBS/ISS-type)
+ * model, x_k and J = x_k / W remain uniformly bounded. In particular,
+ * there exists a constant C1, independent of nj, nl and nr, such that
+ *
+ *     |sum_jlag(nj, nl)| = |sum_jlag(nl0 + nr, nl0 + k)|
+ *                        <= a^k * nr * q + (1 - a^k) * q / (1 - a)
+ *
+ *   The number of remaining tasks is nr - k, so
+ *
+ *     W >= (nr - k) * W_MIN
+ *
+ *   Therefore
+ *
+ *     |J| = |sum_jlag(nj, nl) / W|
+ *          <= q * (nr * a^k + (1 - a^k) / (1 - a)) / ((nr - k) * W_MIN)
+ *
+ *   Hence
+ *
+ *     |J| <= (q / W_MIN) * C1
+ *
+ * where C1 can be chosen as
+ *
+ *   C1 = 1 + 1 / (1 - a) + 1 / (e * ln(1 / a)).
+ *
+ * Thus J has a uniform constant upper bound. ]]
  *
  * However, since v_i is u64, and the multiplication could easily overflow
  * transform it into a relative form that uses smaller quantities:
@@ -638,6 +782,33 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	cfs_rq->avg_load -= weight;
 }
 
+s64 avg_vjlag(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	long load = cfs_rq->avg_load;
+
+	if (curr && curr->on_rq)
+		load += scale_load_down(curr->load.weight);
+
+	return load ? div_s64(cfs_rq->sum_jlag, load) : 0;
+}
+
+static void sum_jlag_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight = scale_load_down(se->load.weight);
+	s64 jlag_join = se->vlag * weight; /* preserve vlag: vlag - J_j */
+
+	cfs_rq->sum_jlag += jlag_join;
+}
+
+static void sum_jlag_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight = scale_load_down(se->load.weight);
+	s64 jlag_leave = (se->vlag + avg_vjlag(cfs_rq)) * weight;
+
+	cfs_rq->sum_jlag -= jlag_leave;
+}
+
 static inline
 void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
 {
@@ -5109,82 +5280,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		se->slice = sysctl_sched_base_slice;
 	vslice = calc_delta_fair(se->slice, se);
 
-	/*
-	 * Due to how V is constructed as the weighted average of entities,
-	 * adding tasks with positive lag, or removing tasks with negative lag
-	 * will move 'time' backwards, this can screw around with the lag of
-	 * other tasks.
-	 *
-	 * EEVDF: placement strategy #1 / #2
-	 */
-	if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
-		struct sched_entity *curr = cfs_rq->curr;
-		unsigned long load;
-
-		lag = se->vlag;
-
-		/*
-		 * If we want to place a task and preserve lag, we have to
-		 * consider the effect of the new entity on the weighted
-		 * average and compensate for this, otherwise lag can quickly
-		 * evaporate.
-		 *
-		 * Lag is defined as:
-		 *
-		 *   lag_i = S - s_i = w_i * (V - v_i)
-		 *
-		 * To avoid the 'w_i' term all over the place, we only track
-		 * the virtual lag:
-		 *
-		 *   vl_i = V - v_i <=> v_i = V - vl_i
-		 *
-		 * And we take V to be the weighted average of all v:
-		 *
-		 *   V = (\Sum w_j*v_j) / W
-		 *
-		 * Where W is: \Sum w_j
-		 *
-		 * Then, the weighted average after adding an entity with lag
-		 * vl_i is given by:
-		 *
-		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
-		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
-		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
-		 *      = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
-		 *      = V - w_i*vl_i / (W + w_i)
-		 *
-		 * And the actual lag after adding an entity with vl_i is:
-		 *
-		 *   vl'_i = V' - v_i
-		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
-		 *         = vl_i - w_i*vl_i / (W + w_i)
-		 *
-		 * Which is strictly less than vl_i. So in order to preserve lag
-		 * we should inflate the lag before placement such that the
-		 * effective lag after placement comes out right.
-		 *
-		 * As such, invert the above relation for vl'_i to get the vl_i
-		 * we need to use such that the lag after placement is the lag
-		 * we computed before dequeue.
-		 *
-		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
-		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
-		 *
-		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
-		 *                   = W*vl_i
-		 *
-		 *   vl_i = (W + w_i)*vl'_i / W
-		 */
-		load = cfs_rq->avg_load;
-		if (curr && curr->on_rq)
-			load += scale_load_down(curr->load.weight);
-
-		lag *= load + scale_load_down(se->load.weight);
-		if (WARN_ON_ONCE(!load))
-			load = 1;
-		lag = div_s64(lag, load);
-	}
-
+	/* v_j: V - (vjlag_j - J) */
+	if (sched_feat(PLACE_LAG))
+		lag = se->vlag - avg_vjlag(cfs_rq);
 	se->vruntime = vruntime - lag;
 
 	if (se->rel_deadline) {
@@ -5260,6 +5358,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	check_schedstat_required();
 	update_stats_enqueue_fair(cfs_rq, se, flags);
+	sum_jlag_add(cfs_rq, se);
 	if (!curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
@@ -5397,6 +5496,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		se->rel_deadline = 1;
 	}
 
+	sum_jlag_sub(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5..8f7eb75f9ab5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -680,6 +680,7 @@ struct cfs_rq {
 
 	s64			avg_vruntime;
 	u64			avg_load;
+	s64			sum_jlag;
 
 	u64			zero_vruntime;
 #ifdef CONFIG_SCHED_CORE
@@ -3891,6 +3892,7 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
 #endif /* !CONFIG_SCHED_MM_CID */
 
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern s64 avg_vjlag(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 static inline
 void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
-- 
2.17.1