[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20251208091117.2727-1-tao.wangtao@honor.com>
Date: Mon, 8 Dec 2025 17:11:17 +0800
From: wangtao <tao.wangtao@...or.com>
To: <mingo@...hat.com>, <peterz@...radead.org>, <juri.lelli@...hat.com>,
<vincent.guittot@...aro.org>
CC: <dietmar.eggemann@....com>, <rostedt@...dmis.org>, <bsegall@...gle.com>,
<mgorman@...e.de>, <vschneid@...hat.com>, <linux-kernel@...r.kernel.org>,
<liulu.liu@...or.com>, <bintian.wang@...or.com>, <wangzicheng@...or.com>,
wangtao <tao.wangtao@...or.com>
Subject: [PATCH v3] sched/fair: make task join/leave scheduling more stable
Suppose the current V = V0, all weights are 1, and we add 3 tasks
with join_lag (jlag) values 0, 36 and -12. Considering preserve_lag
handling with vlag_i = (W + w_i) * vlag_i' / W:
----------------------------------
event |vlag'|vlag| v | W | V
----------------------------------
T1 join| 0 | 0 |V0 | 1 |V0
T2 join| 36 | 72 |V0-72| 2 |V0-36
T3 join| -12 |-18 |V0-18| 3 |V0-30
Because V moves backward after T2 joins, even though lag_T3 < 0 = lag_T1,
we still have v_T2 < v_T3 < v_T1. So the schedule order is T2, T3, T1.
A similar issue exists even without preserve_lag.
If tasks are added in the order T3, T2, T1, the schedule order
becomes T2, T1, T3, which shows instability.
We define the join_lag of each joining/leaving task j as jlag_j, and
let J = \Sum jlag_i / W be the average jlag of all tasks in the queue.
Setting the new task's v_j = V - (vjlag_j - J) keeps V+J unchanged
before and after task join/leave, making scheduling more stable and fair.
Using the same example with lag values 0, 36 and -12.
---------------------------------------------------------------------------
time| event | W | avgJ | sumJ | jlag | V+J | v | V | lag
---------------------------------------------------------------------------
t |T1 joins+ | 1 | 0 <| 0 <| 0 |> V0 |> V0 |> V0 |> 0
t |T2 joins+ | 2 | 18 <| 36 <| 36 |> V0 |> V0-36 |> V0-18|> 18
t |T3 joins+ | 3 | 8 <| 24 <| -12 |> V0 |> V0+12 |> V0-8 |> -20
t |pick T2 | 3 | 8 | 24 | 36 <| V0 | V0-36 | V0-8 |> 28
t+24|T2 leaves- | 3 | 8 | 24 | 20 <| V0+8 <| V0-12 |> V0 |> 12
t+24|T2 leaves+ | 2 | 2 <| 4 <| NA | V0+8 | NA |> V0+6 | NA
t+24|pick T1 | 2 | 2 | 4 | 8 <| V0+8 | V0 | V0+6 |> 6
t+40|T1 leaves- | 2 | 2 | 4 | 0 <| V0+16 <| V0+16 |> V0+14|> -2
t+40|T1 leaves+ | 1 | 4 <| 4 <| NA | V0+16 | NA |> V0+12| NA
t+40|T2 rejoins | 2 | 8 <| 16 <| 12 |> V0+16 |> V0+4 |> V0+8 |> 4
t+40|pick T2 | 2 | 8 | 16 | 12 <| V0+16 | V0+4 | V0+8 |> 4
t+52|T2 leaves- | 2 | 8 | 16 | 6 <| V0+22 <| V0+16 |> V0+14|> -2
t+52|T2 leaves+ | 1 | 10 | 10 | NA | V0+22 | NA |> V0+12| NA
t+52|pick T3 | 1 | 10 | 10 | 10 <| V0+22 | V0+12 | V0+12|> 0
t+60|T3 leaves- | 1 | 10 | 10 | 10 <| V0+30 <| V0+20 |> V0+20|> 0
t+60|T3 leaves+ | 0 | 0 | 0 | NA | NA | NA |> V0+20| NA
note: '<' and '>' indicate that the data needs to be recalculated.
hackbench tests show that this patch reduces execution time due to
fewer task switches.
-------------------------------------------------
hackbench test base patch opt
-------------------------------------------------
process 1 group: 0.119 0.107 -10.6%
process 10 group: 0.869 0.767 -11.7%
process 20 group: 1.840 1.579 -14.1%
thread 1 group: 0.089 0.074 -17.2%
thread 10 group: 0.555 0.480 -13.5%
thread 20 group: 1.099 1.022 -7.0%
pipe process 1 group: 0.126 0.084 -33.5%
pipe process 10 group: 0.810 0.673 -17.0%
pipe process 20 group: 1.625 1.314 -19.1%
pipe thread 1 group: 0.092 0.077 -16.7%
pipe thread 10 group: 0.503 0.465 -7.6%
pipe thread 20 group: 0.947 0.906 -4.4%
v2->v3: updat the example and derivation
v2: https://lore.kernel.org/all/20251205105403.13278-1-tao.wangtao@honor.com/
v1: https://lore.kernel.org/all/20251128081118.20025-1-tao.wangtao@honor.com/T/#u
Signed-off-by: wangtao <tao.wangtao@...or.com>
---
kernel/sched/debug.c | 2 +
kernel/sched/fair.c | 171 ++++++++++++++++++++++++-------------------
kernel/sched/sched.h | 2 +
3 files changed, 98 insertions(+), 77 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..a4fd11c5ca2d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -830,6 +830,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(zero_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vjlag",
+ SPLIT_NS(avg_vjlag(cfs_rq)));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 769d7b7990df..10f48d0de322 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -593,7 +593,68 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* V +-= lag_i / W
*
- * Also see the comment in place_entity() that deals with this. ]]
+ * If a joining/leaving task j has non-zero lag, V will change. Although
+ * v_i of other tasks remains unchanged, their vlag_i = V - v_i changes.
+ * We define join_lag of each task as jlag, the average jlag in the queue:
+ *
+ * J = \Sum jlag_i / W
+ *
+ * 1) Before and after task j joins, v_i and jlag_i of other tasks remain
+ * unchanged:
+ * W_j = W + w_j
+ * V_j = (V * W + v_j * w_j) / W_j
+ * V_j = (V * W_j - V * w_j + v_j * w_j) / W_j
+ * V_j = V - (V - v_j) * w_j / W_j
+ * and
+ * J_j = (J * W + jlag_j) / W_j
+ * J_j = (J * W_j - J * w_j + vjlag_j * w_j) / W_j
+ * J_j = J - (J - vjlag_j) * w_j / W_j
+ *
+ * Therefore:
+ * (V_j + J_j) - (V + J) = ((v_j + vjlag_j) - (V + J)) * w_j / W_j
+ *
+ * The v_j of the joining task j is determined based on vjlag and V. If we set
+ * v_j = V + J - vjlag_j
+ * then we have:
+ * (V_j + J_j) - (V + J) = 0
+ * meaning adding a task keeps V_j + J_j unchanged.
+ *
+ * Since
+ * v_j = V_j - vlag_j
+ * we get:
+ * v_j = V_j - vlag_j = V + J - vjlag_j
+ * vlag_j = vjlag_j - J_j
+ * so after adding task j, preserve vlag: vlag_last_leave - J_j.
+ *
+ * 2) When any task i runs, V changes with v_i, but J does not:
+ * from
+ * v_i = V - vlag_i = V + J - vjlag_i
+ * we get
+ * vlag_i = V - v_i
+ * vjlag_i = V + J - v_i = vlag_i + J
+ * vlag_i and vjlag_i change accordingly.
+ *
+ * 3) When task l leaves, v_i and jlag_i of other tasks remain unchanged:
+ * W' = W_l - w_l
+ * V' = (V_l * W_l - v_l * w_l) / W'
+ * V' = (V_l * W' + V_l * w_l - v_l * w_l) / W'
+ * V' = V_l + (V_l - v_l) * w_l / W'
+ * and
+ * J' = (J_l * W_l - jlag_l) / W'
+ * J' = (J_l * W' + J_l * w_l - vjlag_j * w_l) / W'
+ * J' = J_l + (J_l - vjlag_l) * w_l / W'
+ *
+ * Therefore:
+ * (V' + J') - (V_l + J_l) = ((V_l + J_l) - (v_l + vjlag_l)) * w_l / W' = 0
+ * so V'+J' also remains unchanged when removing a task.
+ *
+ * For n tasks in the queue, the upper bound of \Sum jlag_i is:
+ * \Sum jlag_i = \Sum (vjlag_i * w_i)
+ * = \Sum (vlag_i + J) * w_i
+ * = \Sum (vlag_i * w_i) + J * \Sum w_i
+ * = \Sum lag_i + J * W
+ * = 0 + J * W <= n * q
+ * where q is the upper bound of lag. ]]
*
* However, since v_i is u64, and the multiplication could easily overflow
* transform it into a relative form that uses smaller quantities:
@@ -638,6 +699,33 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg_load -= weight;
}
+s64 avg_vjlag(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
+
+ return load ? div_s64(cfs_rq->sum_jlag, load) : 0;
+}
+
+static void sum_jlag_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 jlag_join = se->vlag * weight; /* preserve vlag: vlag - J_j */
+
+ cfs_rq->sum_jlag += jlag_join;
+}
+
+static void sum_jlag_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 jlag_leave = (se->vlag + avg_vjlag(cfs_rq)) * weight;
+
+ cfs_rq->sum_jlag -= jlag_leave;
+}
+
static inline
void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
{
@@ -5106,82 +5194,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->slice = sysctl_sched_base_slice;
vslice = calc_delta_fair(se->slice, se);
- /*
- * Due to how V is constructed as the weighted average of entities,
- * adding tasks with positive lag, or removing tasks with negative lag
- * will move 'time' backwards, this can screw around with the lag of
- * other tasks.
- *
- * EEVDF: placement strategy #1 / #2
- */
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
- struct sched_entity *curr = cfs_rq->curr;
- unsigned long load;
-
- lag = se->vlag;
-
- /*
- * If we want to place a task and preserve lag, we have to
- * consider the effect of the new entity on the weighted
- * average and compensate for this, otherwise lag can quickly
- * evaporate.
- *
- * Lag is defined as:
- *
- * lag_i = S - s_i = w_i * (V - v_i)
- *
- * To avoid the 'w_i' term all over the place, we only track
- * the virtual lag:
- *
- * vl_i = V - v_i <=> v_i = V - vl_i
- *
- * And we take V to be the weighted average of all v:
- *
- * V = (\Sum w_j*v_j) / W
- *
- * Where W is: \Sum w_j
- *
- * Then, the weighted average after adding an entity with lag
- * vl_i is given by:
- *
- * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
- * = (W*V + w_i*(V - vl_i)) / (W + w_i)
- * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
- * = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
- * = V - w_i*vl_i / (W + w_i)
- *
- * And the actual lag after adding an entity with vl_i is:
- *
- * vl'_i = V' - v_i
- * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
- * = vl_i - w_i*vl_i / (W + w_i)
- *
- * Which is strictly less than vl_i. So in order to preserve lag
- * we should inflate the lag before placement such that the
- * effective lag after placement comes out right.
- *
- * As such, invert the above relation for vl'_i to get the vl_i
- * we need to use such that the lag after placement is the lag
- * we computed before dequeue.
- *
- * vl'_i = vl_i - w_i*vl_i / (W + w_i)
- * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
- *
- * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
- * = W*vl_i
- *
- * vl_i = (W + w_i)*vl'_i / W
- */
- load = cfs_rq->avg_load;
- if (curr && curr->on_rq)
- load += scale_load_down(curr->load.weight);
-
- lag *= load + scale_load_down(se->load.weight);
- if (WARN_ON_ONCE(!load))
- load = 1;
- lag = div_s64(lag, load);
- }
-
+ /* v_j: V - (vjlag_j - J) */
+ if (sched_feat(PLACE_LAG))
+ lag = se->vlag - avg_vjlag(cfs_rq);
se->vruntime = vruntime - lag;
if (se->rel_deadline) {
@@ -5257,6 +5272,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags);
+ sum_jlag_add(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -5394,6 +5410,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->rel_deadline = 1;
}
+ sum_jlag_sub(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b419a4d98461..9957bf817b37 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -680,6 +680,7 @@ struct cfs_rq {
s64 avg_vruntime;
u64 avg_load;
+ s64 sum_jlag;
u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -3959,6 +3960,7 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern s64 avg_vjlag(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
static inline
void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
--
2.17.1
Powered by blists - more mailing lists