linux-kernel - [PATCH V1 4/4] sched: add feature PREDICT_NO

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250221085725.33943-1-15645113830zzh@gmail.com>
Date: Fri, 21 Feb 2025 16:57:26 +0800
From: zihan zhou <15645113830zzh@...il.com>
To: 15645113830zzh@...il.com
Cc: bsegall@...gle.com,
	dietmar.eggemann@....com,
	juri.lelli@...hat.com,
	linux-kernel@...r.kernel.org,
	mgorman@...e.de,
	mingo@...hat.com,
	peterz@...radead.org,
	rostedt@...dmis.org,
	vincent.guittot@...aro.org,
	vschneid@...hat.com
Subject: [PATCH V1 4/4] sched: add feature PREDICT_NO_PREEMPT

Patch 4/4 is independent. It is an attempt to use the predict load.

I observed that some tasks were almost finished, but they were preempted
and had to spend more time running.

This task can be identified by load prediction, that is, the load when
enqueue is basically equal to the load when dequeue.

If the se to be preempted is such a task and the pse should have
preempted the se, PREDICT_NO_PREEMPT prevents pse from preempting and
compensates pse (set_next_buddy).

This is a protection for tasks that are executed immediately. If we find
that our prediction fails later, we will resched the se.

It can be said that this is a way to automatically adjust to
SCHED_BATCH, The performance of hackbench has improved a little.

./hackbench -g 8 -l 10000
orig: 2.063s   with PREDICT_NO_PREEMPT: 1.833s
./hackbench -g 16 -l 10000                                        
orig: 3.658s   with PREDICT_NO_PREEMPT: 3.479s

The average latency of cyclictest (with hackbench) has increased, but the
maximum latency is no different.

orig:
I:1000 C: 181852 Min:      4 Act:   59 Avg:  212 Max:   21838
with PREDICT_NO_PREEMPT:
I:1000 C: 181564 Min:      8 Act:   80 Avg:  457 Max:   22989

I think this kind of scheduling protection can't increase the scheduling
delay over 1ms (every tick will check whether the prediction is correct).
And it can improve the overall throughput, which seems acceptable.
Of course, this patch is still experimental, and welcome to put forward
suggestions.

(Seems to predict util better?)

In addition, I found that even if a high load hackbench was hung in
the background, the terminal operation was still very smooth.

Signed-off-by: zihan zhou <15645113830zzh@...il.com>
---
 kernel/sched/fair.c     | 92 +++++++++++++++++++++++++++++++++++++++--
 kernel/sched/features.h |  4 ++
 2 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d22d47419f79..21bf58a494ba 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1258,6 +1258,9 @@ static void update_curr(struct cfs_rq *cfs_rq)
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
 	resched = update_deadline(cfs_rq, curr);
+#ifdef CONFIG_SCHED_PREDICT_LOAD
+	resched |= predict_error_should_resched(curr);
+#endif
 	update_min_vruntime(cfs_rq);
 
 	if (entity_is_task(curr)) {
@@ -8884,6 +8887,60 @@ static void set_next_buddy(struct sched_entity *se)
 	}
 }
 
+#ifdef CONFIG_SCHED_PREDICT_LOAD
+static bool predict_se_will_end_soon(struct sched_entity *se)
+{
+	struct predict_load_data *pldp = se->pldp;
+
+	if (pldp == NULL)
+		return false;
+	if (pldp->predict_load_normalized == NO_PREDICT_LOAD)
+		return false;
+	if (pldp->predict_load_normalized > pldp->load_normalized_when_enqueue)
+		return false;
+	if (se->avg.load_avg >= get_predict_load(se))
+		return false;
+	return true;
+}
+
+void set_in_predict_no_preempt(struct sched_entity *se, bool in_predict_no_preempt)
+{
+	struct predict_load_data *pldp = se->pldp;
+
+	if (pldp == NULL)
+		return;
+	pldp->in_predict_no_preempt = in_predict_no_preempt;
+}
+
+static bool get_in_predict_no_preempt(struct sched_entity *se)
+{
+	struct predict_load_data *pldp = se->pldp;
+
+	if (pldp == NULL)
+		return false;
+	return pldp->in_predict_no_preempt;
+}
+
+static bool predict_right(struct sched_entity *se)
+{
+	struct predict_load_data *pldp = se->pldp;
+
+	if (pldp == NULL)
+		return false;
+	if (pldp->predict_load_normalized == NO_PREDICT_LOAD)
+		return false;
+	if (se->avg.load_avg <= get_predict_load(se))
+		return true;
+	return false;
+}
+
+bool predict_error_should_resched(struct sched_entity *se)
+{
+	return get_in_predict_no_preempt(se) && !predict_right(se);
+}
+
+#endif
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -8893,6 +8950,10 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	struct sched_entity *se = &donor->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
 	int cse_is_idle, pse_is_idle;
+	bool if_best_se;
+#ifdef CONFIG_SCHED_PREDICT_LOAD
+	bool predict_no_preempt = false;
+#endif
 
 	if (unlikely(se == pse))
 		return;
@@ -8954,6 +9015,21 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	if (unlikely(!normal_policy(p->policy)))
 		return;
 
+#ifdef CONFIG_SCHED_PREDICT_LOAD
+	/*
+	 * If If we predict that se will end soon, it's better not to preempt it,
+	 * but wait for it to exit by itself. This is undoubtedly a grievance for
+	 * pse, so if pse should preempt se, we will give it some compensation.
+	 */
+	if (sched_feat(PREDICT_NO_PREEMPT)) {
+		if (predict_error_should_resched(se))
+			goto preempt;
+
+		if (predict_se_will_end_soon(se))
+			predict_no_preempt = true;
+	}
+#endif
+
 	cfs_rq = cfs_rq_of(se);
 	update_curr(cfs_rq);
 	/*
@@ -8966,10 +9042,18 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	if (do_preempt_short(cfs_rq, pse, se))
 		cancel_protect_slice(se);
 
-	/*
-	 * If @p has become the most eligible task, force preemption.
-	 */
-	if (pick_eevdf(cfs_rq) == pse)
+	if_best_se = (pick_eevdf(cfs_rq) == pse);
+
+#ifdef CONFIG_SCHED_PREDICT_LOAD
+	if (predict_no_preempt) {
+		if (if_best_se && !pse->sched_delayed) {
+			set_next_buddy(pse);
+			set_in_predict_no_preempt(se, true);
+			return;
+		}
+	}
+#endif
+	if (if_best_se)
 		goto preempt;
 
 	return;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 3c12d9f93331..8a78108af835 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -121,3 +121,7 @@ SCHED_FEAT(WA_BIAS, true)
 SCHED_FEAT(UTIL_EST, true)
 
 SCHED_FEAT(LATENCY_WARN, false)
+
+#ifdef CONFIG_SCHED_PREDICT_LOAD
+SCHED_FEAT(PREDICT_NO_PREEMPT, true)
+#endif
-- 
2.33.0