lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <Ztj_h5c2LYsdXYbA@slm.duckdns.org>
Date: Wed, 4 Sep 2024 14:47:03 -1000
From: Tejun Heo <tj@...nel.org>
To: David Vernet <void@...ifault.com>
Cc: linux-kernel@...r.kernel.org, kernel-team@...a.com,
	Peter Zijlstra <peterz@...radead.org>
Subject: [PATCH sched_ext/for-6.12] sched_ext: Handle cases where
 pick_task_scx() is called without preceding balance_scx()

sched_ext dispatches tasks from the BPF scheduler from balance_scx() and
thus every pick_task_scx() call must be preceded by balance_scx(). While
this usually holds, there are rare cases where a higher sched class's
balance() returns true indicating that it has tasks to run on the CPU and
thus terminating balance() calls but fails to actually find the next task to
run when pick_task() is called. In such cases, pick_task_scx() can be called
without preceding balance_scx().

Detect this condition using SCX_RQ_BAL_PENDING flags. If detected, keep
running the previous task if possible and avoid stalling from entering idle
without balancing.

Signed-off-by: Tejun Heo <tj@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>
---
 kernel/sched/core.c  |   13 ++++++++-----
 kernel/sched/ext.c   |   34 +++++++++++++++++++++++++++++-----
 kernel/sched/sched.h |    3 ++-
 3 files changed, 39 insertions(+), 11 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5901,12 +5901,15 @@ static void prev_balance(struct rq *rq,
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 	/*
-	 * SCX requires a balance() call before every pick_next_task() including
-	 * when waking up from SCHED_IDLE. If @start_class is below SCX, start
-	 * from SCX instead.
+	 * SCX requires a balance() call before every pick_task() including when
+	 * waking up from SCHED_IDLE. If @start_class is below SCX, start from
+	 * SCX instead. Also, set a flag to detect missing balance() call.
 	 */
-	if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
-		start_class = &ext_sched_class;
+	if (scx_enabled()) {
+		rq->scx.flags |= SCX_RQ_BAL_PENDING;
+		if (sched_class_above(&ext_sched_class, start_class))
+			start_class = &ext_sched_class;
+	}
 #endif
 
 	/*
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2591,7 +2591,7 @@ static int balance_one(struct rq *rq, st
 
 	lockdep_assert_rq_held(rq);
 	rq->scx.flags |= SCX_RQ_IN_BALANCE;
-	rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+	rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
 
 	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
 	    unlikely(rq->scx.cpu_released)) {
@@ -2904,25 +2904,49 @@ static struct task_struct *pick_task_scx
 {
 	struct task_struct *prev = rq->curr;
 	struct task_struct *p;
+	bool prev_on_scx = prev->sched_class == &ext_sched_class;
+	bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+	bool kick_idle = false;
+
+	if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
+		/*
+		 * pick_task_scx() can be called without preceding balance_scx()
+		 * call if a higher class's balance() returned %true but its
+		 * pick_task() returned %NULL. Keep running @prev if possible
+		 * and avoid stalling from entering idle without balancing.
+		 */
+		if (prev_on_scx) {
+			keep_prev = true;
+		} else {
+			keep_prev = false;
+			kick_idle = true;
+		}
+	} else if (unlikely(keep_prev && !prev_on_scx)) {
+		/* only allowed during transitions */
+		WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+		keep_prev = false;
+	}
 
 	/*
 	 * If balance_scx() is telling us to keep running @prev, replenish slice
 	 * if necessary and keep running @prev. Otherwise, pop the first one
 	 * from the local DSQ.
 	 */
-	if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
-	    !WARN_ON_ONCE(prev->sched_class != &ext_sched_class)) {
+	if (keep_prev) {
 		p = prev;
 		if (!p->scx.slice)
 			p->scx.slice = SCX_SLICE_DFL;
 	} else {
 		p = first_local_task(rq);
-		if (!p)
+		if (!p) {
+			if (kick_idle)
+				scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
 			return NULL;
+		}
 
 		if (unlikely(!p->scx.slice)) {
 			if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
-				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
+				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_task_scx()\n",
 						p->comm, p->pid);
 				scx_warned_zero_slice = true;
 			}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -749,7 +749,8 @@ enum scx_rq_flags {
 	 */
 	SCX_RQ_ONLINE		= 1 << 0,
 	SCX_RQ_CAN_STOP_TICK	= 1 << 1,
-	SCX_RQ_BAL_KEEP		= 1 << 2, /* balance decided to keep current */
+	SCX_RQ_BAL_PENDING	= 1 << 2, /* balance hasn't run yet */
+	SCX_RQ_BAL_KEEP		= 1 << 3, /* balance decided to keep current */
 
 	SCX_RQ_IN_WAKEUP	= 1 << 16,
 	SCX_RQ_IN_BALANCE	= 1 << 17,

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ