[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250920005931.2753828-38-tj@kernel.org>
Date: Fri, 19 Sep 2025 14:59:00 -1000
From: Tejun Heo <tj@...nel.org>
To: void@...ifault.com,
arighi@...dia.com,
multics69@...il.com
Cc: linux-kernel@...r.kernel.org,
sched-ext@...ts.linux.dev,
memxor@...il.com,
bpf@...r.kernel.org,
Tejun Heo <tj@...nel.org>
Subject: [PATCH 37/46] sched_ext: Make watchdog sub-sched aware
Currently, the watchdog checks all tasks as if they are all on scx_root.
Move scx_watchdog_timeout inside scx_sched and make check_rq_for_timeouts()
use the timeout from the scx_sched associated with each task.
refresh_watchdog() is added, which determines the timer interval as half of
the shortest watchdog timeouts of all scheds and arms or disarms it as
necessary. Every scx_sched instance has equivalent or better detection
latency while sharing the same timer.
Signed-off-by: Tejun Heo <tj@...nel.org>
---
kernel/sched/ext.c | 74 ++++++++++++++++++++++++-------------
kernel/sched/ext_internal.h | 7 ++++
2 files changed, 56 insertions(+), 25 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3fcf6cd7fa00..4dc82afb7016 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -66,11 +66,10 @@ static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
- * The maximum amount of time in jiffies that a task may be runnable without
- * being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_error().
+ * Watchdog interval. All scx_sched's share a single watchdog timer and the
+ * interval is half of the shortest sch->watchdog_timeout.
*/
-static unsigned long scx_watchdog_timeout;
+static unsigned long scx_watchdog_interval;
/*
* The last time the delayed work was run. This delayed work relies on
@@ -2761,10 +2760,11 @@ static bool check_rq_for_timeouts(struct rq *rq)
goto out_unlock;
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+ struct scx_sched *sch = scx_task_sched(p);
unsigned long last_runnable = p->scx.runnable_at;
if (unlikely(time_after(jiffies,
- last_runnable + scx_watchdog_timeout))) {
+ last_runnable + sch->watchdog_timeout))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -2781,6 +2781,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
static void scx_watchdog_workfn(struct work_struct *work)
{
+ unsigned long intv;
int cpu;
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
@@ -2791,28 +2792,31 @@ static void scx_watchdog_workfn(struct work_struct *work)
cond_resched();
}
- queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- scx_watchdog_timeout / 2);
+
+ intv = READ_ONCE(scx_watchdog_interval);
+ if (intv < ULONG_MAX)
+ queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+ intv);
}
void scx_tick(struct rq *rq)
{
- struct scx_sched *sch;
+ struct scx_sched *root;
unsigned long last_check;
if (!scx_enabled())
return;
- sch = rcu_dereference_bh(scx_root);
- if (unlikely(!sch))
+ root = rcu_dereference_bh(scx_root);
+ if (unlikely(!root))
return;
last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies,
- last_check + READ_ONCE(scx_watchdog_timeout)))) {
+ last_check + READ_ONCE(root->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
- scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ scx_exit(root, SCX_EXIT_ERROR_STALL, 0,
"watchdog failed to check in for %u.%03us",
dur_ms / 1000, dur_ms % 1000);
}
@@ -4108,6 +4112,26 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
+static void refresh_watchdog(void)
+{
+ struct scx_sched *sch;
+ unsigned long intv = ULONG_MAX;
+
+ /* take the shortest timeout and use its half for watchdog interval */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sch, &scx_sched_all, all)
+ intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+ rcu_read_unlock();
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ WRITE_ONCE(scx_watchdog_interval, intv);
+
+ if (intv < ULONG_MAX)
+ mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
+ else
+ cancel_delayed_work_sync(&scx_watchdog_work);
+}
+
#ifdef CONFIG_EXT_SUB_SCHED
static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
@@ -4159,6 +4183,8 @@ static void scx_sub_disable(struct scx_sched *sch)
list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
mutex_unlock(&scx_enable_mutex);
/*
@@ -4316,12 +4342,12 @@ static void scx_root_disable(struct scx_sched *sch)
if (sch->ops.exit)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
- cancel_delayed_work_sync(&scx_watchdog_work);
-
raw_spin_lock_irq(&scx_sched_lock);
list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
/*
* scx_root clearing must be inside cpus_read_lock(). See
* handle_hotplug().
@@ -4780,6 +4806,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
sch->ancestors[level] = sch;
sch->level = level;
+ if (ops->timeout_ms)
+ sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
+ else
+ sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
kthread_init_work(&sch->disable_work, scx_disable_workfn);
@@ -4899,7 +4930,6 @@ static int scx_root_enable(struct sched_ext_ops *ops, struct bpf_link *link)
struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
- unsigned long timeout;
int i, cpu, ret;
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
@@ -4953,6 +4983,8 @@ static int scx_root_enable(struct sched_ext_ops *ops, struct bpf_link *link)
list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
scx_idle_enable(ops);
if (sch->ops.init) {
@@ -4979,16 +5011,6 @@ static int scx_root_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ret)
goto err_disable;
- if (ops->timeout_ms)
- timeout = msecs_to_jiffies(ops->timeout_ms);
- else
- timeout = SCX_WATCHDOG_MAX_TIMEOUT;
-
- WRITE_ONCE(scx_watchdog_timeout, timeout);
- WRITE_ONCE(scx_watchdog_timestamp, jiffies);
- queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
- scx_watchdog_timeout / 2);
-
/*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
* This can lead to stalls as some BPF schedulers (e.g. userspace
@@ -5215,6 +5237,8 @@ static int scx_sub_enable(struct sched_ext_ops *ops, struct bpf_link *link)
list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock);
+ refresh_watchdog();
+
if (sch->level >= SCX_SUB_MAX_DEPTH) {
scx_error(sch, "max nesting depth %d violated",
SCX_SUB_MAX_DEPTH);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 8dbdae910564..4399c003c15f 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -974,6 +974,13 @@ struct scx_sched {
struct kset *sub_kset;
#endif /* CONFIG_EXT_SUB_SCHED */
+ /*
+ * The maximum amount of time in jiffies that a task may be runnable
+ * without being scheduled on a CPU. If this timeout is exceeded, it
+ * will trigger scx_error().
+ */
+ unsigned long watchdog_timeout;
+
atomic_t exit_kind;
struct scx_exit_info *exit_info;
--
2.51.0
Powered by blists - more mailing lists