[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250920005931.2753828-47-tj@kernel.org>
Date: Fri, 19 Sep 2025 14:59:09 -1000
From: Tejun Heo <tj@...nel.org>
To: void@...ifault.com,
arighi@...dia.com,
multics69@...il.com
Cc: linux-kernel@...r.kernel.org,
sched-ext@...ts.linux.dev,
memxor@...il.com,
bpf@...r.kernel.org,
Tejun Heo <tj@...nel.org>
Subject: [PATCH 46/46] sched_ext: Add basic building blocks for nested sub-scheduler dispatching
This is an early-stage partial implementation that demonstrates the core
building blocks for nested sub-scheduler dispatching. While significant
work remains in the enqueue path and other areas, this patch establishes
the fundamental mechanisms needed for hierarchical scheduler operation.
The key building blocks introduced include:
- Private stack support for ops.dispatch() to prevent stack overflow when
walking down nested schedulers during dispatch operations
- scx_bpf_sub_dispatch() kfunc that allows parent schedulers to trigger
dispatch operations on their direct child schedulers
- Proper parent-child relationship validation to ensure dispatch requests
are only made to legitimate child schedulers
- Updated scx_dispatch_sched() to handle both nested and non-nested
invocations with appropriate kf_mask handling
The qmap scheduler is updated to demonstrate the functionality by calling
scx_bpf_sub_dispatch() on registered child schedulers when it has no
tasks in its own queues.
Signed-off-by: Tejun Heo <tj@...nel.org>
---
kernel/sched/ext.c | 116 ++++++++++++++++++++---
kernel/sched/sched.h | 3 +
tools/sched_ext/include/scx/common.bpf.h | 2 +
tools/sched_ext/scx_qmap.bpf.c | 37 +++++++-
4 files changed, 145 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0d865e017115..99462d0da543 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2253,8 +2253,14 @@ static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
dspc->cursor = 0;
}
-static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
- struct task_struct *prev)
+/*
+ * One user of this function is scx_bpf_dispatch() which can be called
+ * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
+ * from the call frame.
+ */
+static __always_inline bool
+scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *prev, bool nested)
{
struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
int nr_loops = SCX_DSP_MAX_LOOPS;
@@ -2280,8 +2286,23 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
do {
dspc->nr_tasks = 0;
- SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
- cpu_of(rq), prev_on_sch ? prev : NULL);
+ if (nested) {
+ /*
+ * If nested, don't update kf_mask as the originating
+ * invocation would already have set it up.
+ */
+ SCX_CALL_OP(sch, 0, dispatch, rq,
+ cpu_of(rq), prev_on_sch ? prev : NULL);
+ } else {
+ /*
+ * If not nested, stash @prev so that nested invocations
+ * can access it.
+ */
+ rq->scx.sub_dispatch_prev = prev;
+ SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
+ cpu_of(rq), prev_on_sch ? prev : NULL);
+ rq->scx.sub_dispatch_prev = NULL;
+ }
flush_dispatch_buf(sch, rq);
@@ -2314,7 +2335,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
static int balance_one(struct rq *rq, struct task_struct *prev)
{
- struct scx_sched *sch = scx_root, *pos;
+ struct scx_sched *sch = scx_root;
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
@@ -2358,13 +2379,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (rq->scx.local_dsq.nr)
goto has_tasks;
- /*
- * TEMPORARY - Dispatch all scheds. This will be replaced by BPF-driven
- * hierarchical operation.
- */
- list_for_each_entry_rcu(pos, &scx_sched_all, all)
- if (scx_dispatch_sched(pos, rq, prev))
- goto has_tasks;
+ if (scx_dispatch_sched(sch, rq, prev, false))
+ goto has_tasks;
/*
* Didn't find another task to run. Keep running @prev unless
@@ -5883,6 +5899,20 @@ static int bpf_scx_init_member(const struct btf_type *t,
return 0;
}
+#ifdef CONFIG_EXT_SUB_SCHED
+static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = scx_prog_sched(prog->aux);
+ if (unlikely(!sch))
+ return;
+
+ scx_error(sch, "dispatch recursion detected");
+}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
static int bpf_scx_check_member(const struct btf_type *t,
const struct btf_member *member,
const struct bpf_prog *prog)
@@ -5908,6 +5938,22 @@ static int bpf_scx_check_member(const struct btf_type *t,
return -EINVAL;
}
+#ifdef CONFIG_EXT_SUB_SCHED
+ /*
+ * Enable private stack for operations that can nest along the
+ * hierarchy.
+ *
+ * XXX - Ideally, we should only do this for scheds that allow
+ * sub-scheds and sub-scheds themselves but I don't know how to access
+ * struct_ops from here.
+ */
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, dispatch):
+ prog->aux->priv_stack_requested = true;
+ prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
return 0;
}
@@ -6799,6 +6845,49 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
+ * @cgroup_id: cgroup ID of the child scheduler to dispatch
+ * @aux__prog: magic BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Allows a parent scheduler to trigger dispatching on one of its direct
+ * child schedulers. The child scheduler runs its dispatch operation to
+ * move tasks from dispatch queues to the local runqueue.
+ *
+ * Returns: true on success, false if cgroup_id is invalid, not a direct
+ * child, or caller lacks dispatch permission.
+ */
+__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id,
+ const struct bpf_prog_aux *aux__prog)
+{
+ struct rq *this_rq = this_rq();
+ struct scx_sched *parent, *child;
+
+ guard(rcu)();
+ parent = scx_prog_sched(aux__prog);
+ if (unlikely(!parent))
+ return false;
+
+ if (!scx_kf_allowed(parent, SCX_KF_DISPATCH))
+ return false;
+
+ child = scx_find_sub_sched(cgroup_id);
+
+ if (unlikely(!child))
+ return false;
+
+ if (unlikely(scx_parent(child) != parent)) {
+ scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
+ cgroup_id);
+ return false;
+ }
+
+ return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
+ true);
+}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
@@ -6809,6 +6898,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+#ifdef CONFIG_EXT_SUB_SCHED
+BTF_ID_FLAGS(func, scx_bpf_sub_dispatch)
+#endif
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cd6bdcdf9314..0aa0caa84308 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -777,6 +777,9 @@ struct scx_rq {
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
unsigned long pnt_seq;
+
+ struct task_struct *sub_dispatch_prev;
+
struct balance_callback deferred_bal_cb;
struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 6bf75f970237..a1890790d58c 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -140,6 +140,8 @@ struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, const struct bpf_prog_
#define scx_bpf_task_cgroup(p) scx_bpf_task_cgroup((p), NULL)
u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
+bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux__prog) __ksym __weak;
+#define scx_bpf_sub_dispatch(cgroup_id) scx_bpf_sub_dispatch((cgroup_id), NULL)
/*
* Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 15e15cb234dc..9927cd1064e7 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -48,6 +48,9 @@ const volatile bool suppress_dump;
u64 nr_highpri_queued;
u32 test_error_cnt;
+#define MAX_SUB_SCHEDS 8
+u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+
UEI_DEFINE(uei);
struct qmap {
@@ -452,6 +455,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 0;
}
+ for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+ if (sub_sched_cgroup_ids[i] &&
+ scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+ return;
+ }
+
/*
* No other tasks. @prev will keep running. Update its core_sched_seq as
* if the task were enqueued and dispatched immediately.
@@ -877,7 +886,32 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
{
- return 0;
+ int i;
+
+ for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+ if (!sub_sched_cgroup_ids[i]) {
+ sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+ bpf_printk("attaching sub-sched[%d] on %s",
+ i, args->cgroup_path);
+ return 0;
+ }
+ }
+
+ return -ENOSPC;
+}
+
+void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
+{
+ int i;
+
+ for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+ if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+ sub_sched_cgroup_ids[i] = 0;
+ bpf_printk("detaching sub-sched[%d] on %s",
+ i, args->cgroup_path);
+ break;
+ }
+ }
}
SCX_OPS_DEFINE(qmap_ops,
@@ -896,6 +930,7 @@ SCX_OPS_DEFINE(qmap_ops,
.cgroup_set_weight = (void *)qmap_cgroup_set_weight,
.cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
.sub_attach = (void *)qmap_sub_attach,
+ .sub_detach = (void *)qmap_sub_detach,
.cpu_online = (void *)qmap_cpu_online,
.cpu_offline = (void *)qmap_cpu_offline,
.init = (void *)qmap_init,
--
2.51.0
Powered by blists - more mailing lists