[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CABFh=a4-5+SfNGTcjBP1_5bsN5HqdzoWE4F9P1K1yrNQt=k2yw@mail.gmail.com>
Date: Sat, 25 Oct 2025 19:21:47 -0400
From: Emil Tsalapatis <linux-lists@...alapatis.com>
To: Tejun Heo <tj@...nel.org>
Cc: David Vernet <void@...ifault.com>, Andrea Righi <andrea.righi@...ux.dev>,
Changwoo Min <changwoo@...lia.com>, linux-kernel@...r.kernel.org,
sched-ext@...ts.linux.dev, Peter Zijlstra <peterz@...radead.org>,
Wen-Fang Liu <liuwenfang@...or.com>
Subject: Re: [PATCH 3/3] sched_ext: Allow scx_bpf_reenqueue_local() to be
called from anywhere
On Fri, Oct 24, 2025 at 8:18 PM Tejun Heo <tj@...nel.org> wrote:
>
> The ops.cpu_acquire/release() callbacks are broken - they miss events under
> multiple conditions and can't be fixed without adding global sched core hooks
> that sched maintainers don't want. They also aren't necessary as BPF schedulers
> can use generic BPF mechanisms like tracepoints to achieve the same goals.
>
> The main use case for cpu_release() was calling scx_bpf_reenqueue_local() when
> a CPU gets preempted by a higher priority scheduling class. However, the old
> scx_bpf_reenqueue_local() could only be called from cpu_release() context.
>
> Add a new version of scx_bpf_reenqueue_local() that can be called from any
> context by deferring the actual re-enqueue operation. This eliminates the need
> for cpu_acquire/release() ops entirely. Schedulers can now use standard BPF
> mechanisms like the sched_switch tracepoint to detect and handle CPU preemption.
>
> Update scx_qmap to demonstrate the new approach using sched_switch instead of
> cpu_release, with compat support for older kernels. Mark cpu_acquire/release()
> as deprecated. The old scx_bpf_reenqueue_local() variant will be removed in
> v6.23.
>
> Reported-by: Wen-Fang Liu <liuwenfang@...or.com>
> Link: https://lore.kernel.org/all/8d64c74118c6440f81bcf5a4ac6b9f00@honor.com/
> Cc: Peter Zijlstra <peterz@...radead.org>
> Signed-off-by: Tejun Heo <tj@...nel.org>
> ---
Reviewed-by: Emil Tsalapatis <emil@...alapatis.com>
> kernel/sched/ext.c | 31 ++++++++++++++++++++
> kernel/sched/sched.h | 1 +
> tools/sched_ext/include/scx/common.bpf.h | 1 -
> tools/sched_ext/include/scx/compat.bpf.h | 23 +++++++++++++++
> tools/sched_ext/scx_qmap.bpf.c | 38 +++++++++++++++++-------
> 5 files changed, 83 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 222222222222..333333333333 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -147,6 +147,7 @@ static struct kset *scx_kset;
> #include <trace/events/sched_ext.h>
>
> static void process_ddsp_deferred_locals(struct rq *rq);
> +static u32 reenq_local(struct rq *rq);
> static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
> static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
> s64 exit_code, const char *fmt, va_list args);
> @@ -755,6 +756,11 @@ static int ops_sanitize_err(struct scx_sched *sch, s32 ret, s32 ops_err)
> static void run_deferred(struct rq *rq)
> {
> process_ddsp_deferred_locals(rq);
> +
> + if (local_read(&rq->scx.reenq_local_deferred)) {
> + local_set(&rq->scx.reenq_local_deferred, 0);
> + reenq_local(rq);
> + }
> }
>
> static void deferred_bal_cb_workfn(struct rq *rq)
> @@ -4569,6 +4575,9 @@ static int validate_ops(struct scx_sched *sch)
> if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
> pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
>
> + if (ops->cpu_acquire || ops->cpu_release)
> + pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
> +
> return 0;
> }
>
> @@ -5931,6 +5940,9 @@ __bpf_kfunc_start_defs();
> * Iterate over all of the tasks currently enqueued on the local DSQ of the
> * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
> * processed tasks. Can only be called from ops.cpu_release().
> + *
> + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void
> + * returning variant that can be called from anywhere.
> */
> __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
> {
> @@ -6490,6 +6502,24 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *data__str, u32 data__sz)
> }
>
> /**
> + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
> + *
> + * Iterate over all of the tasks currently enqueued on the local DSQ of the
> + * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
> + * anywhere.
> + */
> +__bpf_kfunc void scx_bpf_reenqueue_local___v2(void)
> +{
> + struct rq *rq;
> +
> + guard(preempt)();
> +
> + rq = this_rq();
> + local_set(&rq->scx.reenq_local_deferred, 1);
> + schedule_deferred(rq);
> +}
> +
> +/**
> * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
> * @cpu: CPU of interest
> *
> @@ -6902,6 +6932,7 @@ BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTRUCTOR)
> BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
> BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
> BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
> +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
> BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
> BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
> BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 444444444444..555555555555 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -804,6 +804,7 @@ struct scx_rq {
> cpumask_var_t cpus_to_preempt;
> cpumask_var_t cpus_to_wait;
> unsigned long kick_sync;
> + local_t reenq_local_deferred;
> struct balance_callback deferred_bal_cb;
> struct irq_work deferred_irq_work;
> struct irq_work kick_cpus_irq_work;
> diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
> index 666666666666..777777777777 100644
> --- a/tools/sched_ext/include/scx/common.bpf.h
> +++ b/tools/sched_ext/include/scx/common.bpf.h
> @@ -70,7 +70,6 @@ void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) _
> void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
> bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
> bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
> -u32 scx_bpf_reenqueue_local(void) __ksym;
> void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
> s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
> void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
> diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
> index 888888888888..999999999999 100644
> --- a/tools/sched_ext/include/scx/compat.bpf.h
> +++ b/tools/sched_ext/include/scx/compat.bpf.h
> @@ -279,6 +279,29 @@ static inline void scx_bpf_task_set_dsq_weight(struct task_struct *p, u32 weigh
> }
>
> /*
> + * v6.19: The new void variant can be called from anywhere while the older v1
> + * variant can only be called from ops.cpu_release(). The double ___ prefixes on
> + * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix
> + * on kernel side. Drop the wrapper and move the decl to common.bpf.h after
> + * v6.22.
> + */
> +u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak;
> +void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak;
> +
> +static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void)
> +{
> + return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat);
> +}
> +
> +static inline void scx_bpf_reenqueue_local(void)
> +{
> + if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
> + scx_bpf_reenqueue_local___v2___compat();
> + else
> + scx_bpf_reenqueue_local___v1();
> +}
> +
> +/*
> * Define sched_ext_ops. This may be expanded to define multiple variants for
> * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
> */
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index aaaaaaaaaaaa..bbbbbbbbbbbb 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> void *ring;
> s32 cpu;
>
> + if (enq_flags & SCX_ENQ_REENQ)
> + __sync_fetch_and_add(&nr_reenqueued, 1);
> +
> if (p->flags & PF_KTHREAD) {
> if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
> return;
> @@ -529,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before, struct task_struct *a,
> return task_qdist(a) > task_qdist(b);
> }
>
> -void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
> +SEC("tp_btf/sched_switch")
> +int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
> + struct task_struct *next, unsigned long prev_state)
> {
> - u32 cnt;
> + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
> + return 0;
>
> /*
> - * Called when @cpu is taken by a higher priority scheduling class. This
> - * makes @cpu no longer available for executing sched_ext tasks. As we
> - * don't want the tasks in @cpu's local dsq to sit there until @cpu
> - * becomes available again, re-enqueue them into the global dsq. See
> - * %SCX_ENQ_REENQ handling in qmap_enqueue().
> + * If @cpu is taken by a higher priority scheduling class, it is no
> + * longer available for executing sched_ext tasks. As we don't want the
> + * tasks in @cpu's local dsq to sit there until @cpu becomes available
> + * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
> + * handling in qmap_enqueue().
> */
> - cnt = scx_bpf_reenqueue_local();
> - if (cnt)
> - __sync_fetch_and_add(&nr_reenqueued, cnt);
> + switch (next->policy) {
> + case 1: /* SCHED_FIFO */
> + case 2: /* SCHED_RR */
> + case 6: /* SCHED_DEADLINE */
> + scx_bpf_reenqueue_local();
> + }
> +
> + return 0;
> +}
> +
> +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
> +{
> + /* see qmap_sched_switch() to learn how to do this on newer kernels */
> + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
> + scx_bpf_reenqueue_local();
> }
>
> s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
> --
> 2.47.1
>
Powered by blists - more mailing lists