linux-kernel - Re: [PATCH 3/4] sched_ext: Wrap kfunc args in struct to prepare for aux_

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CABFh=a4CQ0dJOKKDZPVEMPbFRtmY7z=0Bn00HY7JEke35GQy0g@mail.gmail.com>
Date: Tue, 7 Oct 2025 12:04:56 -0400
From: Emil Tsalapatis <linux-lists@...alapatis.com>
To: Tejun Heo <tj@...nel.org>
Cc: David Vernet <void@...ifault.com>, Andrea Righi <arighi@...dia.com>, 
	Changwoo Min <changwoo@...lia.com>, linux-kernel@...r.kernel.org, 
	sched-ext@...ts.linux.dev
Subject: Re: [PATCH 3/4] sched_ext: Wrap kfunc args in struct to prepare for aux__prog

On Mon, Oct 6, 2025 at 9:51 PM Tejun Heo <tj@...nel.org> wrote:
>
> scx_bpf_dsq_insert_vtime() and scx_bpf_select_cpu_and() currently have 5
> parameters. An upcoming change will add aux__prog parameter which will exceed
> BPF's 5 argument limit.
>
> Prepare by adding new kfuncs __scx_bpf_dsq_insert_vtime() and
> __scx_bpf_select_cpu_and() that take args structs. The existing kfuncs are
> kept as compatibility wrappers. BPF programs use inline wrappers that detect
> kernel API version via bpf_core_type_exists() and use the new struct-based
> kfuncs when available, falling back to compat kfuncs otherwise. This allows
> BPF programs to work with both old and new kernels.
>
> Signed-off-by: Tejun Heo <tj@...nel.org>

Reviewed-by: Emil Tsalapatis <emil@...alapatis.com>

> ---
>  kernel/sched/ext.c                       | 82 ++++++++++++++++++------
>  kernel/sched/ext_idle.c                  | 43 +++++++++++--
>  tools/sched_ext/include/scx/common.bpf.h |  6 +-
>  tools/sched_ext/include/scx/compat.bpf.h | 72 +++++++++++++++++++++
>  4 files changed, 173 insertions(+), 30 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 6d76efaaa9b2..a34e731229de 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -5345,54 +5345,94 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
>         scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
>  }
>
> +static void scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
> +                                u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags)
> +{
> +       if (!scx_dsq_insert_preamble(sch, p, enq_flags))
> +               return;
> +
> +       if (slice)
> +               p->scx.slice = slice;
> +       else
> +               p->scx.slice = p->scx.slice ?: 1;
> +

Though the ?:1 logic is already in the code: I assume the rationale is
that we want the slice to be immediately exhausted so that the BPF
scheduler refills it with the right value, which makes sense.

> +       p->scx.dsq_vtime = vtime;
> +
> +       scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
> +}
> +
> +struct scx_bpf_dsq_insert_vtime_args {
> +       /* @p can't be packed together as KF_RCU is not transitive */
> +       u64                     dsq_id;
> +       u64                     slice;
> +       u64                     vtime;
> +       u64                     enq_flags;
> +};
> +
>  /**
> - * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
> + * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion
>   * @p: task_struct to insert
> - * @dsq_id: DSQ to insert into
> - * @slice: duration @p can run for in nsecs, 0 to keep the current value
> - * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
> - * @enq_flags: SCX_ENQ_*
> + * @args: struct containing the rest of the arguments
> + *       @args->dsq_id: DSQ to insert into
> + *       @args->slice: duration @p can run for in nsecs, 0 to keep the current value
> + *       @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
> + *       @args->enq_flags: SCX_ENQ_*
>   *
> - * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id.
> - * Tasks queued into the priority queue are ordered by @vtime. All other aspects
> - * are identical to scx_bpf_dsq_insert().
> + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
> + * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided
> + * as an inline wrapper in common.bpf.h.
>   *
> - * @vtime ordering is according to time_before64() which considers wrapping. A
> - * numerically larger vtime may indicate an earlier position in the ordering and
> - * vice-versa.
> + * Insert @p into the vtime priority queue of the DSQ identified by
> + * @args->dsq_id. Tasks queued into the priority queue are ordered by
> + * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert().
> + *
> + * @args->vtime ordering is according to time_before64() which considers
> + * wrapping. A numerically larger vtime may indicate an earlier position in the
> + * ordering and vice-versa.
>   *
>   * A DSQ can only be used as a FIFO or priority queue at any given time and this
>   * function must not be called on a DSQ which already has one or more FIFO tasks
>   * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
>   * SCX_DSQ_GLOBAL) cannot be used as priority queues.
>   */
> -__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
> -                                         u64 slice, u64 vtime, u64 enq_flags)
> +__bpf_kfunc void
> +__scx_bpf_dsq_insert_vtime(struct task_struct *p,
> +                          struct scx_bpf_dsq_insert_vtime_args *args)
>  {
>         struct scx_sched *sch;
>
>         guard(rcu)();
> +
>         sch = rcu_dereference(scx_root);
>         if (unlikely(!sch))
>                 return;
>
> -       if (!scx_dsq_insert_preamble(sch, p, enq_flags))
> -               return;
> +       scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, args->vtime,
> +                            args->enq_flags);
> +}
>
> -       if (slice)
> -               p->scx.slice = slice;
> -       else
> -               p->scx.slice = p->scx.slice ?: 1;
> +/*
> + * COMPAT: Will be removed in v6.23.
> + */
> +__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
> +                                         u64 slice, u64 vtime, u64 enq_flags)
> +{
> +       struct scx_sched *sch;
>
> -       p->scx.dsq_vtime = vtime;
> +       guard(rcu)();
>
> -       scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
> +       sch = rcu_dereference(scx_root);
> +       if (unlikely(!sch))
> +               return;
> +
> +       scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags);
>  }
>
>  __bpf_kfunc_end_defs();
>
>  BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
>  BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
> +BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU)
>  BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
>  BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
>
> diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
> index d2434c954848..3d9d404d5cd2 100644
> --- a/kernel/sched/ext_idle.c
> +++ b/kernel/sched/ext_idle.c
> @@ -995,26 +995,56 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
>         return prev_cpu;
>  }
>
> +struct scx_bpf_select_cpu_and_args {
> +       /* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */
> +       s32                     prev_cpu;
> +       u64                     wake_flags;
> +       u64                     flags;
> +};
> +
>  /**
> - * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
> - *                         prioritizing those in @cpus_allowed
> + * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask
>   * @p: task_struct to select a CPU for
> - * @prev_cpu: CPU @p was on previously
> - * @wake_flags: %SCX_WAKE_* flags
>   * @cpus_allowed: cpumask of allowed CPUs
> - * @flags: %SCX_PICK_IDLE* flags
> + * @args: struct containing the rest of the arguments
> + *       @args->prev_cpu: CPU @p was on previously
> + *       @args->wake_flags: %SCX_WAKE_* flags
> + *       @args->flags: %SCX_PICK_IDLE* flags
> + *
> + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
> + * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
> + * as an inline wrapper in common.bpf.h.
>   *
>   * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
>   * context such as a BPF test_run() call, as long as built-in CPU selection
>   * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
>   * is set.
>   *
> - * @p, @prev_cpu and @wake_flags match ops.select_cpu().
> + * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu().
>   *
>   * Returns the selected idle CPU, which will be automatically awakened upon
>   * returning from ops.select_cpu() and can be used for direct dispatch, or
>   * a negative value if no idle CPU is available.
>   */
> +__bpf_kfunc s32
> +__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
> +                        struct scx_bpf_select_cpu_and_args *args)
> +{
> +       struct scx_sched *sch;
> +
> +       guard(rcu)();
> +
> +       sch = rcu_dereference(scx_root);
> +       if (unlikely(!sch))
> +               return -ENODEV;
> +
> +       return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags,
> +                                    cpus_allowed, args->flags);
> +}
> +
> +/*
> + * COMPAT: Will be removed in v6.22.
> + */
>  __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
>                                        const struct cpumask *cpus_allowed, u64 flags)
>  {
> @@ -1383,6 +1413,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
>  BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
>  BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
>  BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
> +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU)
>  BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
>  BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
>  BTF_KFUNCS_END(scx_kfunc_ids_idle)
> diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
> index 505231b7b7ae..b1c2a0dde76e 100644
> --- a/tools/sched_ext/include/scx/common.bpf.h
> +++ b/tools/sched_ext/include/scx/common.bpf.h
> @@ -60,10 +60,10 @@ static inline void ___vmlinux_h_sanity_check___(void)
>
>  s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
>  s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
> -s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
> -                          const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
> +s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
> +                            struct scx_bpf_select_cpu_and_args *args) __ksym __weak;
>  void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
> -void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
> +void __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak;
>  u32 scx_bpf_dispatch_nr_slots(void) __ksym;
>  void scx_bpf_dispatch_cancel(void) __ksym;
>  bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak;
> diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
> index d979f16a3ae2..e172de696f99 100644
> --- a/tools/sched_ext/include/scx/compat.bpf.h
> +++ b/tools/sched_ext/include/scx/compat.bpf.h
> @@ -143,6 +143,78 @@ static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu)
>         return rq ? rq->curr : NULL;
>  }
>
> +/*
> + * v6.19: To work around BPF maximum parameter limit, the following kfuncs are
> + * replaced with variants that pack scalar arguments in a struct. Wrappers are
> + * provided to maintain source compatibility.
> + *
> + * The kernel will carry the compat variants until v6.23 to maintain binary
> + * compatibility. After v6.23 release, remove the compat handling and move the
> + * wrappers to common.bpf.h.
> + */
> +s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
> +                                   const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
> +void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
> +
> +/**
> + * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p
> + * @p: task_struct to select a CPU for
> + * @prev_cpu: CPU @p was on previously
> + * @wake_flags: %SCX_WAKE_* flags
> + * @cpus_allowed: cpumask of allowed CPUs
> + * @flags: %SCX_PICK_IDLE* flags
> + *
> + * Inline wrapper that packs scalar arguments into a struct and calls
> + * __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details.
> + */
> +static inline s32
> +scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
> +                      const struct cpumask *cpus_allowed, u64 flags)
> +{
> +       if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) {
> +               struct scx_bpf_select_cpu_and_args args = {
> +                       .prev_cpu = prev_cpu,
> +                       .wake_flags = wake_flags,
> +                       .flags = flags,
> +               };
> +
> +               return __scx_bpf_select_cpu_and(p, cpus_allowed, &args);
> +       } else {
> +               return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags,
> +                                                      cpus_allowed, flags);
> +       }
> +}
> +
> +/**
> + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
> + * @p: task_struct to insert
> + * @dsq_id: DSQ to insert into
> + * @slice: duration @p can run for in nsecs, 0 to keep the current value
> + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
> + * @enq_flags: SCX_ENQ_*
> + *
> + * Inline wrapper that packs scalar arguments into a struct and calls
> + * __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details.
> + */
> +static inline void
> +scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime,
> +                        u64 enq_flags)
> +{
> +       if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) {
> +               struct scx_bpf_dsq_insert_vtime_args args = {
> +                       .dsq_id = dsq_id,
> +                       .slice = slice,
> +                       .vtime = vtime,
> +                       .enq_flags = enq_flags,
> +               };
> +
> +               __scx_bpf_dsq_insert_vtime(p, &args);
> +       } else {
> +               scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime,
> +                                                 enq_flags);
> +       }
> +}
> +
>  /*
>   * Define sched_ext_ops. This may be expanded to define multiple variants for
>   * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
> --
> 2.51.0
>
>