[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240710192523.GF317151@maniforge>
Date: Wed, 10 Jul 2024 14:25:23 -0500
From: David Vernet <void@...ifault.com>
To: Tejun Heo <tj@...nel.org>
Cc: linux-kernel@...r.kernel.org, kernel-team@...a.com,
schatzberg.dan@...il.com, mingo@...hat.com, peterz@...radead.org,
changwoo@...lia.com, righi.andrea@...il.com
Subject: Re: [PATCH 6/6] sched_ext/scx_qmap: Pick idle CPU for direct
dispatch on !wakeup enqueues
On Tue, Jul 09, 2024 at 11:21:12AM -1000, Tejun Heo wrote:
> Because there was no way to directly dispatch to the local DSQ of a remote
> CPU from ops.enqueue(), scx_qmap skipped looking for an idle CPU on !wakeup
> enqueues. This restriction was removed and schbed_ext now allows
s/schbed_ext/sched_ext
> SCX_DSQ_LOCAL_ON verdicts for direct dispatches.
>
> Factor out pick_direct_dispatch_cpu() from ops.select_cpu() and use it to
> direct dispatch from ops.enqueue() on !wakeup enqueues.
>
> Signed-off-by: Tejun Heo <tj@...nel.org>
> Cc: David Vernet <void@...ifault.com>
> Cc: Dan Schatzberg <schatzberg.dan@...il.com>
> Cc: Changwoo Min <changwoo@...lia.com>
> Cc: Andrea Righi <righi.andrea@...il.com>
Hi Tejun,
This LG as is, but I also left a comment below in case we want to tweak. Feel
free to just apply the tag if you'd rather not iterate given that this is just
an example scheduler.
Acked-by: David Vernet <void@...ifault.com>
> ---
> tools/sched_ext/scx_qmap.bpf.c | 39 ++++++++++++++++++++++++++--------
> tools/sched_ext/scx_qmap.c | 5 +++--
> 2 files changed, 33 insertions(+), 11 deletions(-)
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index 27e35066a602..892278f12dce 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -120,11 +120,26 @@ struct {
> } cpu_ctx_stor SEC(".maps");
>
> /* Statistics */
> -u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
> +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
> u64 nr_core_sched_execed;
> u32 cpuperf_min, cpuperf_avg, cpuperf_max;
> u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
>
> +static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
> +{
> + s32 cpu;
> +
> + if (p->nr_cpus_allowed == 1 ||
> + scx_bpf_test_and_clear_cpu_idle(prev_cpu))
> + return prev_cpu;
> +
> + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
> + if (cpu >= 0)
> + return cpu;
> +
> + return -1;
> +}
> +
> s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
> s32 prev_cpu, u64 wake_flags)
> {
> @@ -137,17 +152,14 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
> return -ESRCH;
> }
>
> - if (p->nr_cpus_allowed == 1 ||
> - scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
> + cpu = pick_direct_dispatch_cpu(p, prev_cpu);
> +
> + if (cpu >= 0) {
> tctx->force_local = true;
> + return cpu;
> + } else {
> return prev_cpu;
> }
> -
> - cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
> - if (cpu >= 0)
> - return cpu;
> -
> - return prev_cpu;
> }
>
> static int weight_to_idx(u32 weight)
> @@ -172,6 +184,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> u32 pid = p->pid;
> int idx = weight_to_idx(p->scx.weight);
> void *ring;
> + s32 cpu;
>
> if (p->flags & PF_KTHREAD) {
> if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
> @@ -207,6 +220,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> return;
> }
>
> + /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */
> + if (!(enq_flags & SCX_ENQ_WAKEUP) &&
> + (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
> + __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
> + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
> + return;
> + }
Hmm, will this be a typical pattern for how this is used? I'd expect
ops.select_cpu() and ops.enqueue() to quite often be nearly the same
implementation. Meaning you would e.g. try to find an idle core in both, and do
SCX_DSQ_LOCAL_ON, with the difference being that you'd just return the cpu and
save the extra lock juggling if you did it on the ops.select_cpu() path. Not a
huge deal given that it's just an example scheduler, but it might be a good
idea to try and mirror typical use cases for that reason as well so readers get
an idea of what a typical pattern would look like.
> +
> /*
> * If the task was re-enqueued due to the CPU being preempted by a
> * higher priority scheduling class, just re-enqueue the task directly
> diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
> index 304f0488a386..c9ca30d62b2b 100644
> --- a/tools/sched_ext/scx_qmap.c
> +++ b/tools/sched_ext/scx_qmap.c
> @@ -116,10 +116,11 @@ int main(int argc, char **argv)
> long nr_enqueued = skel->bss->nr_enqueued;
> long nr_dispatched = skel->bss->nr_dispatched;
>
> - printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64"\n",
> + printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
> nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
> skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
> - skel->bss->nr_core_sched_execed);
> + skel->bss->nr_core_sched_execed,
> + skel->bss->nr_ddsp_from_enq);
> if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
> printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
> skel->bss->cpuperf_min,
> --
> 2.45.2
>
Download attachment "signature.asc" of type "application/pgp-signature" (229 bytes)
Powered by blists - more mailing lists