[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240901005639.GE70166@maniforge>
Date: Sat, 31 Aug 2024 19:56:39 -0500
From: David Vernet <void@...ifault.com>
To: Tejun Heo <tj@...nel.org>
Cc: kernel-team@...a.com, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 04/11] sched_ext: Fix processs_ddsp_deferred_locals() by
unifying DTL_INVALID handling
On Sat, Aug 31, 2024 at 07:53:37PM -0500, David Vernet wrote:
> On Fri, Aug 30, 2024 at 07:44:46AM -1000, Tejun Heo wrote:
> > With the preceding update, the only return value which makes meaningful
> > difference is DTL_INVALID, for which one caller, finish_dispatch(), falls
> > back to the global DSQ and the other, process_ddsp_deferred_locals(),
> > doesn't do anything.
> >
> > It should always fallback to the global DSQ. Move the global DSQ fallback
> > into dispatch_to_local_dsq() and remove the return value.
> >
> > v2: Patch title and description updated to reflect the behavior fix for
> > process_ddsp_deferred_locals().
> >
> > Signed-off-by: Tejun Heo <tj@...nel.org>
> > Cc: David Vernet <void@...ifault.com>
>
> Acked-by: David Vernet <void@...ifault.com>
>
> FYI, I was able to trigger what I think is a benign pr_warn() by testing
> the failure path for DSQ_LOCAL_ON in the direct dispatch path. If I
> write a testcase that dispatches an instance of a ksoftirqd task to a
Sorry, should have been more clear: the testcase dispatched all tasks to
the wrong CPU, which is why it's a kworker in the print output below. I
believe that ksoftiqrd hit the same path as well and just wasn't printed
in the output because it lost the race to scx_bpf_error(). Let me know
if you want the testcase to repro and I can send it, or send a separate
patch to add it to selftests.
Thanks,
David
> CPU that it's not allowed to run on, I get the following:
>
> [ 14.799935] sched_ext: BPF scheduler "ddsp_local_on_invalid" enabled
> [ 14.813738] NOHZ tick-stop error: local softirq work is pending, handler #200!!!
> [ 14.829536] sched_ext: BPF scheduler "ddsp_local_on_invalid" disabled (runtime error)
> [ 14.829613] sched_ext: ddsp_local_on_invalid: SCX_DSQ_LOCAL[_ON] verdict target cpu 1 not allowed for kworker/0:2[113]
> [ 14.829707] dispatch_to_local_dsq+0x13b/0x1e0
> [ 14.829760] run_deferred+0x9d/0xe0
> [ 14.829797] ttwu_do_activate+0x10a/0x250
> [ 14.829834] try_to_wake_up+0x28c/0x530
> [ 14.829882] kick_pool+0xd6/0x160
> [ 14.829923] __queue_work+0x3f7/0x530
> [ 14.829962] call_timer_fn+0xcf/0x2a0
> [ 14.830001] __run_timer_base+0x227/0x250
> [ 14.830039] run_timer_softirq+0x45/0x60
> [ 14.830078] handle_softirqs+0x120/0x400
> [ 14.830121] __irq_exit_rcu+0x67/0xd0
> [ 14.830157] irq_exit_rcu+0xe/0x20
> [ 14.830188] sysvec_apic_timer_interrupt+0x76/0x90
> [ 14.830218] asm_sysvec_apic_timer_interrupt+0x1a/0x20
> [ 14.830254] default_idle+0x13/0x20
> [ 14.830282] default_idle_call+0x74/0xb0
> [ 14.830306] do_idle+0xef/0x280
> [ 14.830335] cpu_startup_entry+0x2a/0x30
> [ 14.830363] __pfx_kernel_init+0x0/0x10
> [ 14.830386] start_kernel+0x37c/0x3d0
> [ 14.830414] x86_64_start_reservations+0x24/0x30
> [ 14.830445] x86_64_start_kernel+0xb1/0xc0
> [ 14.830480] common_startup_64+0x13e/0x147
>
> This is from report_idle_softirq() in time/tick-sched.c, which has a
> baked-in assumption that if there's a pending softirq and we're not in
> an IRQ, then ksoftirqd must have been scheduled and we should therefore
> expect to see a need_resched() on the current core if we're on the
> can_stop_idle_tick() path.
>
> I think this is safe because we're going to be reverting back to fair.c
> at this point anyways, but we should figure out how to avoid confusing
> the idle tick path regardless.
>
> Thanks,
> David
>
> > ---
> > kernel/sched/ext.c | 41 ++++++++++-------------------------------
> > 1 file changed, 10 insertions(+), 31 deletions(-)
> >
> > --- a/kernel/sched/ext.c
> > +++ b/kernel/sched/ext.c
> > @@ -2301,12 +2301,6 @@ retry:
> > return false;
> > }
> >
> > -enum dispatch_to_local_dsq_ret {
> > - DTL_DISPATCHED, /* successfully dispatched */
> > - DTL_LOST, /* lost race to dequeue */
> > - DTL_INVALID, /* invalid local dsq_id */
> > -};
> > -
> > /**
> > * dispatch_to_local_dsq - Dispatch a task to a local dsq
> > * @rq: current rq which is locked
> > @@ -2321,9 +2315,8 @@ enum dispatch_to_local_dsq_ret {
> > * The caller must have exclusive ownership of @p (e.g. through
> > * %SCX_OPSS_DISPATCHING).
> > */
> > -static enum dispatch_to_local_dsq_ret
> > -dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
> > - struct task_struct *p, u64 enq_flags)
> > +static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
> > + struct task_struct *p, u64 enq_flags)
> > {
> > struct rq *src_rq = task_rq(p);
> > struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
> > @@ -2336,13 +2329,11 @@ dispatch_to_local_dsq(struct rq *rq, str
> > */
> > if (rq == src_rq && rq == dst_rq) {
> > dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> > - return DTL_DISPATCHED;
> > + return;
> > }
> >
> > #ifdef CONFIG_SMP
> > if (likely(task_can_run_on_remote_rq(p, dst_rq, true))) {
> > - bool dsp;
> > -
> > /*
> > * @p is on a possibly remote @src_rq which we need to lock to
> > * move the task. If dequeue is in progress, it'd be locking
> > @@ -2367,10 +2358,8 @@ dispatch_to_local_dsq(struct rq *rq, str
> > }
> >
> > /* task_rq couldn't have changed if we're still the holding cpu */
> > - dsp = p->scx.holding_cpu == raw_smp_processor_id() &&
> > - !WARN_ON_ONCE(src_rq != task_rq(p));
> > -
> > - if (likely(dsp)) {
> > + if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
> > + !WARN_ON_ONCE(src_rq != task_rq(p))) {
> > /*
> > * If @p is staying on the same rq, there's no need to
> > * go through the full deactivate/activate cycle.
> > @@ -2398,11 +2387,11 @@ dispatch_to_local_dsq(struct rq *rq, str
> > raw_spin_rq_lock(rq);
> > }
> >
> > - return dsp ? DTL_DISPATCHED : DTL_LOST;
> > + return;
> > }
> > #endif /* CONFIG_SMP */
> >
> > - return DTL_INVALID;
> > + dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> > }
> >
> > /**
> > @@ -2479,20 +2468,10 @@ retry:
> >
> > dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
> >
> > - if (dsq->id == SCX_DSQ_LOCAL) {
> > - switch (dispatch_to_local_dsq(rq, dsq, p, enq_flags)) {
> > - case DTL_DISPATCHED:
> > - break;
> > - case DTL_LOST:
> > - break;
> > - case DTL_INVALID:
> > - dispatch_enqueue(&scx_dsq_global, p,
> > - enq_flags | SCX_ENQ_CLEAR_OPSS);
> > - break;
> > - }
> > - } else {
> > + if (dsq->id == SCX_DSQ_LOCAL)
> > + dispatch_to_local_dsq(rq, dsq, p, enq_flags);
> > + else
> > dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> > - }
> > }
> >
> > static void flush_dispatch_buf(struct rq *rq)
Download attachment "signature.asc" of type "application/pgp-signature" (229 bytes)
Powered by blists - more mailing lists