[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240802122034.GZ12673@noisy.programming.kicks-ass.net>
Date: Fri, 2 Aug 2024 14:20:34 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Tejun Heo <tj@...nel.org>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>,
linux-kernel@...r.kernel.org, David Vernet <void@...ifault.com>,
Ingo Molnar <mingo@...hat.com>, Alexei Starovoitov <ast@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>
Subject: Re: [GIT PULL] sched_ext: Initial pull request for v6.11
A few more..
> +static bool scx_switching_all;
> +DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
> + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
> + if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
> + static_branch_enable(&__scx_switched_all);
> + static_branch_disable(&__scx_switched_all);
> + WRITE_ONCE(scx_switching_all, false);
FYI the static_key contains a variable you can read if you want, see
static_key_count()/static_key_enabled(). No need to mirror the state.
> +static struct task_struct *
> +scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead)
> +{
> + struct task_struct *p;
> +retry:
> + scx_task_iter_rq_unlock(iter);
> +
> + while ((p = scx_task_iter_next(iter))) {
> + /*
> + * is_idle_task() tests %PF_IDLE which may not be set for CPUs
> + * which haven't yet been onlined. Test sched_class directly.
> + */
> + if (p->sched_class != &idle_sched_class)
> + break;
This isn't quite the same; please look at play_idle_precise() in
drivers/powercap/idle_inject.c.
That is, there are PF_IDLE tasks that are not idle_sched_class.
> + }
> + if (!p)
> + return NULL;
> +
> + iter->rq = task_rq_lock(p, &iter->rf);
> + iter->locked = p;
> +
> + /*
> + * If we see %TASK_DEAD, @p already disabled preemption, is about to do
> + * the final __schedule(), won't ever need to be scheduled again and can
> + * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter
> + * the final __schedle() while we're locking its rq and thus will stay
> + * alive until the rq is unlocked.
> + */
> + if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD)
> + goto retry;
> +
> + return p;
> +}
> +static void update_curr_scx(struct rq *rq)
> +{
> + struct task_struct *curr = rq->curr;
> + u64 now = rq_clock_task(rq);
> + u64 delta_exec;
> +
> + if (time_before_eq64(now, curr->se.exec_start))
> + return;
> +
> + delta_exec = now - curr->se.exec_start;
> + curr->se.exec_start = now;
> + curr->se.sum_exec_runtime += delta_exec;
> + account_group_exec_runtime(curr, delta_exec);
> + cgroup_account_cputime(curr, delta_exec);
Could you please use update_curr_common() here?
This helps keep the accounting in one place. For instance, see this
patch:
https://lkml.kernel.org/r/20240727105031.053611186@infradead.org
That adds a sum_exec_runtime variant that is scaled by DVFS and
capacity.
You should be able to make the function:
u64 delta_exec = update_curr_common(rq);
> + struct task_struct *curr = rq->curr;
> +
> + if (curr->scx.slice != SCX_SLICE_INF) {
> + curr->scx.slice -= min(curr->scx.slice, delta_exec);
> + if (!curr->scx.slice)
> + touch_core_sched(rq, curr);
> + }
> +}
> +static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
> + u64 enq_flags)
> +{
> + struct rq *task_rq;
> +
> + lockdep_assert_rq_held(rq);
> +
> + /*
> + * If dequeue got to @p while we were trying to lock both rq's, it'd
> + * have cleared @p->scx.holding_cpu to -1. While other cpus may have
> + * updated it to different values afterwards, as this operation can't be
> + * preempted or recurse, @p->scx.holding_cpu can never become
> + * raw_smp_processor_id() again before we're done. Thus, we can tell
> + * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
> + * still raw_smp_processor_id().
> + *
> + * See dispatch_dequeue() for the counterpart.
> + */
> + if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
> + return false;
> +
> + /* @p->rq couldn't have changed if we're still the holding cpu */
> + task_rq = task_rq(p);
> + lockdep_assert_rq_held(task_rq);
> +
> + WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
> + deactivate_task(task_rq, p, 0);
> + set_task_cpu(p, cpu_of(rq));
> + p->scx.sticky_cpu = cpu_of(rq);
(this *could* live in ->migrate_task_rq(), but yeah, you only have this
one site, so meh)
> +
> + /*
> + * We want to pass scx-specific enq_flags but activate_task() will
> + * truncate the upper 32 bit. As we own @rq, we can pass them through
> + * @rq->scx.extra_enq_flags instead.
> + */
> + WARN_ON_ONCE(rq->scx.extra_enq_flags);
> + rq->scx.extra_enq_flags = enq_flags;
eeew.. it's not just activate_task(), its the whole callchain having
'int' flags. That said, we should be having plenty free bits there no?
> + activate_task(rq, p, 0);
> + rq->scx.extra_enq_flags = 0;
> +
> + return true;
> +}
> +static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq,
> + struct task_struct *p, struct rq *task_rq)
> +{
> + bool moved = false;
> +
> + lockdep_assert_held(&dsq->lock); /* released on return */
> +
> + /*
> + * @dsq is locked and @p is on a remote rq. @p is currently protected by
> + * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
> + * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
> + * rq lock or fail, do a little dancing from our side. See
> + * move_task_to_local_dsq().
> + */
> + WARN_ON_ONCE(p->scx.holding_cpu >= 0);
> + task_unlink_from_dsq(p, dsq);
> + dsq_mod_nr(dsq, -1);
> + p->scx.holding_cpu = raw_smp_processor_id();
> + raw_spin_unlock(&dsq->lock);
> +
> + double_lock_balance(rq, task_rq);
> +
> + moved = move_task_to_local_dsq(rq, p, 0);
> +
> + double_unlock_balance(rq, task_rq);
> +
> + return moved;
> +}
I've gotta ask, why are you using the double_lock_balance() pattern
instead of the one in move_queued_task() that does:
lock src
dequeue src, task
set_task_cpu task, dst
unlock src
lock dst
enqueue dst, task
unlock dst
> +/*
> + * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p
> + * can be pulled to @rq.
> + */
> +static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq)
> +{
> + int cpu = cpu_of(rq);
> +
> + if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> + return false;
> + if (unlikely(is_migration_disabled(p)))
> + return false;
> + if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p)))
> + return false;
> + if (!scx_rq_online(rq))
> + return false;
> + return true;
> +}
I'm a little confused, is_cpu_allowed() is used for that same purpose
no?
Powered by blists - more mailing lists