linux-kernel - Re: [GIT PULL] sched_ext: Initial pull request for v6.11

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240802122034.GZ12673@noisy.programming.kicks-ass.net>
Date: Fri, 2 Aug 2024 14:20:34 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Tejun Heo <tj@...nel.org>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>,
	linux-kernel@...r.kernel.org, David Vernet <void@...ifault.com>,
	Ingo Molnar <mingo@...hat.com>, Alexei Starovoitov <ast@...nel.org>,
	Thomas Gleixner <tglx@...utronix.de>
Subject: Re: [GIT PULL] sched_ext: Initial pull request for v6.11


A few more..

> +static bool scx_switching_all;
> +DEFINE_STATIC_KEY_FALSE(__scx_switched_all);

> +	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
> +	if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
> +		static_branch_enable(&__scx_switched_all);

> +	static_branch_disable(&__scx_switched_all);
> +	WRITE_ONCE(scx_switching_all, false);

FYI the static_key contains a variable you can read if you want, see
static_key_count()/static_key_enabled(). No need to mirror the state.


> +static struct task_struct *
> +scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead)
> +{
> +	struct task_struct *p;
> +retry:
> +	scx_task_iter_rq_unlock(iter);
> +
> +	while ((p = scx_task_iter_next(iter))) {
> +		/*
> +		 * is_idle_task() tests %PF_IDLE which may not be set for CPUs
> +		 * which haven't yet been onlined. Test sched_class directly.
> +		 */
> +		if (p->sched_class != &idle_sched_class)
> +			break;

This isn't quite the same; please look at play_idle_precise() in
drivers/powercap/idle_inject.c.

That is, there are PF_IDLE tasks that are not idle_sched_class.

> +	}
> +	if (!p)
> +		return NULL;
> +
> +	iter->rq = task_rq_lock(p, &iter->rf);
> +	iter->locked = p;
> +
> +	/*
> +	 * If we see %TASK_DEAD, @p already disabled preemption, is about to do
> +	 * the final __schedule(), won't ever need to be scheduled again and can
> +	 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter
> +	 * the final __schedle() while we're locking its rq and thus will stay
> +	 * alive until the rq is unlocked.
> +	 */
> +	if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD)
> +		goto retry;
> +
> +	return p;
> +}


> +static void update_curr_scx(struct rq *rq)
> +{
> +	struct task_struct *curr = rq->curr;
> +	u64 now = rq_clock_task(rq);
> +	u64 delta_exec;
> +
> +	if (time_before_eq64(now, curr->se.exec_start))
> +		return;
> +
> +	delta_exec = now - curr->se.exec_start;
> +	curr->se.exec_start = now;
> +	curr->se.sum_exec_runtime += delta_exec;
> +	account_group_exec_runtime(curr, delta_exec);
> +	cgroup_account_cputime(curr, delta_exec);

Could you please use update_curr_common() here?

This helps keep the accounting in one place. For instance, see this
patch:

  https://lkml.kernel.org/r/20240727105031.053611186@infradead.org  

That adds a sum_exec_runtime variant that is scaled by DVFS and
capacity.

You should be able to make the function:

	u64 delta_exec = update_curr_common(rq);
> +	struct task_struct *curr = rq->curr;
> +
> +	if (curr->scx.slice != SCX_SLICE_INF) {
> +		curr->scx.slice -= min(curr->scx.slice, delta_exec);
> +		if (!curr->scx.slice)
> +			touch_core_sched(rq, curr);
> +	}
> +}



> +static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
> +				   u64 enq_flags)
> +{
> +	struct rq *task_rq;
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	/*
> +	 * If dequeue got to @p while we were trying to lock both rq's, it'd
> +	 * have cleared @p->scx.holding_cpu to -1. While other cpus may have
> +	 * updated it to different values afterwards, as this operation can't be
> +	 * preempted or recurse, @p->scx.holding_cpu can never become
> +	 * raw_smp_processor_id() again before we're done. Thus, we can tell
> +	 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
> +	 * still raw_smp_processor_id().
> +	 *
> +	 * See dispatch_dequeue() for the counterpart.
> +	 */
> +	if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
> +		return false;
> +
> +	/* @p->rq couldn't have changed if we're still the holding cpu */
> +	task_rq = task_rq(p);
> +	lockdep_assert_rq_held(task_rq);
> +
> +	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
> +	deactivate_task(task_rq, p, 0);
> +	set_task_cpu(p, cpu_of(rq));
> +	p->scx.sticky_cpu = cpu_of(rq);

(this *could* live in ->migrate_task_rq(), but yeah, you only have this
one site, so meh)

> +
> +	/*
> +	 * We want to pass scx-specific enq_flags but activate_task() will
> +	 * truncate the upper 32 bit. As we own @rq, we can pass them through
> +	 * @rq->scx.extra_enq_flags instead.
> +	 */
> +	WARN_ON_ONCE(rq->scx.extra_enq_flags);
> +	rq->scx.extra_enq_flags = enq_flags;

eeew.. it's not just activate_task(), its the whole callchain having
'int' flags. That said, we should be having plenty free bits there no?

> +	activate_task(rq, p, 0);
> +	rq->scx.extra_enq_flags = 0;
> +
> +	return true;
> +}

> +static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq,
> +				struct task_struct *p, struct rq *task_rq)
> +{
> +	bool moved = false;
> +
> +	lockdep_assert_held(&dsq->lock);	/* released on return */
> +
> +	/*
> +	 * @dsq is locked and @p is on a remote rq. @p is currently protected by
> +	 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
> +	 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
> +	 * rq lock or fail, do a little dancing from our side. See
> +	 * move_task_to_local_dsq().
> +	 */
> +	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
> +	task_unlink_from_dsq(p, dsq);
> +	dsq_mod_nr(dsq, -1);
> +	p->scx.holding_cpu = raw_smp_processor_id();
> +	raw_spin_unlock(&dsq->lock);
> +
> +	double_lock_balance(rq, task_rq);
> +
> +	moved = move_task_to_local_dsq(rq, p, 0);
> +
> +	double_unlock_balance(rq, task_rq);
> +
> +	return moved;
> +}

I've gotta ask, why are you using the double_lock_balance() pattern
instead of the one in move_queued_task() that does:

  lock src
  dequeue src, task
  set_task_cpu task, dst
  unlock src

  lock dst
  enqueue dst, task
  unlock dst



> +/*
> + * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p
> + * can be pulled to @rq.
> + */
> +static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq)
> +{
> +	int cpu = cpu_of(rq);
> +
> +	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> +		return false;
> +	if (unlikely(is_migration_disabled(p)))
> +		return false;
> +	if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p)))
> +		return false;
> +	if (!scx_rq_online(rq))
> +		return false;
> +	return true;
> +}

I'm a little confused, is_cpu_allowed() is used for that same purpose
no?