linux-kernel - Re: [PATCH v22 4/6] sched: Handle blocked-waiter migration (and return migration)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251008133223.GT4067720@noisy.programming.kicks-ass.net>
Date: Wed, 8 Oct 2025 15:32:23 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: John Stultz <jstultz@...gle.com>
Cc: LKML <linux-kernel@...r.kernel.org>,
	Joel Fernandes <joelagnelf@...dia.com>,
	Qais Yousef <qyousef@...alina.io>, Ingo Molnar <mingo@...hat.com>,
	Juri Lelli <juri.lelli@...hat.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Valentin Schneider <vschneid@...hat.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Zimuzo Ezeozue <zezeozue@...gle.com>, Mel Gorman <mgorman@...e.de>,
	Will Deacon <will@...nel.org>, Waiman Long <longman@...hat.com>,
	Boqun Feng <boqun.feng@...il.com>,
	"Paul E. McKenney" <paulmck@...nel.org>,
	Metin Kaya <Metin.Kaya@....com>,
	Xuewen Yan <xuewen.yan94@...il.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Daniel Lezcano <daniel.lezcano@...aro.org>,
	Suleiman Souhlal <suleiman@...gle.com>,
	kuyo chang <kuyo.chang@...iatek.com>, hupu <hupu.gm@...il.com>,
	kernel-team@...roid.com
Subject: Re: [PATCH v22 4/6] sched: Handle blocked-waiter migration (and
 return migration)

On Fri, Sep 26, 2025 at 03:29:12AM +0000, John Stultz wrote:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7bba05c07a79d..d063d2c9bd5aa 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3157,6 +3157,14 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
>  
>  	__do_set_cpus_allowed(p, ctx);
>  
> +	/*
> +	 * It might be that the p->wake_cpu is no longer
> +	 * allowed, so set it to the dest_cpu so return
> +	 * migration doesn't send it to an invalid cpu
> +	 */

This comment isn't quite right; ->wake_cpu is a mere preference, it does
not have correctness concerns. That is, it is okay for ->wake_cpu to not
be inside cpus_allowed.

> +	if (!is_cpu_allowed(p, p->wake_cpu))
> +		p->wake_cpu = dest_cpu;
> +
>  	return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
>  
>  out:
> @@ -3717,6 +3725,72 @@ static inline void ttwu_do_wakeup(struct task_struct *p)
>  	trace_sched_wakeup(p);
>  }
>  
> +#ifdef CONFIG_SCHED_PROXY_EXEC
> +static inline void proxy_set_task_cpu(struct task_struct *p, int cpu)
> +{
> +	unsigned int wake_cpu;
> +
> +	/*
> +	 * Since we are enqueuing a blocked task on a cpu it may
> +	 * not be able to run on, preserve wake_cpu when we
> +	 * __set_task_cpu so we can return the task to where it
> +	 * was previously runnable.
> +	 */
> +	wake_cpu = p->wake_cpu;
> +	__set_task_cpu(p, cpu);
> +	p->wake_cpu = wake_cpu;

Humm, perhaps add an argument to __set_task_cpu() to not set ->wake_cpu
instead?

> +}
> +
> +static bool proxy_task_runnable_but_waking(struct task_struct *p)
> +{
> +	if (!sched_proxy_exec())
> +		return false;
> +	return (READ_ONCE(p->__state) == TASK_RUNNING &&
> +		READ_ONCE(p->blocked_on_state) == BO_WAKING);
> +}
> +
> +/*
> + * Checks to see if task p has been proxy-migrated to another rq
> + * and needs to be returned. If so, we deactivate the task here
> + * so that it can be properly woken up on the p->wake_cpu
> + * (or whichever cpu select_task_rq() picks at the bottom of
> + * try_to_wake_up()
> + */
> +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p)
> +{
> +	bool ret = false;
> +
> +	if (!sched_proxy_exec())
> +		return false;
> +
> +	raw_spin_lock(&p->blocked_lock);
> +	if (__get_task_blocked_on(p) && p->blocked_on_state == BO_WAKING) {
> +		if (!task_current(rq, p) && (p->wake_cpu != cpu_of(rq))) {
> +			if (task_current_donor(rq, p)) {
> +				put_prev_task(rq, p);
> +				rq_set_donor(rq, rq->idle);
> +			}
> +			deactivate_task(rq, p, DEQUEUE_NOCLOCK);
> +			ret = true;
> +		}
> +		__set_blocked_on_runnable(p);
> +		resched_curr(rq);
> +	}
> +	raw_spin_unlock(&p->blocked_lock);
> +	return ret;
> +}
> +#else /* !CONFIG_SCHED_PROXY_EXEC */
> +static bool proxy_task_runnable_but_waking(struct task_struct *p)
> +{
> +	return false;
> +}
> +
> +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p)
> +{
> +	return false;
> +}
> +#endif /* CONFIG_SCHED_PROXY_EXEC */
> +
>  static void
>  ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
>  		 struct rq_flags *rf)
> @@ -3802,6 +3876,8 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  		update_rq_clock(rq);
>  		if (p->se.sched_delayed)
>  			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> +		if (proxy_needs_return(rq, p))
> +			goto out;
>  		if (!task_on_cpu(rq, p)) {
>  			/*
>  			 * When on_rq && !on_cpu the task is preempted, see if
> @@ -3812,6 +3888,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  		ttwu_do_wakeup(p);
>  		ret = 1;
>  	}
> +out:
>  	__task_rq_unlock(rq, &rf);
>  
>  	return ret;
> @@ -4199,6 +4276,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>  		 *    it disabling IRQs (this allows not taking ->pi_lock).
>  		 */
>  		WARN_ON_ONCE(p->se.sched_delayed);
> +		/* If current is waking up, we know we can run here, so set BO_RUNNBLE */
> +		set_blocked_on_runnable(p);
>  		if (!ttwu_state_match(p, state, &success))
>  			goto out;
>  
> @@ -4215,8 +4294,15 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>  	 */
>  	scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
>  		smp_mb__after_spinlock();
> -		if (!ttwu_state_match(p, state, &success))
> -			break;
> +		if (!ttwu_state_match(p, state, &success)) {
> +			/*
> +			 * If we're already TASK_RUNNING, and BO_WAKING
> +			 * continue on to ttwu_runnable check to force
> +			 * proxy_needs_return evaluation
> +			 */
> +			if (!proxy_task_runnable_but_waking(p))
> +				break;
> +		}
>  
>  		trace_sched_waking(p);

Oh gawd :-( why !?!? 

So AFAICT this makes ttwu() do dequeue when it finds WAKING, and then it
falls through to do the normal wakeup. So why can't we do dequeue to
begin with -- instead of setting WAKING in the first place?