linux-kernel - Re: [RESEND][PATCH v21 4/6] sched: Handle blocked-waiter migration (and return migration)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <befadd44-9549-4400-ab28-b8aef55cd73d@amd.com>
Date: Mon, 15 Sep 2025 15:36:50 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: John Stultz <jstultz@...gle.com>, LKML <linux-kernel@...r.kernel.org>
CC: Joel Fernandes <joelagnelf@...dia.com>, Qais Yousef <qyousef@...alina.io>,
	Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>, "Juri
 Lelli" <juri.lelli@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>, Valentin Schneider
	<vschneid@...hat.com>, Steven Rostedt <rostedt@...dmis.org>, Ben Segall
	<bsegall@...gle.com>, Zimuzo Ezeozue <zezeozue@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Will Deacon <will@...nel.org>, Waiman Long
	<longman@...hat.com>, Boqun Feng <boqun.feng@...il.com>, "Paul E. McKenney"
	<paulmck@...nel.org>, Metin Kaya <Metin.Kaya@....com>, Xuewen Yan
	<xuewen.yan94@...il.com>, Thomas Gleixner <tglx@...utronix.de>, "Daniel
 Lezcano" <daniel.lezcano@...aro.org>, Suleiman Souhlal <suleiman@...gle.com>,
	kuyo chang <kuyo.chang@...iatek.com>, hupu <hupu.gm@...il.com>,
	<kernel-team@...roid.com>
Subject: Re: [RESEND][PATCH v21 4/6] sched: Handle blocked-waiter migration
 (and return migration)

Hello John,

On 9/4/2025 5:51 AM, John Stultz wrote:
> @@ -6655,17 +6731,98 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
>  	return try_to_block_task(rq, donor, &state, true);
>  }
>  
> -static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
> +/*
> + * If the blocked-on relationship crosses CPUs, migrate @p to the
> + * owner's CPU.
> + *
> + * This is because we must respect the CPU affinity of execution
> + * contexts (owner) but we can ignore affinity for scheduling
> + * contexts (@p). So we have to move scheduling contexts towards
> + * potential execution contexts.
> + *
> + * Note: The owner can disappear, but simply migrate to @target_cpu
> + * and leave that CPU to sort things out.
> + */
> +static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
> +			       struct task_struct *p, int target_cpu)
>  {
> -	if (!__proxy_deactivate(rq, donor)) {
> -		/*
> -		 * XXX: For now, if deactivation failed, set donor
> -		 * as unblocked, as we aren't doing proxy-migrations
> -		 * yet (more logic will be needed then).
> -		 */
> -		force_blocked_on_runnable(donor);
> -	}
> -	return NULL;
> +	struct rq *target_rq = cpu_rq(target_cpu);
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	/*
> +	 * Since we're going to drop @rq, we have to put(@rq->donor) first,
> +	 * otherwise we have a reference that no longer belongs to us.
> +	 *
> +	 * Additionally, as we put_prev_task(prev) earlier, its possible that
> +	 * prev will migrate away as soon as we drop the rq lock, however we
> +	 * still have it marked as rq->curr, as we've not yet switched tasks.
> +	 *
> +	 * After the migration, we are going to pick_again in the __schedule
> +	 * logic, so backtrack a bit before we release the lock:
> +	 * Put rq->donor, and set rq->curr as rq->donor and set_next_task,
> +	 * so that we're close to the situation we had entering __schedule
> +	 * the first time.
> +	 *
> +	 * Then when we re-aquire the lock, we will re-put rq->curr then
> +	 * rq_set_donor(rq->idle) and set_next_task(rq->idle), before
> +	 * picking again.
> +	 */
> +	/* XXX - Added to address problems with changed dl_server semantics - double check */
> +	__put_prev_set_next_dl_server(rq, rq->donor, rq->curr);

Given we are tagging the rq->dl_server to the donor's context, should we
do:

    __put_prev_set_next_dl_server(rq, rq->donor, rq->idle);

... since we are setting rq->idle as next task and the donor?

On a side note, this can perhaps be simplified as:

    put_prev_set_next_task(rq, rq->donor, rq->idle);
    rq_set_donor(rq, rq->idle);

put_prev_set_next_task() will take are of the dl_server handoff too.

> +	put_prev_task(rq, rq->donor);
> +	rq_set_donor(rq, rq->idle);
> +	set_next_task(rq, rq->idle);
> +
> +	WARN_ON(p == rq->curr);
> +
> +	deactivate_task(rq, p, 0);
> +	proxy_set_task_cpu(p, target_cpu);
> +
> +	zap_balance_callbacks(rq);

Is this zap necessary? Given we return NULL from find_proxy_task() for
migrate case, __schedule() would zap the callback soon. I don't see
any WARN_ON() for the balance callbacks in the unpin + repin path so
this might not be necessary or am I mistaken?

> +	rq_unpin_lock(rq, rf);
> +	raw_spin_rq_unlock(rq);
> +	raw_spin_rq_lock(target_rq);
> +
> +	activate_task(target_rq, p, 0);
> +	wakeup_preempt(target_rq, p, 0);
> +
> +	raw_spin_rq_unlock(target_rq);
> +	raw_spin_rq_lock(rq);
> +	rq_repin_lock(rq, rf);
> +}
> +
> +static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
> +			       struct task_struct *p)
> +{
> +	lockdep_assert_rq_held(rq);
> +
> +	put_prev_task(rq, rq->donor);
> +	rq_set_donor(rq, rq->idle);
> +	set_next_task(rq, rq->idle);

Similar set of comments as above.

> +
> +	WARN_ON(p == rq->curr);
> +
> +	set_blocked_on_waking(p);
> +	get_task_struct(p);
> +	block_task(rq, p, 0);
> +
> +	zap_balance_callbacks(rq);
> +	rq_unpin_lock(rq, rf);
> +	raw_spin_rq_unlock(rq);
> +
> +	wake_up_process(p);
> +	put_task_struct(p);
> +
> +	raw_spin_rq_lock(rq);
> +	rq_repin_lock(rq, rf);
> +}
> +
> +static inline bool proxy_can_run_here(struct rq *rq, struct task_struct *p)
> +{
> +	if (p == rq->curr || p->wake_cpu == cpu_of(rq))
> +		return true;
> +	return false;
>  }
>  
>  /*
> @@ -6688,9 +6845,11 @@ static struct task_struct *
>  find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
>  {
>  	struct task_struct *owner = NULL;
> +	bool curr_in_chain = false;
>  	int this_cpu = cpu_of(rq);
>  	struct task_struct *p;
>  	struct mutex *mutex;
> +	int owner_cpu;
>  
>  	/* Follow blocked_on chain. */
>  	for (p = donor; task_is_blocked(p); p = owner) {
> @@ -6716,6 +6875,10 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
>  			return NULL;
>  		}
>  
> +		/* Double check blocked_on_state now we're holding the lock */
> +		if (p->blocked_on_state == BO_RUNNABLE)
> +			return p;
> +
>  		/*
>  		 * If a ww_mutex hits the die/wound case, it marks the task as
>  		 * BO_WAKING and calls try_to_wake_up(), so that the mutex
> @@ -6731,26 +6894,46 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
>  		 * try_to_wake_up from completing and doing the return
>  		 * migration.
>  		 *
> -		 * So when we hit a !BO_BLOCKED task briefly schedule idle
> -		 * so we release the rq and let the wakeup complete.
> +		 * So when we hit a BO_WAKING task try to wake it up ourselves.
>  		 */
> -		if (p->blocked_on_state != BO_BLOCKED)
> -			return proxy_resched_idle(rq);
> +		if (p->blocked_on_state == BO_WAKING) {
> +			if (task_current(rq, p)) {
> +				/* If its current just set it runnable */
> +				__force_blocked_on_runnable(p);
> +				return p;
> +			}
> +			goto needs_return;
> +		}
> +
> +		if (task_current(rq, p))
> +			curr_in_chain = true;
>  
>  		owner = __mutex_owner(mutex);
>  		if (!owner) {
> +			/* If the owner is null, we may have some work to do */
> +			if (!proxy_can_run_here(rq, p))
> +				goto needs_return;
> +
>  			__force_blocked_on_runnable(p);
>  			return p;
>  		}
>  
>  		if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
>  			/* XXX Don't handle blocked owners/delayed dequeue yet */
> +			if (curr_in_chain)
> +				return proxy_resched_idle(rq);
>  			goto deactivate_donor;
>  		}
>  
> -		if (task_cpu(owner) != this_cpu) {
> -			/* XXX Don't handle migrations yet */
> -			goto deactivate_donor;
> +		owner_cpu = task_cpu(owner);
> +		if (owner_cpu != this_cpu) {
> +			/*
> +			 * @owner can disappear, simply migrate to @owner_cpu
> +			 * and leave that CPU to sort things out.
> +			 */
> +			if (curr_in_chain)
> +				return proxy_resched_idle(rq);
> +			goto migrate;
>  		}
>  
>  		if (task_on_rq_migrating(owner)) {
> @@ -6817,8 +7000,18 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
>  	 * blocked_lock released, so we have to get out of the
>  	 * guard() scope.
>  	 */
> +migrate:
> +	proxy_migrate_task(rq, rf, p, owner_cpu);
> +	return NULL;
> +needs_return:
> +	proxy_force_return(rq, rf, p);
> +	return NULL;
>  deactivate_donor:
> -	return proxy_deactivate(rq, donor);
> +	if (!proxy_deactivate(rq, donor)) {
> +		p = donor;
> +		goto needs_return;
> +	}
> +	return NULL;
>  }
>  #else /* SCHED_PROXY_EXEC */
>  static struct task_struct *
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b173a059315c2..cc531eb939831 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8781,7 +8781,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>  	se = &p->se;
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> -	if (prev->sched_class != &fair_sched_class)
> +	if (prev->sched_class != &fair_sched_class ||
> +	    rq->curr != rq->donor)

Why is this special handling required?

>  		goto simple;
>  
>  	__put_prev_set_next_dl_server(rq, prev, p);

-- 
Thanks and Regards,
Prateek