linux-kernel - Re: [PATCH v24 06/11] sched: Handle blocked-waiter migration (and return migration)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <df4e667a-e322-4124-a9c1-57d6f58e6f0a@amd.com>
Date: Tue, 30 Dec 2025 11:03:54 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: John Stultz <jstultz@...gle.com>, LKML <linux-kernel@...r.kernel.org>
CC: Joel Fernandes <joelagnelf@...dia.com>, Qais Yousef <qyousef@...alina.io>,
	Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>, "Juri
 Lelli" <juri.lelli@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>, Valentin Schneider
	<vschneid@...hat.com>, Steven Rostedt <rostedt@...dmis.org>, Ben Segall
	<bsegall@...gle.com>, Zimuzo Ezeozue <zezeozue@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Will Deacon <will@...nel.org>, Waiman Long
	<longman@...hat.com>, Boqun Feng <boqun.feng@...il.com>, "Paul E. McKenney"
	<paulmck@...nel.org>, Metin Kaya <Metin.Kaya@....com>, Xuewen Yan
	<xuewen.yan94@...il.com>, Thomas Gleixner <tglx@...utronix.de>, "Daniel
 Lezcano" <daniel.lezcano@...aro.org>, Suleiman Souhlal <suleiman@...gle.com>,
	kuyo chang <kuyo.chang@...iatek.com>, hupu <hupu.gm@...il.com>,
	<kernel-team@...roid.com>
Subject: Re: [PATCH v24 06/11] sched: Handle blocked-waiter migration (and
 return migration)

Hello John,

On 11/25/2025 4:00 AM, John Stultz wrote:
> -static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
> +/*
> + * If the blocked-on relationship crosses CPUs, migrate @p to the
> + * owner's CPU.
> + *
> + * This is because we must respect the CPU affinity of execution
> + * contexts (owner) but we can ignore affinity for scheduling
> + * contexts (@p). So we have to move scheduling contexts towards
> + * potential execution contexts.
> + *
> + * Note: The owner can disappear, but simply migrate to @target_cpu
> + * and leave that CPU to sort things out.
> + */
> +static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
> +			       struct task_struct *p, int target_cpu)
>  {
> -	if (!__proxy_deactivate(rq, donor)) {
> -		/*
> -		 * XXX: For now, if deactivation failed, set donor
> -		 * as unblocked, as we aren't doing proxy-migrations
> -		 * yet (more logic will be needed then).
> -		 */
> -		clear_task_blocked_on(donor, NULL);
> +	struct rq *target_rq = cpu_rq(target_cpu);
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	/*
> +	 * Since we're going to drop @rq, we have to put(@rq->donor) first,
> +	 * otherwise we have a reference that no longer belongs to us.
> +	 *
> +	 * Additionally, as we put_prev_task(prev) earlier, its possible that
> +	 * prev will migrate away as soon as we drop the rq lock, however we
> +	 * still have it marked as rq->curr, as we've not yet switched tasks.
> +	 *
> +	 * So call proxy_resched_idle() to let go of the references before
> +	 * we release the lock.
> +	 */
> +	proxy_resched_idle(rq);
> +
> +	WARN_ON(p == rq->curr);
> +
> +	deactivate_task(rq, p, DEQUEUE_NOCLOCK);
> +	proxy_set_task_cpu(p, target_cpu);
> +
> +	/*
> +	 * We have to zap callbacks before unlocking the rq
> +	 * as another CPU may jump in and call sched_balance_rq
> +	 * which can trip the warning in rq_pin_lock() if we
> +	 * leave callbacks set.
> +	 */
> +	zap_balance_callbacks(rq);
> +	rq_unpin_lock(rq, rf);
> +	raw_spin_rq_unlock(rq);
> +
> +	raw_spin_rq_lock(target_rq);
> +	activate_task(target_rq, p, 0);
> +	wakeup_preempt(target_rq, p, 0);
> +	raw_spin_rq_unlock(target_rq);

I'll leave my suggestion for this on Patch 11 but for now ...

> +
> +	raw_spin_rq_lock(rq);
> +	rq_repin_lock(rq, rf);
> +	update_rq_clock(rq);

I'm tempted to suggest extracting this pattern into a guard. We seem
to always do the zap + unpin + unlock somewhere in the middle and
lock + repin + update clock at the end of a function so how about:

  (Build and boot tested with a test-ww_mutex run; Open-coded pattern
   inspired from the one for perf_ctx_lock in kernel/events/core.c)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4c5493b0ad21..52dfa63327fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6600,6 +6600,44 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
 }
 
 #ifdef CONFIG_SCHED_PROXY_EXEC
+typedef struct {
+	struct rq 	*rq;
+	struct rq_flags *rf;
+} class_proxy_drop_reacquire_rq_lock_t;
+
+static void __proxy_drop_rq_lock(struct rq *rq, struct rq_flags *rf)
+{
+	/*
+	 * We have to zap callbacks before unlocking the rq
+	 * as another CPU may jump in and call sched_balance_rq
+	 * which can trip the warning in rq_pin_lock() if we
+	 * leave callbacks set.
+	 */
+	zap_balance_callbacks(rq);
+	rq_unpin_lock(rq, rf);
+	raw_spin_rq_unlock(rq);
+}
+
+static void __proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf)
+{
+	raw_spin_rq_lock(rq);
+	rq_repin_lock(rq, rf);
+	update_rq_clock(rq);
+}
+
+static inline void
+class_proxy_drop_reacquire_rq_lock_destructor(class_proxy_drop_reacquire_rq_lock_t *_T)
+{
+	__proxy_reacquire_rq_lock(_T->rq, _T->rf);
+}
+
+static inline class_proxy_drop_reacquire_rq_lock_t
+class_proxy_drop_reacquire_rq_lock_constructor(struct rq *rq, struct rq_flags *rf)
+{
+	__proxy_drop_rq_lock(rq, rf);
+	return (class_proxy_drop_reacquire_rq_lock_t){ rq, rf };
+}
+
 static inline struct task_struct *proxy_resched_idle(struct rq *rq)
 {
 	put_prev_set_next_task(rq, rq->donor, rq->idle);
@@ -6666,23 +6704,15 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
 	proxy_set_task_cpu(p, target_cpu);
 
 	/*
-	 * We have to zap callbacks before unlocking the rq
-	 * as another CPU may jump in and call sched_balance_rq
-	 * which can trip the warning in rq_pin_lock() if we
-	 * leave callbacks set.
+	 * Zap balance callbacks and drop the rq_lock
+	 * until the end of the scope.
 	 */
-	zap_balance_callbacks(rq);
-	rq_unpin_lock(rq, rf);
-	raw_spin_rq_unlock(rq);
+	guard(proxy_drop_reacquire_rq_lock)(rq, rf);
 
 	raw_spin_rq_lock(target_rq);
 	activate_task(target_rq, p, 0);
 	wakeup_preempt(target_rq, p, 0);
 	raw_spin_rq_unlock(target_rq);
-
-	raw_spin_rq_lock(rq);
-	rq_repin_lock(rq, rf);
-	update_rq_clock(rq);
 }
 
 static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
---

If the guard obfuscates the flow too much, I'm happy with the current
approach as well but having the guard + a comment on top of its usage is
lot more soothing on the eye IMHO :-)

It also simplifies proxy_force_return() if my next set of comments aren't
too horrible. I'll leave it to you and Peter to decide which is best.

> +}
> +
> +static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
> +			       struct task_struct *p)
> +{
> +	struct rq *this_rq, *target_rq;
> +	struct rq_flags this_rf;
> +	int cpu, wake_flag = 0;

Should we pretend this is a wakeup with WF_TTWU? There are some
affine wakeup considerations that select_task_rq_fair() adds with
WF_TTWU. Without it, we'll end up selecting the wake_cpu's LLC to
search for the wakeup target.

> +
> +	lockdep_assert_rq_held(rq);
> +	WARN_ON(p == rq->curr);
> +
> +	get_task_struct(p);

Since we are IRQs disabled (equivalent of RCU read-side), I don't think
we need to grab an extra reference here since the task_struct cannot
disappear from under us and we take the necessary locks to confirm the
task's rq associations so we shouldn't race with anything either.

> +
> +	/*
> +	 * We have to zap callbacks before unlocking the rq
> +	 * as another CPU may jump in and call sched_balance_rq
> +	 * which can trip the warning in rq_pin_lock() if we
> +	 * leave callbacks set.
> +	 */
> +	zap_balance_callbacks(rq);
> +	rq_unpin_lock(rq, rf);
> +	raw_spin_rq_unlock(rq);
> +
> +	/*
> +	 * We drop the rq lock, and re-grab task_rq_lock to get
> +	 * the pi_lock (needed for select_task_rq) as well.
> +	 */
> +	this_rq = task_rq_lock(p, &this_rf);

So I'm failing to see why we need to drop the rq_lock, re-grab it via
task_rq_lock() when we are eventually going to do a deactivate_task().

Once we deactivate, wakeup path will stall at ttwu_runnable() since it
cannot grab the __task_rq_lock() with task_on_rq_migrating() so we
should be able to safely move the task around.

Maybe my naive eyes are missing something (and perhaps there are some
details that arrive with sleeping owner stuff) but the following
boots fine and survives a sched-messaging and test-ww_mutex run
without any splat on my machine at this point:

  (Includes bits from comments above to give those a spin.)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 52dfa63327fc..2a4e73c807e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6718,86 +6718,31 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
 static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
 			       struct task_struct *p)
 {
-	struct rq *this_rq, *target_rq;
-	struct rq_flags this_rf;
-	int cpu, wake_flag = 0;
+	int target_cpu, wake_flag = WF_TTWU;
+	struct rq *target_rq;
 
 	lockdep_assert_rq_held(rq);
 	WARN_ON(p == rq->curr);
 
-	get_task_struct(p);
-
-	/*
-	 * We have to zap callbacks before unlocking the rq
-	 * as another CPU may jump in and call sched_balance_rq
-	 * which can trip the warning in rq_pin_lock() if we
-	 * leave callbacks set.
-	 */
-	zap_balance_callbacks(rq);
-	rq_unpin_lock(rq, rf);
-	raw_spin_rq_unlock(rq);
-
-	/*
-	 * We drop the rq lock, and re-grab task_rq_lock to get
-	 * the pi_lock (needed for select_task_rq) as well.
-	 */
-	this_rq = task_rq_lock(p, &this_rf);
+	proxy_resched_idle(rq);
+	deactivate_task(rq, p, DEQUEUE_NOCLOCK);
 
-	/*
-	 * Since we let go of the rq lock, the task may have been
-	 * woken or migrated to another rq before we  got the
-	 * task_rq_lock. So re-check we're on the same RQ. If
-	 * not, the task has already been migrated and that CPU
-	 * will handle any futher migrations.
-	 */
-	if (this_rq != rq)
-		goto err_out;
+	target_cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
+	target_rq = cpu_rq(target_cpu);
+	set_task_cpu(p, target_cpu);
 
-	/* Similarly, if we've been dequeued, someone else will wake us */
-	if (!task_on_rq_queued(p))
-		goto err_out;
+	clear_task_blocked_on(p, NULL);
 
 	/*
-	 * Since we should only be calling here from __schedule()
-	 * -> find_proxy_task(), no one else should have
-	 * assigned current out from under us. But check and warn
-	 * if we see this, then bail.
+	 * Zap balance callbacks and drop the rq_lock
+	 * until the end of the scope.
 	 */
-	if (task_current(this_rq, p) || task_on_cpu(this_rq, p)) {
-		WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d  on_cpu: %i\n",
-			  __func__, cpu_of(this_rq),
-			  p->comm, p->pid, p->on_cpu);
-		goto err_out;
-	}
-
-	update_rq_clock(this_rq);
-	proxy_resched_idle(this_rq);
-	deactivate_task(this_rq, p, DEQUEUE_NOCLOCK);
-	cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
-	set_task_cpu(p, cpu);
-	target_rq = cpu_rq(cpu);
-	clear_task_blocked_on(p, NULL);
-	task_rq_unlock(this_rq, p, &this_rf);
+	guard(proxy_drop_reacquire_rq_lock)(rq, rf);
 
-	/* Drop this_rq and grab target_rq for activation */
 	raw_spin_rq_lock(target_rq);
 	activate_task(target_rq, p, 0);
 	wakeup_preempt(target_rq, p, 0);
-	put_task_struct(p);
 	raw_spin_rq_unlock(target_rq);
-
-	/* Finally, re-grab the origianl rq lock and return to pick-again */
-	raw_spin_rq_lock(rq);
-	rq_repin_lock(rq, rf);
-	update_rq_clock(rq);
-	return;
-
-err_out:
-	task_rq_unlock(this_rq, p, &this_rf);
-	put_task_struct(p);
-	raw_spin_rq_lock(rq);
-	rq_repin_lock(rq, rf);
-	update_rq_clock(rq);
 }
 
 /*
---

Thoughts?

Also, we may want to just move the attach_one_task() helper to sched.h
and use it in both proxy_migrate_task() and proxy_force_return().

> +
> +	/*
> +	 * Since we let go of the rq lock, the task may have been
> +	 * woken or migrated to another rq before we  got the
> +	 * task_rq_lock. So re-check we're on the same RQ. If
> +	 * not, the task has already been migrated and that CPU
> +	 * will handle any futher migrations.
> +	 */
> +	if (this_rq != rq)
> +		goto err_out;
> +
> +	/* Similarly, if we've been dequeued, someone else will wake us */
> +	if (!task_on_rq_queued(p))
> +		goto err_out;
> +
> +	/*
> +	 * Since we should only be calling here from __schedule()
> +	 * -> find_proxy_task(), no one else should have
> +	 * assigned current out from under us. But check and warn
> +	 * if we see this, then bail.
> +	 */
> +	if (task_current(this_rq, p) || task_on_cpu(this_rq, p)) {
> +		WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d  on_cpu: %i\n",
> +			  __func__, cpu_of(this_rq),
> +			  p->comm, p->pid, p->on_cpu);
> +		goto err_out;
>  	}
> -	return NULL;
> +
> +	update_rq_clock(this_rq);
> +	proxy_resched_idle(this_rq);
> +	deactivate_task(this_rq, p, DEQUEUE_NOCLOCK);
> +	cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
> +	set_task_cpu(p, cpu);
> +	target_rq = cpu_rq(cpu);
> +	clear_task_blocked_on(p, NULL);
> +	task_rq_unlock(this_rq, p, &this_rf);
> +
> +	/* Drop this_rq and grab target_rq for activation */
> +	raw_spin_rq_lock(target_rq);
> +	activate_task(target_rq, p, 0);
> +	wakeup_preempt(target_rq, p, 0);
> +	put_task_struct(p);
> +	raw_spin_rq_unlock(target_rq);
> +
> +	/* Finally, re-grab the origianl rq lock and return to pick-again */
> +	raw_spin_rq_lock(rq);
> +	rq_repin_lock(rq, rf);
> +	update_rq_clock(rq);
> +	return;
> +
> +err_out:
> +	task_rq_unlock(this_rq, p, &this_rf);
> +	put_task_struct(p);
> +	raw_spin_rq_lock(rq);
> +	rq_repin_lock(rq, rf);
> +	update_rq_clock(rq);
>  }
>  
>  /*
> @@ -6650,11 +6789,13 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
>  static struct task_struct *
>  find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
>  {
> -	enum { FOUND, DEACTIVATE_DONOR } action = FOUND;
> +	enum { FOUND, DEACTIVATE_DONOR, MIGRATE, NEEDS_RETURN } action = FOUND;
>  	struct task_struct *owner = NULL;
> +	bool curr_in_chain = false;
>  	int this_cpu = cpu_of(rq);
>  	struct task_struct *p;
>  	struct mutex *mutex;
> +	int owner_cpu;
>  
>  	/* Follow blocked_on chain. */
>  	for (p = donor; task_is_blocked(p); p = owner) {
> @@ -6663,9 +6804,15 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
>  		if (!mutex)
>  			return NULL;
>  
> -		/* if its PROXY_WAKING, resched_idle so ttwu can complete */
> -		if (mutex == PROXY_WAKING)
> -			return proxy_resched_idle(rq);
> +		/* if its PROXY_WAKING, do return migration or run if current */
> +		if (mutex == PROXY_WAKING) {
> +			if (task_current(rq, p)) {
> +				clear_task_blocked_on(p, PROXY_WAKING);
> +				return p;
> +			}
> +			action = NEEDS_RETURN;
> +			break;
> +		}
>  
>  		/*
>  		 * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
> @@ -6685,26 +6832,41 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
>  			return NULL;
>  		}
>  
> +		if (task_current(rq, p))
> +			curr_in_chain = true;
> +
>  		owner = __mutex_owner(mutex);
>  		if (!owner) {
>  			/*
> -			 * If there is no owner, clear blocked_on
> -			 * and return p so it can run and try to
> -			 * acquire the lock
> +			 * If there is no owner, either clear blocked_on
> +			 * and return p (if it is current and safe to
> +			 * just run on this rq), or return-migrate the task.
>  			 */
> -			__clear_task_blocked_on(p, mutex);
> -			return p;
> +			if (task_current(rq, p)) {
> +				__clear_task_blocked_on(p, NULL);

If my suggestion with proxy_force_return() isn't downright horrible, we
can unconditionally clear the p->blocked_on relation on both the
NEEDS_RETURN paths since we won't drop the rq_lock before deactivate
(jury still out on that).

> +				return p;
> +			}
> +			action = NEEDS_RETURN;
> +			break;
>  		}
>  
-- 
Thanks and Regards,
Prateek