[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <befadd44-9549-4400-ab28-b8aef55cd73d@amd.com>
Date: Mon, 15 Sep 2025 15:36:50 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: John Stultz <jstultz@...gle.com>, LKML <linux-kernel@...r.kernel.org>
CC: Joel Fernandes <joelagnelf@...dia.com>, Qais Yousef <qyousef@...alina.io>,
Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>, "Juri
Lelli" <juri.lelli@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>, Valentin Schneider
<vschneid@...hat.com>, Steven Rostedt <rostedt@...dmis.org>, Ben Segall
<bsegall@...gle.com>, Zimuzo Ezeozue <zezeozue@...gle.com>, Mel Gorman
<mgorman@...e.de>, Will Deacon <will@...nel.org>, Waiman Long
<longman@...hat.com>, Boqun Feng <boqun.feng@...il.com>, "Paul E. McKenney"
<paulmck@...nel.org>, Metin Kaya <Metin.Kaya@....com>, Xuewen Yan
<xuewen.yan94@...il.com>, Thomas Gleixner <tglx@...utronix.de>, "Daniel
Lezcano" <daniel.lezcano@...aro.org>, Suleiman Souhlal <suleiman@...gle.com>,
kuyo chang <kuyo.chang@...iatek.com>, hupu <hupu.gm@...il.com>,
<kernel-team@...roid.com>
Subject: Re: [RESEND][PATCH v21 4/6] sched: Handle blocked-waiter migration
(and return migration)
Hello John,
On 9/4/2025 5:51 AM, John Stultz wrote:
> @@ -6655,17 +6731,98 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
> return try_to_block_task(rq, donor, &state, true);
> }
>
> -static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
> +/*
> + * If the blocked-on relationship crosses CPUs, migrate @p to the
> + * owner's CPU.
> + *
> + * This is because we must respect the CPU affinity of execution
> + * contexts (owner) but we can ignore affinity for scheduling
> + * contexts (@p). So we have to move scheduling contexts towards
> + * potential execution contexts.
> + *
> + * Note: The owner can disappear, but simply migrate to @target_cpu
> + * and leave that CPU to sort things out.
> + */
> +static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
> + struct task_struct *p, int target_cpu)
> {
> - if (!__proxy_deactivate(rq, donor)) {
> - /*
> - * XXX: For now, if deactivation failed, set donor
> - * as unblocked, as we aren't doing proxy-migrations
> - * yet (more logic will be needed then).
> - */
> - force_blocked_on_runnable(donor);
> - }
> - return NULL;
> + struct rq *target_rq = cpu_rq(target_cpu);
> +
> + lockdep_assert_rq_held(rq);
> +
> + /*
> + * Since we're going to drop @rq, we have to put(@rq->donor) first,
> + * otherwise we have a reference that no longer belongs to us.
> + *
> + * Additionally, as we put_prev_task(prev) earlier, its possible that
> + * prev will migrate away as soon as we drop the rq lock, however we
> + * still have it marked as rq->curr, as we've not yet switched tasks.
> + *
> + * After the migration, we are going to pick_again in the __schedule
> + * logic, so backtrack a bit before we release the lock:
> + * Put rq->donor, and set rq->curr as rq->donor and set_next_task,
> + * so that we're close to the situation we had entering __schedule
> + * the first time.
> + *
> + * Then when we re-aquire the lock, we will re-put rq->curr then
> + * rq_set_donor(rq->idle) and set_next_task(rq->idle), before
> + * picking again.
> + */
> + /* XXX - Added to address problems with changed dl_server semantics - double check */
> + __put_prev_set_next_dl_server(rq, rq->donor, rq->curr);
Given we are tagging the rq->dl_server to the donor's context, should we
do:
__put_prev_set_next_dl_server(rq, rq->donor, rq->idle);
... since we are setting rq->idle as next task and the donor?
On a side note, this can perhaps be simplified as:
put_prev_set_next_task(rq, rq->donor, rq->idle);
rq_set_donor(rq, rq->idle);
put_prev_set_next_task() will take are of the dl_server handoff too.
> + put_prev_task(rq, rq->donor);
> + rq_set_donor(rq, rq->idle);
> + set_next_task(rq, rq->idle);
> +
> + WARN_ON(p == rq->curr);
> +
> + deactivate_task(rq, p, 0);
> + proxy_set_task_cpu(p, target_cpu);
> +
> + zap_balance_callbacks(rq);
Is this zap necessary? Given we return NULL from find_proxy_task() for
migrate case, __schedule() would zap the callback soon. I don't see
any WARN_ON() for the balance callbacks in the unpin + repin path so
this might not be necessary or am I mistaken?
> + rq_unpin_lock(rq, rf);
> + raw_spin_rq_unlock(rq);
> + raw_spin_rq_lock(target_rq);
> +
> + activate_task(target_rq, p, 0);
> + wakeup_preempt(target_rq, p, 0);
> +
> + raw_spin_rq_unlock(target_rq);
> + raw_spin_rq_lock(rq);
> + rq_repin_lock(rq, rf);
> +}
> +
> +static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
> + struct task_struct *p)
> +{
> + lockdep_assert_rq_held(rq);
> +
> + put_prev_task(rq, rq->donor);
> + rq_set_donor(rq, rq->idle);
> + set_next_task(rq, rq->idle);
Similar set of comments as above.
> +
> + WARN_ON(p == rq->curr);
> +
> + set_blocked_on_waking(p);
> + get_task_struct(p);
> + block_task(rq, p, 0);
> +
> + zap_balance_callbacks(rq);
> + rq_unpin_lock(rq, rf);
> + raw_spin_rq_unlock(rq);
> +
> + wake_up_process(p);
> + put_task_struct(p);
> +
> + raw_spin_rq_lock(rq);
> + rq_repin_lock(rq, rf);
> +}
> +
> +static inline bool proxy_can_run_here(struct rq *rq, struct task_struct *p)
> +{
> + if (p == rq->curr || p->wake_cpu == cpu_of(rq))
> + return true;
> + return false;
> }
>
> /*
> @@ -6688,9 +6845,11 @@ static struct task_struct *
> find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
> {
> struct task_struct *owner = NULL;
> + bool curr_in_chain = false;
> int this_cpu = cpu_of(rq);
> struct task_struct *p;
> struct mutex *mutex;
> + int owner_cpu;
>
> /* Follow blocked_on chain. */
> for (p = donor; task_is_blocked(p); p = owner) {
> @@ -6716,6 +6875,10 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
> return NULL;
> }
>
> + /* Double check blocked_on_state now we're holding the lock */
> + if (p->blocked_on_state == BO_RUNNABLE)
> + return p;
> +
> /*
> * If a ww_mutex hits the die/wound case, it marks the task as
> * BO_WAKING and calls try_to_wake_up(), so that the mutex
> @@ -6731,26 +6894,46 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
> * try_to_wake_up from completing and doing the return
> * migration.
> *
> - * So when we hit a !BO_BLOCKED task briefly schedule idle
> - * so we release the rq and let the wakeup complete.
> + * So when we hit a BO_WAKING task try to wake it up ourselves.
> */
> - if (p->blocked_on_state != BO_BLOCKED)
> - return proxy_resched_idle(rq);
> + if (p->blocked_on_state == BO_WAKING) {
> + if (task_current(rq, p)) {
> + /* If its current just set it runnable */
> + __force_blocked_on_runnable(p);
> + return p;
> + }
> + goto needs_return;
> + }
> +
> + if (task_current(rq, p))
> + curr_in_chain = true;
>
> owner = __mutex_owner(mutex);
> if (!owner) {
> + /* If the owner is null, we may have some work to do */
> + if (!proxy_can_run_here(rq, p))
> + goto needs_return;
> +
> __force_blocked_on_runnable(p);
> return p;
> }
>
> if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
> /* XXX Don't handle blocked owners/delayed dequeue yet */
> + if (curr_in_chain)
> + return proxy_resched_idle(rq);
> goto deactivate_donor;
> }
>
> - if (task_cpu(owner) != this_cpu) {
> - /* XXX Don't handle migrations yet */
> - goto deactivate_donor;
> + owner_cpu = task_cpu(owner);
> + if (owner_cpu != this_cpu) {
> + /*
> + * @owner can disappear, simply migrate to @owner_cpu
> + * and leave that CPU to sort things out.
> + */
> + if (curr_in_chain)
> + return proxy_resched_idle(rq);
> + goto migrate;
> }
>
> if (task_on_rq_migrating(owner)) {
> @@ -6817,8 +7000,18 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
> * blocked_lock released, so we have to get out of the
> * guard() scope.
> */
> +migrate:
> + proxy_migrate_task(rq, rf, p, owner_cpu);
> + return NULL;
> +needs_return:
> + proxy_force_return(rq, rf, p);
> + return NULL;
> deactivate_donor:
> - return proxy_deactivate(rq, donor);
> + if (!proxy_deactivate(rq, donor)) {
> + p = donor;
> + goto needs_return;
> + }
> + return NULL;
> }
> #else /* SCHED_PROXY_EXEC */
> static struct task_struct *
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b173a059315c2..cc531eb939831 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8781,7 +8781,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
> se = &p->se;
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> - if (prev->sched_class != &fair_sched_class)
> + if (prev->sched_class != &fair_sched_class ||
> + rq->curr != rq->donor)
Why is this special handling required?
> goto simple;
>
> __put_prev_set_next_dl_server(rq, prev, p);
--
Thanks and Regards,
Prateek
Powered by blists - more mailing lists