[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250904002201.971268-3-jstultz@google.com>
Date: Thu, 4 Sep 2025 00:21:52 +0000
From: John Stultz <jstultz@...gle.com>
To: LKML <linux-kernel@...r.kernel.org>
Cc: John Stultz <jstultz@...gle.com>, Joel Fernandes <joelagnelf@...dia.com>,
Qais Yousef <qyousef@...alina.io>, Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>, Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>, Dietmar Eggemann <dietmar.eggemann@....com>,
Valentin Schneider <vschneid@...hat.com>, Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, Zimuzo Ezeozue <zezeozue@...gle.com>, Mel Gorman <mgorman@...e.de>,
Will Deacon <will@...nel.org>, Waiman Long <longman@...hat.com>, Boqun Feng <boqun.feng@...il.com>,
"Paul E. McKenney" <paulmck@...nel.org>, Metin Kaya <Metin.Kaya@....com>,
Xuewen Yan <xuewen.yan94@...il.com>, K Prateek Nayak <kprateek.nayak@....com>,
Thomas Gleixner <tglx@...utronix.de>, Daniel Lezcano <daniel.lezcano@...aro.org>,
Suleiman Souhlal <suleiman@...gle.com>, kuyo chang <kuyo.chang@...iatek.com>, hupu <hupu.gm@...il.com>,
kernel-team@...roid.com
Subject: [RESEND][PATCH v21 2/6] sched/locking: Add blocked_on_state to
provide necessary tri-state for proxy return-migration
As we add functionality to proxy execution, we may migrate a
donor task to a runqueue where it can't run due to cpu affinity.
Thus, we must be careful to ensure we return-migrate the task
back to a cpu in its cpumask when it becomes unblocked.
Thus we need more then just a binary concept of the task being
blocked on a mutex or not.
So add a blocked_on_state value to the task, that allows the
task to move through BO_RUNNING -> BO_BLOCKED -> BO_WAKING
and back to BO_RUNNING. This provides a guard state in
BO_WAKING so we can know the task is no longer blocked
but we don't want to run it until we have potentially
done return migration, back to a usable cpu.
Signed-off-by: John Stultz <jstultz@...gle.com>
---
v15:
* Split blocked_on_state into its own patch later in the
series, as the tri-state isn't necessary until we deal
with proxy/return migrations
v16:
* Handle case where task in the chain is being set as
BO_WAKING by another cpu (usually via ww_mutex die code).
Make sure we release the rq lock so the wakeup can
complete.
* Rework to use guard() in find_proxy_task() as suggested
by Peter
v18:
* Add initialization of blocked_on_state for init_task
v19:
* PREEMPT_RT build fixups and rework suggested by
K Prateek Nayak
v20:
* Simplify one of the blocked_on_state changes to avoid extra
PREMEPT_RT conditionals
v21:
* Slight reworks due to avoiding nested blocked_lock locking
* Be consistent in use of blocked_on_state helper functions
* Rework calls to proxy_deactivate() to do proper locking
around blocked_on_state changes that we were cheating in
previous versions.
* Minor cleanups, some comment improvements
Cc: Joel Fernandes <joelagnelf@...dia.com>
Cc: Qais Yousef <qyousef@...alina.io>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Juri Lelli <juri.lelli@...hat.com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@....com>
Cc: Valentin Schneider <vschneid@...hat.com>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Ben Segall <bsegall@...gle.com>
Cc: Zimuzo Ezeozue <zezeozue@...gle.com>
Cc: Mel Gorman <mgorman@...e.de>
Cc: Will Deacon <will@...nel.org>
Cc: Waiman Long <longman@...hat.com>
Cc: Boqun Feng <boqun.feng@...il.com>
Cc: "Paul E. McKenney" <paulmck@...nel.org>
Cc: Metin Kaya <Metin.Kaya@....com>
Cc: Xuewen Yan <xuewen.yan94@...il.com>
Cc: K Prateek Nayak <kprateek.nayak@....com>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Daniel Lezcano <daniel.lezcano@...aro.org>
Cc: Suleiman Souhlal <suleiman@...gle.com>
Cc: kuyo chang <kuyo.chang@...iatek.com>
Cc: hupu <hupu.gm@...il.com>
Cc: kernel-team@...roid.com
---
include/linux/sched.h | 80 +++++++++++++++++++++++++++++----------
init/init_task.c | 1 +
kernel/fork.c | 1 +
kernel/locking/mutex.c | 15 ++++----
kernel/locking/ww_mutex.h | 20 ++++------
kernel/sched/core.c | 44 +++++++++++++++++++--
kernel/sched/sched.h | 2 +-
7 files changed, 120 insertions(+), 43 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3ec0ef0d91603..5801de1a44a79 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -815,6 +815,12 @@ struct kmap_ctrl {
#endif
};
+enum blocked_on_state {
+ BO_RUNNABLE,
+ BO_BLOCKED,
+ BO_WAKING,
+};
+
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
@@ -1234,6 +1240,7 @@ struct task_struct {
struct rt_mutex_waiter *pi_blocked_on;
#endif
+ enum blocked_on_state blocked_on_state;
struct mutex *blocked_on; /* lock we're blocked on */
raw_spinlock_t blocked_lock;
@@ -2141,7 +2148,52 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
__cond_resched_rwlock_write(lock); \
})
-#ifndef CONFIG_PREEMPT_RT
+static inline void __force_blocked_on_runnable(struct task_struct *p)
+{
+ lockdep_assert_held(&p->blocked_lock);
+ p->blocked_on_state = BO_RUNNABLE;
+}
+
+static inline void force_blocked_on_runnable(struct task_struct *p)
+{
+ guard(raw_spinlock_irqsave)(&p->blocked_lock);
+ __force_blocked_on_runnable(p);
+}
+
+static inline void __force_blocked_on_blocked(struct task_struct *p)
+{
+ lockdep_assert_held(&p->blocked_lock);
+ p->blocked_on_state = BO_BLOCKED;
+}
+
+static inline void __set_blocked_on_runnable(struct task_struct *p)
+{
+ lockdep_assert_held(&p->blocked_lock);
+ if (p->blocked_on_state == BO_WAKING)
+ p->blocked_on_state = BO_RUNNABLE;
+}
+
+static inline void set_blocked_on_runnable(struct task_struct *p)
+{
+ if (!sched_proxy_exec())
+ return;
+ guard(raw_spinlock_irqsave)(&p->blocked_lock);
+ __set_blocked_on_runnable(p);
+}
+
+static inline void __set_blocked_on_waking(struct task_struct *p)
+{
+ lockdep_assert_held(&p->blocked_lock);
+ if (p->blocked_on_state == BO_BLOCKED)
+ p->blocked_on_state = BO_WAKING;
+}
+
+static inline void set_blocked_on_waking(struct task_struct *p)
+{
+ guard(raw_spinlock_irqsave)(&p->blocked_lock);
+ __set_blocked_on_waking(p);
+}
+
static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
{
lockdep_assert_held_once(&p->blocked_lock);
@@ -2163,24 +2215,23 @@ static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
lockdep_assert_held_once(&p->blocked_lock);
/*
* Check ensure we don't overwrite existing mutex value
- * with a different mutex. Note, setting it to the same
- * lock repeatedly is ok.
+ * with a different mutex.
*/
- WARN_ON_ONCE(p->blocked_on && p->blocked_on != m);
+ WARN_ON_ONCE(p->blocked_on);
p->blocked_on = m;
+ p->blocked_on_state = BO_BLOCKED;
}
static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
{
+ /* The task should only be clearing itself */
+ WARN_ON_ONCE(p != current);
/* Currently we serialize blocked_on under the task::blocked_lock */
lockdep_assert_held_once(&p->blocked_lock);
- /*
- * There may be cases where we re-clear already cleared
- * blocked_on relationships, but make sure we are not
- * clearing the relationship with a different lock.
- */
- WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m);
+ /* Make sure we are clearing the relationship with the right lock */
+ WARN_ON_ONCE(m && p->blocked_on != m);
p->blocked_on = NULL;
+ p->blocked_on_state = BO_RUNNABLE;
}
static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
@@ -2188,15 +2239,6 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
guard(raw_spinlock_irqsave)(&p->blocked_lock);
__clear_task_blocked_on(p, m);
}
-#else
-static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
-{
-}
-
-static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
-{
-}
-#endif /* !CONFIG_PREEMPT_RT */
static __always_inline bool need_resched(void)
{
diff --git a/init/init_task.c b/init/init_task.c
index 7e29d86153d9f..6d72ec23410a6 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -174,6 +174,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
&init_task.alloc_lock),
#endif
+ .blocked_on_state = BO_RUNNABLE,
#ifdef CONFIG_RT_MUTEXES
.pi_waiters = RB_ROOT_CACHED,
.pi_top_task = NULL,
diff --git a/kernel/fork.c b/kernel/fork.c
index db6d08946ec11..4bd0731995e86 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2129,6 +2129,7 @@ __latent_entropy struct task_struct *copy_process(
lockdep_init_task(p);
#endif
+ p->blocked_on_state = BO_RUNNABLE;
p->blocked_on = NULL; /* not blocked yet */
#ifdef CONFIG_BCACHE
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index fac40c456098e..42e4d2e6e4ad4 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -682,11 +682,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
raw_spin_lock_irqsave(&lock->wait_lock, flags);
raw_spin_lock(¤t->blocked_lock);
/*
- * As we likely have been woken up by task
- * that has cleared our blocked_on state, re-set
- * it to the lock we are trying to acquire.
+ * Re-set blocked_on_state as unlock path set it to WAKING/RUNNABLE
*/
- __set_task_blocked_on(current, lock);
+ __force_blocked_on_blocked(current);
set_current_state(state);
/*
* Here we order against unlock; we must either see it change
@@ -705,14 +703,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
* and clear blocked on so we don't become unselectable
* to run.
*/
- __clear_task_blocked_on(current, lock);
+ __force_blocked_on_runnable(current);
raw_spin_unlock(¤t->blocked_lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter);
raw_spin_lock_irqsave(&lock->wait_lock, flags);
raw_spin_lock(¤t->blocked_lock);
- __set_task_blocked_on(current, lock);
+ __force_blocked_on_blocked(current);
if (opt_acquired)
break;
trace_contention_begin(lock, LCB_F_MUTEX);
@@ -963,8 +961,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
next = waiter->task;
+ raw_spin_lock(&next->blocked_lock);
debug_mutex_wake_waiter(lock, waiter);
- clear_task_blocked_on(next, lock);
+ WARN_ON_ONCE(__get_task_blocked_on(next) != lock);
+ __set_blocked_on_waking(next);
+ raw_spin_unlock(&next->blocked_lock);
wake_q_add(&wake_q, next);
}
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index e4a81790ea7dd..f34363615eb34 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -285,11 +285,11 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
debug_mutex_wake_waiter(lock, waiter);
#endif
/*
- * When waking up the task to die, be sure to clear the
- * blocked_on pointer. Otherwise we can see circular
- * blocked_on relationships that can't resolve.
+ * When waking up the task to die, be sure to set the
+ * blocked_on_state to BO_WAKING. Otherwise we can see
+ * circular blocked_on relationships that can't resolve.
*/
- clear_task_blocked_on(waiter->task, lock);
+ set_blocked_on_waking(waiter->task);
wake_q_add(wake_q, waiter->task);
}
@@ -339,15 +339,11 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
*/
if (owner != current) {
/*
- * When waking up the task to wound, be sure to clear the
- * blocked_on pointer. Otherwise we can see circular
- * blocked_on relationships that can't resolve.
- *
- * NOTE: We pass NULL here instead of lock, because we
- * are waking the mutex owner, who may be currently
- * blocked on a different mutex.
+ * When waking up the task to wound, be sure to set the
+ * blocked_on_state to BO_WAKING. Otherwise we can see
+ * circular blocked_on relationships that can't resolve.
*/
- clear_task_blocked_on(owner, NULL);
+ set_blocked_on_waking(owner);
wake_q_add(wake_q, owner);
}
return true;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0180853dd48c5..e0007660161fa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4328,6 +4328,12 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
ttwu_queue(p, cpu, wake_flags);
}
out:
+ /*
+ * For now, if we've been woken up, set us as BO_RUNNABLE
+ * We will need to be more careful later when handling
+ * proxy migration
+ */
+ set_blocked_on_runnable(p);
if (success)
ttwu_stat(p, task_cpu(p), wake_flags);
@@ -6623,7 +6629,7 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
* as unblocked, as we aren't doing proxy-migrations
* yet (more logic will be needed then).
*/
- donor->blocked_on = NULL;
+ force_blocked_on_runnable(donor);
}
return NULL;
}
@@ -6676,20 +6682,41 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
return NULL;
}
+ /*
+ * If a ww_mutex hits the die/wound case, it marks the task as
+ * BO_WAKING and calls try_to_wake_up(), so that the mutex
+ * cycle can be broken and we avoid a deadlock.
+ *
+ * However, if at that moment, we are here on the cpu which the
+ * die/wounded task is enqueued, we might loop on the cycle as
+ * BO_WAKING still causes task_is_blocked() to return true
+ * (since we want return migration to occur before we run the
+ * task).
+ *
+ * Unfortunately since we hold the rq lock, it will block
+ * try_to_wake_up from completing and doing the return
+ * migration.
+ *
+ * So when we hit a !BO_BLOCKED task briefly schedule idle
+ * so we release the rq and let the wakeup complete.
+ */
+ if (p->blocked_on_state != BO_BLOCKED)
+ return proxy_resched_idle(rq);
+
owner = __mutex_owner(mutex);
if (!owner) {
- __clear_task_blocked_on(p, mutex);
+ __force_blocked_on_runnable(p);
return p;
}
if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
/* XXX Don't handle blocked owners/delayed dequeue yet */
- return proxy_deactivate(rq, donor);
+ goto deactivate_donor;
}
if (task_cpu(owner) != this_cpu) {
/* XXX Don't handle migrations yet */
- return proxy_deactivate(rq, donor);
+ goto deactivate_donor;
}
if (task_on_rq_migrating(owner)) {
@@ -6749,6 +6776,15 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
WARN_ON_ONCE(owner && !owner->on_rq);
return owner;
+
+ /*
+ * NOTE: This logic is down here, because we need to call
+ * the functions with the mutex wait_lock and task
+ * blocked_lock released, so we have to get out of the
+ * guard() scope.
+ */
+deactivate_donor:
+ return proxy_deactivate(rq, donor);
}
#else /* SCHED_PROXY_EXEC */
static struct task_struct *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index be9745d104f75..845454ec81a22 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2264,7 +2264,7 @@ static inline bool task_is_blocked(struct task_struct *p)
if (!sched_proxy_exec())
return false;
- return !!p->blocked_on;
+ return !!p->blocked_on && p->blocked_on_state != BO_RUNNABLE;
}
static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
--
2.51.0.338.gd7d06c2dae-goog
Powered by blists - more mailing lists