[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260103002343.6599-7-joelagnelf@nvidia.com>
Date: Fri, 2 Jan 2026 19:23:35 -0500
From: Joel Fernandes <joelagnelf@...dia.com>
To: linux-kernel@...r.kernel.org
Cc: "Paul E . McKenney" <paulmck@...nel.org>,
Frederic Weisbecker <frederic@...nel.org>,
Neeraj Upadhyay <neeraj.upadhyay@...nel.org>,
Joel Fernandes <joelagnelf@...dia.com>,
Josh Triplett <josh@...htriplett.org>,
Boqun Feng <boqun.feng@...il.com>,
Steven Rostedt <rostedt@...dmis.org>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Lai Jiangshan <jiangshanlai@...il.com>,
Zqiang <qiang.zhang@...ux.dev>,
Uladzislau Rezki <urezki@...il.com>,
joel@...lfernandes.org,
rcu@...r.kernel.org
Subject: [PATCH RFC 06/14] rcu: Promote per-CPU blocked tasks before checking for blocked readers
When CONFIG_RCU_PER_CPU_BLOCKED_LISTS is enabled, tasks that block in
RCU read-side critical sections may be placed on per-CPU lists rather
than directly on the rcu_node's blkd_tasks list. It is possible that a
task can block just after rcu_gp_init()'s promotion scan completes,
leaving it only on the per-CPU list while a GP is active.
The RCU priority boosting mechanism only looks at rnp->gp_tasks and
rnp->exp_tasks, which point into rnp->blkd_tasks. Tasks on per-CPU
lists are invisible to the boost kthread and cannot be boosted.
Address this by adding a "promote" parameter to
rcu_preempt_blocked_readers_cgp(). When promote is true and the caller
the function first promotes any tasks from per-CPU blocked lists to the
rcu_node's blkd_tasks list before checking if there are blocked readers.
This ensures that late-arriving tasks are visible for priority boosting
and other operations.
Callers that hold the rnp lock pass promote=true to get an accurate answer
including late arrivals. Lockless callers (GP loop, FQS check) pass
promote=false for an approximate snapshot (TODO: need to check if we can
always just set "promote" to true and remove the parameter).
Signed-off-by: Joel Fernandes <joelagnelf@...dia.com>
---
kernel/rcu/tree.c | 14 +++++++-------
kernel/rcu/tree.h | 2 +-
kernel/rcu/tree_plugin.h | 34 ++++++++++++++++++++++++++++------
kernel/rcu/tree_stall.h | 4 ++--
4 files changed, 38 insertions(+), 16 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5837e9923642..f8f43f94adbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2034,7 +2034,7 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
return true;
// The current grace period has completed.
- if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp, false))
return true;
return false;
@@ -2125,7 +2125,7 @@ static noinline_for_stack void rcu_gp_fqs_loop(void)
* the corresponding leaf nodes have passed through their quiescent state.
*/
if (!READ_ONCE(rnp->qsmask) &&
- !rcu_preempt_blocked_readers_cgp(rnp))
+ !rcu_preempt_blocked_readers_cgp(rnp, false))
break;
/* If time for quiescent-state forcing, do it. */
if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
@@ -2207,7 +2207,7 @@ static noinline void rcu_gp_cleanup(void)
rcu_seq_end(&new_gp_seq);
rcu_for_each_node_breadth_first(rnp) {
raw_spin_lock_irq_rcu_node(rnp);
- if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+ if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)))
dump_blkd_tasks(rnp, 10);
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->gp_seq, new_gp_seq);
@@ -2376,13 +2376,13 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
- rcu_preempt_blocked_readers_cgp(rnp));
+ rcu_preempt_blocked_readers_cgp(rnp, true));
WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
mask, rnp->qsmask, rnp->level,
rnp->grplo, rnp->grphi,
!!rnp->gp_tasks);
- if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp, true)) {
/* Other bits still set at this level, so done. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2428,7 +2428,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
raw_lockdep_assert_held_rcu_node(rnp);
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
- WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)) ||
rnp->qsmask != 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return; /* Still need more quiescent states! */
@@ -2763,7 +2763,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rcu_state.cbovldnext |= !!rnp->cbovldmask;
if (rnp->qsmask == 0) {
- if (rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (rcu_preempt_blocked_readers_cgp(rnp, true)) {
/*
* No point in scanning bits because they
* are all zero. But we might need to
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b71c6c1de8d3..25eb9200e6ef 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -486,7 +486,7 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
/* Forward declarations for tree_plugin.h */
static void rcu_bootup_announce(void);
static void rcu_qs(void);
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote);
#ifdef CONFIG_HOTPLUG_CPU
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ad33fdd0efe8..6ed3815bb912 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -383,9 +383,28 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
* Check for preempted RCU readers blocking the current grace period
* for the specified rcu_node structure. If the caller needs a reliable
* answer, it must hold the rcu_node's ->lock.
+ *
+ * If @promote is true and CONFIG_RCU_PER_CPU_BLOCKED_LISTS is enabled,
+ * this function first promotes any tasks from per-CPU blocked lists to
+ * the rcu_node's blkd_tasks list before checking. This ensures that
+ * late-arriving tasks (blocked after GP init's promotion scan) are
+ * visible for priority boosting and other operations. When promoting,
+ * the caller must hold rnp->lock.
*/
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote)
{
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ if (promote && rcu_is_leaf_node(rnp)) {
+ int cpu;
+ struct rcu_data *rdp;
+
+ raw_lockdep_assert_held_rcu_node(rnp);
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rcu_promote_blocked_tasks_rdp(rdp, rnp);
+ }
+ }
+#endif
return READ_ONCE(rnp->gp_tasks) != NULL;
}
@@ -570,7 +589,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
- empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
+ empty_norm = !rcu_preempt_blocked_readers_cgp(rnp, true);
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
(!empty_norm || rnp->qsmask));
empty_exp = sync_rcu_exp_done(rnp);
@@ -597,7 +616,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* so we must take a snapshot of the expedited state.
*/
empty_exp_now = sync_rcu_exp_done(rnp);
- if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp, true)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gp_seq,
0, rnp->qsmask,
@@ -901,7 +920,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
raw_lockdep_assert_held_rcu_node(rnp);
- if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+ if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)))
dump_blkd_tasks(rnp, 10);
if (rcu_preempt_has_tasks(rnp) &&
(rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
@@ -1127,7 +1146,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
* Because preemptible RCU does not exist, there are never any preempted
* RCU readers.
*/
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote)
{
return 0;
}
@@ -1221,6 +1240,9 @@ static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { }
static void rcu_promote_blocked_tasks(struct rcu_node *rnp) { }
+static void rcu_promote_blocked_tasks_rdp(struct rcu_data *rdp,
+ struct rcu_node *rnp) { }
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/*
@@ -1378,7 +1400,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
{
raw_lockdep_assert_held_rcu_node(rnp);
if (!rnp->boost_kthread_task ||
- (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
+ (!rcu_preempt_blocked_readers_cgp(rnp, true) && !rnp->exp_tasks)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b67532cb8770..5aa65130ab5c 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -277,7 +277,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
struct task_struct *t;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (!rcu_preempt_blocked_readers_cgp(rnp, true)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -331,7 +331,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
struct task_struct *ts[8];
lockdep_assert_irqs_disabled();
- if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (!rcu_preempt_blocked_readers_cgp(rnp, true)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
}
--
2.34.1
Powered by blists - more mailing lists