lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260103002343.6599-7-joelagnelf@nvidia.com>
Date: Fri,  2 Jan 2026 19:23:35 -0500
From: Joel Fernandes <joelagnelf@...dia.com>
To: linux-kernel@...r.kernel.org
Cc: "Paul E . McKenney" <paulmck@...nel.org>,
	Frederic Weisbecker <frederic@...nel.org>,
	Neeraj Upadhyay <neeraj.upadhyay@...nel.org>,
	Joel Fernandes <joelagnelf@...dia.com>,
	Josh Triplett <josh@...htriplett.org>,
	Boqun Feng <boqun.feng@...il.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
	Lai Jiangshan <jiangshanlai@...il.com>,
	Zqiang <qiang.zhang@...ux.dev>,
	Uladzislau Rezki <urezki@...il.com>,
	joel@...lfernandes.org,
	rcu@...r.kernel.org
Subject: [PATCH RFC 06/14] rcu: Promote per-CPU blocked tasks before checking for blocked readers

When CONFIG_RCU_PER_CPU_BLOCKED_LISTS is enabled, tasks that block in
RCU read-side critical sections may be placed on per-CPU lists rather
than directly on the rcu_node's blkd_tasks list.  It is possible that a
task can block just after rcu_gp_init()'s promotion scan completes,
leaving it only on the per-CPU list while a GP is active.

The RCU priority boosting mechanism only looks at rnp->gp_tasks and
rnp->exp_tasks, which point into rnp->blkd_tasks.  Tasks on per-CPU
lists are invisible to the boost kthread and cannot be boosted.

Address this by adding a "promote" parameter to
rcu_preempt_blocked_readers_cgp().  When promote is true and the caller
the function first promotes any tasks from per-CPU blocked lists to the
rcu_node's blkd_tasks list before checking if there are blocked readers.
This ensures that late-arriving tasks are visible for priority boosting
and other operations.

Callers that hold the rnp lock pass promote=true to get an accurate answer
including late arrivals. Lockless callers (GP loop, FQS check) pass
promote=false for an approximate snapshot (TODO: need to check if we can
always just set "promote" to true and remove the parameter).

Signed-off-by: Joel Fernandes <joelagnelf@...dia.com>
---
 kernel/rcu/tree.c        | 14 +++++++-------
 kernel/rcu/tree.h        |  2 +-
 kernel/rcu/tree_plugin.h | 34 ++++++++++++++++++++++++++++------
 kernel/rcu/tree_stall.h  |  4 ++--
 4 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5837e9923642..f8f43f94adbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2034,7 +2034,7 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
 		return true;
 
 	// The current grace period has completed.
-	if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+	if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp, false))
 		return true;
 
 	return false;
@@ -2125,7 +2125,7 @@ static noinline_for_stack void rcu_gp_fqs_loop(void)
 		 * the corresponding leaf nodes have passed through their quiescent state.
 		 */
 		if (!READ_ONCE(rnp->qsmask) &&
-		    !rcu_preempt_blocked_readers_cgp(rnp))
+		    !rcu_preempt_blocked_readers_cgp(rnp, false))
 			break;
 		/* If time for quiescent-state forcing, do it. */
 		if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
@@ -2207,7 +2207,7 @@ static noinline void rcu_gp_cleanup(void)
 	rcu_seq_end(&new_gp_seq);
 	rcu_for_each_node_breadth_first(rnp) {
 		raw_spin_lock_irq_rcu_node(rnp);
-		if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+		if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)))
 			dump_blkd_tasks(rnp, 10);
 		WARN_ON_ONCE(rnp->qsmask);
 		WRITE_ONCE(rnp->gp_seq, new_gp_seq);
@@ -2376,13 +2376,13 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
 		}
 		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
 		WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
-			     rcu_preempt_blocked_readers_cgp(rnp));
+			     rcu_preempt_blocked_readers_cgp(rnp, true));
 		WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
 		trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
 						 mask, rnp->qsmask, rnp->level,
 						 rnp->grplo, rnp->grphi,
 						 !!rnp->gp_tasks);
-		if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+		if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp, true)) {
 
 			/* Other bits still set at this level, so done. */
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2428,7 +2428,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 
 	raw_lockdep_assert_held_rcu_node(rnp);
 	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
-	    WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
+	    WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)) ||
 	    rnp->qsmask != 0) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;  /* Still need more quiescent states! */
@@ -2763,7 +2763,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		rcu_state.cbovldnext |= !!rnp->cbovldmask;
 		if (rnp->qsmask == 0) {
-			if (rcu_preempt_blocked_readers_cgp(rnp)) {
+			if (rcu_preempt_blocked_readers_cgp(rnp, true)) {
 				/*
 				 * No point in scanning bits because they
 				 * are all zero.  But we might need to
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b71c6c1de8d3..25eb9200e6ef 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -486,7 +486,7 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
 /* Forward declarations for tree_plugin.h */
 static void rcu_bootup_announce(void);
 static void rcu_qs(void);
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote);
 #ifdef CONFIG_HOTPLUG_CPU
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ad33fdd0efe8..6ed3815bb912 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -383,9 +383,28 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  * Check for preempted RCU readers blocking the current grace period
  * for the specified rcu_node structure.  If the caller needs a reliable
  * answer, it must hold the rcu_node's ->lock.
+ *
+ * If @promote is true and CONFIG_RCU_PER_CPU_BLOCKED_LISTS is enabled,
+ * this function first promotes any tasks from per-CPU blocked lists to
+ * the rcu_node's blkd_tasks list before checking.  This ensures that
+ * late-arriving tasks (blocked after GP init's promotion scan) are
+ * visible for priority boosting and other operations.  When promoting,
+ * the caller must hold rnp->lock.
  */
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote)
 {
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+	if (promote && rcu_is_leaf_node(rnp)) {
+		int cpu;
+		struct rcu_data *rdp;
+
+		raw_lockdep_assert_held_rcu_node(rnp);
+		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+			rdp = per_cpu_ptr(&rcu_data, cpu);
+			rcu_promote_blocked_tasks_rdp(rdp, rnp);
+		}
+	}
+#endif
 	return READ_ONCE(rnp->gp_tasks) != NULL;
 }
 
@@ -570,7 +589,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
 		WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
-		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
+		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp, true);
 		WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
 			     (!empty_norm || rnp->qsmask));
 		empty_exp = sync_rcu_exp_done(rnp);
@@ -597,7 +616,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		 * so we must take a snapshot of the expedited state.
 		 */
 		empty_exp_now = sync_rcu_exp_done(rnp);
-		if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
+		if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp, true)) {
 			trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
 							 rnp->gp_seq,
 							 0, rnp->qsmask,
@@ -901,7 +920,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 
 	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
 	raw_lockdep_assert_held_rcu_node(rnp);
-	if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+	if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp, true)))
 		dump_blkd_tasks(rnp, 10);
 	if (rcu_preempt_has_tasks(rnp) &&
 	    (rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
@@ -1127,7 +1146,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  * Because preemptible RCU does not exist, there are never any preempted
  * RCU readers.
  */
-static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp, bool promote)
 {
 	return 0;
 }
@@ -1221,6 +1240,9 @@ static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { }
 
 static void rcu_promote_blocked_tasks(struct rcu_node *rnp) { }
 
+static void rcu_promote_blocked_tasks_rdp(struct rcu_data *rdp,
+					  struct rcu_node *rnp) { }
+
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /*
@@ -1378,7 +1400,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 {
 	raw_lockdep_assert_held_rcu_node(rnp);
 	if (!rnp->boost_kthread_task ||
-	    (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
+	    (!rcu_preempt_blocked_readers_cgp(rnp, true) && !rnp->exp_tasks)) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b67532cb8770..5aa65130ab5c 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -277,7 +277,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 	struct task_struct *t;
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+	if (!rcu_preempt_blocked_readers_cgp(rnp, true)) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
@@ -331,7 +331,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
 	struct task_struct *ts[8];
 
 	lockdep_assert_irqs_disabled();
-	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+	if (!rcu_preempt_blocked_readers_cgp(rnp, true)) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return 0;
 	}
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ