linux-kernel - [PATCH RFC 13/14] rcu: Skip rnp addition when no grace period waiting

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260103002343.6599-14-joelagnelf@nvidia.com>
Date: Fri,  2 Jan 2026 19:23:42 -0500
From: Joel Fernandes <joelagnelf@...dia.com>
To: linux-kernel@...r.kernel.org
Cc: "Paul E . McKenney" <paulmck@...nel.org>,
	Frederic Weisbecker <frederic@...nel.org>,
	Neeraj Upadhyay <neeraj.upadhyay@...nel.org>,
	Joel Fernandes <joelagnelf@...dia.com>,
	Josh Triplett <josh@...htriplett.org>,
	Boqun Feng <boqun.feng@...il.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
	Lai Jiangshan <jiangshanlai@...il.com>,
	Zqiang <qiang.zhang@...ux.dev>,
	Uladzislau Rezki <urezki@...il.com>,
	joel@...lfernandes.org,
	rcu@...r.kernel.org
Subject: [PATCH RFC 13/14] rcu: Skip rnp addition when no grace period waiting

This is the key optimization commit that triggers the per-CPU blocked
task list promotion mechanism.

When a GP is waiting, add directly to rnp->blkd_tasks via
rcu_preempt_ctxt_queue(), but NOT to the per-CPU list.

However, when no GP is waiting on this CPU, skip adding to rnp->blkd_tasks
entirely. This completely avoids rnp->lock acquisition in this path
triggering the optimization.

Signed-off-by: Joel Fernandes <joelagnelf@...dia.com>
---
 kernel/rcu/tree_plugin.h | 64 ++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d43dd153c152..a0cd50f1e6c5 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -335,37 +335,43 @@ void rcu_note_context_switch(bool preempt)
 
 		/* Possibly blocking in an RCU read-side critical section. */
 		rnp = rdp->mynode;
-		raw_spin_lock_rcu_node(rnp);
 		t->rcu_read_unlock_special.b.blocked = true;
-		t->rcu_blocked_node = rnp;
 #ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
 		/*
-		 * If no GP is waiting on this CPU, add to per-CPU list as well
-		 * so promotion can find it if a GP starts later. If GP waiting,
-		 * skip per-CPU list - task goes only to rnp->blkd_tasks (same
-		 * behavior as before per-CPU lists were added).
+		 * Check if a GP is in progress.
 		 */
 		if (!rcu_gp_in_progress() && !rdp->cpu_no_qs.b.norm && !rdp->cpu_no_qs.b.exp) {
+			/*
+			 * No GP waiting on this CPU. Add to per-CPU list only,
+			 * skipping rnp->lock for better scalability.
+			 */
+			t->rcu_blocked_node = NULL;
 			t->rcu_blocked_cpu = rdp->cpu;
 			raw_spin_lock(&rdp->blkd_lock);
 			list_add(&t->rcu_rdp_entry, &rdp->blkd_list);
 			raw_spin_unlock(&rdp->blkd_lock);
-		}
+			trace_rcu_preempt_task(rcu_state.name, t->pid,
+					       rcu_seq_snap(&rnp->gp_seq));
+		} else
 #endif
+		/* GP waiting (or per-CPU lists disabled) - add to rnp. */
+		{
+			raw_spin_lock_rcu_node(rnp);
+			t->rcu_blocked_node = rnp;
 
-		/*
-		 * Verify the CPU's sanity, trace the preemption, and
-		 * then queue the task as required based on the states
-		 * of any ongoing and expedited grace periods.
-		 */
-		WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
-		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
-		trace_rcu_preempt_task(rcu_state.name,
-				       t->pid,
-				       (rnp->qsmask & rdp->grpmask)
-				       ? rnp->gp_seq
-				       : rcu_seq_snap(&rnp->gp_seq));
-		rcu_preempt_ctxt_queue(rnp, rdp);
+			/*
+			 * Verify the CPU's sanity, trace the preemption, and
+			 * then queue the task as required based on the states
+			 * of any ongoing and expedited grace periods.
+			 */
+			WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
+			WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
+			trace_rcu_preempt_task(rcu_state.name, t->pid,
+					       (rnp->qsmask & rdp->grpmask)
+					       ? rnp->gp_seq
+					       : rcu_seq_snap(&rnp->gp_seq));
+			rcu_preempt_ctxt_queue(rnp, rdp);
+		}
 	} else {
 		rcu_preempt_deferred_qs(t);
 	}
@@ -568,13 +574,22 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		 */
 		rnp = t->rcu_blocked_node;
 #ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
-		/* Remove from per-CPU list if task was added to it. */
 		blocked_cpu = t->rcu_blocked_cpu;
 		if (blocked_cpu != -1) {
+			/*
+			 * Task is on per-CPU list. Remove it and check if
+			 * it was promoted to rnp->blkd_tasks.
+			 */
 			blocked_rdp = per_cpu_ptr(&rcu_data, blocked_cpu);
 			raw_spin_lock(&blocked_rdp->blkd_lock);
 			list_del_init(&t->rcu_rdp_entry);
 			t->rcu_blocked_cpu = -1;
+
+			/*
+			 * Read rcu_blocked_node while holding blkd_lock to
+			 * serialize with rcu_promote_blocked_tasks().
+			 */
+			rnp = t->rcu_blocked_node;
 			raw_spin_unlock(&blocked_rdp->blkd_lock);
 			/*
 			 * TODO: This should just be "WARN_ON_ONCE(rnp); return;" since after
@@ -584,15 +599,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 			 * from the rdp blocked list and early returning.
 			 */
 			if (!rnp) {
-				/*
-				 * Task was only on per-CPU list, not on rnp list.
-				 * This can happen in future when tasks are added
-				 * only to rdp initially and promoted to rnp later.
-				 */
+				/* Not promoted - no GP waiting for this task. */
 				local_irq_restore(flags);
 				return;
 			}
 		}
+		/* else: Task went directly to rnp->blkd_tasks. */
 #endif
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
-- 
2.34.1