[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260103002343.6599-14-joelagnelf@nvidia.com>
Date: Fri, 2 Jan 2026 19:23:42 -0500
From: Joel Fernandes <joelagnelf@...dia.com>
To: linux-kernel@...r.kernel.org
Cc: "Paul E . McKenney" <paulmck@...nel.org>,
Frederic Weisbecker <frederic@...nel.org>,
Neeraj Upadhyay <neeraj.upadhyay@...nel.org>,
Joel Fernandes <joelagnelf@...dia.com>,
Josh Triplett <josh@...htriplett.org>,
Boqun Feng <boqun.feng@...il.com>,
Steven Rostedt <rostedt@...dmis.org>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Lai Jiangshan <jiangshanlai@...il.com>,
Zqiang <qiang.zhang@...ux.dev>,
Uladzislau Rezki <urezki@...il.com>,
joel@...lfernandes.org,
rcu@...r.kernel.org
Subject: [PATCH RFC 13/14] rcu: Skip rnp addition when no grace period waiting
This is the key optimization commit that triggers the per-CPU blocked
task list promotion mechanism.
When a GP is waiting, add directly to rnp->blkd_tasks via
rcu_preempt_ctxt_queue(), but NOT to the per-CPU list.
However, when no GP is waiting on this CPU, skip adding to rnp->blkd_tasks
entirely. This completely avoids rnp->lock acquisition in this path
triggering the optimization.
Signed-off-by: Joel Fernandes <joelagnelf@...dia.com>
---
kernel/rcu/tree_plugin.h | 64 ++++++++++++++++++++++++----------------
1 file changed, 38 insertions(+), 26 deletions(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d43dd153c152..a0cd50f1e6c5 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -335,37 +335,43 @@ void rcu_note_context_switch(bool preempt)
/* Possibly blocking in an RCU read-side critical section. */
rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp);
t->rcu_read_unlock_special.b.blocked = true;
- t->rcu_blocked_node = rnp;
#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
/*
- * If no GP is waiting on this CPU, add to per-CPU list as well
- * so promotion can find it if a GP starts later. If GP waiting,
- * skip per-CPU list - task goes only to rnp->blkd_tasks (same
- * behavior as before per-CPU lists were added).
+ * Check if a GP is in progress.
*/
if (!rcu_gp_in_progress() && !rdp->cpu_no_qs.b.norm && !rdp->cpu_no_qs.b.exp) {
+ /*
+ * No GP waiting on this CPU. Add to per-CPU list only,
+ * skipping rnp->lock for better scalability.
+ */
+ t->rcu_blocked_node = NULL;
t->rcu_blocked_cpu = rdp->cpu;
raw_spin_lock(&rdp->blkd_lock);
list_add(&t->rcu_rdp_entry, &rdp->blkd_list);
raw_spin_unlock(&rdp->blkd_lock);
- }
+ trace_rcu_preempt_task(rcu_state.name, t->pid,
+ rcu_seq_snap(&rnp->gp_seq));
+ } else
#endif
+ /* GP waiting (or per-CPU lists disabled) - add to rnp. */
+ {
+ raw_spin_lock_rcu_node(rnp);
+ t->rcu_blocked_node = rnp;
- /*
- * Verify the CPU's sanity, trace the preemption, and
- * then queue the task as required based on the states
- * of any ongoing and expedited grace periods.
- */
- WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
- WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
- trace_rcu_preempt_task(rcu_state.name,
- t->pid,
- (rnp->qsmask & rdp->grpmask)
- ? rnp->gp_seq
- : rcu_seq_snap(&rnp->gp_seq));
- rcu_preempt_ctxt_queue(rnp, rdp);
+ /*
+ * Verify the CPU's sanity, trace the preemption, and
+ * then queue the task as required based on the states
+ * of any ongoing and expedited grace periods.
+ */
+ WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
+ WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
+ trace_rcu_preempt_task(rcu_state.name, t->pid,
+ (rnp->qsmask & rdp->grpmask)
+ ? rnp->gp_seq
+ : rcu_seq_snap(&rnp->gp_seq));
+ rcu_preempt_ctxt_queue(rnp, rdp);
+ }
} else {
rcu_preempt_deferred_qs(t);
}
@@ -568,13 +574,22 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
rnp = t->rcu_blocked_node;
#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
- /* Remove from per-CPU list if task was added to it. */
blocked_cpu = t->rcu_blocked_cpu;
if (blocked_cpu != -1) {
+ /*
+ * Task is on per-CPU list. Remove it and check if
+ * it was promoted to rnp->blkd_tasks.
+ */
blocked_rdp = per_cpu_ptr(&rcu_data, blocked_cpu);
raw_spin_lock(&blocked_rdp->blkd_lock);
list_del_init(&t->rcu_rdp_entry);
t->rcu_blocked_cpu = -1;
+
+ /*
+ * Read rcu_blocked_node while holding blkd_lock to
+ * serialize with rcu_promote_blocked_tasks().
+ */
+ rnp = t->rcu_blocked_node;
raw_spin_unlock(&blocked_rdp->blkd_lock);
/*
* TODO: This should just be "WARN_ON_ONCE(rnp); return;" since after
@@ -584,15 +599,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* from the rdp blocked list and early returning.
*/
if (!rnp) {
- /*
- * Task was only on per-CPU list, not on rnp list.
- * This can happen in future when tasks are added
- * only to rdp initially and promoted to rnp later.
- */
+ /* Not promoted - no GP waiting for this task. */
local_irq_restore(flags);
return;
}
}
+ /* else: Task went directly to rnp->blkd_tasks. */
#endif
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
--
2.34.1
Powered by blists - more mailing lists