[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260103002343.6599-3-joelagnelf@nvidia.com>
Date: Fri, 2 Jan 2026 19:23:31 -0500
From: Joel Fernandes <joelagnelf@...dia.com>
To: linux-kernel@...r.kernel.org
Cc: "Paul E . McKenney" <paulmck@...nel.org>,
Frederic Weisbecker <frederic@...nel.org>,
Neeraj Upadhyay <neeraj.upadhyay@...nel.org>,
Joel Fernandes <joelagnelf@...dia.com>,
Josh Triplett <josh@...htriplett.org>,
Boqun Feng <boqun.feng@...il.com>,
Steven Rostedt <rostedt@...dmis.org>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Lai Jiangshan <jiangshanlai@...il.com>,
Zqiang <qiang.zhang@...ux.dev>,
Uladzislau Rezki <urezki@...il.com>,
joel@...lfernandes.org,
rcu@...r.kernel.org
Subject: [PATCH RFC 02/14] rcu: Add per-CPU blocked task lists for PREEMPT_RCU
Add per-CPU tracking of tasks blocked in RCU read-side critical
sections. Each rcu_data gets a blkd_list protected by blkd_lock,
mirroring the rcu_node blkd_tasks list at per-CPU granularity.
Tasks are added on preemption and removed on rcu_read_unlock.
A WARN_ON_ONCE in rcu_gp_init verifies list consistency.
Signed-off-by: Joel Fernandes <joelagnelf@...dia.com>
---
include/linux/sched.h | 4 ++++
kernel/fork.c | 4 ++++
kernel/rcu/Kconfig | 12 ++++++++++++
kernel/rcu/tree.c | 32 ++++++++++++++++++++++++++++++++
kernel/rcu/tree.h | 6 ++++++
kernel/rcu/tree_plugin.h | 21 +++++++++++++++++++++
6 files changed, 79 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d395f2810fac..90ce501a568e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -931,6 +931,10 @@ struct task_struct {
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
struct rcu_node *rcu_blocked_node;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ struct list_head rcu_rdp_entry;
+ int rcu_blocked_cpu;
+#endif
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
diff --git a/kernel/fork.c b/kernel/fork.c
index b1f3915d5f8e..7a5ba2d2c1b5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1819,6 +1819,10 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_read_unlock_special.s = 0;
p->rcu_blocked_node = NULL;
INIT_LIST_HEAD(&p->rcu_node_entry);
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ INIT_LIST_HEAD(&p->rcu_rdp_entry);
+ p->rcu_blocked_cpu = -1;
+#endif
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
p->rcu_tasks_holdout = false;
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 4d9b21f69eaa..4bb12f1fed09 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -248,6 +248,18 @@ config RCU_EXP_KTHREAD
Accept the default if unsure.
+config RCU_PER_CPU_BLOCKED_LISTS
+ bool "Use per-CPU blocked task lists in PREEMPT_RCU"
+ depends on PREEMPT_RCU
+ default n
+ help
+ Enable per-CPU tracking of tasks blocked in RCU read-side
+ critical sections. This allows to quickly toggle the feature.
+ Eventually the config will be removed, in favor of always keeping
+ the optimization enabled.
+
+ Accept the default if unsure.
+
config RCU_NOCB_CPU
bool "Offload RCU callback processing from boot-selected CPUs"
depends on TREE_RCU
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 293bbd9ac3f4..e2b6a4579086 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1809,6 +1809,14 @@ static noinline_for_stack bool rcu_gp_init(void)
struct rcu_node *rnp = rcu_get_root();
bool start_new_poll;
unsigned long old_gp_seq;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ struct task_struct *t_verify;
+ int cpu_verify;
+ int rnp_count;
+ int rdp_total;
+ struct rcu_data *rdp_cpu;
+ struct task_struct *t_rdp;
+#endif
WRITE_ONCE(rcu_state.gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp);
@@ -1891,6 +1899,26 @@ static noinline_for_stack bool rcu_gp_init(void)
*/
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_rcu_node(rnp);
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ /*
+ * Verify rdp lists consistent with rnp list. Since the unlock
+ * path removes from rdp before rnp, we can have tasks that are
+ * on rnp but not on rdp (in the middle of being removed).
+ * Therefore rnp_count >= rdp_total is the expected invariant.
+ */
+ rnp_count = 0;
+ rdp_total = 0;
+ list_for_each_entry(t_verify, &rnp->blkd_tasks, rcu_node_entry)
+ rnp_count++;
+ for (cpu_verify = rnp->grplo; cpu_verify <= rnp->grphi; cpu_verify++) {
+ rdp_cpu = per_cpu_ptr(&rcu_data, cpu_verify);
+ raw_spin_lock(&rdp_cpu->blkd_lock);
+ list_for_each_entry(t_rdp, &rdp_cpu->blkd_list, rcu_rdp_entry)
+ rdp_total++;
+ raw_spin_unlock(&rdp_cpu->blkd_lock);
+ }
+ WARN_ON_ONCE(rnp_count < rdp_total);
+#endif
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
@@ -4143,6 +4171,10 @@ rcu_boot_init_percpu_data(int cpu)
rdp->rcu_onl_gp_state = RCU_GP_CLEANED;
rdp->last_sched_clock = jiffies;
rdp->cpu = cpu;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ raw_spin_lock_init(&rdp->blkd_lock);
+ INIT_LIST_HEAD(&rdp->blkd_list);
+#endif
rcu_boot_init_nocb_percpu_data(rdp);
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b8bbe7960cda..13d5649a80fb 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -294,6 +294,12 @@ struct rcu_data {
long lazy_len; /* Length of buffered lazy callbacks. */
int cpu;
+
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ /* 8) Per-CPU blocked task tracking. */
+ raw_spinlock_t blkd_lock; /* Protects blkd_list. */
+ struct list_head blkd_list; /* Tasks blocked on this CPU. */
+#endif
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 73ba5f4a968d..5d2bde19131a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -338,6 +338,12 @@ void rcu_note_context_switch(bool preempt)
raw_spin_lock_rcu_node(rnp);
t->rcu_read_unlock_special.b.blocked = true;
t->rcu_blocked_node = rnp;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ t->rcu_blocked_cpu = rdp->cpu;
+ raw_spin_lock(&rdp->blkd_lock);
+ list_add(&t->rcu_rdp_entry, &rdp->blkd_list);
+ raw_spin_unlock(&rdp->blkd_lock);
+#endif
/*
* Verify the CPU's sanity, trace the preemption, and
@@ -485,6 +491,10 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
struct rcu_data *rdp;
struct rcu_node *rnp;
union rcu_special special;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ int blocked_cpu;
+ struct rcu_data *blocked_rdp;
+#endif
rdp = this_cpu_ptr(&rcu_data);
if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
@@ -530,6 +540,17 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
*/
rnp = t->rcu_blocked_node;
+#ifdef CONFIG_RCU_PER_CPU_BLOCKED_LISTS
+ /* Remove from per-CPU list if task was added to it. */
+ blocked_cpu = t->rcu_blocked_cpu;
+ if (blocked_cpu != -1) {
+ blocked_rdp = per_cpu_ptr(&rcu_data, blocked_cpu);
+ raw_spin_lock(&blocked_rdp->blkd_lock);
+ list_del_init(&t->rcu_rdp_entry);
+ t->rcu_blocked_cpu = -1;
+ raw_spin_unlock(&blocked_rdp->blkd_lock);
+ }
+#endif
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
--
2.34.1
Powered by blists - more mailing lists