[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20251225074451.2405584-1-joelagnelf@nvidia.com>
Date: Thu, 25 Dec 2025 02:44:50 -0500
From: Joel Fernandes <joelagnelf@...dia.com>
To: linux-kernel@...r.kernel.org,
"Paul E. McKenney" <paulmck@...nel.org>,
Frederic Weisbecker <frederic@...nel.org>,
Neeraj Upadhyay <neeraj.upadhyay@...nel.org>,
Joel Fernandes <joelagnelf@...dia.com>,
Josh Triplett <josh@...htriplett.org>,
Boqun Feng <boqun.feng@...il.com>,
Uladzislau Rezki <urezki@...il.com>,
Steven Rostedt <rostedt@...dmis.org>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Lai Jiangshan <jiangshanlai@...il.com>,
Zqiang <qiang.zhang@...ux.dev>
Cc: rcu@...r.kernel.org
Subject: [PATCH RFC] rcu/nocb: Remove unnecessary WakeOvfIsDeferred wake path
The WakeOvfIsDeferred code path in __call_rcu_nocb_wake() attempts to
wake rcuog when the callback count exceeds qhimark and callbacks aren't
done with their GP (newly queued or awaiting GP). However, a lot of
testing proves this wake is always redundant or useless.
In the flooding case, rcuog is always waiting for a GP to finish. So
waking up the rcuog thread is pointless. The timer wakeup adds overhead,
rcuog simply wakes up and goes back to sleep achieving nothing.
This path also adds a full memory barrier, and additional timer expiry
modifications unnecessarily.
The root cause is that WakeOvfIsDeferred fires when
!rcu_segcblist_ready_cbs() (GP not complete), but waking rcuog cannot
accelerate GP completion.
This commit therefore removes this path, which also adding some rdp
counters to ensure we don't have lost wake ups.
Tested with rcutorture scenarios: TREE01, TREE05, TREE08 (all NOCB
configurations) - all pass. Also stress tested using a kernel module
that floods call_rcu() to trigger the overload conditions and made the
observations confirming the findings.
Signed-off-by: Joel Fernandes <joelagnelf@...dia.com>
---
kernel/rcu/tree.h | 3 ++-
kernel/rcu/tree_nocb.h | 43 +++++++++++++++++-------------------------
2 files changed, 19 insertions(+), 27 deletions(-)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b8bbe7960cda..03ff62f28d00 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -257,6 +257,8 @@ struct rcu_data {
unsigned long nocb_gp_loops; /* # passes through wait code. */
struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
+ bool nocb_gp_wake_attempt; /* Was rcuog wake attempted? */
+ bool nocb_gp_serving; /* Is rcuog serving this rdp? */
struct task_struct *nocb_cb_kthread;
struct list_head nocb_head_rdp; /*
* Head of rcu_data list in wakeup chain,
@@ -301,7 +303,6 @@ struct rcu_data {
#define RCU_NOCB_WAKE_BYPASS 1
#define RCU_NOCB_WAKE_LAZY 2
#define RCU_NOCB_WAKE 3
-#define RCU_NOCB_WAKE_FORCE 4
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index e6cd56603cad..9f58589f08d7 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -518,10 +518,8 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
}
/*
- * Awaken the no-CBs grace-period kthread if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
+ * Awaken the no-CBs grace-period kthread if needed due to it legitimately
+ * being asleep.
*/
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
unsigned long flags)
@@ -533,7 +531,6 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
long lazy_len;
long len;
struct task_struct *t;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
// If we are being polled or there is no kthread, just leave.
t = READ_ONCE(rdp->nocb_gp_kthread);
@@ -549,24 +546,26 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
lazy_len = READ_ONCE(rdp->lazy_len);
if (was_alldone) {
rdp->qlen_last_fqs_check = len;
+ rdp->nocb_gp_wake_attempt = true;
+ rcu_nocb_unlock(rdp);
// Only lazy CBs in bypass list
if (lazy_len && bypass_len == lazy_len) {
- rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
TPS("WakeLazy"));
} else if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
- rcu_nocb_unlock(rdp);
wake_nocb_gp(rdp, false);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
TPS("WakeEmptyIsDeferred"));
}
+
+ return;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
+ /* Callback overload condition. */
+ WARN_ON_ONCE(!rdp->nocb_gp_wake_attempt && !rdp->nocb_gp_serving);
rdp->qlen_last_fqs_check = len;
j = jiffies;
if (j != rdp->nocb_gp_adv_time &&
@@ -575,21 +574,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
rcu_advance_cbs_nowake(rdp->mynode, rdp);
rdp->nocb_gp_adv_time = j;
}
- smp_mb(); /* Enqueue before timer_pending(). */
- if ((rdp->nocb_cb_sleep ||
- !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp_gp->nocb_timer)) {
- rcu_nocb_unlock(rdp);
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
- } else {
- rcu_nocb_unlock(rdp);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- }
- } else {
- rcu_nocb_unlock(rdp);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
+
+ rcu_nocb_unlock(rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
@@ -702,10 +690,13 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bypass_ncbs > 2 * qhimark)) {
flush_bypass = true;
} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rdp->nocb_gp_wake_attempt = false;
+ rdp->nocb_gp_serving = false;
rcu_nocb_unlock_irqrestore(rdp, flags);
continue; /* No callbacks here, try next. */
}
+ rdp->nocb_gp_serving = true;
if (flush_bypass) {
// Bypass full or old, so flush it.
(void)rcu_nocb_try_flush_bypass(rdp, j);
@@ -966,7 +957,6 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
unsigned long flags)
__releases(rdp_gp->nocb_gp_lock)
{
- int ndw;
int ret;
if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
@@ -974,8 +964,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
return false;
}
- ndw = rdp_gp->nocb_defer_wakeup;
- ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ ret = __wake_nocb_gp(rdp_gp, rdp, false, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
return ret;
@@ -1077,6 +1066,8 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+ rdp->nocb_gp_wake_attempt = false;
+ rdp->nocb_gp_serving = false;
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
--
2.34.1
Powered by blists - more mailing lists