[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <36f40819-d2ac-4185-9f1c-1d01940b1d8a@paulmck-laptop>
Date: Thu, 25 Sep 2025 11:58:17 -0700
From: "Paul E. McKenney" <paulmck@...nel.org>
To: Andrii Nakryiko <andrii.nakryiko@...il.com>
Cc: rcu@...r.kernel.org, linux-kernel@...r.kernel.org, kernel-team@...a.com,
rostedt@...dmis.org, kernel test robot <oliver.sang@...el.com>,
Andrii Nakryiko <andrii@...nel.org>,
Alexei Starovoitov <ast@...nel.org>,
Peter Zijlstra <peterz@...radead.org>, bpf@...r.kernel.org
Subject: Re: [PATCH 01/34] rcu: Re-implement RCU Tasks Trace in terms of
SRCU-fast
On Thu, Sep 25, 2025 at 11:39:18AM -0700, Andrii Nakryiko wrote:
> On Tue, Sep 23, 2025 at 7:22 AM Paul E. McKenney <paulmck@...nel.org> wrote:
> >
> > This commit saves more than 500 lines of RCU code by re-implementing
> > RCU Tasks Trace in terms of SRCU-fast. Follow-up work will remove
> > more code that does not cause problems by its presence, but that is no
> > longer required.
> >
> > This variant places smp_mb() in rcu_read_{,un}lock_trace(), which will
> > be removed on common-case architectures in a later commit.
> >
> > [ paulmck: Apply kernel test robot, Boqun Feng, and Zqiang feedback. ]
> >
> > Signed-off-by: Paul E. McKenney <paulmck@...nel.org>
> > Tested-by: kernel test robot <oliver.sang@...el.com>
> > Cc: Andrii Nakryiko <andrii@...nel.org>
> > Cc: Alexei Starovoitov <ast@...nel.org>
> > Cc: Peter Zijlstra <peterz@...radead.org>
> > Cc: <bpf@...r.kernel.org>
> > ---
> > include/linux/rcupdate_trace.h | 107 ++++--
> > include/linux/sched.h | 1 +
> > kernel/rcu/srcutiny.c | 13 +-
> > kernel/rcu/tasks.h | 617 +--------------------------------
> > 4 files changed, 104 insertions(+), 634 deletions(-)
> >
>
> makes sense to me overall, but I had a few questions below
>
> > diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
> > index e6c44eb428ab63..3f46cbe6700038 100644
> > --- a/include/linux/rcupdate_trace.h
> > +++ b/include/linux/rcupdate_trace.h
> > @@ -12,28 +12,28 @@
> > #include <linux/rcupdate.h>
> > #include <linux/cleanup.h>
> >
> > -extern struct lockdep_map rcu_trace_lock_map;
> > +#ifdef CONFIG_TASKS_TRACE_RCU
> > +extern struct srcu_struct rcu_tasks_trace_srcu_struct;
> > +#endif // #ifdef CONFIG_TASKS_TRACE_RCU
> >
> > -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > +#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_TASKS_TRACE_RCU)
> >
> > static inline int rcu_read_lock_trace_held(void)
> > {
> > - return lock_is_held(&rcu_trace_lock_map);
> > + return srcu_read_lock_held(&rcu_tasks_trace_srcu_struct);
> > }
> >
> > -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
> > +#else // #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_TASKS_TRACE_RCU)
> >
> > static inline int rcu_read_lock_trace_held(void)
> > {
> > return 1;
> > }
> >
> > -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
> > +#endif // #else // #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_TASKS_TRACE_RCU)
>
> nit: // #else // #if... looks very unconventional
Perhaps so, but it is much easier to deal with.
> > #ifdef CONFIG_TASKS_TRACE_RCU
> >
> > -void rcu_read_unlock_trace_special(struct task_struct *t);
> > -
> > /**
> > * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section
> > *
> > @@ -50,12 +50,14 @@ static inline void rcu_read_lock_trace(void)
> > {
> > struct task_struct *t = current;
> >
> > - WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1);
> > - barrier();
> > - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
> > - t->trc_reader_special.b.need_mb)
> > - smp_mb(); // Pairs with update-side barriers
> > - rcu_lock_acquire(&rcu_trace_lock_map);
> > + if (t->trc_reader_nesting++) {
> > + // In case we interrupted a Tasks Trace RCU reader.
> > + rcu_try_lock_acquire(&rcu_tasks_trace_srcu_struct.dep_map);
>
> why is this a "try_lock" variant instead of a no-try "lock_acquire"
> one? Some lockdep special treatment for nested locking?
Because rcu_read_lock_trace() cannot participate in anywhere near the
variety of deadlocks that (say) read_lock() can. So we tell lockdep
that this is a try-lock variant so that it won't spew false-positive
deadlock diagnostics.
> > + return;
> > + }
> > + barrier(); // nesting before scp to protect against interrupt handler.
> > + t->trc_reader_scp = srcu_read_lock_fast(&rcu_tasks_trace_srcu_struct);
> > + smp_mb(); // Placeholder for more selective ordering
> > }
> >
> > /**
>
> [...]
>
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 2b272382673d62..89d3646155525f 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -939,6 +939,7 @@ struct task_struct {
> >
> > #ifdef CONFIG_TASKS_TRACE_RCU
> > int trc_reader_nesting;
> > + struct srcu_ctr __percpu *trc_reader_scp;
> > int trc_ipi_to_cpu;
> > union rcu_special trc_reader_special;
> > struct list_head trc_holdout_list;
> > diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
> > index e3b64a5e0ec7e1..3450c3751ef7ad 100644
> > --- a/kernel/rcu/srcutiny.c
> > +++ b/kernel/rcu/srcutiny.c
> > @@ -106,15 +106,15 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
> > newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
> > WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
> > preempt_enable();
> > - if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
> > + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task() && !irqs_disabled())
>
> this seems like something that probably should be done in a separate
> patch with an explanation on why?
>
> > swake_up_one(&ssp->srcu_wq);
> > }
> > EXPORT_SYMBOL_GPL(__srcu_read_unlock);
> >
> > /*
> > * Workqueue handler to drive one grace period and invoke any callbacks
> > - * that become ready as a result. Single-CPU and !PREEMPTION operation
> > - * means that we get away with murder on synchronization. ;-)
> > + * that become ready as a result. Single-CPU operation and preemption
> > + * disabling mean that we get away with murder on synchronization. ;-)
> > */
> > void srcu_drive_gp(struct work_struct *wp)
> > {
> > @@ -141,7 +141,12 @@ void srcu_drive_gp(struct work_struct *wp)
> > WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
> > WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
> > preempt_enable();
> > - swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
> > + do {
> > + // Deadlock issues prevent __srcu_read_unlock() from
> > + // doing an unconditional wakeup, so polling is required.
> > + swait_event_timeout_exclusive(ssp->srcu_wq,
> > + !READ_ONCE(ssp->srcu_lock_nesting[idx]), HZ / 10);
> > + } while (READ_ONCE(ssp->srcu_lock_nesting[idx]));
>
> ditto, generic srcu change, driven by RCU Tasks Trace transformation,
> but probably worth calling it out separately?
Good point for both, will extract them to their own commits.
Thanx, Paul
> > preempt_disable(); // Needed for PREEMPT_LAZY
> > WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
> > WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
>
> [...]
Powered by blists - more mailing lists