lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <960035c6-c5f3-4c31-bd0f-f57d79b040f4@paulmck-laptop>
Date: Sun, 6 Jul 2025 10:08:15 -0700
From: "Paul E. McKenney" <paulmck@...nel.org>
To: Joel Fernandes <joelagnelf@...dia.com>
Cc: linux-kernel@...r.kernel.org, Frederic Weisbecker <frederic@...nel.org>,
	Neeraj Upadhyay <neeraj.upadhyay@...nel.org>,
	Josh Triplett <josh@...htriplett.org>,
	Boqun Feng <boqun.feng@...il.com>,
	Uladzislau Rezki <urezki@...il.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
	Lai Jiangshan <jiangshanlai@...il.com>,
	Zqiang <qiang.zhang@...ux.dev>, rcu@...r.kernel.org
Subject: Re: [PATCH RFC 1/3] rcu: Fix rcu_read_unlock() deadloop due to IRQ
 work

On Sat, Jul 05, 2025 at 04:39:15PM -0400, Joel Fernandes wrote:
> Signed-off-by: Joel Fernandes <joelagnelf@...dia.com>

Definitely headed in the right direction, though it does need just a
little bit more detail in the commit log.  ;-)

Also a few comments and questions interspersed below.

							Thanx, Paul

> ---
>  kernel/rcu/tree.h        | 11 ++++++++++-
>  kernel/rcu/tree_plugin.h | 29 ++++++++++++++++++++++-------
>  2 files changed, 32 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index 3830c19cf2f6..f8f612269e6e 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -174,6 +174,15 @@ struct rcu_snap_record {
>  	unsigned long   jiffies;	/* Track jiffies value */
>  };
>  
> +/*
> + * The IRQ work (deferred_qs_iw) is used by RCU to get scheduler's attention.
> + * It can be in one of the following states:
> + * - DEFER_QS_IDLE: An IRQ work was never scheduled.
> + * - DEFER_QS_PENDING: An IRQ work was scheduler but never run.
> + */
> +#define DEFER_QS_IDLE		0
> +#define DEFER_QS_PENDING	1

Having names for the states is good!

> +
>  /* Per-CPU data for read-copy update. */
>  struct rcu_data {
>  	/* 1) quiescent-state and grace-period handling : */
> @@ -192,7 +201,7 @@ struct rcu_data {
>  					/*  during and after the last grace */
>  					/* period it is aware of. */
>  	struct irq_work defer_qs_iw;	/* Obtain later scheduler attention. */
> -	bool defer_qs_iw_pending;	/* Scheduler attention pending? */
> +	int defer_qs_iw_pending;	/* Scheduler attention pending? */
>  	struct work_struct strict_work;	/* Schedule readers for strict GPs. */
>  
>  	/* 2) batch handling */
> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> index dd1c156c1759..baf57745b42f 100644
> --- a/kernel/rcu/tree_plugin.h
> +++ b/kernel/rcu/tree_plugin.h
> @@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
>  	struct rcu_node *rnp;
>  	union rcu_special special;
>  
> +	rdp = this_cpu_ptr(&rcu_data);
> +	if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
> +		rdp->defer_qs_iw_pending = DEFER_QS_IDLE;

Good, this is where the request actually gets serviced.

> +
>  	/*
>  	 * If RCU core is waiting for this CPU to exit its critical section,
>  	 * report the fact that it has exited.  Because irqs are disabled,
>  	 * t->rcu_read_unlock_special cannot change.
>  	 */
>  	special = t->rcu_read_unlock_special;
> -	rdp = this_cpu_ptr(&rcu_data);
>  	if (!special.s && !rdp->cpu_no_qs.b.exp) {
>  		local_irq_restore(flags);
>  		return;
> @@ -623,12 +626,24 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t)
>   */
>  static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
>  {
> -	unsigned long flags;
> -	struct rcu_data *rdp;
> +	volatile unsigned long flags;

I don't understand why this wants to be volatile.

Unless maybe you want to make sure that gdb can see it, in
which case, is there an existing Kconfig option for that?  Maybe
CONFIG_DEBUG_INFO_NONE=n?

> +	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
>  
> -	rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
>  	local_irq_save(flags);
> -	rdp->defer_qs_iw_pending = false;
> +
> +	/*
> +	 * Requeue the IRQ work on next unlock in following situation:
> +	 * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING)
> +	 * 2. CPU enters new rcu_read_lock()
> +	 * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0
> +	 * 4. rcu_read_unlock() does not re-queue work (state still PENDING)
> +	 * 5. Deferred QS reporting does not happen.
> +	 */
> +	if (rcu_preempt_depth() > 0) {
> +		WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);

Shouldn't we have just this WRITE_ONCE() in this then-clause?

> +		local_irq_restore(flags);
> +		return;
> +	}
>  	local_irq_restore(flags);
>  }
>  
> @@ -675,7 +690,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
>  			set_tsk_need_resched(current);
>  			set_preempt_need_resched();
>  			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
> -			    expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
> +			    expboost && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && cpu_online(rdp->cpu)) {
>  				// Get scheduler to re-evaluate and call hooks.
>  				// If !IRQ_WORK, FQS scan will eventually IPI.
>  				if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
> @@ -685,7 +700,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
>  				else
>  					init_irq_work(&rdp->defer_qs_iw,
>  						      rcu_preempt_deferred_qs_handler);
> -				rdp->defer_qs_iw_pending = true;
> +				rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
>  				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
>  			}
>  		}
> -- 
> 2.43.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ