linux-kernel - Re: [PATCH diagnostic] Re: HPET regression in 2.6.26 versus 2.6.25 -- RCU problem

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20080811013808.GA4242@linux.vnet.ibm.com>
Date:	Sun, 10 Aug 2008 18:38:09 -0700
From:	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
To:	David Witbrodt <dawitbro@...global.net>
Cc:	Peter Zijlstra <peterz@...radead.org>,
	linux-kernel@...r.kernel.org, Yinghai Lu <yhlu.kernel@...il.com>,
	Ingo Molnar <mingo@...e.hu>,
	Thomas Gleixner <tglx@...utronix.de>,
	"H. Peter Anvin" <hpa@...or.com>, netdev <netdev@...r.kernel.org>
Subject: Re: [PATCH diagnostic] Re: HPET regression in 2.6.26 versus 2.6.25
	-- RCU problem

On Sun, Aug 10, 2008 at 06:35:38PM -0700, Paul E. McKenney wrote:
> On Sun, Aug 10, 2008 at 08:15:20AM -0700, Paul E. McKenney wrote:
> > I will see about putting together a diagnostic patch for Classic RCU.
> > The approach will be to record jiffies (or some such) at the beginning
> > of the grace period (in rcu_start_batch()), then have
> > rcu_check_callbacks() complain if:
> > 
> > 1.	it is running on a CPU that has holding up grace periods for
> > 	a long time (say one second).  This will identify the culprit
> > 	assuming that the culprit has not disabled hardware irqs,
> > 	instruction execution, or some such.
> > 
> > 2.	it is running on a CPU that is not holding up grace periods,
> > 	but grace periods have been held up for an even longer time
> > 	(say two seconds).
> > 
> > In either case, some sort of exponential backoff would be needed to
> > avoid multi-gigabyte log files.  Of course, all of this assumes that
> > the machine remains healthy enough to actually get any such messages
> > somewhere that you can see them, but so it goes...

And attached is the kernel module I used to test the patch, for whatever
that might be worth.

							Thanx, Paul

> And here is the patch.  It is still a bit raw, so the results should
> be viewed with some suspicion.  It adds a default-off kernel parameter
> CONFIG_RCU_CPU_STALL which must be enabled.
> 
> Rather than exponential backoff, it backs off to once per 30 seconds.
> My feeling upon thinking on it was that if you have stalled RCU grace
> periods for that long, a few extra printk() messages are probably the
> least of your worries...
> 
> Signed-off-by: Paul E. McKenney <paulmck@...ux.vnet.ibm.com>
> ---
> 
>  include/linux/rcuclassic.h |    3 +
>  kernel/rcuclassic.c        |   80 +++++++++++++++++++++++++++++++++++++++++++++
>  lib/Kconfig.debug          |   13 +++++++
>  3 files changed, 96 insertions(+)
> 
> diff -urpNa -X dontdiff linux-2.6.27-rc1/include/linux/rcuclassic.h linux-2.6.27-rc1-cpustall/include/linux/rcuclassic.h
> --- linux-2.6.27-rc1/include/linux/rcuclassic.h	2008-07-30 08:48:16.000000000 -0700
> +++ linux-2.6.27-rc1-cpustall/include/linux/rcuclassic.h	2008-08-10 12:21:22.000000000 -0700
> @@ -46,6 +46,9 @@ struct rcu_ctrlblk {
>  	long	cur;		/* Current batch number.                      */
>  	long	completed;	/* Number of the last completed batch         */
>  	int	next_pending;	/* Is the next batch already waiting?         */
> +#ifdef CONFIG_RCU_CPU_STALL
> +	unsigned long gp_check;	/* Time grace period should end, in seconds.  */
> +#endif /* #ifdef CONFIG_RCU_CPU_STALL */
>  
>  	int	signaled;
>  
> diff -urpNa -X dontdiff linux-2.6.27-rc1/kernel/rcuclassic.c linux-2.6.27-rc1-cpustall/kernel/rcuclassic.c
> --- linux-2.6.27-rc1/kernel/rcuclassic.c	2008-07-30 08:48:17.000000000 -0700
> +++ linux-2.6.27-rc1-cpustall/kernel/rcuclassic.c	2008-08-10 17:51:32.000000000 -0700
> @@ -47,6 +47,7 @@
>  #include <linux/notifier.h>
>  #include <linux/cpu.h>
>  #include <linux/mutex.h>
> +#include <linux/time.h>
>  
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  static struct lock_class_key rcu_lock_key;
> @@ -269,6 +270,81 @@ static void rcu_do_batch(struct rcu_data
>   *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
>   *   period (if necessary).
>   */
> +
> +#ifdef CONFIG_RCU_CPU_STALL
> +
> +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
> +{
> +	rcp->gp_check = get_seconds() + 3;
> +}
> +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
> +{
> +	int cpu;
> +	long delta;
> +
> +	/* Only let one CPU complain about others per time interval. */
> +
> +	spin_lock(&rcp->lock);
> +	delta = get_seconds() - rcp->gp_check;
> +	if (delta < 2L ||
> +	    cpus_empty(rcp->cpumask)) {
> +		spin_unlock(&rcp->lock);
> +		return;
> +	rcp->gp_check = get_seconds() + 30;
> +	}
> +	spin_unlock(&rcp->lock);
> +
> +	/* OK, time to rat on our buddy... */
> +
> +	printk(KERN_ERR "RCU detected CPU stalls:");
> +	for_each_cpu_mask(cpu, rcp->cpumask)
> +		printk(" %d", cpu);
> +	printk(" (detected by %d, t=%lu/%lu)\n",
> +	       smp_processor_id(), get_seconds(), rcp->gp_check);
> +}
> +static void print_cpu_stall(struct rcu_ctrlblk *rcp)
> +{
> +	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
> +			smp_processor_id(), get_seconds(), rcp->gp_check);
> +	dump_stack();
> +	spin_lock(&rcp->lock);
> +	if ((long)(get_seconds() - rcp->gp_check) >= 0L)
> +		rcp->gp_check = get_seconds() + 30;
> +	spin_unlock(&rcp->lock);
> +}
> +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
> +				   struct rcu_data *rdp)
> +{
> +	long delta;
> +
> +	delta = get_seconds() - rcp->gp_check;
> +	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
> +
> +		/* We haven't checked in, so go dump stack. */
> +
> +		print_cpu_stall(rcp);
> +
> +	} else if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
> +
> +		/* They had two seconds to dump stack, so complain. */
> +
> +		print_other_cpu_stall(rcp);
> +
> +	}
> +}
> +
> +#else /* #ifdef CONFIG_RCU_CPU_STALL */
> +
> +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
> +{
> +}
> +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
> +				   struct rcu_data *rdp)
> +{
> +}
> +
> +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL */
> +
>  /*
>   * Register a new batch of callbacks, and start it up if there is currently no
>   * active batch and the batch to be registered has not already occurred.
> @@ -285,6 +361,7 @@ static void rcu_start_batch(struct rcu_c
>  		 */
>  		smp_wmb();
>  		rcp->cur++;
> +		record_gp_check_time(rcp);
>  
>  		/*
>  		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
> @@ -468,6 +545,9 @@ static void rcu_process_callbacks(struct
>  
>  static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
>  {
> +	/* Check for CPU stalls, if enabled. */
> +	check_cpu_stall(rcp, rdp);
> +
>  	/* This cpu has pending rcu entries and the grace period
>  	 * for them has completed.
>  	 */
> diff -urpNa -X dontdiff linux-2.6.27-rc1/lib/Kconfig.debug linux-2.6.27-rc1-cpustall/lib/Kconfig.debug
> --- linux-2.6.27-rc1/lib/Kconfig.debug	2008-07-30 08:48:17.000000000 -0700
> +++ linux-2.6.27-rc1-cpustall/lib/Kconfig.debug	2008-08-10 12:14:18.000000000 -0700
> @@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE
>  	  Say N here if you want the RCU torture tests to start only
>  	  after being manually enabled via /proc.
>  
> +config RCU_CPU_STALL
> +	bool "Check for stalled CPUs delaying RCU grace periods"
> +	depends on CLASSIC_RCU
> +	default n
> +	help
> +	  This option causes RCU to printk information on which
> +	  CPUs are delaying the current grace period, but only when
> +	  the grace period extends for excessive time periods.
> +
> +	  Say Y if you want RCU to perform such checks.
> +
> +	  Say N if you are unsure.
> +
>  config KPROBES_SANITY_TEST
>  	bool "Kprobes sanity tests"
>  	depends on DEBUG_KERNEL

View attachment "stall.c" of type "text/x-csrc" (1720 bytes)