linux-kernel - Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20150925144448.GA100922@redhat.com>
Date:	Fri, 25 Sep 2015 10:44:48 -0400
From:	Don Zickus <dzickus@...hat.com>
To:	Jiri Kosina <jikos@...nel.org>
Cc:	Andrew Morton <akpm@...ux-foundation.org>,
	Ulrich Obergfell <uobergfe@...hat.com>,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard
 lockup

On Fri, Sep 25, 2015 at 01:15:56PM +0200, Jiri Kosina wrote:
> From: Jiri Kosina <jkosina@...e.cz>
> 
> In many cases of hardlockup reports, it's actually not possible to know 
> why it triggered, because the CPU that got stuck is usually waiting on a 
> resource (with IRQs disabled) in posession of some other CPU is holding.
> 
> IOW, we are often looking at the stacktrace of the victim and not the 
> actual offender.
> 
> Introduce sysctl / cmdline parameter that makes it possible to have 
> hardlockup detector perform all-CPU backtrace.

Hi Jiri,

This seems to mimic the very similar softlockup_all_cpu_backtrace
functionality, so I am ok with this.  Let me try and run some tests on this
to make sure nothing regressed.

Cheers,
Don

> 
> Signed-off-by: Jiri Kosina <jkosina@...e.cz>
> ---
>  Documentation/kernel-parameters.txt |  5 +++++
>  Documentation/sysctl/kernel.txt     | 12 ++++++++++++
>  include/linux/nmi.h                 |  1 +
>  kernel/sysctl.c                     |  9 +++++++++
>  kernel/watchdog.c                   | 33 ++++++++++++++++++++++++++++-----
>  5 files changed, 55 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index 22a4b68..b4af96e 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1246,6 +1246,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>  			Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
>  			Default: 1024
>  
> +	hardlockup_all_cpu_backtrace=
> +			[KNL] Should the hard-lockup detector generate
> +			backtraces on all cpus.
> +			Format: <integer>
> +
>  	hashdist=	[KNL,NUMA] Large hashes allocated during boot
>  			are distributed across NUMA nodes.  Defaults on
>  			for 64-bit NUMA, off otherwise.
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 6fccb69..af70d15 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
>  - domainname
>  - hostname
>  - hotplug
> +- hardlockup_all_cpu_backtrace
>  - hung_task_panic
>  - hung_task_check_count
>  - hung_task_timeout_secs
> @@ -293,6 +294,17 @@ domain names are in general different. For a detailed discussion
>  see the hostname(1) man page.
>  
>  ==============================================================
> +hardlockup_all_cpu_backtrace:
> +
> +This value controls the hard lockup detector behavior when a hard
> +lockup condition is detected as to whether or not to gather further
> +debug information. If enabled, arch-specific all-CPU stack dumping
> +will be initiated.
> +
> +0: do nothing. This is the default behavior.
> +
> +1: on detection capture more debug information.
> +==============================================================
>  
>  hotplug:
>  
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 78488e0..7ec5b86 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
>  extern unsigned long *watchdog_cpumask_bits;
>  extern int sysctl_softlockup_all_cpu_backtrace;
> +extern int sysctl_hardlockup_all_cpu_backtrace;
>  struct ctl_table;
>  extern int proc_watchdog(struct ctl_table *, int ,
>  			 void __user *, size_t *, loff_t *);
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d..efb0370 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -897,6 +897,15 @@ static struct ctl_table kern_table[] = {
>  		.extra1		= &zero,
>  		.extra2		= &one,
>  	},
> +	{
> +		.procname	= "hardlockup_all_cpu_backtrace",
> +		.data		= &sysctl_hardlockup_all_cpu_backtrace,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &one,
> +	},
>  #endif /* CONFIG_SMP */
>  #endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 64ed1c3..202999c 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
>  
>  #ifdef CONFIG_SMP
>  int __read_mostly sysctl_softlockup_all_cpu_backtrace;
> +int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
>  #else
>  #define sysctl_softlockup_all_cpu_backtrace 0
> +#define sysctl_hardlockup_all_cpu_backtrace 0
>  #endif
>  static struct cpumask watchdog_cpumask __read_mostly;
>  unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
> @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn;
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
>  static int hardlockup_panic =
>  			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
> +static unsigned long hardlockup_allcpu_dumped;
>  /*
>   * We may not want to enable hard lockup detection by default in all cases,
>   * for example when running the kernel as a guest on a hypervisor. In these
> @@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
>  	return 1;
>  }
>  __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
> +static int __init hardlockup_all_cpu_backtrace_setup(char *str)
> +{
> +	sysctl_hardlockup_all_cpu_backtrace =
> +		!!simple_strtol(str, NULL, 0);
> +	return 1;
> +}
> +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
>  #endif
>  
>  /*
> @@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
>  	 */
>  	if (is_hardlockup()) {
>  		int this_cpu = smp_processor_id();
> +		struct pt_regs *regs = get_irq_regs();
>  
>  		/* only print hardlockups once */
>  		if (__this_cpu_read(hard_watchdog_warn) == true)
>  			return;
>  
> -		if (hardlockup_panic)
> -			panic("Watchdog detected hard LOCKUP on cpu %d",
> -			      this_cpu);
> +		pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
> +		print_modules();
> +		print_irqtrace_events(current);
> +		if (regs)
> +			show_regs(regs);
>  		else
> -			WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
> -			     this_cpu);
> +			dump_stack();
> +
> +		/*
> +		 * Perform all-CPU dump only once to avoid multiple hardlockups
> +		 * generating interleaving traces
> +		 */
> +		if (sysctl_hardlockup_all_cpu_backtrace &&
> +				!test_and_set_bit(0, &hardlockup_allcpu_dumped))
> +			trigger_allbutself_cpu_backtrace();
> +
> +		if (hardlockup_panic)
> +			panic("Hard LOCKUP");
>  
>  		__this_cpu_write(hard_watchdog_warn, true);
>  		return;
> -- 
> Jiri Kosina
> SUSE Labs
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/