[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20161219115427.82712130f1e5f651f180e807@linux-foundation.org>
Date: Mon, 19 Dec 2016 11:54:27 -0800
From: Andrew Morton <akpm@...ux-foundation.org>
To: Don Zickus <dzickus@...hat.com>
Cc: LKML <linux-kernel@...r.kernel.org>,
Ulrich Obergfell <uobergfe@...hat.com>
Subject: Re: [PATCH] kernel/watchdog: Prevent false hardlockup on overloaded
system
On Tue, 6 Dec 2016 11:17:13 -0500 Don Zickus <dzickus@...hat.com> wrote:
> On an overloaded system, it is possible that a change in the watchdog threshold
> can be delayed long enough to trigger a false positive.
>
> This can easily be achieved by having a cpu spinning indefinitely on a task,
> while another cpu updates watchdog threshold.
>
> What happens is while trying to park the watchdog threads, the hrtimers on the
> other cpus trigger and reprogram themselves with the new slower watchdog
> threshold. Meanwhile, the nmi watchdog is still programmed with the old faster
> threshold.
>
> Because the one cpu is blocked, it prevents the thread parking on the other
> cpus from completing, which is needed to shutdown the nmi watchdog and
> reprogram it correctly. As a result, a false positive from the nmi watchdog is
> reported.
>
> Fix this by setting a park_in_progress flag to block all lockups
> until the parking is complete.
>
> Fix provided by Ulrich Obergfell.
>
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -111,6 +111,7 @@ static inline bool trigger_single_cpu_backtrace(int cpu)
> extern unsigned long watchdog_enabled;
> extern DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
> extern unsigned long *watchdog_cpumask_bits;
> +extern atomic_t park_in_progress;
> extern int sysctl_softlockup_all_cpu_backtrace;
> extern int sysctl_hardlockup_all_cpu_backtrace;
> struct ctl_table;
A kernel-wide identifier should really identify which subsystem it
belongs to...
--- a/include/linux/nmi.h~kernel-watchdog-prevent-false-hardlockup-on-overloaded-system-fix
+++ a/include/linux/nmi.h
@@ -110,7 +110,7 @@ extern int watchdog_user_enabled;
extern int watchdog_thresh;
extern unsigned long watchdog_enabled;
extern unsigned long *watchdog_cpumask_bits;
-extern atomic_t park_in_progress;
+extern atomic_t watchdog_park_in_progress;
#ifdef CONFIG_SMP
extern int sysctl_softlockup_all_cpu_backtrace;
extern int sysctl_hardlockup_all_cpu_backtrace;
--- a/kernel/watchdog.c~kernel-watchdog-prevent-false-hardlockup-on-overloaded-system-fix
+++ a/kernel/watchdog.c
@@ -49,7 +49,7 @@ unsigned long *watchdog_cpumask_bits = c
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-atomic_t park_in_progress = ATOMIC_INIT(0);
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
/*
* The 'watchdog_running' variable is set to 1 when the watchdog threads
@@ -262,7 +262,7 @@ static enum hrtimer_restart watchdog_tim
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
- if (atomic_read(&park_in_progress) != 0)
+ if (atomic_read(&watchdog_park_in_progress) != 0)
return HRTIMER_NORESTART;
/* kick the hardlockup detector */
@@ -472,7 +472,7 @@ static int watchdog_park_threads(void)
{
int cpu, ret = 0;
- atomic_set(&park_in_progress, 1);
+ atomic_set(&watchdog_park_in_progress, 1);
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
@@ -480,7 +480,7 @@ static int watchdog_park_threads(void)
break;
}
- atomic_set(&park_in_progress, 0);
+ atomic_set(&watchdog_park_in_progress, 0);
return ret;
}
--- a/kernel/watchdog_hld.c~kernel-watchdog-prevent-false-hardlockup-on-overloaded-system-fix
+++ a/kernel/watchdog_hld.c
@@ -84,7 +84,7 @@ static void watchdog_overflow_callback(s
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
- if (atomic_read(&park_in_progress) != 0)
+ if (atomic_read(&watchdog_park_in_progress) != 0)
return;
if (__this_cpu_read(watchdog_nmi_touch) == true) {
_
Powered by blists - more mailing lists