[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180608135703.GD18941@redhat.com>
Date: Fri, 8 Jun 2018 15:57:04 +0200
From: Oleg Nesterov <oleg@...hat.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: mingo@...nel.org, gkohli@...eaurora.org, tglx@...utronix.de,
mpe@...erman.id.au, bigeasy@...utronix.de,
linux-kernel@...r.kernel.org, will.deacon@....com
Subject: Re: [PATCH 2/4] watchdog/softlockup: Replace "watchdog/%u" threads
with cpu_stop_work
On 06/07, Peter Zijlstra wrote:
>
> On Thu, Jun 07, 2018 at 02:33:12PM +0200, Peter Zijlstra wrote:
>
> > +static int softlockup_stop_fn(void *data)
> > {
> > + watchdog_disable(smp_processor_id());
> > + return 0;
> > }
> >
> > +static void softlockup_stop_all(void)
> > {
> > + int cpu;
> > +
> > + for_each_cpu(cpu, &watchdog_allowed_mask)
> > + stop_one_cpu(cpu, softlockup_stop_fn, NULL);
> > +
> > + cpumask_clear(&watchdog_allowed_mask);
> > }
>
> Bugger, that one doesn't quite work.. watchdog_disable() ends up calling
> a sleeping function. I forgot to enable all the debug cruft when
> testing..
And probably there is another problem. Both watchdog_disable(cpu) and
watchdog_nmi_disable(cpu) assume that cpu == smp_processor_id(), this arg
is simply ignored.
but lockup_detector_offline_cpu(cpu) is called by cpuhp_invoke_callback(),
so in this case watchdog_disable(dying_cpu) is simply wrong.
May be we can do something like below? Then softlockup_stop_all() can simply do
for_each_cpu(cpu, &watchdog_allowed_mask)
watchdog_disable(cpu);
watchdog_nmi_disable() is __weak, but at first glance arch/sparc/kernel/nmi.c
does everything correctly.
Oleg.
--- x/kernel/watchdog.c
+++ x/kernel/watchdog.c
@@ -108,13 +108,13 @@ __setup("hardlockup_all_cpu_backtrace=",
*/
int __weak watchdog_nmi_enable(unsigned int cpu)
{
- hardlockup_detector_perf_enable();
+ hardlockup_detector_perf_enable(cpu);
return 0;
}
void __weak watchdog_nmi_disable(unsigned int cpu)
{
- hardlockup_detector_perf_disable();
+ hardlockup_detector_perf_disable(cpu);
}
/* Return 0, if a NMI watchdog is available. Error code otherwise */
@@ -479,7 +479,7 @@ static void watchdog_enable(unsigned int
static void watchdog_disable(unsigned int cpu)
{
- struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = per_cpu_ptr(&watchdog_hrtimer, cpu);
watchdog_set_prio(SCHED_NORMAL, 0);
/*
--- x/kernel/watchdog_hld.c
+++ x/kernel/watchdog_hld.c
@@ -162,9 +162,8 @@ static void watchdog_overflow_callback(s
return;
}
-static int hardlockup_detector_event_create(void)
+static int hardlockup_detector_event_create(int cpu)
{
- unsigned int cpu = smp_processor_id();
struct perf_event_attr *wd_attr;
struct perf_event *evt;
@@ -179,37 +178,37 @@ static int hardlockup_detector_event_cre
PTR_ERR(evt));
return PTR_ERR(evt);
}
- this_cpu_write(watchdog_ev, evt);
+ raw_cpu_write(watchdog_ev, evt);
return 0;
}
/**
* hardlockup_detector_perf_enable - Enable the local event
*/
-void hardlockup_detector_perf_enable(void)
+void hardlockup_detector_perf_enable(int cpu)
{
- if (hardlockup_detector_event_create())
+ if (hardlockup_detector_event_create(cpu))
return;
/* use original value for check */
if (!atomic_fetch_inc(&watchdog_cpus))
pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
- perf_event_enable(this_cpu_read(watchdog_ev));
+ perf_event_enable(raw_cpu_read(watchdog_ev));
}
/**
* hardlockup_detector_perf_disable - Disable the local event
*/
-void hardlockup_detector_perf_disable(void)
+void hardlockup_detector_perf_disable(int cpu)
{
- struct perf_event *event = this_cpu_read(watchdog_ev);
+ struct perf_event *event = per_cpu_read(watchdog_ev, cpu);
if (event) {
perf_event_disable(event);
- this_cpu_write(watchdog_ev, NULL);
- this_cpu_write(dead_event, event);
- cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
+ raw_cpu_write(watchdog_ev, cpu, NULL);
+ raw_cpu_write(dead_event, cpu, event);
+ cpumask_set_cpu(cpu, &dead_events_mask);
atomic_dec(&watchdog_cpus);
}
}
Powered by blists - more mailing lists