[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20150612153505.GJ9409@pathway.suse.cz>
Date: Fri, 12 Jun 2015 17:35:05 +0200
From: Petr Mladek <pmladek@...e.cz>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Steven Rostedt <rostedt@...dmis.org>, linux-kernel@...r.kernel.org,
jkosina@...e.cz, paulmck@...ux.vnet.ibm.com,
Ingo Molnar <mingo@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>
Subject: Re: [RFC][PATCH] printk: Fixup the nmi printk mess
On Fri 2015-06-12 16:28:05, Petr Mladek wrote:
> On Fri 2015-06-12 14:00:37, Peter Zijlstra wrote:
> > On Thu, Jun 11, 2015 at 04:55:47PM +0200, Petr Mladek wrote:
> > > > + * 'Consume' this chunk, avoids concurrent callers printing the same
> > > > + * stuff.
> > > > + */
> > > > + if (atomic_cmpxchg(&s->read, i, len) != i)
> > > > + goto again;
> > >
> > > I think that this is racy:
> >
> > Indeed. I'll think a bit on that.
> >
> > > I think that ordering CPUs is not worth it. I would go back to the
> > > first solution, add the @lock there, and double check races with
> > > seq_buf().
> >
> > You mean, provide printk_nmi_flush() but completely screw concurrency?
> > And basically reserve it for a last ditch effort for getting crap out?
>
> This looks much easier to me. We could call this only when the system
> is going down.
>
> Another possibility would be to ignore this race because it is rather
> theoretical and it better to print some mess than nothing.
I have got another idea. We already use a lock to synchronize
readers. It might be used to avoid the race and keep the global
flush().
I played with it, see below. It is based on 2nd Peter's patch
where I modified only struct nmi_seq_buf, __printk_nmi_flush(),
and vprintk_nmi().
It forces the reader to flush everything or nothing. It is less
effective than the previous code but much more simple.
It seems to work but it is just a proof-of-concept. Also I think that
it would be possible to use the normal seq_buf in the end.
---
arch/Kconfig | 4 ++
arch/arm/Kconfig | 1 +
arch/avr32/Kconfig | 1 +
arch/blackfin/Kconfig | 1 +
arch/cris/Kconfig | 1 +
arch/powerpc/Kconfig | 1 +
arch/s390/Kconfig | 1 +
arch/sh/Kconfig | 1 +
arch/sparc/Kconfig | 1 +
arch/tile/Kconfig | 1 +
arch/x86/Kconfig | 1 +
arch/x86/kernel/apic/hw_nmi.c | 89 ++---------------------
include/linux/hardirq.h | 2 +
include/linux/percpu.h | 3 -
include/linux/printk.h | 12 +++-
init/Kconfig | 5 ++
init/main.c | 1 +
kernel/printk/printk.c | 159 +++++++++++++++++++++++++++++++++++++++---
18 files changed, 186 insertions(+), 99 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index a65eafb24997..ad8db3979e21 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -179,7 +179,11 @@ config HAVE_OPTPROBES
config HAVE_KPROBES_ON_FTRACE
bool
+config HAVE_NMI
+ bool
+
config HAVE_NMI_WATCHDOG
+ depends on HAVE_NMI
bool
#
# An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 45df48ba0b12..a26e83699df9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -61,6 +61,7 @@ config ARM
select HAVE_KRETPROBES if (HAVE_KPROBES)
select HAVE_MEMBLOCK
select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
+ select HAVE_NMI if (!CPU_V7)
select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
select HAVE_OPTPROBES if !THUMB2_KERNEL
select HAVE_PERF_EVENTS
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b6878eb64884..9dc3e2b1180b 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -17,6 +17,7 @@ config AVR32
select GENERIC_CLOCKEVENTS
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
+ select HAVE_NMI
help
AVR32 is a high-performance 32-bit RISC microprocessor core,
designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index af76634f8d98..47c0a55acafd 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
+ select HAVE_NMI
config GENERIC_CSUM
def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 0314e325a669..2e053f709c4b 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -58,6 +58,7 @@ config CRIS
select CLKSRC_MMIO if ETRAX_ARCH_V32
select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
+ select HAVE_NMI
config HZ
int
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 190cc48abc0c..079f4fec9586 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -153,6 +153,7 @@ config PPC
select NO_BOOTMEM
select HAVE_GENERIC_RCU_GUP
select HAVE_PERF_EVENTS_NMI if PPC64
+ select HAVE_NMI if PERF_EVENTS
config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b06dc3839268..1b84036437c0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -152,6 +152,7 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select HAVE_NMI
config SCHED_OMIT_FRAME_POINTER
def_bool y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..7803b6863c0e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
select OLD_SIGSUSPEND
select OLD_SIGACTION
select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_NMI
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e49502acbab4..198abcb6e510 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -80,6 +80,7 @@ config SPARC64
select NO_BOOTMEM
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_NMI
config ARCH_DEFCONFIG
string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index a07e31b50d3f..1503ab5882cd 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -28,6 +28,7 @@ config TILE
select HAVE_DEBUG_STACKOVERFLOW
select ARCH_WANT_FRAME_POINTERS
select HAVE_CONTEXT_TRACKING
+ select HAVE_NMI if USE_PMC
# FIXME: investigate whether we need/want these options.
# select HAVE_IOREMAP_PROT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 226d5696e1d1..1d1a59e7eaac 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -38,6 +38,7 @@ config X86
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_IOREMAP_PROT
+ select HAVE_NMI
select HAVE_KPROBES
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..6e633e020f55 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
-#include <linux/seq_buf.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
@@ -30,35 +29,14 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
-}
-
void arch_trigger_all_cpu_backtrace(bool include_self)
{
- struct nmi_seq_buf *s;
- int len;
- int cpu;
- int i;
int this_cpu = get_cpu();
+ int i;
if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -73,16 +51,6 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
if (!include_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("sending NMI to %s CPUs:\n",
(include_self ? "all" : "other"));
@@ -98,57 +66,15 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
}
/*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
+ * Force flush any remote buffers that might be stuck in IRQ context
+ * and therefore could not run their irq_work.
*/
- for_each_cpu(cpu, &printtrace_mask) {
- int last_i = 0;
-
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
+ printk_nmi_flush();
- clear_bit(0, &backtrace_flag);
- smp_mb__after_atomic();
+ clear_bit_unlock(0, &backtrace_flag);
put_cpu();
}
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
-}
-
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
@@ -157,13 +83,8 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
- this_cpu_write(printk_func, printk_func_save);
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return NMI_HANDLED;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f4af03404b97..8c9040934ea0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -67,10 +67,12 @@ extern void irq_exit(void);
preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
rcu_nmi_enter(); \
trace_hardirq_enter(); \
+ printk_nmi_enter(); \
} while (0)
#define nmi_exit() \
do { \
+ printk_nmi_exit(); \
trace_hardirq_exit(); \
rcu_nmi_exit(); \
BUG_ON(!in_nmi()); \
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index caebf2a758dc..04c68b9f56f8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))
-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9b30871c9149..05b7f6ed1663 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -120,7 +120,17 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif
-typedef int(*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+#else
+static inline void printk_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+#endif /* PRINTK_NMI */
#ifdef CONFIG_PRINTK
asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index dc24dec60232..acab5c67d2a9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1445,6 +1445,11 @@ config PRINTK
very difficult to diagnose system problems, saying N here is
strongly discouraged.
+config PRINTK_NMI
+ def_bool y
+ depends on PRINTK
+ depends on HAVE_NMI
+
config BUG
bool "BUG() support" if EXPERT
default y
diff --git a/init/main.c b/init/main.c
index 2115055faeac..4a28accaaa98 100644
--- a/init/main.c
+++ b/init/main.c
@@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void)
timekeeping_init();
time_init();
sched_clock_postinit();
+ printk_init();
perf_event_init();
profile_init();
call_function_init();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c099b082cd02..640460d881e5 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1821,13 +1821,160 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);
+#ifdef CONFIG_PRINTK_NMI
+
+typedef int(*printk_func_t)(const char *fmt, va_list args);
/*
* This allows printk to be diverted to another function per cpu.
+ *
* This is useful for calling printk functions from within NMI
* without worrying about race conditions that can lock up the
* box.
*/
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
+struct nmi_seq_buf {
+ atomic_t tail;
+ struct irq_work work;
+ unsigned char buffer[PAGE_SIZE -
+ 3*sizeof(atomic_t) -
+ sizeof(struct irq_work)];
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk("%.*s", (end - start) + 1, buf);
+}
+
+static void __printk_nmi_flush(struct irq_work *work)
+{
+ static raw_spinlock_t read_lock = __RAW_SPIN_LOCK_INITIALIZER(read_lock);
+ struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ int len, size, i, last_i;
+
+ /*
+ * Serialize against the actual printing of the chunks
+ * to avoid interleaving lines and interleaving readers.
+ */
+ raw_spin_lock(&read_lock);
+
+ i = 0;
+more:
+ len = atomic_read(&s->tail);
+
+ /*
+ * we have the read lock; @tail must only increase if
+ * we printed something
+ */
+ WARN_ON(len && i >= len);
+
+ if (!len)
+ goto out; /* someone else already flushed the buffer */
+
+ /* make sure that data has ben written up to the @tail */
+ smp_rmb();
+
+ size = min_t(int, len, sizeof(s->buffer));
+ last_i = i;
+
+ /* Print line by line. */
+ for (; i < size; i++) {
+ if (s->buffer[i] == '\n') {
+ print_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < size) {
+ print_seq_line(s, last_i, len - 1);
+ pr_cont("\n");
+ }
+
+ /*
+ * Check nothing got added and truncate the buffer.
+ */
+ if (atomic_cmpxchg(&s->tail, len, 0) != len)
+ goto more;
+
+out:
+ raw_spin_unlock(&read_lock);
+}
+
+/*
+ * flush all per-cpu nmi buffers
+ */
+void printk_nmi_flush(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void printk_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+ init_irq_work(&s->work, __printk_nmi_flush);
+ }
+}
+
+/*
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ int add = 0, len;
+
+ irq_work_queue(&s->work);
+
+again:
+ len = atomic_read(&s->tail);
+
+ if (len < sizeof(s->buffer)) {
+ add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len,
+ fmt, args);
+ /* write the data before updating the used buffer size */
+ smp_wmb();
+ /* once again if someone flushed the buffer in the meantime */
+ if (atomic_cmpxchg(&s->tail, len, len + add) != len)
+ goto again;
+
+ }
+ return add;
+}
+
+void printk_nmi_enter(void)
+{
+ this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+ this_cpu_write(printk_func, vprintk_default);
+}
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_default(fmt, args);
+}
+
+#endif /* PRINTK_NMI */
/**
* printk - print a kernel message
@@ -1852,21 +1999,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
- printk_func_t vprintk_func;
va_list args;
int r;
va_start(args, fmt);
-
- /*
- * If a caller overrides the per_cpu printk_func, then it needs
- * to disable preemption when calling printk(). Otherwise
- * the printk_func should be set to the default. No need to
- * disable preemption here.
- */
- vprintk_func = this_cpu_read(printk_func);
r = vprintk_func(fmt, args);
-
va_end(args);
return r;
--
1.8.5.6
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists