linux-kernel - Re: [printk] fbc14616f4: BUG:kernel_reboot-without-warning_in_test

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 12 Apr 2017 01:19:53 +0900
From:   Sergey Senozhatsky <sergey.senozhatsky@...il.com>
To:     Petr Mladek <pmladek@...e.com>, Pavel Machek <pavel@....cz>
Cc:     Jan Kara <jack@...e.cz>,
        "Eric W. Biederman" <ebiederm@...ssion.com>,
        Ye Xiaolong <xiaolong.ye@...el.com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Petr Mladek <pmladek@...e.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Peter Zijlstra <peterz@...radead.org>,
        "Rafael J . Wysocki" <rjw@...ysocki.net>,
        Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
        Jiri Slaby <jslaby@...e.com>, Len Brown <len.brown@...el.com>,
        linux-kernel@...r.kernel.org,
        Sergey Senozhatsky <sergey.senozhatsky.work@...il.com>,
        Sergey Senozhatsky <sergey.senozhatsky@...il.com>
Subject: Re: [printk]  fbc14616f4:
 BUG:kernel_reboot-without-warning_in_test_stage

On (04/11/17 10:46), Sergey Senozhatsky wrote:
> On (04/10/17 20:48), Pavel Machek wrote:
> [..]
> > > but, once again, I see your point.
> > 
> > Good. Does that mean that the next version of patches will work ok in
> > that case?
> 
> yes.

ok... so I'm looking at something like below right now.
not really tested yet.

I put some comments into the code.

it does offloading after X printed lines by the same process.
if we reschedule, then the counter resets. which is probably OK,
we don't really want any process, except for printk_kthread, to
stay in console_unlock() forever. "number of lines printed" is
probably easier to understand (easily converted to the number of
pageup/pagedown you need to press, terminal buffer history size,
etc.) than seconds we spent on printing (which doesn't even
correspond to messages' timestamps in general case).

when the limit of "number of lines printed" is 0, then no
offloading takes place.

it also has some simple mechanism to handle cases when
we try to wake_up() printk_kthread, but it never becomes alive.
it's a bit simple minded, probably.

wake_up is done from printk_safe context, so warnings/printks
from there should do no harm (in fact, we even do pr_crit()
error reporting, when we enforce printk_emergency mode).

I'll do more tests tomorrow, and will take a closer look.
this code is basically just compiles, boots and passes some
trivial tests. quite possible I've missed something important.

once verified, then the next question will be -- do we even
need printk_emergency_begin/end or we can leave without it.

// given that printk_emergency enforcement works properly


but, once again, the code might be stupid and wrong.
and I need some sleep.

---
 include/linux/console.h |   3 +
 kernel/printk/printk.c  | 200 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 198 insertions(+), 5 deletions(-)

diff --git a/include/linux/console.h b/include/linux/console.h
index 5949d1855589..f1a86944072e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -187,6 +187,9 @@ extern bool console_suspend_enabled;
 extern void suspend_console(void);
 extern void resume_console(void);
 
+extern void printk_emergency_begin(void);
+extern void printk_emergency_end(void);
+
 int mda_console_init(void);
 void prom_con_init(void);
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 6cf756dbee39..c0075e8b3a09 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
+#include <linux/kthread.h>
 
 #include <linux/uaccess.h>
 #include <asm/sections.h>
@@ -402,7 +403,8 @@ DEFINE_RAW_SPINLOCK(logbuf_lock);
 	} while (0)
 
 /*
- * Delayed printk version, for scheduler-internal messages:
+ * Used both for deferred printk version (scheduler-internal messages)
+ * and printk_kthread control.
  */
 #define PRINTK_PENDING_WAKEUP	0x01
 #define PRINTK_PENDING_OUTPUT	0x02
@@ -445,6 +447,48 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
 static u32 log_buf_len = __LOG_BUF_LEN;
 
+static struct task_struct *printk_kthread __read_mostly;
+/*
+ * We can't call into the scheduler (wake_up() printk kthread) during
+ * suspend/kexec/etc. This temporarily switches printk to old behaviour.
+ */
+static atomic_t printk_emergency __read_mostly;
+/*
+ * Disable printk_kthread permanently. Unlike `oops_in_progress'
+ * it doesn't go back to 0.
+ */
+static bool printk_enforce_emergency __read_mostly;
+
+static unsigned int atomic_print_limit = 10000;
+
+module_param_named(atomic_print_limit, atomic_print_limit, uint, 0644);
+MODULE_PARM_DESC(atomic_print_limit,
+		 "max lines to print before offloading to printk kthread");
+
+static inline bool printk_kthread_enabled(void)
+{
+	return !printk_enforce_emergency &&
+		printk_kthread && atomic_read(&printk_emergency) == 0;
+}
+
+/*
+ * This disables printing offloading and instead attempts
+ * to do the usual console_trylock()->console_unlock().
+ *
+ * Note, this does not stop the printk_kthread if it's already
+ * printing logbuf messages.
+ */
+void printk_emergency_begin(void)
+{
+	atomic_inc(&printk_emergency);
+}
+
+/* This re-enables printk_kthread offloading. */
+void printk_emergency_end(void)
+{
+	atomic_dec(&printk_emergency);
+}
+
 /* Return log buffer address */
 char *log_buf_addr_get(void)
 {
@@ -1765,6 +1809,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 
 	printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
 
+	/*
+	 * Emergency level indicates that the system is unstable and, thus,
+	 * we better stop relying on wake_up(printk_kthread) and try to do
+	 * a direct printing.
+	 */
+	if (level == LOGLEVEL_EMERG)
+		printk_enforce_emergency = true;
+
+	set_bit(PRINTK_PENDING_OUTPUT, &printk_pending);
 	logbuf_unlock_irqrestore(flags);
 
 	/* If called from the scheduler, we can not call up(). */
@@ -1882,6 +1935,9 @@ static size_t msg_print_text(const struct printk_log *msg,
 			     bool syslog, char *buf, size_t size) { return 0; }
 static bool suppress_message_printing(int level) { return false; }
 
+void printk_emergency_begin(void) {}
+void printk_emergency_end(void) {}
+
 #endif /* CONFIG_PRINTK */
 
 #ifdef CONFIG_EARLY_PRINTK
@@ -2141,6 +2197,89 @@ static inline int can_use_console(void)
 	return cpu_online(raw_smp_processor_id()) || have_callable_console();
 }
 
+/*
+ * Under heavy printing load/slow serial console/etc console_unlock() can
+ * stall CPUs, which can result in soft/hard-lockups, lost interrupts, RCU
+ * stalls, etc. Therefore we attempt to print the messages to console from
+ * a dedicated printk_kthread, which always runs in schedulable context.
+ *
+ * There are several possible scenarios:
+ *
+ * a) When we got a large number of pending messages to print.
+ * e.g.
+ *    vprintk_emit() or console_lock()
+ *     console_unlock()
+ *      <<massive dump>>
+ *
+ * b) When we printk() a large number of messages.
+ * e.g.
+ *   vprintk_emit()->console_unlock()   <<print 1 message>>
+ *   vprintk_emit()->console_unlock()   <<print 1 message>>
+ *   ...
+ *   vprintk_emit()->console_unlock()   <<print 1 message>>
+ *
+ * In all those cases we can be in atomic context, we need to offload
+ * printing at some point.
+ *
+ * This function must be called from 'printk_safe' context.
+ */
+static inline bool console_offload_printing(void)
+{
+	static struct task_struct *printing_task = NULL;
+	static unsigned long lines_printed = 0;
+
+	if (!atomic_print_limit || !printk_kthread_enabled())
+		return false;
+
+	/* We rescheduled - reset the counters. */
+	if (printing_task != current) {
+		lines_printed = 0;
+		printing_task = current;
+		return false;
+	}
+
+	if (current == printk_kthread)
+		return false;
+
+	/*
+	 * Don't reset the counter, let CPU overrun the limit.
+	 * The idea is that
+	 *
+	 *   a) woken up printk_kthread (if succeeded)
+	 * or
+	 *   b) concurrent printk from another CPU (if any)
+	 *
+	 * will change `printing_task' and reset the counter. This also
+	 * let us to introduce additional policies later, for instance,
+	 * if we can't wakeup printk_kthread for Y times, e.g.
+	 *
+	 *            lines_printed > 2 * atomic_print_limit
+	 *
+	 * then we can declare emergency and stop relying on printk_kthread.
+	 *
+	 * If neither a) nor b) happens - we continue printing from
+	 * current process. Which is bad and can be risky, but we can't
+	 * wake_up() printk_kthread, so things already don't look normal.
+	 */
+	lines_printed++;
+	if (lines_printed < atomic_print_limit)
+		return false;
+
+	/*
+	 * A trivial emergency enforcement.
+	 * Assumes that `atomic_print_limit' is large enough.
+	 */
+	if (lines_printed > 2 * (unsigned long)atomic_print_limit) {
+		printk_enforce_emergency = true;
+		pr_crit("Declaring printk emergency mode.\n");
+		return false;
+	}
+
+	/* Must be executed in 'printk_safe' context. */
+	wake_up_process(printk_kthread);
+	return true;
+}
+
 /**
  * console_unlock - unlock the console system
  *
@@ -2163,8 +2302,17 @@ void console_unlock(void)
 	unsigned long flags;
 	bool wake_klogd = false;
 	bool do_cond_resched, retry;
+	bool did_offload;
 
 	if (console_suspended) {
+		/*
+		 * Here and later, we need to clear the PENDING_OUTPUT bit
+		 * in order to avoid an infinite loop in printk_kthread
+		 * function when console_unlock() cannot flush messages
+		 * because we suspended consoles. Someone else will print
+		 * the messages from resume_console().
+		 */
+		clear_bit(PRINTK_PENDING_OUTPUT, &printk_pending);
 		up_console_sem();
 		return;
 	}
@@ -2186,6 +2334,7 @@ void console_unlock(void)
 	do_cond_resched = console_may_schedule;
 again:
 	console_may_schedule = 0;
+	did_offload = 0;
 
 	/*
 	 * We released the console_sem lock, so we need to recheck if
@@ -2193,6 +2342,7 @@ void console_unlock(void)
 	 * console.
 	 */
 	if (!can_use_console()) {
+		clear_bit(PRINTK_PENDING_OUTPUT, &printk_pending);
 		console_locked = 0;
 		up_console_sem();
 		return;
@@ -2221,7 +2371,7 @@ void console_unlock(void)
 			len = 0;
 		}
 skip:
-		if (console_seq == log_next_seq)
+		if (did_offload || console_seq == log_next_seq)
 			break;
 
 		msg = log_from_idx(console_idx);
@@ -2253,9 +2403,28 @@ void console_unlock(void)
 		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(ext_text, ext_len, text, len);
 		start_critical_timings();
+
+		/*
+		 * Sometimes we may lock console_sem before printk_kthread.
+		 * In this case we will jump to `again' label (if there are
+		 * pending messages), print one more line from current
+		 * process, break out of printing loop (we don't reset the
+		 * counter of printed lines) and do up_console_sem() to
+		 * wakeup printk_kthread again.
+		 *
+		 * If printk_kthread never wakes up (which may indicate that
+		 * the system is unstable or something weird is going on),
+		 * then we will keep jumping to `again' label and printing
+		 * one message from the logbuf. This is a bit ugly, but at
+		 * least we will print out the logbuf.
+		 *
+		 * If such condition occurs, console_offload_printing() can
+		 * declare `printk_emergency' at some point.
+		 */
+		did_offload = console_offload_printing();
 		printk_safe_exit_irqrestore(flags);
 
-		if (do_cond_resched)
+		if (!did_offload && do_cond_resched)
 			cond_resched();
 	}
 	console_locked = 0;
@@ -2276,6 +2445,8 @@ void console_unlock(void)
 	 */
 	raw_spin_lock(&logbuf_lock);
 	retry = console_seq != log_next_seq;
+	if (!retry)
+		clear_bit(PRINTK_PENDING_OUTPUT, &printk_pending);
 	raw_spin_unlock(&logbuf_lock);
 	printk_safe_exit_irqrestore(flags);
 
@@ -2669,8 +2840,11 @@ late_initcall(printk_late_init);
 #if defined CONFIG_PRINTK
 static void wake_up_klogd_work_func(struct irq_work *irq_work)
 {
-	if (test_and_clear_bit(PRINTK_PENDING_OUTPUT, &printk_pending)) {
-		/* If trylock fails, someone else is doing the printing */
+	if (test_bit(PRINTK_PENDING_OUTPUT, &printk_pending)) {
+		/*
+		 * If trylock fails, someone else is doing the printing.
+		 * PRINTK_PENDING_OUTPUT bit is cleared by console_unlock().
+		 */
 		if (console_trylock())
 			console_unlock();
 	}
@@ -2684,6 +2858,22 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
 	.flags = IRQ_WORK_LAZY,
 };
 
+static int printk_kthread_func(void *data)
+{
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!test_bit(PRINTK_PENDING_OUTPUT, &printk_pending))
+			schedule();
+
+		__set_current_state(TASK_RUNNING);
+
+		console_lock();
+		console_unlock();
+	}
+
+	return 0;
+}
+
 void wake_up_klogd(void)
 {
 	preempt_disable();
-- 
2.12.2