linux-kernel - [PATCH 2/2] sched/debug: fix deadlock when enabling sched events

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <50a0f7f2a5f8d69e0440c3430f06be7ccbe59903.1465322027.git.jpoimboe@redhat.com>
Date:	Tue,  7 Jun 2016 14:43:17 -0500
From:	Josh Poimboeuf <jpoimboe@...hat.com>
To:	Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>
Cc:	linux-kernel@...r.kernel.org,
	Mel Gorman <mgorman@...hsingularity.net>,
	Matt Fleming <matt@...eblueprint.co.uk>,
	Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
Subject: [PATCH 2/2] sched/debug: fix deadlock when enabling sched events

When enabling sched trace events via:

  echo 1 > /sys/kernel/debug/tracing/events/sched/enable

I see a hang, with the following BUG in the printk buffer:

  BUG: spinlock recursion on CPU#1, swapper/1/0
   lock: 0xffff88007d5d8c00, .magic: dead4ead, .owner: swapper/1/0, .owner_cpu: 1
  CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc2+ #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014
   0000000000000086 3a8e170cf69ad23f ffff88007d403898 ffffffff8143d663
   ffff88007c535480 ffff88007d5d8c00 ffff88007d4038b8 ffffffff81115948
   ffff88007d5d8c00 ffff88007d5d8c00 ffff88007d4038e8 ffffffff81115aea
  Call Trace:
   <IRQ>  [<ffffffff8143d663>] dump_stack+0x85/0xc2
   [<ffffffff81115948>] spin_dump+0x78/0xc0
   [<ffffffff81115aea>] do_raw_spin_lock+0x11a/0x150
   [<ffffffff81891471>] _raw_spin_lock+0x61/0x80
   [<ffffffff810e5466>] ? try_to_wake_up+0x256/0x4e0
   [<ffffffff810e5466>] try_to_wake_up+0x256/0x4e0
   [<ffffffff81891a0a>] ? _raw_spin_unlock_irqrestore+0x4a/0x80
   [<ffffffff810e5705>] wake_up_process+0x15/0x20
   [<ffffffff810cebb4>] insert_work+0x84/0xc0
   [<ffffffff810ced7f>] __queue_work+0x18f/0x660
   [<ffffffff810cf9a6>] queue_work_on+0x46/0x90
   [<ffffffffa00cd95b>] drm_fb_helper_dirty.isra.11+0xcb/0xe0 [drm_kms_helper]
   [<ffffffffa00cdac0>] drm_fb_helper_sys_imageblit+0x30/0x40 [drm_kms_helper]
   [<ffffffff814babcd>] soft_cursor+0x1ad/0x230
   [<ffffffff814ba379>] bit_cursor+0x649/0x680
   [<ffffffff814b9d30>] ? update_attr.isra.2+0x90/0x90
   [<ffffffff814b5e6a>] fbcon_cursor+0x14a/0x1c0
   [<ffffffff81555ef8>] hide_cursor+0x28/0x90
   [<ffffffff81558b6f>] vt_console_print+0x3bf/0x3f0
   [<ffffffff81122c63>] call_console_drivers.constprop.24+0x183/0x200
   [<ffffffff811241f4>] console_unlock+0x3d4/0x610
   [<ffffffff811247f5>] vprintk_emit+0x3c5/0x610
   [<ffffffff81124bc9>] vprintk_default+0x29/0x40
   [<ffffffff811e965b>] printk+0x57/0x73
   [<ffffffff810f7a9e>] enqueue_entity+0xc2e/0xc70
   [<ffffffff810f7b39>] enqueue_task_fair+0x59/0xab0
   [<ffffffff8106dcd9>] ? kvm_sched_clock_read+0x9/0x20
   [<ffffffff8103fb39>] ? sched_clock+0x9/0x10
   [<ffffffff810e3fcc>] activate_task+0x5c/0xa0
   [<ffffffff810e4514>] ttwu_do_activate+0x54/0xb0
   [<ffffffff810e5cea>] sched_ttwu_pending+0x7a/0xb0
   [<ffffffff810e5e51>] scheduler_ipi+0x61/0x170
   [<ffffffff81059e7f>] smp_trace_reschedule_interrupt+0x4f/0x2a0
   [<ffffffff81893ba6>] trace_reschedule_interrupt+0x96/0xa0
   <EOI>  [<ffffffff8106e0d6>] ? native_safe_halt+0x6/0x10
   [<ffffffff8110fb1d>] ? trace_hardirqs_on+0xd/0x10
   [<ffffffff81040ac0>] default_idle+0x20/0x1a0
   [<ffffffff8104147f>] arch_cpu_idle+0xf/0x20
   [<ffffffff81102f8f>] default_idle_call+0x2f/0x50
   [<ffffffff8110332e>] cpu_startup_entry+0x37e/0x450
   [<ffffffff8105af70>] start_secondary+0x160/0x1a0

The bug is caused by a deadlock where the task is trying to grab the rq
lock twice because printk()'s aren't safe in sched code.

The offending printk() is the pr_warn_once() in
check_schedstat_required() which warns the user that their
'trace_sched_stat_*' tracepoints aren't going to work because schedstats
is disabled:

	/* Force schedstat enabled if a dependent tracepoint is active */
	if (trace_sched_stat_wait_enabled()    ||
			trace_sched_stat_sleep_enabled()   ||
			trace_sched_stat_iowait_enabled()  ||
			trace_sched_stat_blocked_enabled() ||
			trace_sched_stat_runtime_enabled())  {
		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
			     "stat_blocked and stat_runtime require the "
			     "kernel parameter schedstats=enabled or "
			     "kernel.sched_schedstats=1\n");
	}

In addition to the deadlock, I think this code has other issues:

1. Instead of just warning and allowing the tracepoints to be broken,
   I'd argue that it would be better to make them work by forcing
   schedstats enabled and printing a warning about that, which is what's
   already being done in other similar cases (latencytop and profiling).
   Otherwise things like "perf sched record" won't have the intended
   effect.  In fact the comment in the above code snippet seems to agree
   with me, so maybe that was the original intent.

2. It's called in the scheduler hot path from enqueue_entity().

So change the warning to a call to force_schedstat_enabled(), and do it
only when the tracepoint gets enabled.

Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")
Cc: stable@...r.kernel.org
Signed-off-by: Josh Poimboeuf <jpoimboe@...hat.com>
---
 include/trace/events/sched.h | 30 +++++++++++++++---------------
 kernel/sched/fair.c          | 29 +++++++----------------------
 2 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9b90c57..ce12e93 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -341,37 +341,36 @@ DECLARE_EVENT_CLASS(sched_stat_template,
 			(unsigned long long)__entry->delay)
 );
 
+extern void trace_sched_stat_register(void);
+
+#define DEFINE_SCHED_STAT_EVENT(name)					\
+	DEFINE_EVENT_FN(sched_stat_template, name,			\
+			TP_PROTO(struct task_struct *tsk, u64 delay),	\
+			TP_ARGS(tsk, delay),				\
+			trace_sched_stat_register, NULL)
 
 /*
  * Tracepoint for accounting wait time (time the task is runnable
  * but not actually running due to scheduler contention).
  */
-DEFINE_EVENT(sched_stat_template, sched_stat_wait,
-	     TP_PROTO(struct task_struct *tsk, u64 delay),
-	     TP_ARGS(tsk, delay));
+DEFINE_SCHED_STAT_EVENT(sched_stat_wait);
 
 /*
  * Tracepoint for accounting sleep time (time the task is not runnable,
  * including iowait, see below).
  */
-DEFINE_EVENT(sched_stat_template, sched_stat_sleep,
-	     TP_PROTO(struct task_struct *tsk, u64 delay),
-	     TP_ARGS(tsk, delay));
+DEFINE_SCHED_STAT_EVENT(sched_stat_sleep);
 
 /*
  * Tracepoint for accounting iowait time (time the task is not runnable
  * due to waiting on IO to complete).
  */
-DEFINE_EVENT(sched_stat_template, sched_stat_iowait,
-	     TP_PROTO(struct task_struct *tsk, u64 delay),
-	     TP_ARGS(tsk, delay));
+DEFINE_SCHED_STAT_EVENT(sched_stat_iowait);
 
 /*
  * Tracepoint for accounting blocked time (time the task is in uninterruptible).
  */
-DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
-	     TP_PROTO(struct task_struct *tsk, u64 delay),
-	     TP_ARGS(tsk, delay));
+DEFINE_SCHED_STAT_EVENT(sched_stat_blocked);
 
 /*
  * Tracepoint for accounting runtime (time the task is executing
@@ -403,9 +402,10 @@ DECLARE_EVENT_CLASS(sched_stat_runtime,
 			(unsigned long long)__entry->vruntime)
 );
 
-DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
-	     TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
-	     TP_ARGS(tsk, runtime, vruntime));
+DEFINE_EVENT_FN(sched_stat_runtime, sched_stat_runtime,
+		TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
+		TP_ARGS(tsk, runtime, vruntime),
+		trace_sched_stat_register, NULL);
 
 /*
  * Tracepoint for showing priority inheritance modifying a tasks
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c6dd8ba..b0cbcc0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -789,6 +789,13 @@ static void update_curr_fair(struct rq *rq)
 	update_curr(cfs_rq_of(&rq->curr->se));
 }
 
+void trace_sched_stat_register(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+	force_schedstat_enabled();
+#endif
+}
+
 #ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -3208,27 +3215,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 
-static inline void check_schedstat_required(void)
-{
-#ifdef CONFIG_SCHEDSTATS
-	if (schedstat_enabled())
-		return;
-
-	/* Force schedstat enabled if a dependent tracepoint is active */
-	if (trace_sched_stat_wait_enabled()    ||
-			trace_sched_stat_sleep_enabled()   ||
-			trace_sched_stat_iowait_enabled()  ||
-			trace_sched_stat_blocked_enabled() ||
-			trace_sched_stat_runtime_enabled())  {
-		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
-			     "stat_blocked and stat_runtime require the "
-			     "kernel parameter schedstats=enabled or "
-			     "kernel.sched_schedstats=1\n");
-	}
-#endif
-}
-
-
 /*
  * MIGRATION
  *
@@ -3293,7 +3279,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 			enqueue_sleeper(cfs_rq, se);
 	}
 
-	check_schedstat_required();
 	if (schedstat_enabled()) {
 		update_stats_enqueue(cfs_rq, se);
 		check_spread(cfs_rq, se);
-- 
2.4.11