linux-kernel - Re: [BUG] sched: leaf_cfs_rq

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160314112057.GT6356@twins.programming.kicks-ass.net>
Date:	Mon, 14 Mar 2016 12:20:57 +0100
From:	Peter Zijlstra <peterz@...radead.org>
To:	Kazuki Yamaguchi <k@....jp>
Cc:	Tejun Heo <tj@...nel.org>, Niklas Cassel <niklas.cassel@...s.com>,
	linux-kernel@...r.kernel.org
Subject: Re: [BUG] sched: leaf_cfs_rq_list use after free

On Sat, Mar 12, 2016 at 06:42:57PM +0900, Kazuki Yamaguchi wrote:

> 2e91fa7 cgroup: keep zombies associated with their original cgroups

So the below hackery yields:

[  192.814857] ------------[ cut here ]------------
[  192.820025] WARNING: CPU: 38 PID: 3539 at ../kernel/sched/fair.c:288 enqueue_entity+0x90d/0xa10()
[  192.829930] Modules linked in:
[  192.833346] CPU: 38 PID: 3539 Comm: test Not tainted 4.5.0-rc7-01136-g89456cf-dirty #346
[  192.842377] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
[  192.853832]  0000000000000000 ffff880423d87b08 ffffffff81684a55 0000000000000000
[  192.862136]  ffffffff81f36e63 ffff880423d87b40 ffffffff810d7366 ffff880827432c00
[  192.870437]  ffff880827505b80 000000000000024a 0000000000000001 0000000000000000
[  192.878744] Call Trace:
[  192.881480]  [<ffffffff81684a55>] dump_stack+0x67/0x92
[  192.887224]  [<ffffffff810d7366>] warn_slowpath_common+0x86/0xc0
[  192.893930]  [<ffffffff810d745a>] warn_slowpath_null+0x1a/0x20
[  192.900431]  [<ffffffff8111bd6d>] enqueue_entity+0x90d/0xa10
[  192.906751]  [<ffffffff811168cd>] ? select_task_rq_fair+0x48d/0x790
[  192.913748]  [<ffffffff8111bec9>] enqueue_task_fair+0x59/0x8c0
[  192.920254]  [<ffffffff8112d83d>] ? __lock_is_held+0x4d/0x70
[  192.926572]  [<ffffffff8112d83d>] ? __lock_is_held+0x4d/0x70
[  192.932895]  [<ffffffff811081bc>] activate_task+0x5c/0xb0
[  192.938923]  [<ffffffff8110874e>] ttwu_do_activate.constprop.104+0x3e/0x90
[  192.946601]  [<ffffffff81109669>] try_to_wake_up+0x1f9/0x620
[  192.952919]  [<ffffffff81109b32>] default_wake_function+0x12/0x20
[  192.959717]  [<ffffffff810da021>] child_wait_callback+0x51/0x60
[  192.966326]  [<ffffffff81125c02>] __wake_up_common+0x52/0x90
[  192.972634]  [<ffffffff811261f5>] __wake_up_sync_key+0x45/0x60
[  192.979146]  [<ffffffff810dccd6>] __wake_up_parent+0x26/0x30
[  192.985468]  [<ffffffff810ea09b>] do_notify_parent+0x30b/0x550
[  192.991980]  [<ffffffff810e9edd>] ? do_notify_parent+0x14d/0x550
[  192.998684]  [<ffffffff810e476a>] ? __ptrace_unlink+0xba/0x110
[  193.005196]  [<ffffffff810e3a18>] __ptrace_detach.part.12+0x88/0xd0
[  193.012183]  [<ffffffff810e4897>] exit_ptrace+0x87/0xd0
[  193.018015]  [<ffffffff810dc9ab>] do_exit+0xabb/0xca0
[  193.023663]  [<ffffffff810dcc1e>] do_group_exit+0x4e/0xc0
[  193.029680]  [<ffffffff810dcca4>] SyS_exit_group+0x14/0x20
[  193.035823]  [<ffffffff81b1d965>] entry_SYSCALL_64_fastpath+0x18/0xa8
[  193.043013] ---[ end trace 8c92cd8599d0fd71 ]---


Which yields the patch in question is indeed full of fail. The
additional hackery below (new cpu_cgroup_exit) does indeed fix the fail.

But given that this is true for cpu, it is also very much true for perf.

So I would suggest TJ to revert that patch and queue it for stable.

It it clearly borken, because cgroup_exit() is called from preemptible
context, so _obviously_ we can (and clearly will) schedule after that,
which is somewhat hard if the cgroup we're supposedly belonging to has
been torn to shreds in the meantime.

---
 kernel/sched/core.c | 17 +++++++++++++++++
 kernel/sched/fair.c | 13 +++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1c82ded..3892a48 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7640,6 +7640,9 @@ void sched_move_task(struct task_struct *tsk)
 	tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
 			  struct task_group, css);
 	tg = autogroup_task_group(tsk, tg);
+	if (tsk->flags & PF_EXITING) /* we're dying, tg could be about to vanish */
+		tg = &root_task_group;
+
 	tsk->sched_task_group = tg;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8112,6 +8115,19 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 		sched_move_task(task);
 }
 
+static void cpu_cgroup_exit(struct task_struct *task)
+{
+	/*
+	 * cgroup_exit() is called in the copy_process() failure path.
+	 * Ignore this case since the task hasn't ran yet, this avoids
+	 * trying to poke a half freed task state from generic code.
+	 */
+	if (!(task->flags & PF_EXITING))
+		return;
+
+	sched_move_task(task);
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
@@ -8443,6 +8459,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
+	.exit           = cpu_cgroup_exit,
 	.legacy_cftypes	= cpu_files,
 	.early_init	= 1,
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3313052..163b829 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -285,7 +285,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
-	if (!cfs_rq->on_list) {
+	WARN_ON(cfs_rq->on_list == -1);
+
+	if (cfs_rq->on_list == 0) {
 		/*
 		 * Ensure we either appear before our parent (if already
 		 * enqueued) or force our parent to appear after us when it is
@@ -302,15 +304,17 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 		}
 
 		cfs_rq->on_list = 1;
+		trace_printk("%p\n", cfs_rq);
 	}
 }
 
 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
-	if (cfs_rq->on_list) {
+	if (cfs_rq->on_list == 1) {
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
-		cfs_rq->on_list = 0;
+		trace_printk("%p\n", cfs_rq);
 	}
+	cfs_rq->on_list = -1;
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
@@ -8356,9 +8360,10 @@ void unregister_fair_sched_group(struct task_group *tg)
 		/*
 		 * Only empty task groups can be destroyed; so we can speculatively
 		 * check on_list without danger of it being re-added.
-		 */
+		 *
 		if (!tg->cfs_rq[cpu]->on_list)
 			continue;
+		 */
 
 		rq = cpu_rq(cpu);