linux-kernel - [RFC][PATCH] cgroup: fix race between fork and cgroup freezing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <4F587199.6050404@cn.fujitsu.com>
Date:	Thu, 08 Mar 2012 16:45:13 +0800
From:	Li Zefan <lizf@...fujitsu.com>
To:	Tejun Heo <tj@...nel.org>
CC:	Frederic Weisbecker <fweisbec@...il.com>,
	LKML <linux-kernel@...r.kernel.org>,
	Cgroups <cgroups@...r.kernel.org>, Mel Gorman <mgorman@...e.de>,
	David Rientjes <rientjes@...gle.com>,
	缪 勰 <miaox@...fujitsu.com>,
	Andrew Morton <akpm@...ux-foundation.org>
Subject: [RFC][PATCH] cgroup: fix race between fork and cgroup freezing

A similar bug exists in cpuset, and those are long-standing bugs.

As reported by Frederic:

> When a user freezes a cgroup, the freezer sets the subsystem state
> to CGROUP_FREEZING and then iterates over the tasks in the cgroup links.
> 
> But there is a possible race here, although unlikely, if a task
> forks and the parent is preempted between write_unlock(tasklist_lock)
> and cgroup_post_fork(). If we freeze the cgroup while the parent
> is sleeping and the parent wakes up thereafter, its child will
> be missing from the set of tasks to freeze because:
> 
> - The child was not yet linked to its css_set->tasks, as is done
> from cgroup_post_fork(). cgroup_iter_start() has thus missed it.
> 
> - The cgroup freezer's fork callback can handle that child but
> cgroup_fork_callbacks() has been called already.

I try to fix it by using seqcount. We read the counter before calling
cgroup_fork_callbacks(), and we check the counter after cgroup_post_fork().
If the seq numbers don't match, we know the forking task's cgroup
has been/is being frozen, so we freeze the child task.

cpuset can be fixed accordingly.

Reported-by: Frederic Weisbecker <fweisbec@...il.com>
Signed-off-by: Li Zefan <lizf@...fujitsu.com>
---
 include/linux/cgroup.h    |    5 +++--
 include/linux/init_task.h |    8 ++++++++
 include/linux/sched.h     |    1 +
 kernel/cgroup.c           |   18 ++++++++++++++++--
 kernel/cgroup_freezer.c   |   22 ++++++++++++++++++++++
 kernel/fork.c             |    5 +++--
 6 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e9b6021..38c82b2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -32,8 +32,8 @@ extern int cgroup_lock_is_held(void);
 extern bool cgroup_lock_live_group(struct cgroup *cgrp);
 extern void cgroup_unlock(void);
 extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_fork_callbacks(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
+extern void cgroup_fork_callbacks(struct task_struct *p, int *seq);
+extern void cgroup_post_fork(struct task_struct *p, int seq);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
@@ -495,6 +495,7 @@ struct cgroup_subsys {
 	void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		       struct cgroup_taskset *tset);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
+	void (*post_fork)(struct cgroup_subsys *ss, struct task_struct *task);
 	void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			struct cgroup *old_cgrp, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9c66b1a..91665ce 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -125,6 +125,13 @@ extern struct cred init_cred;
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#ifdef CONFIG_CGROUPS
+# define INIT_CGROUPS(tsk)						\
+	.fork_seq = SEQCNT_ZERO
+#else
+# define INIT_CGROUPS(tsk)
+#endif
+
 #define INIT_TASK_COMM "swapper"
 
 /*
@@ -192,6 +199,7 @@ extern struct cred init_cred;
 	INIT_FTRACE_GRAPH						\
 	INIT_TRACE_RECURSION						\
 	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_CGROUPS(tsk)						\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d379a6..d487777 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1507,6 +1507,7 @@ struct task_struct {
 	struct css_set __rcu *cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock */
 	struct list_head cg_list;
+	seqcount_t fork_seq;
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user *robust_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b53..77435ec 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/seqlock.h>
 
 #include <linux/atomic.h>
 
@@ -4558,6 +4559,7 @@ void cgroup_fork(struct task_struct *child)
 	child->cgroups = current->cgroups;
 	get_css_set(child->cgroups);
 	INIT_LIST_HEAD(&child->cg_list);
+	seqcount_init(&child->fork_seq);
 }
 
 /**
@@ -4568,8 +4570,10 @@ void cgroup_fork(struct task_struct *child)
  * tasklist. No need to take any locks since no-one can
  * be operating on this task.
  */
-void cgroup_fork_callbacks(struct task_struct *child)
+void cgroup_fork_callbacks(struct task_struct *child, int *seq)
 {
+	*seq = read_seqcount_begin(&current->fork_seq);
+
 	if (need_forkexit_callback) {
 		int i;
 		/*
@@ -4594,7 +4598,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
  * with the first call to cgroup_iter_start() - to guarantee that the
  * new task ends up on its list.
  */
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child, int seq)
 {
 	if (use_task_css_set_links) {
 		write_lock(&css_set_lock);
@@ -4612,6 +4616,16 @@ void cgroup_post_fork(struct task_struct *child)
 			list_add(&child->cg_list, &child->cgroups->tasks);
 		}
 		write_unlock(&css_set_lock);
+
+		if (read_seqcount_retry(&current->fork_seq, seq)) {
+			int i;
+
+			for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+				struct cgroup_subsys *ss = subsys[i];
+				if (ss->post_fork)
+					ss->post_fork(ss, child);
+			}
+		}
 	}
 }
 /**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b..88f210e 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -216,6 +216,25 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 	spin_unlock_irq(&freezer->lock);
 }
 
+static void freezer_post_fork(struct cgroup_subsys *ss,
+			      struct task_struct *task)
+{
+	struct freezer *freezer;
+
+	cgroup_lock();
+
+	freezer = task_freezer(task);
+	if (!freezer->css.cgroup->parent)
+		goto out;
+
+	spin_lock_irq(&freezer->lock);
+	if (freezer->state != CGROUP_THAWED)
+		freeze_task(task);
+	spin_unlock_irq(&freezer->lock);
+out:
+	cgroup_unlock();
+}
+
 /*
  * caller must hold freezer->lock
  */
@@ -286,6 +305,8 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 			continue;
 		if (!freezing(task) && !freezer_should_skip(task))
 			num_cant_freeze_now++;
+
+		write_seqcount_barrier(&task->fork_seq);
 	}
 	cgroup_iter_end(cgroup, &it);
 
@@ -381,4 +402,5 @@ struct cgroup_subsys freezer_subsys = {
 	.subsys_id	= freezer_subsys_id,
 	.can_attach	= freezer_can_attach,
 	.fork		= freezer_fork,
+	.post_fork	= freezer_post_fork,
 };
diff --git a/kernel/fork.c b/kernel/fork.c
index e2cd3e2..73abc25 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1077,6 +1077,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	int retval;
 	struct task_struct *p;
 	int cgroup_callbacks_done = 0;
+	int seq;
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1331,7 +1332,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
 	 * on the tasklist. */
-	cgroup_fork_callbacks(p);
+	cgroup_fork_callbacks(p, &seq);
 	cgroup_callbacks_done = 1;
 
 	/* Need tasklist lock for parent etc handling! */
@@ -1395,7 +1396,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
-	cgroup_post_fork(p);
+	cgroup_post_fork(p, seq);
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
 	perf_event_fork(p);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/