linux-kernel - Re: Attaching a process to cgroups

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20120723204159.GB6379@thinkpad>
Date:	Mon, 23 Jul 2012 22:41:59 +0200
From:	Andrea Righi <righi.andrea@...il.com>
To:	Mike Galbraith <efault@....de>
Cc:	Alexey Vlasov <renton@...ton.name>,
	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
	linux-kernel@...r.kernel.org
Subject: Re: Attaching a process to cgroups

On Thu, Jun 21, 2012 at 10:23:02AM +0200, Mike Galbraith wrote:
> On Thu, 2012-06-21 at 11:54 +0400, Alexey Vlasov wrote: 
> > On Wed, Jun 20, 2012 at 02:28:18PM +0200, Mike Galbraith wrote:
> > > 
> > > kernel/cgroup.c::cgroup_attach_task()
> > > {
> > > ...
> > > 	synchronize_rcu();
> > > ...
> > > }
> > 
> > So nothing can be done here? (I mean if only I knew how to fix it I
> > wouldn't ask about it ;)
> 
> Sure, kill the obnoxious thing, it's sitting right in the middle of the
> userspace interface.
> 
> I banged on it a while back (wrt explosive android patches), extracted
> RCU from the userspace interface.  It seemed to work great, much faster,
> couldn't make it explode.  I wouldn't bet anything I wasn't willing to
> immediately part with that the result was really really safe though ;-)
> 
> -Mike

JFYI,

I'm testing the following patch in a bunch of hosts and I wasn't able to
make any of them to explode, even running a multi-threaded
cgroup-intensive workload, but probably I was just lucky (or unlucky,
depending on the point of view).

It is basically the same Not-signed-off-by work posted by Mike a while
ago: https://lkml.org/lkml/2011/4/12/599.

In addition, I totally removed the synchronize_rcu() call from
cgroup_attach_task() and added the call_rcu -> schedule_work removal
also for css_set. The latter looks unnecessary to me from a logical
point of view, or maybe I'm missing something, because I can't explain
why with it I can't trigger any BUG / oops.

Mike, did you make any progress from your old patch?

Thanks,
-Andrea

---
 include/linux/cgroup.h |    2 ++
 kernel/cgroup.c        |   91 +++++++++++++++++++++++++++++-------------------
 2 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d3f5fba..2bab2d6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -212,6 +212,7 @@ struct cgroup {
 
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
+	struct work_struct work;
 
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
@@ -260,6 +261,7 @@ struct css_set {
 
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
+	struct work_struct work;
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b303dfc..aa7cc06 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -396,6 +396,21 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
 
+static void free_css_set_work(struct work_struct *work)
+{
+	struct css_set *cg = container_of(work, struct css_set, work);
+
+	kfree(cg);
+}
+
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+	struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+
+	INIT_WORK(&cg->work, free_css_set_work);
+	schedule_work(&cg->work);
+}
+
 static void __put_css_set(struct css_set *cg, int taskexit)
 {
 	struct cg_cgroup_link *link;
@@ -433,7 +448,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
 	}
 
 	write_unlock(&css_set_lock);
-	kfree_rcu(cg, rcu_head);
+	call_rcu(&cg->rcu_head, free_css_set_rcu);
 }
 
 /*
@@ -875,44 +890,52 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 	return ret;
 }
 
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static void free_cgroup_work(struct work_struct *work)
 {
-	/* is dentry a directory ? if so, kfree() associated cgroup */
-	if (S_ISDIR(inode->i_mode)) {
-		struct cgroup *cgrp = dentry->d_fsdata;
-		struct cgroup_subsys *ss;
-		BUG_ON(!(cgroup_is_removed(cgrp)));
-		/* It's possible for external users to be holding css
-		 * reference counts on a cgroup; css_put() needs to
-		 * be able to access the cgroup after decrementing
-		 * the reference count in order to know if it needs to
-		 * queue the cgroup to be handled by the release
-		 * agent */
-		synchronize_rcu();
+	struct cgroup *cgrp = container_of(work, struct cgroup, work);
+	struct cgroup_subsys *ss;
 
-		mutex_lock(&cgroup_mutex);
-		/*
-		 * Release the subsystem state objects.
-		 */
-		for_each_subsys(cgrp->root, ss)
-			ss->destroy(cgrp);
+	mutex_lock(&cgroup_mutex);
+	/*
+	 * Release the subsystem state objects.
+	 */
+	for_each_subsys(cgrp->root, ss)
+		ss->destroy(cgrp);
 
-		cgrp->root->number_of_cgroups--;
-		mutex_unlock(&cgroup_mutex);
+	cgrp->root->number_of_cgroups--;
+	mutex_unlock(&cgroup_mutex);
 
-		/*
-		 * Drop the active superblock reference that we took when we
-		 * created the cgroup
-		 */
-		deactivate_super(cgrp->root->sb);
+	/*
+	 * Drop the active superblock reference that we took when we
+	 * created the cgroup
+	 */
+	deactivate_super(cgrp->root->sb);
 
-		/*
-		 * if we're getting rid of the cgroup, refcount should ensure
-		 * that there are no pidlists left.
-		 */
-		BUG_ON(!list_empty(&cgrp->pidlists));
+	/*
+	 * if we're getting rid of the cgroup, refcount should ensure
+	 * that there are no pidlists left.
+	 */
+	BUG_ON(!list_empty(&cgrp->pidlists));
+
+	kfree(cgrp);
+}
+
+static void free_cgroup_rcu(struct rcu_head *obj)
+{
+	struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+
+	INIT_WORK(&cgrp->work, free_cgroup_work);
+	schedule_work(&cgrp->work);
+}
+
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+	/* is dentry a directory ? if so, kfree() associated cgroup */
+	if (S_ISDIR(inode->i_mode)) {
+		struct cgroup *cgrp = dentry->d_fsdata;
 
-		kfree_rcu(cgrp, rcu_head);
+		BUG_ON(!(cgroup_is_removed(cgrp)));
+		call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
 	} else {
 		struct cfent *cfe = __d_cfe(dentry);
 		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -1990,8 +2013,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 			ss->attach(cgrp, &tset);
 	}
 
-	synchronize_rcu();
-
 	/*
 	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
 	 * is no longer empty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/