linux-kernel - [PATCH 6/8] cgroup: mount cgroupns-root when inside non-init cgroupns

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-Id: <1451937294-22589-7-git-send-email-serge.hallyn@ubuntu.com>
Date:	Mon,  4 Jan 2016 13:54:51 -0600
From:	serge.hallyn@...ntu.com
To:	linux-kernel@...r.kernel.org
Cc:	adityakali@...gle.com, tj@...nel.org, linux-api@...r.kernel.org,
	containers@...ts.linux-foundation.org, cgroups@...r.kernel.org,
	lxc-devel@...ts.linuxcontainers.org, akpm@...ux-foundation.org,
	ebiederm@...ssion.com, gregkh@...uxfoundation.org,
	lizefan@...wei.com, hannes@...xchg.org,
	Serge Hallyn <serge.hallyn@...ntu.com>,
	Serge Hallyn <serge.hallyn@...onical.com>
Subject: [PATCH 6/8] cgroup: mount cgroupns-root when inside non-init cgroupns

From: Serge Hallyn <serge.hallyn@...ntu.com>

This patch enables cgroup mounting inside userns when a process
as appropriate privileges. The cgroup filesystem mounted is
rooted at the cgroupns-root. Thus, in a container-setup, only
the hierarchy under the cgroupns-root is exposed inside the container.
This allows container management tools to run inside the containers
without depending on any global state.

Signed-off-by: Serge Hallyn <serge.hallyn@...onical.com>
---
Changelog:
	20151116 - Don't allow user namespaces to bind new subsystems
	20151118 - postpone the FS_USERNS_MOUNT flag until the
	           last patch, until we can convince ourselves it
		   is safe.
	20151207 - Switch to walking up the kernfs path from kn root.
		 - Group initialized variables
		 - Explain the capable(CAP_SYS_ADMIN) check
		 - Style fixes
	20160104 - kernfs_node_dentry: lock inode for lookup_one_len()

Signed-off-by: Serge Hallyn <serge.hallyn@...ntu.com>
---
 fs/kernfs/mount.c |    2 ++
 kernel/cgroup.c   |   40 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 7224296..074bb8b 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -120,7 +120,9 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
 		kntmp = find_next_ancestor(kn, knparent);
 		if (WARN_ON(!kntmp))
 			return ERR_PTR(-EINVAL);
+		mutex_lock(&d_inode(dentry)->i_mutex);
 		dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name));
+		mutex_unlock(&d_inode(dentry)->i_mutex);
 		dput(dentry);
 		if (IS_ERR(dtmp))
 			return dtmp;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2bb58a1..d0bed8f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1983,6 +1983,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 {
 	bool is_v2 = fs_type == &cgroup2_fs_type;
 	struct super_block *pinned_sb = NULL;
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct cgroup_subsys *ss;
 	struct cgroup_root *root;
 	struct cgroup_sb_opts opts;
@@ -1991,6 +1992,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	int i;
 	bool new_sb;
 
+	get_cgroup_ns(ns);
+
+	/* Check if the caller has permission to mount. */
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+		put_cgroup_ns(ns);
+		return ERR_PTR(-EPERM);
+	}
+
 	/*
 	 * The first time anyone tries to mount a cgroup, enable the list
 	 * linking each css_set to its tasks and fix up all existing tasks.
@@ -2106,6 +2115,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		goto out_unlock;
 	}
 
+	/*
+	 * We know this subsystem has not yet been bound.  Users in a non-init
+	 * user namespace may only mount hierarchies with no bound subsystems,
+	 * i.e. 'none,name=user1'
+	 */
+	if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root) {
 		ret = -ENOMEM;
@@ -2124,12 +2143,30 @@ out_free:
 	kfree(opts.release_agent);
 	kfree(opts.name);
 
-	if (ret)
+	if (ret) {
+		put_cgroup_ns(ns);
 		return ERR_PTR(ret);
+	}
 out_mount:
 	dentry = kernfs_mount(fs_type, flags, root->kf_root,
 			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
 			      &new_sb);
+
+	/*
+	 * In non-init cgroup namespace, instead of root cgroup's
+	 * dentry, we return the dentry corresponding to the
+	 * cgroupns->root_cgrp.
+	 */
+	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+		struct dentry *nsdentry;
+		struct cgroup *cgrp;
+
+		cgrp = cset_cgroup_from_root(ns->root_cset, root);
+		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+		dput(dentry);
+		dentry = nsdentry;
+	}
+
 	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 
@@ -2142,6 +2179,7 @@ out_mount:
 		deactivate_super(pinned_sb);
 	}
 
+	put_cgroup_ns(ns);
 	return dentry;
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/