linux-kernel - Re: [PATCH 1/1] containers: implement nsproxy containers subsystem

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4665346F.20200@openvz.org>
Date:	Tue, 05 Jun 2007 14:01:19 +0400
From:	Pavel Emelianov <xemul@...nvz.org>
To:	"Serge E. Hallyn" <serue@...ibm.com>
CC:	menage@...gle.com, Andrew Morton <akpm@...ux-foundation.org>,
	dev@...ru, vatsa@...ibm.com, ebiederm@...ssion.com,
	svaidy@...ux.vnet.ibm.com, balbir@...ibm.com, pj@....com,
	cpw@....com, ckrm-tech@...ts.sourceforge.net,
	linux-kernel@...r.kernel.org, containers@...ts.osdl.org,
	mbligh@...gle.com, rohitseth@...gle.com, devel@...nvz.org
Subject: Re: [PATCH 1/1] containers: implement nsproxy containers subsystem

Serge E. Hallyn wrote:
>>>From 190ea72d213393dd1440643b2b87b5b2128dff87 Mon Sep 17 00:00:00 2001
> From: Serge E. Hallyn <serue@...ibm.com>
> Date: Mon, 4 Jun 2007 14:18:52 -0400
> Subject: [PATCH 1/1] containers: implement nsproxy containers subsystem
> 
> When a task enters a new namespace via a clone() or unshare(), a new
> container is created and the task moves into it.  This enables

I have a design question.

How the child that has a new namespace guesses what id
this namespace has in containers?

> tracking - and applying resource controls - to virtual servers,
> resource jobs, often logins (using per-user namespaces), and in
> general any application of per-process namespaces.
> 
> This version uses an ida to generate a unique id for auto-created
> containers, i.e. those created through container_clone().  After
> a container is created, it can be renamed to a more useful name if
> so desired.  The original id isn't released until the container is
> destroyed.  (This could be fixed at container_rename() if it was
> deemed desireable)
> 
> There is just one ida for all containers, because an ida per container
> to track ids used for child containers would take a great deal of
> space, and probably isn't very useful.
> 
> Changelog:
> 	Use an ida to generate unique ids for auto-created containers.
> 
> Signed-off-by: Serge E. Hallyn <serue@...ibm.com>
> ---
>  include/linux/container.h        |    2 +
>  include/linux/container_subsys.h |    6 ++
>  include/linux/nsproxy.h          |    7 +++
>  init/Kconfig                     |    9 ++++
>  kernel/Makefile                  |    1 +
>  kernel/container.c               |   52 ++++++++++++++++----
>  kernel/ns_container.c            |   99 ++++++++++++++++++++++++++++++++++++++
>  kernel/nsproxy.c                 |   16 ++++++
>  8 files changed, 183 insertions(+), 9 deletions(-)
>  create mode 100644 kernel/ns_container.c
> 
> diff --git a/include/linux/container.h b/include/linux/container.h
> index 37c0bdf..2f8ee47 100644
> --- a/include/linux/container.h
> +++ b/include/linux/container.h
> @@ -87,6 +87,8 @@ struct container {
>  	/* Private pointers for each registered subsystem */
>  	struct container_subsys_state *subsys[CONTAINER_SUBSYS_COUNT];
>  
> +	int unique_id;  /* usually -1, set to >=0  in container_clone() */
> +
>  	struct containerfs_root *root;
>  	struct container *top_container;
>  
> diff --git a/include/linux/container_subsys.h b/include/linux/container_subsys.h
> index 8fea7cf..9861751 100644
> --- a/include/linux/container_subsys.h
> +++ b/include/linux/container_subsys.h
> @@ -24,3 +24,9 @@ SUBSYS(debug)
>  #endif
>  
>  /* */
> +
> +#ifdef CONFIG_CONTAINER_NS
> +SUBSYS(ns)
> +#endif
> +
> +/* */
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index 189e0dc..8be975b 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -54,4 +54,11 @@ static inline void exit_task_namespaces(struct task_struct *p)
>  		put_nsproxy(ns);
>  	}
>  }
> +
> +#ifdef CONFIG_CONTAINER_NS
> +int ns_container_clone(struct task_struct *tsk);
> +#else
> +static inline int ns_container_clone(struct task_struct *tsk) { return 0; }
> +#endif
> +
>  #endif
> diff --git a/init/Kconfig b/init/Kconfig
> index 5861ad9..d79c505 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -355,6 +355,15 @@ config CONTAINER_CPUACCT
>  	  Provides a simple Resource Controller for monitoring the
>  	  total CPU consumed by the tasks in a container
>  
> +config CONTAINER_NS
> +        bool "Namespace container subsystem"
> +        select CONTAINERS
> +        help
> +          Provides a simple namespace container subsystem to
> +          provide hierarchical naming of sets of namespaces,
> +          for instance virtual servers and checkpoint/restart
> +          jobs.
> +
>  config PROC_PID_CPUSET
>  	bool "Include legacy /proc/<pid>/cpuset file"
>  	depends on CPUSETS
> diff --git a/kernel/Makefile b/kernel/Makefile
> index f73b3d3..34f2345 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -40,6 +40,7 @@ obj-$(CONFIG_CONTAINERS) += container.o
>  obj-$(CONFIG_CONTAINER_DEBUG) += container_debug.o
>  obj-$(CONFIG_CPUSETS) += cpuset.o
>  obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
> +obj-$(CONFIG_CONTAINER_NS) += ns_container.o
>  obj-$(CONFIG_IKCONFIG) += configs.o
>  obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
>  obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
> diff --git a/kernel/container.c b/kernel/container.c
> index 6f80487..0d46d66 100644
> --- a/kernel/container.c
> +++ b/kernel/container.c
> @@ -55,6 +55,7 @@
>  #include <linux/time.h>
>  #include <linux/backing-dev.h>
>  #include <linux/sort.h>
> +#include <linux/idr.h>
>  
>  #include <asm/uaccess.h>
>  #include <asm/atomic.h>
> @@ -64,6 +65,8 @@
>  
>  static DEFINE_MUTEX(container_mutex);
>  
> +static struct ida container_ida;
> +
>  /* Generate an array of container subsystem pointers */
>  #define SUBSYS(_x) &_x ## _subsys,
>  
> @@ -1818,6 +1821,7 @@ static long container_create(struct container *parent, struct dentry *dentry,
>  	INIT_LIST_HEAD(&cont->children);
>  	INIT_LIST_HEAD(&cont->css_groups);
>  	INIT_LIST_HEAD(&cont->release_list);
> +	cont->unique_id = -1;
>  
>  	cont->parent = parent;
>  	cont->root = parent->root;
> @@ -1941,6 +1945,9 @@ static int container_rmdir(struct inode *unused_dir, struct dentry *dentry)
>  	dput(d);
>  	root->number_of_containers--;
>  
> +	if (cont->unique_id != -1)
> +		ida_remove(&container_ida, cont->unique_id);
> +
>  	if (!list_empty(&cont->release_list))
>  		list_del(&cont->release_list);
>  	set_bit(CONT_RELEASABLE, &parent->flags);
> @@ -2040,6 +2047,8 @@ int __init container_init_early(void)
>  		if (ss->early_init)
>  			container_init_subsys(ss);
>  	}
> +
> +	ida_init(&container_ida);
>  	return 0;
>  }
>  
> @@ -2302,12 +2311,6 @@ void container_exit(struct task_struct *tsk, int run_callbacks)
>  		put_css_group_taskexit(cg);
>  }
>  
> -static atomic_t namecnt;
> -static void get_unused_name(char *buf)
> -{
> -	sprintf(buf, "node%d", atomic_inc_return(&namecnt));
> -}
> -
>  /**
>   * container_clone - duplicate the current container in the hierarchy
>   * that the given subsystem is attached to, and move this task into
> @@ -2322,6 +2325,7 @@ int container_clone(struct task_struct *tsk, struct container_subsys *subsys)
>  	struct inode *inode;
>  	struct css_group *cg;
>  	struct containerfs_root *root;
> +	int new_id;
>  
>  	/* We shouldn't be called by an unregistered subsystem */
>  	BUG_ON(!subsys->active);
> @@ -2340,6 +2344,19 @@ int container_clone(struct task_struct *tsk, struct container_subsys *subsys)
>  	}
>  	cg = tsk->containers;
>  	parent = task_container(tsk, subsys->subsys_id);
> +
> +	ret = ida_get_new(&container_ida, &new_id);
> +	if (ret == -EAGAIN) {
> +		if (!ida_pre_get(&container_ida, GFP_KERNEL))
> +			return -ENOMEM;
> +		ret = ida_get_new(&container_ida, &new_id);
> +	}
> +	if (ret) {
> +		ret = -EBUSY;
> +		goto out_no_name;
> +	}
> +	sprintf(nodename, "node%d", new_id);
> +
>  	/* Pin the hierarchy */
>  	atomic_inc(&parent->root->sb->s_active);
>  
> @@ -2347,8 +2364,8 @@ int container_clone(struct task_struct *tsk, struct container_subsys *subsys)
>  	get_css_group(cg);
>  	mutex_unlock(&container_mutex);
>  
> +
>  	/* Now do the VFS work to create a container */
> -	get_unused_name(nodename);
>  	inode = parent->dentry->d_inode;
>  
>  	/* Hold the parent directory mutex across this operation to
> @@ -2403,6 +2420,10 @@ int container_clone(struct task_struct *tsk, struct container_subsys *subsys)
>  
>  	/* All seems fine. Finish by moving the task into the new container */
>  	ret = attach_task(child, tsk);
> +
> +	if (!ret)
> +		child->unique_id = new_id;
> +
>  	mutex_unlock(&container_mutex);
>  
>   out_release:
> @@ -2410,19 +2431,32 @@ int container_clone(struct task_struct *tsk, struct container_subsys *subsys)
>  
>  	mutex_lock(&container_mutex);
>  	put_css_group(cg);
> +
> + out_no_name:
>  	mutex_unlock(&container_mutex);
>  	deactivate_super(parent->root->sb);
>  	return ret;
>  }
>  
> -/* See if "cont" is a descendant of the current task's container in
> - * the appropriate hierarchy */
> +/*
> + * See if "cont" is a descendant of the current task's container in
> + * the appropriate hierarchy
> + *
> + * If we are sending in dummytop, then presumably we are creating
> + * the top container in the subsystem.
> + *
> + * Called only by the ns (nsproxy) container.
> + */
>  
>  int container_is_descendant(const struct container *cont)
>  {
>  	int ret;
>  	struct container *target;
>  	int subsys_id;
> +
> +	if (cont == dummytop)
> +		return 1;
> +
>  	get_first_subsys(cont, NULL, &subsys_id);
>  	target = task_container(current, subsys_id);
>  	while (cont != target && cont!= cont->top_container) {
> diff --git a/kernel/ns_container.c b/kernel/ns_container.c
> new file mode 100644
> index 0000000..3465716
> --- /dev/null
> +++ b/kernel/ns_container.c
> @@ -0,0 +1,99 @@
> +/*
> + * ns_container.c - namespace container subsystem
> + *
> + * Copyright 2006, 2007 IBM Corp
> + */
> +
> +#include <linux/module.h>
> +#include <linux/container.h>
> +#include <linux/fs.h>
> +
> +struct ns_container {
> +	struct container_subsys_state css;
> +	spinlock_t lock;
> +};
> +
> +struct container_subsys ns_subsys;
> +
> +static inline struct ns_container *container_to_ns(
> +		struct container *container)
> +{
> +	return container_of(container_subsys_state(container, ns_subsys_id),
> +			    struct ns_container, css);
> +}
> +
> +int ns_container_clone(struct task_struct *task)
> +{
> +	return container_clone(task, &ns_subsys);
> +}
> +
> +/*
> + * Rules:
> + *   1. you can only enter a container which is a child of your current
> + *     container
> + *   2. you can only place another process into a container if
> + *     a. you have CAP_SYS_ADMIN
> + *     b. your container is an ancestor of task's destination container
> + *       (hence either you are in the same container as task, or in an
> + *        ancestor container thereof)
> + */
> +static int ns_can_attach(struct container_subsys *ss,
> +		struct container *new_container, struct task_struct *task)
> +{
> +	struct container *orig;
> +
> +	if (current != task) {
> +		if (!capable(CAP_SYS_ADMIN))
> +			return -EPERM;
> +
> +		if (!container_is_descendant(new_container))
> +			return -EPERM;
> +	}
> +
> +	if (atomic_read(&new_container->count) != 0)
> +		return -EPERM;
> +
> +	orig = task_container(task, ns_subsys_id);
> +	if (orig && orig != new_container->parent)
> +		return -EPERM;
> +
> +	return 0;
> +}
> +
> +/*
> + * Rules: you can only create a container if
> + *     1. you are capable(CAP_SYS_ADMIN)
> + *     2. the target container is a descendant of your own container
> + */
> +static int ns_create(struct container_subsys *ss, struct container *container)
> +{
> +	struct ns_container *ns_container;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +	if (!container_is_descendant(container))
> +		return -EPERM;
> +
> +	ns_container = kzalloc(sizeof(*ns_container), GFP_KERNEL);
> +	if (!ns_container) return -ENOMEM;
> +	spin_lock_init(&ns_container->lock);
> +	container->subsys[ns_subsys.subsys_id] = &ns_container->css;
> +	return 0;
> +}
> +
> +static void ns_destroy(struct container_subsys *ss,
> +			struct container *container)
> +{
> +	struct ns_container *ns_container;
> +	
> +	ns_container = container_to_ns(container);
> +	kfree(ns_container);
> +}
> +
> +struct container_subsys ns_subsys = {
> +	.name = "ns",
> +	.can_attach = ns_can_attach,
> +	.create = ns_create,
> +	.destroy  = ns_destroy,
> +	.subsys_id = ns_subsys_id,
> +};
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 1bc4b55..afce808 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -124,7 +124,14 @@ int copy_namespaces(int flags, struct task_struct *tsk)
>  		goto out;
>  	}
>  
> +	err = ns_container_clone(tsk);
> +	if (err) {
> +		put_nsproxy(new_ns);
> +		goto out;
> +	}
> +
>  	tsk->nsproxy = new_ns;
> +
>  out:
>  	put_nsproxy(old_ns);
>  	return err;
> @@ -177,6 +184,15 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
>  	if (IS_ERR(*new_nsp)) {
>  		err = PTR_ERR(*new_nsp);
>  		put_nsproxy(old_ns);
> +		goto out;
> +	}
> +
> +	err = ns_container_clone(current);
> +	if (err) {
> +		put_nsproxy(*new_nsp);
> +		put_nsproxy(old_ns);
>  	}
> +
> +out:
>  	return err;
>  }

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/