[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <m1pr3t2fvl.fsf_-_@fess.ebiederm.org>
Date: Thu, 25 Feb 2010 12:57:02 -0800
From: ebiederm@...ssion.com (Eric W. Biederman)
To: hadi@...erus.ca
Cc: Daniel Lezcano <dlezcano@...ibm.com>,
Patrick McHardy <kaber@...sh.net>,
Linux Netdev List <netdev@...r.kernel.org>,
containers@...ts.linux-foundation.org,
Netfilter Development Mailinglist
<netfilter-devel@...r.kernel.org>,
Ben Greear <greearb@...delatech.com>,
Serge Hallyn <serue@...ibm.com>,
Matt Helsley <matthltc@...ibm.com>
Subject: [RFC][PATCH] ns: Syscalls for better namespace sharing control.
Introduce two new system calls:
int nsfd(pid_t pid, unsigned long nstype);
int setns(unsigned long nstype, int fd);
These two new system calls address three specific problems that can
make namespaces hard to work with.
- Namespaces require a dedicated process to pin them in memory.
- It is not possible to use a namespace unless you are the
child of the original creator.
- Namespaces don't have names that userspace can use to talk
about them.
The nsfd() system call returns a file descriptor that can
be used to talk about a specific namespace, and to keep
the specified namespace alive.
The fd returned by nsfd() can be bind mounted as:
mount --bind /proc/self/fd/N /some/filesystem/path
to keep the namespace alive indefinitely as long as
it is mounted.
open works on the fd returned by nsfd() so another
process can get a hold of it and do interesting things.
Overall that allows for persistent naming of namespaces
according to userspace policy.
setns() allows changing the namespace of the current process
to a namespace that originates with nsfd().
Signed-off-by: Eric W. Biederman <ebiederm@...ssion.com>
---
This is just my first pass at this, and not yet compiled tested.
I was pleasantly surprised at how easy all of this was to implement.
I have verified mount will let me bind mount /proc/self/fd/N so
there is nothing special needed for the mount case, except
getting the reference counting and lifetime rules correct for
my filesystem objects.
arch/x86/ia32/ia32entry.S | 2 +
arch/x86/include/asm/unistd_32.h | 4 +-
arch/x86/include/asm/unistd_64.h | 4 +
arch/x86/kernel/syscall_table_32.S | 2 +
fs/Makefile | 2 +-
fs/nsfd.c | 278 ++++++++++++++++++++++++++++++++++++
include/linux/magic.h | 1 +
include/linux/nsproxy.h | 1 +
include/linux/nstype.h | 6 +
kernel/nsproxy.c | 17 +++
10 files changed, 315 insertions(+), 2 deletions(-)
create mode 100644 fs/nsfd.c
create mode 100644 include/linux/nstype.h
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 53147ad..9fd33de 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,6 @@ ia32_sys_call_table:
.quad compat_sys_rt_tgsigqueueinfo /* 335 */
.quad sys_perf_event_open
.quad compat_sys_recvmmsg
+ .quad sys_nsfd
+ .quad sys_setns
ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 3baf379..5b7833c 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,12 @@
#define __NR_rt_tgsigqueueinfo 335
#define __NR_perf_event_open 336
#define __NR_recvmmsg 337
+#define __NR_nsfd 338
+#define __NR_setns 339
#ifdef __KERNEL__
-#define NR_syscalls 338
+#define NR_syscalls 340
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4843f7b..260d542 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,10 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
#define __NR_recvmmsg 299
__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_nsfd 300
+__SYSCALL(__NR_nsfd, sys_nsfd)
+#define __NR_setns 301
+__SYSCALL(__NR_setns, sys_setns)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 15228b5..e09a45b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,5 @@ ENTRY(sys_call_table)
.long sys_rt_tgsigqueueinfo /* 335 */
.long sys_perf_event_open
.long sys_recvmmsg
+ .long sys_nsfd
+ .long sys_setns
diff --git a/fs/Makefile b/fs/Makefile
index af6d047..74d5091 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o fs_struct.o
+ stack.o fs_struct.o nsfd.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/nsfd.c b/fs/nsfd.c
new file mode 100644
index 0000000..71bcc55
--- /dev/null
+++ b/fs/nsfd.c
@@ -0,0 +1,278 @@
+#include <linux/nstype.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <net/net_namespace.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cred.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/nsproxy.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+
+static struct vfsmount *nsfd_mnt __read_mostly;
+static struct inode *nsfd_inode;
+
+static const struct file_operations nsfd_file_operations = {
+ .llseek = no_llseek,
+};
+
+
+static int nsfd_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ return get_sb_pseudo(fs_type, "nsfd:", NULL, NSFD_FS_MAGIC, mnt);
+}
+
+static char *nsfd_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ static const char name[] = "nsfd";
+
+ if (sizeof(name) > buflen)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ return memcpy(buffer, name, sizeof(name));
+}
+
+static const struct dentry_operations nsfd_dentry_operations = {
+ .d_dname = nsfd_dname,
+};
+
+static struct file_system_type nsfd_fs_type = {
+ .name = "nsfd",
+ .get_sb = nsfd_get_sb,
+ .kill_sb = kill_anon_super,
+
+};
+
+static void netns_dentry_release(struct dentry *dentry)
+{
+ put_net(dentry->d_fsdata);
+ dentry->d_fsdata = NULL;
+}
+
+static const struct dentry_operations netns_dentry_operations = {
+ .d_dname = nsfd_dname,
+ .d_release = netns_dentry_release,
+};
+
+static const struct dentry_operations *nsfd_dops[] = {
+ [NSTYPE_NET] = &netns_dentry_operations,
+};
+
+static const struct dentry_operations *nstype_dops(unsigned long nstype)
+{
+ const struct dentry_operations *d_op = NULL;
+
+ if (nstype < sizeof(nsfd_dops)/sizeof(nsfd_dops[0]))
+ d_op = nsfd_dops[nstype];
+
+ return d_op;
+}
+
+static struct file *nsfd_fget(int fd, unsigned long nstype)
+{
+ const struct dentry_operations *d_op;
+ struct file *file;
+
+ d_op = nstype_dops(nstype);
+ if (!d_op)
+ return ERR_PTR(-EINVAL);
+
+ file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &nsfd_file_operations)
+ goto out_invalid;
+
+ if (file->f_path.dentry->d_op != d_op)
+ goto out_invalid;
+
+ return file;
+
+out_invalid:
+ fput(file);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct inode *nsfd_mkinode(void)
+{
+ struct inode *inode;
+ inode = new_inode(nsfd_mnt->mnt_sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ inode->i_fop = &nsfd_file_operations;
+
+ /*
+ * Mark the inode dirty from the very beginning,
+ * that way it will never be moved to the dirty
+ * list because mark_inode_dirty() will think that
+ * it already _is_ on the dirty list.
+ */
+ inode->i_state = I_DIRTY;
+ inode->i_mode = S_IRUSR | S_IWUSR;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ return inode;
+}
+
+
+static struct file *nsfd_getfile(void)
+{
+ struct qstr name = { .name = "" };
+ struct path path;
+ struct file *file;
+
+ path.dentry = d_alloc(nsfd_mnt->mnt_sb->s_root, &name);
+ if (!path.dentry)
+ return ERR_PTR(-ENOMEM);
+
+ path.mnt = mntget(nsfd_mnt);
+
+ /*
+ * We know the nsfd_inode inode count is always greater than zero,
+ * so we can avoid doing an igrab() and we can use an open-coded
+ * atomic_inc().
+ */
+ atomic_inc(&nsfd_inode->i_count);
+ path.dentry->d_op = &nsfd_dentry_operations;
+ d_instantiate(path.dentry, nsfd_inode);
+
+ file = alloc_file(&path, FMODE_READ, &nsfd_file_operations);
+ if (!file) {
+ path_put(&path);
+ return ERR_PTR(-ENFILE);
+ }
+ file->f_mapping = nsfd_inode->i_mapping;
+
+ file->f_pos = 0;
+ file->f_flags = O_RDONLY;
+ file->f_version = 0;
+ file->private_data = NULL;
+
+ return file;
+}
+
+static void *nsfd_getns(pid_t pid, unsigned long nstype)
+{
+ struct task_struct *task;
+ struct nsproxy *nsproxy;
+ void *ns;
+
+ ns = ERR_PTR(-ESRCH);
+ rcu_read_lock();
+ if (pid == 0)
+ task = current;
+ else
+ task = find_task_by_vpid(pid);
+ if (!task)
+ goto out;
+
+ ns = ERR_PTR(-EPERM);
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
+ goto out;
+
+ ns = ERR_PTR(-ESRCH);
+ nsproxy = task_nsproxy(task);
+ if (!nsproxy)
+ goto out;
+
+ ns = ERR_PTR(-EINVAL);
+ switch(nstype) {
+ case NSTYPE_NET:
+ ns = get_net(nsproxy->net_ns);
+ break;
+ }
+out:
+ rcu_read_unlock();
+ return ns;
+}
+
+SYSCALL_DEFINE2(nsfd, pid_t, pid, unsigned long, nstype)
+{
+ const struct dentry_operations *d_op;
+ struct file *file;
+ int fd;
+ void *ns;
+
+ d_op = nstype_dops(nstype);
+ if (!d_op)
+ return -EINVAL;
+
+ file = nsfd_getfile();
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ns = nsfd_getns(pid, nstype);
+ if (IS_ERR(ns)) {
+ fput(file);
+ return PTR_ERR(ns);
+ }
+
+ file->f_dentry->d_fsdata = ns;
+ file->f_dentry->d_op = d_op;
+
+ fd = get_unused_fd();
+ if (fd < 0) {
+ fput(file);
+ return fd;
+ }
+ fd_install(fd, file);
+
+ return fd;
+}
+
+
+SYSCALL_DEFINE2(setns, unsigned long, nstype, int, fd)
+{
+ struct file *file;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ file = nsfd_fget(fd, nstype);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ set_namespace(nstype, file->f_dentry->d_fsdata);
+
+ fput(file);
+ return 0;
+}
+
+
+static int __init nsfd_init(void)
+{
+ int error;
+
+ error = register_filesystem(&nsfd_fs_type);
+ if (error)
+ goto err_exit;
+
+ nsfd_mnt = kern_mount(&nsfd_fs_type);
+ if (IS_ERR(nsfd_mnt)) {
+ error = PTR_ERR(nsfd_mnt);
+ goto err_unregister_filesystem;
+ }
+
+ nsfd_inode = nsfd_mkinode();
+ if (IS_ERR(nsfd_inode)) {
+ error = PTR_ERR(nsfd_inode);
+ goto err_mntput;
+ }
+
+ return 0;
+
+err_mntput:
+ mntput(nsfd_mnt);
+err_unregister_filesystem:
+ unregister_filesystem(&nsfd_fs_type);
+err_exit:
+ panic(KERN_ERR "nsfd_init() failed (%d)\n", error);
+}
+
+fs_initcall(nsfd_init);
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 76285e0..a4fe6eb 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -26,6 +26,7 @@
#define ISOFS_SUPER_MAGIC 0x9660
#define JFFS2_SUPER_MAGIC 0x72b6
#define ANON_INODE_FS_MAGIC 0x09041934
+#define NSFD_FS_MAGIC 0x6e736664
#define MINIX_SUPER_MAGIC 0x137F /* original minix fs */
#define MINIX_SUPER_MAGIC2 0x138F /* minix fs, 30 char names */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 7b370c7..45f1e07 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -65,6 +65,7 @@ static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
int copy_namespaces(unsigned long flags, struct task_struct *tsk);
void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
+void set_namespace(unsigned long nstype, void *ns);
void free_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
struct fs_struct *);
diff --git a/include/linux/nstype.h b/include/linux/nstype.h
new file mode 100644
index 0000000..3bdf856
--- /dev/null
+++ b/include/linux/nstype.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_NSTYPE_H
+#define _LINUX_NSTYPE_H
+
+#define NSTYPE_NET 0
+
+#endif /* _LINUX_NSTYPE_H */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9..574461c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,7 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/nstype.h>
static struct kmem_cache *nsproxy_cachep;
@@ -221,6 +222,22 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+void set_namespace(unsigned long nstype, void *ns)
+{
+ struct task_struct *tsk = current;
+ struct nsproxy *new_nsproxy;
+
+ new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+ switch(nstype) {
+ case NSTYPE_NET:
+ put_net(new_nsproxy->net_ns);
+ new_nsproxy->net_ns = get_net(ns);
+ break;
+ }
+
+ switch_task_namespaces(tsk, new_nsproxy);
+}
+
static int __init nsproxy_cache_init(void)
{
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
--
1.6.5.2.143.g8cc62
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists