[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87a8s6a4zm.fsf@x220.int.ebiederm.org>
Date: Mon, 28 Sep 2015 11:22:21 -0500
From: ebiederm@...ssion.com (Eric W. Biederman)
To: Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
Cc: linux-api@...r.kernel.org, containers@...ts.linux-foundation.org,
linux-kernel@...r.kernel.org,
Roman Gushchin <klamm@...dex-team.ru>,
Serge Hallyn <serge.hallyn@...ntu.com>,
Oleg Nesterov <oleg@...hat.com>,
Chen Fan <chen.fan.fnst@...fujitsu.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Stéphane Graber <stgraber@...ntu.com>
Subject: Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
Konstantin Khlebnikov <khlebnikov@...dex-team.ru> writes:
> pid_t getvpid(pid_t pid, int source, int target);
>
> This syscall converts pid from source pid-namespace into pid visible
> in target pid-namespace. If pid is unreachable from target namespace
> then getvpid() returns zero.
Two minor things.
Can we please call this translate_pid? getvpid does not really cover
what this syscall does.
Can you please split wiring up into a separate patch? You goofed it
up this round and it just adds noise in reviewing the core syscall.
> Namespaces are defined by file descriptors pointing to entries in
> proc (/proc/[pid]/ns/pid). If argument is negative then current pid
> namespace is used.
>
> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Possible error codes:
> ESRCH - task not found
> EBADF - closed file descriptor
> EINVAL - not pid-namespace file descriptor
>
> Such conversion is required for interaction between processes from
> different pid-namespaces. For example system service at host system
> who provide access to restricted set of privileged operations for
> clients from containers have to convert pids back and forward.
>
> Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but
> this interface works only in one way and even that is non-trivial.
>
> Other option is passing pids with credentials via unix socket, but
> this solution requires a lot of preparation and CAP_SYS_ADMIN for
> sending arbitrary pids.
>
> This syscall works in both directions, it's fast and simple.
>
> Examples:
> getvpid(pid, ns, -1) - get pid in our pid namespace
> getvpid(pid, -1, ns) - get pid in container
> getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> getvpid(1, ns, -1) - get init task of pid-namespace
> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
> getvpid(-pid, -1, -1) - get ppid by pid
>
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
>
> --
>
> v1: https://lkml.org/lkml/2015/9/15/411
> v2: https://lkml.org/lkml/2015/9/24/278
> v3:
> * use proc_ns_fdget()
> * update description
> * rebase to next-20150925
> * fix conflict with mlock2
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> include/linux/syscalls.h | 1 +
> include/uapi/asm-generic/unistd.h | 4 ++-
> kernel/sys.c | 51 ++++++++++++++++++++++++++++++++
> 5 files changed, 57 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 143ef9f37932..c36c2c65d204 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -383,3 +383,4 @@
> 374 i386 userfaultfd sys_userfaultfd
> 375 i386 membarrier sys_membarrier
> 376 i386 mlock2 sys_mlock2
> +377 i386 getvpid sys_getvpid
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 314a90bfc09c..90bbbc7fdbe0 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -332,6 +332,7 @@
> 323 common userfaultfd sys_userfaultfd
> 324 common membarrier sys_membarrier
> 325 common mlock2 sys_mlock2
> +326 common getvpid sys_getvpid
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a156b82dd14c..dbb5638260b5 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
> asmlinkage long sys_alarm(unsigned int seconds);
> asmlinkage long sys_getpid(void);
> asmlinkage long sys_getppid(void);
> +asmlinkage long sys_getvpid(pid_t pid, int source, int target);
> asmlinkage long sys_getuid(void);
> asmlinkage long sys_geteuid(void);
> asmlinkage long sys_getgid(void);
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1324b0292ec2..2c1123130f90 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
> __SYSCALL(__NR_membarrier, sys_membarrier)
> #define __NR_mlock2 284
> __SYSCALL(__NR_mlock2, sys_mlock2)
> +#define __NR_mlock2 285
> +__SYSCALL(__NR_getvpid, sys_getvpid)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 285
> +#define __NR_syscalls 286
>
> /*
> * All syscalls below here should go away really,
> diff --git a/kernel/sys.c b/kernel/sys.c
> index fa2f2f671a5c..1e28a36b84fa 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -46,6 +46,7 @@
> #include <linux/syscalls.h>
> #include <linux/kprobes.h>
> #include <linux/user_namespace.h>
> +#include <linux/proc_ns.h>
> #include <linux/binfmts.h>
>
> #include <linux/sched.h>
> @@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid)
> return pid;
> }
>
> +SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target)
> +{
> + struct pid_namespace *source_ns, *target_ns;
> + struct fd source_fd = {}, target_fd = {};
> + struct pid *struct_pid;
> + struct ns_common *ns;
> + pid_t result;
> +
> + if (source >= 0) {
> + ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd);
> + result = PTR_ERR(ns);
> + if (IS_ERR(ns))
> + goto out;
> + source_ns = container_of(ns, struct pid_namespace, ns);
> + } else
> + source_ns = task_active_pid_ns(current);
> +
> + if (target >= 0) {
> + ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd);
> + result = PTR_ERR(ns);
> + if (IS_ERR(ns))
> + goto out;
> + target_ns = container_of(ns, struct pid_namespace, ns);
> + } else
> + target_ns = task_active_pid_ns(current);
> +
> + rcu_read_lock();
> + struct_pid = find_pid_ns(abs(pid), source_ns);
> +
> + if (struct_pid && pid < 0) {
> + struct task_struct *task;
> +
> + task = pid_task(struct_pid, PIDTYPE_PID);
> + if (task)
> + task = rcu_dereference(task->real_parent);
> + struct_pid = task ? task_pid(task) : NULL;
> + }
> +
> + if (struct_pid)
> + result = pid_nr_ns(struct_pid, target_ns);
> + else
> + result = -ESRCH;
> + rcu_read_unlock();
> +
> +out:
> + fdput(target_fd);
> + fdput(source_fd);
> + return result;
> +}
> +
> SYSCALL_DEFINE0(getuid)
> {
> /* Only we change this so SMP safe */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists