[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CANDihLH6cXC+qQ5qm8dm3FnF1Wxv=RF3LsQMaoXggsWH=hscDw@mail.gmail.com>
Date: Sat, 16 Jun 2018 23:20:09 -0700
From: Alistair Strachan <astrachan@...gle.com>
To: "Eric W. Biederman" <ebiederm@...ssion.com>
Cc: linux-fsdevel@...r.kernel.org,
Seth Forshee <seth.forshee@...onical.com>,
Djalal Harouni <tixxdz@...il.com>, kernel-team@...roid.com,
linux-kernel@...r.kernel.org, containers@...ts.linux-foundation.org
Subject: Re: [PATCH v2] proc: Simplify and fix proc by removing the kernel mount
Hi Eric,
Thanks a lot for looking into this problem.
On Sat, Jun 16, 2018 at 7:55 PM Eric W. Biederman <ebiederm@...ssion.com> wrote:
>
>
> Today there are three users of proc_mnt.
> - The legacy sysctl system call implementation.
> - The uml mconsole driver.
> - The process cleanup function proc_flush_task.
>
> The first two are slow path and essentially unused. I expect soon we
> will be able to remove the legacy sysctl system call entirely. To
> keep them working for now a new wrapper file_open_proc_is added to
> mount and unmount proc around file_open_root. Which nicely removes
> the need for a always mounted proc instance for these cases.
>
> Handling proc_flush_task which is regularly used requires a little more
> work. First I optimize proc_flush_task to do nothing where there is
> evidence that there are no entries in proc, by looking at pid->count.
> Then I carefully update proc_fill_super and proc_kill_sb to maintain a
> ns->proc_super pointer to the super block for proc. This allows
> proc_flush_task to find the appropriate instance of proc via rcu.
>
> Once the appropriate instance of proc is found in proc_flush_task
> atomic_inc_not_zero is used to increase the s_active count ensuring
> proc_kill_sb will not be called, until the superblock is deactivated.
> This makes it safe to inspect the instance of proc and invalidate any
> dentries that mention the exiting task.
>
> The two extra atomics operations in exit are not my favorite but given
> that exit is already almost completely serialized with the task lock I
> do not expect this change will be measurable.
>
> The benefit for all of this change is that one of the most error prone
> and tricky parts of the pid namespace implementation, maintaining
> kernel mounts of proc is removed.
>
> In addition removing the unnecessary complexity of the kernel mount
> fixes a regression that caused the proc mount options to be ignored.
> Now that the initial mount of proc comes from userspace, those mount
> options are again honored. This fixes Android's usage of the proc
> hidepid option.
>
> Reported-by: Alistair Strachan <astrachan@...gle.com>
> Cc: stable@...r.kernel.org
> Fixes: e94591d0d90c ("proc: Convert proc_mount to use mount_ns.")
> Signed-off-by: "Eric W. Biederman" <ebiederm@...ssion.com>
> ---
>
> Alistair if you can test and confirm this fixes your issue I will add
> your tested by and send the fix to Linus.
I tested v2 with both UML and qemu-system-x86_64 / ARCH=x86_64 against
4.18-rc1, 4.14 and 4.9 and I couldn't break it. The hidepid problem is
resolved, and the mount flags can now only be specified on the first
userspace mount for that pid namespace.
Tested-by: Alistair Strachan <astrachan@...gle.com>
> Since my earlier posting I have spot tested this. Fixed a few bugs that
> showed up and verified my changes work. So I think this is ready to go
> unless someone looks at this and in testing or code review spots a bug.
Agreed!
> Eric
>
> arch/um/drivers/mconsole_kern.c | 4 ++--
> fs/proc/base.c | 36 ++++++++++++++++++++++++++-------
> fs/proc/inode.c | 5 ++++-
> fs/proc/root.c | 28 ++++++++++---------------
> include/linux/pid_namespace.h | 3 +--
> include/linux/proc_ns.h | 7 ++-----
> kernel/pid.c | 8 --------
> kernel/pid_namespace.c | 7 -------
> kernel/sysctl_binary.c | 5 ++---
> 9 files changed, 51 insertions(+), 52 deletions(-)
>
> diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
> index d5f9a2d1da1b..36af0e02d56b 100644
> --- a/arch/um/drivers/mconsole_kern.c
> +++ b/arch/um/drivers/mconsole_kern.c
> @@ -27,6 +27,7 @@
> #include <linux/file.h>
> #include <linux/uaccess.h>
> #include <asm/switch_to.h>
> +#include <linux/proc_ns.h>
>
> #include <init.h>
> #include <irq_kern.h>
> @@ -124,7 +125,6 @@ void mconsole_log(struct mc_request *req)
>
> void mconsole_proc(struct mc_request *req)
> {
> - struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
> char *buf;
> int len;
> struct file *file;
> @@ -135,7 +135,7 @@ void mconsole_proc(struct mc_request *req)
> ptr += strlen("proc");
> ptr = skip_spaces(ptr);
>
> - file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0);
> + file = file_open_proc(ptr, O_RDONLY, 0);
> if (IS_ERR(file)) {
> mconsole_reply(req, "Failed to open file", 1, 0);
> printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file));
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 1b2ede6abcdf..cd7b68a64ed1 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -3052,7 +3052,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
> .permission = proc_pid_permission,
> };
>
> -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> +static void proc_flush_task_root(struct dentry *proc_root, pid_t pid, pid_t tgid)
> {
> struct dentry *dentry, *leader, *dir;
> char buf[10 + 1];
> @@ -3061,7 +3061,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> name.name = buf;
> name.len = snprintf(buf, sizeof(buf), "%u", pid);
> /* no ->d_hash() rejects on procfs */
> - dentry = d_hash_and_lookup(mnt->mnt_root, &name);
> + dentry = d_hash_and_lookup(proc_root, &name);
> if (dentry) {
> d_invalidate(dentry);
> dput(dentry);
> @@ -3072,7 +3072,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>
> name.name = buf;
> name.len = snprintf(buf, sizeof(buf), "%u", tgid);
> - leader = d_hash_and_lookup(mnt->mnt_root, &name);
> + leader = d_hash_and_lookup(proc_root, &name);
> if (!leader)
> goto out;
>
> @@ -3102,8 +3102,8 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> * @task: task that should be flushed.
> *
> * When flushing dentries from proc, one needs to flush them from global
> - * proc (proc_mnt) and from all the namespaces' procs this task was seen
> - * in. This call is supposed to do all of this job.
> + * proc and from all the namespaces' procs this task was seen in. This call
> + * is supposed to do all of this job.
> *
> * Looks in the dcache for
> * /proc/@pid
> @@ -3127,15 +3127,37 @@ void proc_flush_task(struct task_struct *task)
> int i;
> struct pid *pid, *tgid;
> struct upid *upid;
> + int expected = 1;
>
> pid = task_pid(task);
> tgid = task_tgid(task);
> + if (thread_group_leader(task)) {
> + if (task_pgrp(task) == pid)
> + expected++;
> + if (task_session(task) == pid)
> + expected++;
> + }
> +
> + /* Nothing to do if proc inodes have not take a reference to pid */
> + if (atomic_read(&pid->count) == expected)
> + return;
>
> + rcu_read_lock();
> for (i = 0; i <= pid->level; i++) {
> + struct super_block *sb;
> upid = &pid->numbers[i];
> - proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> - tgid->numbers[i].nr);
> +
> + sb = rcu_dereference(upid->ns->proc_super);
> + if (!sb || !atomic_inc_not_zero(&sb->s_active))
> + continue;
> + rcu_read_unlock();
> +
> + proc_flush_task_root(sb->s_root, upid->nr, tgid->numbers[i].nr);
> + deactivate_super(sb);
> +
> + rcu_read_lock();
> }
> + rcu_read_unlock();
> }
>
> static int proc_pid_instantiate(struct inode *dir,
> diff --git a/fs/proc/inode.c b/fs/proc/inode.c
> index 2cf3b74391ca..1dd9514fa068 100644
> --- a/fs/proc/inode.c
> +++ b/fs/proc/inode.c
> @@ -532,5 +532,8 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
> if (ret) {
> return ret;
> }
> - return proc_setup_thread_self(s);
> + ret = proc_setup_thread_self(s);
> +
> + rcu_assign_pointer(ns->proc_super, s);
> + return ret;
> }
> diff --git a/fs/proc/root.c b/fs/proc/root.c
> index 61b7340b357a..59ca06c386a0 100644
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -89,14 +89,7 @@ int proc_remount(struct super_block *sb, int *flags, char *data)
> static struct dentry *proc_mount(struct file_system_type *fs_type,
> int flags, const char *dev_name, void *data)
> {
> - struct pid_namespace *ns;
> -
> - if (flags & SB_KERNMOUNT) {
> - ns = data;
> - data = NULL;
> - } else {
> - ns = task_active_pid_ns(current);
> - }
> + struct pid_namespace *ns = task_active_pid_ns(current);
>
> return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
> }
> @@ -106,6 +99,7 @@ static void proc_kill_sb(struct super_block *sb)
> struct pid_namespace *ns;
>
> ns = (struct pid_namespace *)sb->s_fs_info;
> + rcu_assign_pointer(ns->proc_super, NULL);
> if (ns->proc_self)
> dput(ns->proc_self);
> if (ns->proc_thread_self)
> @@ -208,19 +202,19 @@ struct proc_dir_entry proc_root = {
> .inline_name = "/proc",
> };
>
> -int pid_ns_prepare_proc(struct pid_namespace *ns)
> +#if defined(CONFIG_SYSCTL_SYSCALL) || defined(CONFIG_MCONSOLE)
> +struct file *file_open_proc(const char *pathname, int flags, umode_t mode)
> {
> struct vfsmount *mnt;
> + struct file *file;
>
> - mnt = kern_mount_data(&proc_fs_type, ns);
> + mnt = kern_mount(&proc_fs_type);
> if (IS_ERR(mnt))
> - return PTR_ERR(mnt);
> + return ERR_CAST(mnt);
>
> - ns->proc_mnt = mnt;
> - return 0;
> -}
> + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, mode);
> + kern_unmount(mnt);
>
> -void pid_ns_release_proc(struct pid_namespace *ns)
> -{
> - kern_unmount(ns->proc_mnt);
> + return file;
> }
> +#endif
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index 49538b172483..dfa70858b19a 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -31,7 +31,7 @@ struct pid_namespace {
> unsigned int level;
> struct pid_namespace *parent;
> #ifdef CONFIG_PROC_FS
> - struct vfsmount *proc_mnt;
> + struct super_block __rcu *proc_super;
> struct dentry *proc_self;
> struct dentry *proc_thread_self;
> #endif
> @@ -40,7 +40,6 @@ struct pid_namespace {
> #endif
> struct user_namespace *user_ns;
> struct ucounts *ucounts;
> - struct work_struct proc_work;
> kgid_t pid_gid;
> int hide_pid;
> int reboot; /* group exit code if this pidns was rebooted */
> diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
> index d31cb6215905..8f1b9edf40ba 100644
> --- a/include/linux/proc_ns.h
> +++ b/include/linux/proc_ns.h
> @@ -47,16 +47,11 @@ enum {
>
> #ifdef CONFIG_PROC_FS
>
> -extern int pid_ns_prepare_proc(struct pid_namespace *ns);
> -extern void pid_ns_release_proc(struct pid_namespace *ns);
> extern int proc_alloc_inum(unsigned int *pino);
> extern void proc_free_inum(unsigned int inum);
>
> #else /* CONFIG_PROC_FS */
>
> -static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
> -static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
> -
> static inline int proc_alloc_inum(unsigned int *inum)
> {
> *inum = 1;
> @@ -86,4 +81,6 @@ extern int ns_get_name(char *buf, size_t size, struct task_struct *task,
> const struct proc_ns_operations *ns_ops);
> extern void nsfs_init(void);
>
> +extern struct file *file_open_proc(const char *pathname, int flags, umode_t mode);
> +
> #endif /* _LINUX_PROC_NS_H */
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 157fe4b19971..7a1a4f39e527 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -143,9 +143,6 @@ void free_pid(struct pid *pid)
> /* Handle a fork failure of the first process */
> WARN_ON(ns->child_reaper);
> ns->pid_allocated = 0;
> - /* fall through */
> - case 0:
> - schedule_work(&ns->proc_work);
> break;
> }
>
> @@ -204,11 +201,6 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> tmp = tmp->parent;
> }
>
> - if (unlikely(is_child_reaper(pid))) {
> - if (pid_ns_prepare_proc(ns))
> - goto out_free;
> - }
> -
> get_pid_ns(ns);
> atomic_set(&pid->count, 1);
> for (type = 0; type < PIDTYPE_MAX; ++type)
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 2a2ac53d8b8b..3018cc18ac38 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -58,12 +58,6 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
> return READ_ONCE(*pkc);
> }
>
> -static void proc_cleanup_work(struct work_struct *work)
> -{
> - struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
> - pid_ns_release_proc(ns);
> -}
> -
> static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
> {
> return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
> @@ -115,7 +109,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> ns->user_ns = get_user_ns(user_ns);
> ns->ucounts = ucounts;
> ns->pid_allocated = PIDNS_ADDING;
> - INIT_WORK(&ns->proc_work, proc_cleanup_work);
>
> return ns;
>
> diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
> index 07148b497451..b655410fa05a 100644
> --- a/kernel/sysctl_binary.c
> +++ b/kernel/sysctl_binary.c
> @@ -17,6 +17,7 @@
> #include <linux/uuid.h>
> #include <linux/slab.h>
> #include <linux/compat.h>
> +#include <linux/proc_ns.h>
>
> #ifdef CONFIG_SYSCTL_SYSCALL
>
> @@ -1278,7 +1279,6 @@ static ssize_t binary_sysctl(const int *name, int nlen,
> void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
> {
> const struct bin_table *table = NULL;
> - struct vfsmount *mnt;
> struct file *file;
> ssize_t result;
> char *pathname;
> @@ -1301,8 +1301,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
> goto out_putname;
> }
>
> - mnt = task_active_pid_ns(current)->proc_mnt;
> - file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0);
> + file = file_open_proc(pathname, flags, 0);
> result = PTR_ERR(file);
> if (IS_ERR(file))
> goto out_putname;
> --
> 2.17.1
Powered by blists - more mailing lists