[PATCH 2/2] pid_ns: refactor the buggy pid_ns_release_proc() logic pid_namespace holds ns->proc_mnt, while this vfsmount has a referene to the namespace via PROC_I(sb->s_root->d_inode)->pid. To break this circle /sbin/init does mntput() in pid_ns_release_proc(). See 6f4e6433. But we have the following problems: - Nobody does mntput() if copy_process() fails after pid_ns_prepare_proc(). - proc_flush_task() checks upid->nr == 1 to verify we are init, this is wrong if a multi-threaded init does exec. - As Louis pointed out, this namespace can have the detached EXIT_DEAD tasks which can use ns->proc_mnt after this mntput(). With this patch only pid_namespace has a reference to ns->proc_mnt, and mntput(ns->proc_mnt) is called by destroy_pid_namespace() paths when we know that this ns must not have any references (in particular, there are no pids in this namespace). Changes: - kill proc_flush_task()->pid_ns_release_proc() - change fs/proc/root.c so that we don't create the "artificial" references to the namespace or its pid==1. - change destroy_pid_namespace() to call pid_ns_release_proc(). - change pid_ns_release_proc() to clear s_root->d_inode->pid and sb->s_fs_info. The caller is destroy_pid_namespace(), both pid and ns must not have any reference. - change proc_self_readlink() and proc_pid_lookup() to check sb->s_fs_info != NULL to detect the case when the proc fs is kept mounted after pid_ns_release_proc(). Reported-by: Louis Rilling Signed-off-by: Oleg Nesterov --- kernel/pid_namespace.c | 2 ++ fs/proc/base.c | 20 ++++++++++++-------- fs/proc/root.c | 11 +++++++---- 3 files changed, 21 insertions(+), 12 deletions(-) --- 35-rc3/kernel/pid_namespace.c~PNS_6_BREAK_CIRCLE 2010-06-24 15:17:46.000000000 +0200 +++ 35-rc3/kernel/pid_namespace.c 2010-06-24 20:48:18.000000000 +0200 @@ -128,6 +128,8 @@ static void destroy_pid_namespace(struct { int i; + pid_ns_release_proc(ns); + for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); kmem_cache_free(pid_ns_cachep, ns); --- 35-rc3/fs/proc/base.c~PNS_6_BREAK_CIRCLE 2010-06-24 15:16:21.000000000 +0200 +++ 35-rc3/fs/proc/base.c 2010-06-24 20:48:18.000000000 +0200 @@ -2349,11 +2349,17 @@ static const struct file_operations proc /* * /proc/self: */ + +static inline pid_t self_tgid(struct dentry *dentry) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + return ns ? task_tgid_nr_ns(current, ns) : 0; +} + static int proc_self_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); + pid_t tgid = self_tgid(dentry); char tmp[PROC_NUMBUF]; if (!tgid) return -ENOENT; @@ -2363,8 +2369,7 @@ static int proc_self_readlink(struct den static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); + pid_t tgid = self_tgid(dentry); char *name = ERR_PTR(-ENOENT); if (tgid) { name = __getname(); @@ -2745,10 +2750,6 @@ void proc_flush_task(struct task_struct proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, @@ -2796,6 +2797,9 @@ struct dentry *proc_pid_lookup(struct in goto out; ns = dentry->d_sb->s_fs_info; + if (!ns) + goto out; + rcu_read_lock(); task = find_task_by_pid_ns(tgid, ns); if (task) --- 35-rc3/fs/proc/root.c~PNS_6_BREAK_CIRCLE 2010-06-23 22:06:01.000000000 +0200 +++ 35-rc3/fs/proc/root.c 2010-06-24 20:48:18.000000000 +0200 @@ -31,7 +31,7 @@ static int proc_set_super(struct super_b struct pid_namespace *ns; ns = (struct pid_namespace *)data; - sb->s_fs_info = get_pid_ns(ns); + sb->s_fs_info = ns; return set_anon_super(sb, NULL); } @@ -68,7 +68,7 @@ static int proc_get_sb(struct file_syste struct proc_inode *ei = PROC_I(sb->s_root->d_inode); if (!ei->pid) { rcu_read_lock(); - ei->pid = get_pid(find_pid_ns(1, ns)); + ei->pid = find_pid_ns(1, ns); rcu_read_unlock(); } } @@ -83,7 +83,6 @@ static void proc_kill_sb(struct super_bl ns = (struct pid_namespace *)sb->s_fs_info; kill_anon_super(sb); - put_pid_ns(ns); } static struct file_system_type proc_fs_type = { @@ -209,5 +208,9 @@ int pid_ns_prepare_proc(struct pid_names void pid_ns_release_proc(struct pid_namespace *ns) { - mntput(ns->proc_mnt); + if (ns->proc_mnt) { + ns->proc_mnt->mnt_sb->s_fs_info = NULL; + PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL; + mntput(ns->proc_mnt); + } }