[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100620180637.GD17120@redhat.com>
Date: Sun, 20 Jun 2010 20:06:37 +0200
From: Oleg Nesterov <oleg@...hat.com>
To: "Eric W. Biederman" <ebiederm@...ssion.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
Louis Rilling <louis.rilling@...labs.com>,
Pavel Emelyanov <xemul@...nvz.org>,
Linux Containers <containers@...ts.osdl.org>,
linux-kernel@...r.kernel.org, Daniel Lezcano <dlezcano@...ibm.com>
Subject: [PATCH 2/2] pid_ns: refactor the buggy pid_ns_release_proc() logic
pid_namespace holds ns->proc_mnt, while this vfsmount has a referene to
the namespace via PROC_I(sb->s_root->d_inode)->pid. To break this circle
/sbin/init does mntput() in pid_ns_release_proc(). See 6f4e6433.
But we have the following problems:
- Nobody does mntput() if copy_process() fails after
pid_ns_prepare_proc().
- proc_flush_task() checks upid->nr == 1 to verify we are init,
this is wrong if a multi-threaded init does exec.
- As Louis pointed out, this namespace can have the detached
EXIT_DEAD tasks which can use ns->proc_mnt after this mntput().
With this patch only pid_namespace has a reference to ns->proc_mnt, and
mntput(ns->proc_mnt) is called by destroy_pid_namespace() paths when we
know that this ns must not have any references (in particular, there are
no pids in this namespace).
Changes:
- kill proc_flush_task()->pid_ns_release_proc()
- change fs/proc/root.c so that we don't create the "artificial"
references to the namespace or its pid==1.
- change destroy_pid_namespace() to call pid_ns_release_proc().
- change pid_ns_release_proc() to clear s_root->d_inode->pid.
The caller is destroy_pid_namespace(), this pid was already
freed.
Reported-by: Louis Rilling <louis.rilling@...labs.com>
Signed-off-by: Oleg Nesterov <oleg@...hat.com>
---
kernel/pid_namespace.c | 2 ++
fs/proc/base.c | 4 ----
fs/proc/root.c | 10 ++++++----
3 files changed, 8 insertions(+), 8 deletions(-)
--- 35-rc3/kernel/pid_namespace.c~PNS_6_BREAK_CIRCLE 2010-06-20 18:36:00.000000000 +0200
+++ 35-rc3/kernel/pid_namespace.c 2010-06-20 18:50:30.000000000 +0200
@@ -128,6 +128,8 @@ static void destroy_pid_namespace(struct
{
int i;
+ pid_ns_release_proc(ns);
+
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
kmem_cache_free(pid_ns_cachep, ns);
--- 35-rc3/fs/proc/base.c~PNS_6_BREAK_CIRCLE 2010-05-28 13:41:41.000000000 +0200
+++ 35-rc3/fs/proc/base.c 2010-06-20 18:51:14.000000000 +0200
@@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}
static struct dentry *proc_pid_instantiate(struct inode *dir,
--- 35-rc3/fs/proc/root.c~PNS_6_BREAK_CIRCLE 2010-06-19 20:11:03.000000000 +0200
+++ 35-rc3/fs/proc/root.c 2010-06-20 18:58:12.000000000 +0200
@@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
struct pid_namespace *ns;
ns = (struct pid_namespace *)data;
- sb->s_fs_info = get_pid_ns(ns);
+ sb->s_fs_info = ns;
return set_anon_super(sb, NULL);
}
@@ -68,7 +68,7 @@ static int proc_get_sb(struct file_syste
struct proc_inode *ei = PROC_I(sb->s_root->d_inode);
if (!ei->pid) {
rcu_read_lock();
- ei->pid = get_pid(find_pid_ns(1, ns));
+ ei->pid = find_pid_ns(1, ns);
rcu_read_unlock();
}
}
@@ -83,7 +83,6 @@ static void proc_kill_sb(struct super_bl
ns = (struct pid_namespace *)sb->s_fs_info;
kill_anon_super(sb);
- put_pid_ns(ns);
}
static struct file_system_type proc_fs_type = {
@@ -209,5 +208,8 @@ int pid_ns_prepare_proc(struct pid_names
void pid_ns_release_proc(struct pid_namespace *ns)
{
- mntput(ns->proc_mnt);
+ if (ns->proc_mnt) {
+ PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
+ mntput(ns->proc_mnt);
+ }
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists