[<prev] [next>] [day] [month] [year] [list]
Message-ID: <CANn89iJm6nehnTobQi6n=FUL1A8Nmc9UZ_S0ELNziQmEtZuo1g@mail.gmail.com>
Date: Wed, 6 Feb 2019 12:13:35 -0800
From: Eric Dumazet <edumazet@...gle.com>
To: Salman Qazi <sqazi@...gle.com>
Cc: Alexander Viro <viro@...iv.linux.org.uk>,
Eric Biederman <ebiederm@...ssion.com>,
linux-fsdevel@...r.kernel.org, LKML <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC
On Wed, Feb 6, 2019 at 11:54 AM Salman Qazi <sqazi@...gle.com> wrote:
>
> Prior to this patch, the kernel can spend a lot of time with
> this stack trace:
>
> [<ffffffffbe5491e3>] __wait_rcu_gp+0x93/0xe0
> [<ffffffffbe549418>] synchronize_sched+0x48/0x60
> [<ffffffffbe7ae5b3>] kern_unmount+0x3a/0x46
> [<ffffffffbe847c02>] mq_put_mnt+0x15/0x17
> [<ffffffffbe8481af>] put_ipc_ns+0x36/0x8b
>
> This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
> This is done by implementing an asynchronous version of kern_unmount.
>
> Since mntput() sleeps, it needs to be deferred to a work queue.
>
> Additionally, the callers of mq_put_mnt appear to be safe having
> it behave asynchronously. In particular, put_ipc_ns calls
> mq_clear_sbinfo which renders the inode inaccessible for the purposes of
> mqueue_create by making s_fs_info NULL. This appears
> to be the thing that prevents access while free_ipc_ns is taking place.
> So, the unmount should be able to proceed lazily.
>
> Tested: Ran the following program:
>
> int main(void)
> {
> int pid;
> int status;
> int i;
>
> for (i = 0; i < 1000; i++) {
> pid = fork();
> if (!pid) {
> assert(!unshare(CLONE_NEWUSER|
> CLONE_NEWIPC|CLONE_NEWNS));
> return 0;
> }
>
> assert(waitpid(pid, &status, 0) == pid);
> }
> }
>
> Before:
>
> $ time ./unshare2
>
> real 0m9.784s
> user 0m0.428s
> sys 0m0.000s
>
> After:
>
> $ time ./unshare2
>
> real 0m0.368s
> user 0m0.226s
> sys 0m0.122s
>
> Signed-off-by: Salman Qazi <sqazi@...gle.com>
Reviewed-by: Eric Dumazet <edumazet@...gle.com>
> ---
> fs/namespace.c | 41 +++++++++++++++++++++++++++++++++++++++++
> include/linux/fs.h | 1 +
> ipc/mqueue.c | 2 +-
> 3 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index a677b59efd74..caa51ca81605 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3323,6 +3323,47 @@ void kern_unmount(struct vfsmount *mnt)
> }
> EXPORT_SYMBOL(kern_unmount);
>
> +struct async_unmount_cb {
> + struct vfsmount *mnt;
> + struct work_struct work;
> + struct rcu_head rcu_head;
> +};
> +
> +static void kern_unmount_work(struct work_struct *work)
> +{
> + struct async_unmount_cb *cb = container_of(work,
> + struct async_unmount_cb, work);
> +
> + mntput(cb->mnt);
> + kfree(cb);
> +}
> +
> +static void kern_unmount_rcu_cb(struct rcu_head *rcu_head)
> +{
> + struct async_unmount_cb *cb = container_of(rcu_head,
> + struct async_unmount_cb, rcu_head);
> +
> + INIT_WORK(&cb->work, kern_unmount_work);
> + schedule_work(&cb->work);
> +
> +}
> +
> +void kern_unmount_async(struct vfsmount *mnt)
> +{
> + /* release long term mount so mount point can be released */
> + if (!IS_ERR_OR_NULL(mnt)) {
> + struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
> +
> + if (cb) {
> + real_mount(mnt)->mnt_ns = NULL;
> + cb->mnt = mnt;
> + call_rcu(&cb->rcu_head, kern_unmount_rcu_cb);
> + } else {
> + kern_unmount(mnt);
> + }
> + }
> +}
> +
> bool our_mnt(struct vfsmount *mnt)
> {
> return check_mnt(real_mount(mnt));
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 29d8e2cfed0e..8865997a8722 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *);
> extern int unregister_filesystem(struct file_system_type *);
> extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
> #define kern_mount(type) kern_mount_data(type, NULL)
> +extern void kern_unmount_async(struct vfsmount *mnt);
> extern void kern_unmount(struct vfsmount *mnt);
> extern int may_umount_tree(struct vfsmount *);
> extern int may_umount(struct vfsmount *);
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index c595bed7bfcb..a8c2465ac0cb 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns)
>
> void mq_put_mnt(struct ipc_namespace *ns)
> {
> - kern_unmount(ns->mq_mnt);
> + kern_unmount_async(ns->mq_mnt);
> }
>
> static int __init init_mqueue_fs(void)
> --
> 2.20.1.611.gfbb209baf1-goog
>
Powered by blists - more mailing lists