[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3wy3sqvmun6ohamngwuxcquantwmk4ytcflm4jojbck4zjps45@jwsujs62sz2w>
Date: Thu, 17 Apr 2025 18:33:40 -0400
From: Eric Chanudet <echanude@...hat.com>
To: Christian Brauner <brauner@...nel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
Ian Kent <raven@...maw.net>, Mark Brown <broonie@...nel.org>,
Alexander Viro <viro@...iv.linux.org.uk>, Jan Kara <jack@...e.cz>, Clark Williams <clrkwllms@...nel.org>,
Steven Rostedt <rostedt@...dmis.org>, Ian Kent <ikent@...hat.com>, linux-fsdevel@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-rt-devel@...ts.linux.dev,
Alexander Larsson <alexl@...hat.com>, Lucas Karpinski <lkarpins@...hat.com>, Aishwarya.TCV@....com
Subject: Re: [PATCH v4] fs/namespace: defer RCU sync for MNT_DETACH umount
On Thu, Apr 17, 2025 at 06:28:20PM +0200, Christian Brauner wrote:
> On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
> > > > (1) The real issue is with the same process P1 doing stupid stuff that
> > > > just happened to work. For example if there's code out there that
> > > > does a MNT_DETACH on a filesystem that uses a loop device
> > > > immediately followed by the desire to reuse the loop device:
> > > >
> > > > It doesn't matter whether such code must in theory already be
> > > > prepared to handle the case of seeing EBUSY after the MNT_DETACH. If
> > > > this currently just happens to work because we guarantee that the
> > > > last mntput() and cleanup_mnt() will have been done when the caller
> > > > returns to userspace it's a uapi break plain and simple.
> > > >
> > > > This implicit guarantee isn't there anymore after your patch because
> > > > the final mntput() from is done from the system_unbound_wq which has
> > > > the consequence that the cleanup_mnt() is done from the
> > > > delayed_mntput_work workqeueue. And that leads to problem number
> > > > (2).
The following script runs fine on mainline, but consistently fails with
the version of this patch after discussions upstream:
#! /usr/bin/bash -eux
mntpath="/mnt"
umount -R "$mntpath" || true
fspath="/root/fs.ext4"
dd if=/dev/zero of="$fspath" bs=1M count=500
mkfs.ext4 "$fspath"
loopdev="$(losetup -f "$fspath" --show)"
for i in $(seq 0 99); do
mkfs.ext4 -Fq "$fspath"
mount "$loopdev" "$mntpath"
touch "$mntpath/f1"
umount -l "$mntpath"
done
losetup -d "$loopdev"
Failure looks like:
+ for i in $(seq 0 99)
+ mkfs.ext4 -Fq /root/fs.ext4
/root/fs.ext4 contains a ext4 file system
created on Thu Apr 17 20:42:04 2025
+ mount /dev/loop0 /mnt
+ touch /mnt/f1
+ umount -l /mnt
+ for i in $(seq 0 99)
+ mkfs.ext4 -Fq /root/fs.ext4
/root/fs.ext4 contains a ext4 file system
created on Thu Apr 17 20:42:04 2025
+ mount /dev/loop0 /mnt
mount: /mnt: mount(2) system call failed: Structure needs cleaning.
[ 9.352478] EXT4-fs (loop0): mounted filesystem 3c5c632e-24d1-4027-b378-f51e67972883 r/w with ordered data mode. Quota mode: none.
[ 9.449121] EXT4-fs (loop0): unmounting filesystem 3c5c632e-24d1-4027-b378-f51e67972883.
[ 9.462093] EXT4-fs (loop0): ext4_check_descriptors: Checksum for group 32 failed (10605!=64170)
[ 9.462099] EXT4-fs (loop0): group descriptors corrupted!
Seems worse than EBUSY :(.
> However, what about using polled grace periods?
>
> A first simple-minded thing to do would be to record the grace period
> after umount_tree() has finished and the check it in namespace_unlock():
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d9ca80dcc544..1e7ebcdd1ebc 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable __ro_after_init;
> static struct hlist_head *mountpoint_hashtable __ro_after_init;
> static struct kmem_cache *mnt_cache __ro_after_init;
> static DECLARE_RWSEM(namespace_sem);
> +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
> static HLIST_HEAD(unmounted); /* protected by namespace_sem */
> static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> static DEFINE_SEQLOCK(mnt_ns_tree_lock);
> @@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
> struct hlist_head head;
> struct hlist_node *p;
> struct mount *m;
> + unsigned long unmount_seq = rcu_unmount_seq;
> LIST_HEAD(list);
>
> hlist_move_list(&unmounted, &head);
> @@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
> if (likely(hlist_empty(&head)))
> return;
>
> - synchronize_rcu_expedited();
> + cond_synchronize_rcu_expedited(unmount_seq);
>
> hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> hlist_del(&m->mnt_umount);
> @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
> */
> mnt_notify_add(p);
> }
> +
> + rcu_unmount_seq = get_state_synchronize_rcu();
> }
>
> static void shrink_submounts(struct mount *mnt);
>
>
> I'm not sure how much that would buy us. If it doesn't then it should be
> possible to play with the following possibly strange idea:
Did not improve the lazy unmount, no corruption running the script.
QEMU x86_64, 8cpus, PREEMP_RT:
# perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs /mnt' -- umount /mnt
0.019498 +- 0.000944 seconds time elapsed ( +- 4.84% )
# perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs /mnt' -- umount -l /mnt
0.020635 +- 0.000959 seconds time elapsed ( +- 4.65% )
> diff --git a/fs/mount.h b/fs/mount.h
> index 7aecf2a60472..51b86300dc50 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -61,6 +61,7 @@ struct mount {
> struct rb_node mnt_node; /* node in the ns->mounts rbtree */
> struct rcu_head mnt_rcu;
> struct llist_node mnt_llist;
> + unsigned long mnt_rcu_unmount_seq;
> };
> #ifdef CONFIG_SMP
> struct mnt_pcp __percpu *mnt_pcp;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d9ca80dcc544..aae9df75beed 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
> struct hlist_head head;
> struct hlist_node *p;
> struct mount *m;
> + bool needs_synchronize_rcu = false;
> LIST_HEAD(list);
>
> hlist_move_list(&unmounted, &head);
> @@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
> if (likely(hlist_empty(&head)))
> return;
>
> - synchronize_rcu_expedited();
> + hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> + if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
> + continue;
> +
> + needs_synchronize_rcu = true;
> + break;
> + }
> +
> + if (needs_synchronize_rcu)
> + synchronize_rcu_expedited();
>
> hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> hlist_del(&m->mnt_umount);
> @@ -1923,8 +1933,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
> }
> }
> change_mnt_propagation(p, MS_PRIVATE);
> - if (disconnect)
> + if (disconnect) {
> + p->mnt_rcu_unmount_seq = get_state_synchronize_rcu();
> hlist_add_head(&p->mnt_umount, &unmounted);
> + }
>
> /*
> * At this point p->mnt_ns is NULL, notification will be queued
>
> This would allow to elide synchronize rcu calls if they elapsed in the
> meantime since we moved that mount to the unmounted list.
Faster umount, lazy or not, no corruption running the script.
QEMU x86_64, 8cpus, PREEMP_RT:
# perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount /mnt
0.001482 +- 0.000121 seconds time elapsed ( +- 8.17% )
# perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l /mnt
0.002248 +- 0.000845 seconds time elapsed ( +- 37.58% )
The crun test from v1 without your patch:
# perf stat -r 10 --null -- crun run test
0.08166 +- 0.00476 seconds time elapsed ( +- 5.83% )
The crun test from v1 with your patch:
# perf stat -r 10 --null -- crun run test
0.01449 +- 0.00457 seconds time elapsed ( +- 31.55% )
I have not run the LTP fs tests with that last patch yet, but that looks
like quite an improvement.
Best,
--
Eric Chanudet
Powered by blists - more mailing lists