[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250418-armselig-kabel-4710dc466170@brauner>
Date: Fri, 18 Apr 2025 21:59:23 +0200
From: Christian Brauner <brauner@...nel.org>
To: Ian Kent <raven@...maw.net>
Cc: Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
Mark Brown <broonie@...nel.org>, Eric Chanudet <echanude@...hat.com>,
Alexander Viro <viro@...iv.linux.org.uk>, Jan Kara <jack@...e.cz>, Clark Williams <clrkwllms@...nel.org>,
Steven Rostedt <rostedt@...dmis.org>, Ian Kent <ikent@...hat.com>, linux-fsdevel@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-rt-devel@...ts.linux.dev,
Alexander Larsson <alexl@...hat.com>, Lucas Karpinski <lkarpins@...hat.com>, Aishwarya.TCV@....com
Subject: Re: [PATCH v4] fs/namespace: defer RCU sync for MNT_DETACH umount
On Fri, Apr 18, 2025 at 10:47:10AM +0200, Christian Brauner wrote:
> On Fri, Apr 18, 2025 at 09:20:52AM +0800, Ian Kent wrote:
> > On 18/4/25 09:13, Ian Kent wrote:
> > >
> > > On 18/4/25 00:28, Christian Brauner wrote:
> > > > On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior
> > > > wrote:
> > > > > On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
> > > > > > > So if there's some userspace process with a broken
> > > > > > > NFS server and it
> > > > > > > does umount(MNT_DETACH) it will end up hanging every other
> > > > > > > umount(MNT_DETACH) on the system because the dealyed_mntput_work
> > > > > > > workqueue (to my understanding) cannot make progress.
> > > > > > Ok, "to my understanding" has been updated after going back
> > > > > > and reading
> > > > > > the delayed work code. Luckily it's not as bad as I thought it is
> > > > > > because it's queued on system_wq which is multi-threaded so it's at
> > > > > > least not causing everyone with MNT_DETACH to get stuck. I'm still
> > > > > > skeptical how safe this all is.
> > > > > I would (again) throw system_unbound_wq into the game because
> > > > > the former
> > > > > will remain on the CPU on which has been enqueued (if speaking about
> > > > > multi threading).
> > > > Yes, good point.
> > > >
> > > > However, what about using polled grace periods?
> > > >
> > > > A first simple-minded thing to do would be to record the grace period
> > > > after umount_tree() has finished and the check it in namespace_unlock():
> > > >
> > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > index d9ca80dcc544..1e7ebcdd1ebc 100644
> > > > --- a/fs/namespace.c
> > > > +++ b/fs/namespace.c
> > > > @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable
> > > > __ro_after_init;
> > > > static struct hlist_head *mountpoint_hashtable __ro_after_init;
> > > > static struct kmem_cache *mnt_cache __ro_after_init;
> > > > static DECLARE_RWSEM(namespace_sem);
> > > > +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
> > > > static HLIST_HEAD(unmounted); /* protected by namespace_sem */
> > > > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
> > > > static DEFINE_SEQLOCK(mnt_ns_tree_lock);
> > > > @@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
> > > > struct hlist_head head;
> > > > struct hlist_node *p;
> > > > struct mount *m;
> > > > + unsigned long unmount_seq = rcu_unmount_seq;
> > > > LIST_HEAD(list);
> > > >
> > > > hlist_move_list(&unmounted, &head);
> > > > @@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
> > > > if (likely(hlist_empty(&head)))
> > > > return;
> > > >
> > > > - synchronize_rcu_expedited();
> > > > + cond_synchronize_rcu_expedited(unmount_seq);
> > > >
> > > > hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> > > > hlist_del(&m->mnt_umount);
> > > > @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt,
> > > > enum umount_tree_flags how)
> > > > */
> > > > mnt_notify_add(p);
> > > > }
> > > > +
> > > > + rcu_unmount_seq = get_state_synchronize_rcu();
> > > > }
> > > >
> > > > static void shrink_submounts(struct mount *mnt);
> > > >
> > > >
> > > > I'm not sure how much that would buy us. If it doesn't then it should be
> > > > possible to play with the following possibly strange idea:
> > > >
> > > > diff --git a/fs/mount.h b/fs/mount.h
> > > > index 7aecf2a60472..51b86300dc50 100644
> > > > --- a/fs/mount.h
> > > > +++ b/fs/mount.h
> > > > @@ -61,6 +61,7 @@ struct mount {
> > > > struct rb_node mnt_node; /* node in the ns->mounts
> > > > rbtree */
> > > > struct rcu_head mnt_rcu;
> > > > struct llist_node mnt_llist;
> > > > + unsigned long mnt_rcu_unmount_seq;
> > > > };
> > > > #ifdef CONFIG_SMP
> > > > struct mnt_pcp __percpu *mnt_pcp;
> > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > index d9ca80dcc544..aae9df75beed 100644
> > > > --- a/fs/namespace.c
> > > > +++ b/fs/namespace.c
> > > > @@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
> > > > struct hlist_head head;
> > > > struct hlist_node *p;
> > > > struct mount *m;
> > > > + bool needs_synchronize_rcu = false;
> > > > LIST_HEAD(list);
> > > >
> > > > hlist_move_list(&unmounted, &head);
> > > > @@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
> > > > if (likely(hlist_empty(&head)))
> > > > return;
> > > >
> > > > - synchronize_rcu_expedited();
> > > > + hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> > > > + if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
> > > > + continue;
>
> This has a bug. This needs to be:
>
> /* A grace period has already elapsed. */
> if (poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
> continue;
>
> /* Oh oh, we have to pay up. */
> needs_synchronize_rcu = true;
> break;
>
> which I'm pretty sure will eradicate most of the performance gain you've
> seen because fundamentally the two version shouldn't be different (Note,
> I drafted this while on my way out the door. r.
>
> I would test the following version where we pay the cost of the
> smb_mb() from poll_state_synchronize_rcu() exactly one time:
>
> diff --git a/fs/mount.h b/fs/mount.h
> index 7aecf2a60472..51b86300dc50 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -61,6 +61,7 @@ struct mount {
> struct rb_node mnt_node; /* node in the ns->mounts rbtree */
> struct rcu_head mnt_rcu;
> struct llist_node mnt_llist;
> + unsigned long mnt_rcu_unmount_seq;
> };
> #ifdef CONFIG_SMP
> struct mnt_pcp __percpu *mnt_pcp;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d9ca80dcc544..dd367c54bc29 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
> struct hlist_head head;
> struct hlist_node *p;
> struct mount *m;
> + unsigned long mnt_rcu_unmount_seq = 0;
> LIST_HEAD(list);
>
> hlist_move_list(&unmounted, &head);
> @@ -1817,7 +1818,10 @@ static void namespace_unlock(void)
> if (likely(hlist_empty(&head)))
> return;
>
> - synchronize_rcu_expedited();
> + hlist_for_each_entry_safe(m, p, &head, mnt_umount)
> + mnt_rcu_unmount_seq = max(m->mnt_rcu_unmount_seq, mnt_rcu_unmount_seq);
> +
> + cond_synchronize_rcu_expedited(mnt_rcu_unmount_seq);
>
> hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> hlist_del(&m->mnt_umount);
> @@ -1923,8 +1927,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
> }
> }
> change_mnt_propagation(p, MS_PRIVATE);
> - if (disconnect)
> + if (disconnect) {
> + p->mnt_rcu_unmount_seq = get_state_synchronize_rcu();
> hlist_add_head(&p->mnt_umount, &unmounted);
> + }
>
> /*
> * At this point p->mnt_ns is NULL, notification will be queued
I'm appending a patch that improves on the first version of this patch.
Instead of simply sampling the current rcu state and hoping that the rcu
grace period has elapsed by the time we get to put the mounts we sample
the rcu state and kick off a new grace period at the end of
umount_tree(). That could even get us some performance improvement by on
non-RT kernels. I have no clue how well this will fare on RT though.
View attachment "0001-UNTESTED-mount-sample-and-kick-of-grace-period-durin.patch" of type "text/x-diff" (1771 bytes)
Powered by blists - more mailing lists