linux-kernel - Re: [PATCH v4] fs/namespace: defer RCU sync for MNT

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250409-egalisieren-halbbitter-23bc252d3a38@brauner>
Date: Wed, 9 Apr 2025 12:37:06 +0200
From: Christian Brauner <brauner@...nel.org>
To: Eric Chanudet <echanude@...hat.com>
Cc: Alexander Viro <viro@...iv.linux.org.uk>, Jan Kara <jack@...e.cz>, 
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>, Clark Williams <clrkwllms@...nel.org>, 
	Steven Rostedt <rostedt@...dmis.org>, Ian Kent <ikent@...hat.com>, linux-fsdevel@...r.kernel.org, 
	linux-kernel@...r.kernel.org, linux-rt-devel@...ts.linux.dev, 
	Alexander Larsson <alexl@...hat.com>, Lucas Karpinski <lkarpins@...hat.com>
Subject: Re: [PATCH v4] fs/namespace: defer RCU sync for MNT_DETACH umount

On Tue, Apr 08, 2025 at 04:58:34PM -0400, Eric Chanudet wrote:
> Defer releasing the detached file-system when calling namespace_unlock()
> during a lazy umount to return faster.
> 
> When requesting MNT_DETACH, the caller does not expect the file-system
> to be shut down upon returning from the syscall. Calling
> synchronize_rcu_expedited() has a significant cost on RT kernel that
> defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
> mount in a separate list and put it on a workqueue to run post RCU
> grace-period.
> 
> w/o patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )
> 
> w/ patch, 6.15-rc1 PREEMPT_RT:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>     0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
>     0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )
> 
> Signed-off-by: Alexander Larsson <alexl@...hat.com>
> Signed-off-by: Lucas Karpinski <lkarpins@...hat.com>
> Signed-off-by: Eric Chanudet <echanude@...hat.com>
> ---
> 
> Attempt to re-spin this series based on the feedback received in v3 that
> pointed out the need to wait the grace-period in namespace_unlock()
> before calling the deferred mntput().

I still hate this with a passion because it adds another special-sauce
path into the unlock path. I've folded the following diff into it so it
at least doesn't start passing that pointless boolean and doesn't
introduce __namespace_unlock(). Just use a global variable and pick the
value off of it just as we do with the lists. Testing this now:

diff --git a/fs/namespace.c b/fs/namespace.c
index e5b0b920dd97..25599428706c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -82,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init;
 static struct hlist_head *mountpoint_hashtable __ro_after_init;
 static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
-static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
-static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static bool unmounted_lazily;          /* protected by namespace_sem */
+static HLIST_HEAD(unmounted);          /* protected by namespace_sem */
+static LIST_HEAD(ex_mountpoints);      /* protected by namespace_sem */
 static DEFINE_SEQLOCK(mnt_ns_tree_lock);

 #ifdef CONFIG_FSNOTIFY
@@ -1807,17 +1808,18 @@ static void free_mounts(struct hlist_head *mount_list)

 static void defer_free_mounts(struct work_struct *work)
 {
-       struct deferred_free_mounts *d = container_of(
-               to_rcu_work(work), struct deferred_free_mounts, rwork);
+       struct deferred_free_mounts *d;

+       d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork);
        free_mounts(&d->release_list);
        kfree(d);
 }

-static void __namespace_unlock(bool lazy)
+static void namespace_unlock(void)
 {
        HLIST_HEAD(head);
        LIST_HEAD(list);
+       bool defer = unmounted_lazily;

        hlist_move_list(&unmounted, &head);
        list_splice_init(&ex_mountpoints, &list);
@@ -1840,29 +1842,21 @@ static void __namespace_unlock(bool lazy)
        if (likely(hlist_empty(&head)))
                return;

-       if (lazy) {
-               struct deferred_free_mounts *d =
-                       kmalloc(sizeof(*d), GFP_KERNEL);
+       if (defer) {
+               struct deferred_free_mounts *d;

-               if (unlikely(!d))
-                       goto out;
-
-               hlist_move_list(&head, &d->release_list);
-               INIT_RCU_WORK(&d->rwork, defer_free_mounts);
-               queue_rcu_work(system_wq, &d->rwork);
-               return;
+               d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
+               if (d) {
+                       hlist_move_list(&head, &d->release_list);
+                       INIT_RCU_WORK(&d->rwork, defer_free_mounts);
+                       queue_rcu_work(system_wq, &d->rwork);
+                       return;
+               }
        }
-
-out:
        synchronize_rcu_expedited();
        free_mounts(&head);
 }

-static inline void namespace_unlock(void)
-{
-       __namespace_unlock(false);
-}
-
 static inline void namespace_lock(void)
 {
        down_write(&namespace_sem);
@@ -2094,7 +2088,7 @@ static int do_umount(struct mount *mnt, int flags)
        }
 out:
        unlock_mount_hash();
-       __namespace_unlock(flags & MNT_DETACH);
+       namespace_unlock();
        return retval;
 }


> 
> v4:
> - Use queue_rcu_work() to defer free_mounts() for lazy umounts
> - Drop lazy_unlock global and refactor using a helper
> v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/
> - Removed unneeded code for lazy umount case.
> - Don't block within interrupt context.
> v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/
> - Only defer releasing umount'ed filesystems for lazy umounts
> v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/
> 
>  fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 14935a0500a2..e5b0b920dd97 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
>  static unsigned int mp_hash_mask __ro_after_init;
>  static unsigned int mp_hash_shift __ro_after_init;
>  
> +struct deferred_free_mounts {
> +	struct rcu_work rwork;
> +	struct hlist_head release_list;
> +};
> +
>  static __initdata unsigned long mhash_entries;
>  static int __init set_mhash_entries(char *str)
>  {
> @@ -1789,11 +1794,29 @@ static bool need_notify_mnt_list(void)
>  }
>  #endif
>  
> -static void namespace_unlock(void)
> +static void free_mounts(struct hlist_head *mount_list)
>  {
> -	struct hlist_head head;
>  	struct hlist_node *p;
>  	struct mount *m;
> +
> +	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
> +		hlist_del(&m->mnt_umount);
> +		mntput(&m->mnt);
> +	}
> +}
> +
> +static void defer_free_mounts(struct work_struct *work)
> +{
> +	struct deferred_free_mounts *d = container_of(
> +		to_rcu_work(work), struct deferred_free_mounts, rwork);
> +
> +	free_mounts(&d->release_list);
> +	kfree(d);
> +}
> +
> +static void __namespace_unlock(bool lazy)
> +{
> +	HLIST_HEAD(head);
>  	LIST_HEAD(list);
>  
>  	hlist_move_list(&unmounted, &head);
> @@ -1817,12 +1840,27 @@ static void namespace_unlock(void)
>  	if (likely(hlist_empty(&head)))
>  		return;
>  
> -	synchronize_rcu_expedited();
> +	if (lazy) {
> +		struct deferred_free_mounts *d =
> +			kmalloc(sizeof(*d), GFP_KERNEL);
>  
> -	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
> -		hlist_del(&m->mnt_umount);
> -		mntput(&m->mnt);
> +		if (unlikely(!d))
> +			goto out;
> +
> +		hlist_move_list(&head, &d->release_list);
> +		INIT_RCU_WORK(&d->rwork, defer_free_mounts);
> +		queue_rcu_work(system_wq, &d->rwork);
> +		return;
>  	}
> +
> +out:
> +	synchronize_rcu_expedited();
> +	free_mounts(&head);
> +}
> +
> +static inline void namespace_unlock(void)
> +{
> +	__namespace_unlock(false);
>  }
>  
>  static inline void namespace_lock(void)
> @@ -2056,7 +2094,7 @@ static int do_umount(struct mount *mnt, int flags)
>  	}
>  out:
>  	unlock_mount_hash();
> -	namespace_unlock();
> +	__namespace_unlock(flags & MNT_DETACH);
>  	return retval;
>  }
>  
> -- 
> 2.49.0
>