>From 558b37682baaf251b761c455d02606db4abfedd8 Mon Sep 17 00:00:00 2001 From: Eric Chanudet Date: Tue, 8 Apr 2025 16:58:34 -0400 Subject: [PATCH] [UNTESTED] fs/namespace: defer RCU sync for MNT_DETACH umount Defer releasing the detached file-system when calling namespace_unlock() during a lazy umount to return faster. When requesting MNT_DETACH, the caller does not expect the file-system to be shut down upon returning from the syscall. Calling synchronize_rcu_expedited() has a significant cost on RT kernel that defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct mount in a separate list and put it on a workqueue to run post RCU grace-period. w/o patch, 6.15-rc1 PREEMPT_RT: perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt 0.02455 +- 0.00107 seconds time elapsed ( +- 4.36% ) perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt 0.02555 +- 0.00114 seconds time elapsed ( +- 4.46% ) w/ patch, 6.15-rc1 PREEMPT_RT: perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt 0.026311 +- 0.000869 seconds time elapsed ( +- 3.30% ) perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt 0.003194 +- 0.000160 seconds time elapsed ( +- 5.01% ) Signed-off-by: Alexander Larsson Signed-off-by: Lucas Karpinski Signed-off-by: Eric Chanudet Link: https://lore.kernel.org/20250408210350.749901-12-echanude@redhat.com Not-Tested-by: Christian Brauner Massaged-With-Great-Shame-by: Christian Brauner --- fs/namespace.c | 78 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 16 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index bc23c0e1fb9d..c36debbc5135 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init; static unsigned int mp_hash_mask __ro_after_init; static unsigned int mp_hash_shift __ro_after_init; +struct deferred_free_mounts { + struct rcu_work rwork; + struct hlist_head release_list; +}; + static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { @@ -77,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); -static HLIST_HEAD(unmounted); /* protected by namespace_sem */ -static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static bool defer_unmount; /* protected by namespace_sem */ +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ +static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY @@ -1412,7 +1418,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return ERR_PTR(err); } -static void cleanup_mnt(struct mount *mnt) +static void __mntput_no_expire(struct mount *mnt, bool cleanup_sync); + +static void cleanup_mnt(struct mount *mnt, bool cleanup_sync) { struct hlist_node *p; struct mount *m; @@ -1428,7 +1436,9 @@ static void cleanup_mnt(struct mount *mnt) mnt_pin_kill(mnt); hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { hlist_del(&m->mnt_umount); - mntput(&m->mnt); + if (unlikely(m->mnt_expiry_mark)) + WRITE_ONCE(m->mnt_expiry_mark, 0); + __mntput_no_expire(m, cleanup_sync); } fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); @@ -1439,7 +1449,7 @@ static void cleanup_mnt(struct mount *mnt) static void __cleanup_mnt(struct rcu_head *head) { - cleanup_mnt(container_of(head, struct mount, mnt_rcu)); + cleanup_mnt(container_of(head, struct mount, mnt_rcu), false /* cleanup sync */); } static LLIST_HEAD(delayed_mntput_list); @@ -1449,11 +1459,11 @@ static void delayed_mntput(struct work_struct *unused) struct mount *m, *t; llist_for_each_entry_safe(m, t, node, mnt_llist) - cleanup_mnt(m); + cleanup_mnt(m, false /* cleanup sync */); } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); -static void mntput_no_expire(struct mount *mnt) +static void __mntput_no_expire(struct mount *mnt, bool cleanup_sync) { LIST_HEAD(list); int count; @@ -1507,7 +1517,7 @@ static void mntput_no_expire(struct mount *mnt) unlock_mount_hash(); shrink_dentry_list(&list); - if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { + if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL) && !cleanup_sync)) { struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); @@ -1518,7 +1528,12 @@ static void mntput_no_expire(struct mount *mnt) schedule_delayed_work(&delayed_mntput_work, 1); return; } - cleanup_mnt(mnt); + cleanup_mnt(mnt, cleanup_sync); +} + +static inline void mntput_no_expire(struct mount *mnt) +{ + __mntput_no_expire(mnt, false); } void mntput(struct vfsmount *mnt) @@ -1789,15 +1804,37 @@ static bool need_notify_mnt_list(void) } #endif -static void namespace_unlock(void) +static void free_mounts(struct hlist_head *mount_list, bool cleanup_sync) { - struct hlist_head head; struct hlist_node *p; struct mount *m; + + hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) { + hlist_del(&m->mnt_umount); + if (unlikely(m->mnt_expiry_mark)) + WRITE_ONCE(m->mnt_expiry_mark, 0); + __mntput_no_expire(m, cleanup_sync); + } +} + +static void defer_free_mounts(struct work_struct *work) +{ + struct deferred_free_mounts *d; + + d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork); + free_mounts(&d->release_list, true /* cleanup_sync */); + kfree(d); +} + +static void namespace_unlock(void) +{ + HLIST_HEAD(head); LIST_HEAD(list); + bool defer = defer_unmount; hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); + defer_unmount = false; if (need_notify_mnt_list()) { /* @@ -1817,12 +1854,19 @@ static void namespace_unlock(void) if (likely(hlist_empty(&head))) return; - synchronize_rcu_expedited(); + if (defer) { + struct deferred_free_mounts *d; - hlist_for_each_entry_safe(m, p, &head, mnt_umount) { - hlist_del(&m->mnt_umount); - mntput(&m->mnt); + d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL); + if (d) { + hlist_move_list(&head, &d->release_list); + INIT_RCU_WORK(&d->rwork, defer_free_mounts); + queue_rcu_work(system_unbound_wq, &d->rwork); + return; + } } + synchronize_rcu_expedited(); + free_mounts(&head, false /* cleanup_sync */); } static inline void namespace_lock(void) @@ -2044,8 +2088,10 @@ static int do_umount(struct mount *mnt, int flags) event++; if (flags & MNT_DETACH) { - if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) + if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) { umount_tree(mnt, UMOUNT_PROPAGATE); + defer_unmount = true; + } retval = 0; } else { shrink_submounts(mnt); -- 2.47.2