lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250408210350.749901-12-echanude@redhat.com>
Date: Tue,  8 Apr 2025 16:58:34 -0400
From: Eric Chanudet <echanude@...hat.com>
To: Alexander Viro <viro@...iv.linux.org.uk>,
	Christian Brauner <brauner@...nel.org>,
	Jan Kara <jack@...e.cz>,
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
	Clark Williams <clrkwllms@...nel.org>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ian Kent <ikent@...hat.com>
Cc: linux-fsdevel@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	linux-rt-devel@...ts.linux.dev,
	Eric Chanudet <echanude@...hat.com>,
	Alexander Larsson <alexl@...hat.com>,
	Lucas Karpinski <lkarpins@...hat.com>
Subject: [PATCH v4] fs/namespace: defer RCU sync for MNT_DETACH umount

Defer releasing the detached file-system when calling namespace_unlock()
during a lazy umount to return faster.

When requesting MNT_DETACH, the caller does not expect the file-system
to be shut down upon returning from the syscall. Calling
synchronize_rcu_expedited() has a significant cost on RT kernel that
defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
mount in a separate list and put it on a workqueue to run post RCU
grace-period.

w/o patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
    0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
    0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )

w/ patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
    0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
    0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )

Signed-off-by: Alexander Larsson <alexl@...hat.com>
Signed-off-by: Lucas Karpinski <lkarpins@...hat.com>
Signed-off-by: Eric Chanudet <echanude@...hat.com>
---

Attempt to re-spin this series based on the feedback received in v3 that
pointed out the need to wait the grace-period in namespace_unlock()
before calling the deferred mntput().

v4:
- Use queue_rcu_work() to defer free_mounts() for lazy umounts
- Drop lazy_unlock global and refactor using a helper
v3: https://lore.kernel.org/all/20240626201129.272750-2-lkarpins@redhat.com/
- Removed unneeded code for lazy umount case.
- Don't block within interrupt context.
v2: https://lore.kernel.org/all/20240426195429.28547-1-lkarpins@redhat.com/
- Only defer releasing umount'ed filesystems for lazy umounts
v1: https://lore.kernel.org/all/20230119205521.497401-1-echanude@redhat.com/

 fs/namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 14935a0500a2..e5b0b920dd97 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
 static unsigned int mp_hash_mask __ro_after_init;
 static unsigned int mp_hash_shift __ro_after_init;
 
+struct deferred_free_mounts {
+	struct rcu_work rwork;
+	struct hlist_head release_list;
+};
+
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
 {
@@ -1789,11 +1794,29 @@ static bool need_notify_mnt_list(void)
 }
 #endif
 
-static void namespace_unlock(void)
+static void free_mounts(struct hlist_head *mount_list)
 {
-	struct hlist_head head;
 	struct hlist_node *p;
 	struct mount *m;
+
+	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
+}
+
+static void defer_free_mounts(struct work_struct *work)
+{
+	struct deferred_free_mounts *d = container_of(
+		to_rcu_work(work), struct deferred_free_mounts, rwork);
+
+	free_mounts(&d->release_list);
+	kfree(d);
+}
+
+static void __namespace_unlock(bool lazy)
+{
+	HLIST_HEAD(head);
 	LIST_HEAD(list);
 
 	hlist_move_list(&unmounted, &head);
@@ -1817,12 +1840,27 @@ static void namespace_unlock(void)
 	if (likely(hlist_empty(&head)))
 		return;
 
-	synchronize_rcu_expedited();
+	if (lazy) {
+		struct deferred_free_mounts *d =
+			kmalloc(sizeof(*d), GFP_KERNEL);
 
-	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
-		hlist_del(&m->mnt_umount);
-		mntput(&m->mnt);
+		if (unlikely(!d))
+			goto out;
+
+		hlist_move_list(&head, &d->release_list);
+		INIT_RCU_WORK(&d->rwork, defer_free_mounts);
+		queue_rcu_work(system_wq, &d->rwork);
+		return;
 	}
+
+out:
+	synchronize_rcu_expedited();
+	free_mounts(&head);
+}
+
+static inline void namespace_unlock(void)
+{
+	__namespace_unlock(false);
 }
 
 static inline void namespace_lock(void)
@@ -2056,7 +2094,7 @@ static int do_umount(struct mount *mnt, int flags)
 	}
 out:
 	unlock_mount_hash();
-	namespace_unlock();
+	__namespace_unlock(flags & MNT_DETACH);
 	return retval;
 }
 
-- 
2.49.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ