lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240806144628.874350-1-mjguzik@gmail.com>
Date: Tue,  6 Aug 2024 16:46:28 +0200
From: Mateusz Guzik <mjguzik@...il.com>
To: brauner@...nel.org
Cc: viro@...iv.linux.org.uk,
	jack@...e.cz,
	linux-kernel@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	Mateusz Guzik <mjguzik@...il.com>
Subject: [PATCH] vfs: avoid spurious dentry ref/unref cycle on open

Opening a file grabs a reference on the terminal dentry in
__legitimize_path(), then another one in do_dentry_open() and finally
drops the initial reference in terminate_walk().

That's 2 modifications which don't need to be there -- do_dentry_open()
can consume the already held reference instead.

In order to facilitate some debugging a dedicated namei state flag was
added to denote this happened.

When benchmarking on a 20-core vm using will-it-scale's open3_processes
("Same file open/close"), the results are (ops/s):
before:	3087010
after:	4173977 (+35%)

Signed-off-by: Mateusz Guzik <mjguzik@...il.com>
---

The flag thing is optional and can be dropped, but I think the general
direction should be to add *more* asserts and whatnot (even if they are
to land separately). A debug-only variant would not hurt.

The entire "always consume regardless of error" ordeal stems from the
corner case where open succeeds with a fully populated file object and
then returns an error anyway. While odd, it does mean error handling
does not get more complicated at least.

 fs/internal.h |  3 ++-
 fs/namei.c    | 30 +++++++++++++++++++++++-------
 fs/open.c     | 31 ++++++++++++++++++++++++++++---
 3 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index cdd73209eecb..9eeb7e03f81d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -193,7 +193,8 @@ int chmod_common(const struct path *path, umode_t mode);
 int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
 		int flag);
 int chown_common(const struct path *path, uid_t user, gid_t group);
-extern int vfs_open(const struct path *, struct file *);
+int vfs_open_consume(const struct path *, struct file *);
+int vfs_open(const struct path *, struct file *);
 
 /*
  * inode.c
diff --git a/fs/namei.c b/fs/namei.c
index 1bf081959066..20c5823d34dc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -595,9 +595,10 @@ struct nameidata {
 	umode_t		dir_mode;
 } __randomize_layout;
 
-#define ND_ROOT_PRESET 1
-#define ND_ROOT_GRABBED 2
-#define ND_JUMPED 4
+#define ND_ROOT_PRESET		0x00000001
+#define ND_ROOT_GRABBED 	0x00000002
+#define ND_JUMPED 		0x00000004
+#define ND_PATH_CONSUMED 	0x00000008
 
 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 {
@@ -697,6 +698,7 @@ static void terminate_walk(struct nameidata *nd)
 			nd->state &= ~ND_ROOT_GRABBED;
 		}
 	} else {
+		BUG_ON(nd->state & ND_PATH_CONSUMED);
 		leave_rcu(nd);
 	}
 	nd->depth = 0;
@@ -3683,6 +3685,7 @@ static const char *open_last_lookups(struct nameidata *nd,
 static int do_open(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
+	struct vfsmount *mnt;
 	struct mnt_idmap *idmap;
 	int open_flag = op->open_flag;
 	bool do_truncate;
@@ -3720,11 +3723,22 @@ static int do_open(struct nameidata *nd,
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			return error;
+		/*
+		 * We grab an additional reference here because vfs_open_consume()
+		 * may error out and free the mount from under us, while we need
+		 * to undo write access below.
+		 */
+		mnt = mntget(nd->path.mnt);
 		do_truncate = true;
 	}
 	error = may_open(idmap, &nd->path, acc_mode, open_flag);
-	if (!error && !(file->f_mode & FMODE_OPENED))
-		error = vfs_open(&nd->path, file);
+	if (!error && !(file->f_mode & FMODE_OPENED)) {
+		BUG_ON(nd->state & ND_PATH_CONSUMED);
+		error = vfs_open_consume(&nd->path, file);
+		nd->state |= ND_PATH_CONSUMED;
+		nd->path.mnt = NULL;
+		nd->path.dentry = NULL;
+	}
 	if (!error)
 		error = security_file_post_open(file, op->acc_mode);
 	if (!error && do_truncate)
@@ -3733,8 +3747,10 @@ static int do_open(struct nameidata *nd,
 		WARN_ON(1);
 		error = -EINVAL;
 	}
-	if (do_truncate)
-		mnt_drop_write(nd->path.mnt);
+	if (do_truncate) {
+		mnt_drop_write(mnt);
+		mntput(mnt);
+	}
 	return error;
 }
 
diff --git a/fs/open.c b/fs/open.c
index 22adbef7ecc2..eb69af3676e3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -905,6 +905,15 @@ static inline int file_get_write_access(struct file *f)
 	return error;
 }
 
+/*
+ * Populate struct file
+ *
+ * NOTE: it assumes f_path is populated and consumes the caller's reference.
+ *
+ * The caller must not path_put on it regardless of the error code -- the
+ * routine will either clean it up on its own or rely on fput, which must
+ * be issued anyway.
+ */
 static int do_dentry_open(struct file *f,
 			  int (*open)(struct inode *, struct file *))
 {
@@ -912,7 +921,6 @@ static int do_dentry_open(struct file *f,
 	struct inode *inode = f->f_path.dentry->d_inode;
 	int error;
 
-	path_get(&f->f_path);
 	f->f_inode = inode;
 	f->f_mapping = inode->i_mapping;
 	f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
@@ -1045,6 +1053,7 @@ int finish_open(struct file *file, struct dentry *dentry,
 	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
 
 	file->f_path.dentry = dentry;
+	path_get(&file->f_path);
 	return do_dentry_open(file, open);
 }
 EXPORT_SYMBOL(finish_open);
@@ -1077,15 +1086,19 @@ char *file_path(struct file *filp, char *buf, int buflen)
 EXPORT_SYMBOL(file_path);
 
 /**
- * vfs_open - open the file at the given path
+ * vfs_open_consume - open the file at the given path and consume the reference
  * @path: path to open
  * @file: newly allocated file with f_flag initialized
  */
-int vfs_open(const struct path *path, struct file *file)
+int vfs_open_consume(const struct path *path, struct file *file)
 {
 	int ret;
 
 	file->f_path = *path;
+	/*
+	 * do_dentry_open() does the referene consuming regardless of its return
+	 * value
+	 */
 	ret = do_dentry_open(file, NULL);
 	if (!ret) {
 		/*
@@ -1098,6 +1111,17 @@ int vfs_open(const struct path *path, struct file *file)
 	return ret;
 }
 
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @file: newly allocated file with f_flag initialized
+ */
+int vfs_open(const struct path *path, struct file *file)
+{
+	path_get(path);
+	return vfs_open_consume(path, file);
+}
+
 struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *cred)
 {
@@ -1183,6 +1207,7 @@ struct file *kernel_file_open(const struct path *path, int flags,
 		return f;
 
 	f->f_path = *path;
+	path_get(&f->f_path);
 	error = do_dentry_open(f, NULL);
 	if (error) {
 		fput(f);
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ