[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <176169819048.1431292.632116172556190002.stgit@frogsfrogsfrogs>
Date: Tue, 28 Oct 2025 18:18:52 -0700
From: "Darrick J. Wong" <djwong@...nel.org>
To: tytso@....edu
Cc: linux-fsdevel@...r.kernel.org, joannelkoong@...il.com, bernd@...ernd.com,
 neal@...pa.dev, miklos@...redi.hu, linux-ext4@...r.kernel.org
Subject: [PATCH 1/7] libext2fs: fix MMP code to work with unixfd IO manager
From: Darrick J. Wong <djwong@...nel.org>
The MMP code wants to be able to read and write the MMP block directly
to storage so that the pagecache does not get in the way.  This is
critical for correct operation of MMP, because it is guarding against
two cluster nodes trying to change the filesystem at the same time.
Unfortunately there's no convenient way to tell an IO manager to perform
a particular IO in directio mode, so the MMP code open()s the filesystem
source device a second time so that it can set O_DIRECT and maintain its
own file position independently of the IO channel.  This is a gross
layering violation.
For unprivileged containerized fuse4fs, we're going to have a privileged
mount helper pass us the fd to the block device, so we'll be using the
unixfd IO manager.  Unfortunately, if the unixfd IO manager is in use,
the filesystem "source" will be a string representation of the fd
number, and MMP breaks.
Fix this (sort of) by detecting the unixfd IO manager and duplicating
the open fd if it's in use.  This adds a requirement that the unixfd
originally be opened in O_DIRECT mode if the filesystem is on a block
device, but that's the best we can do here.
Signed-off-by: "Darrick J. Wong" <djwong@...nel.org>
---
 lib/ext2fs/ext2fs.h |    1 +
 lib/ext2fs/mmp.c    |   82 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 1 deletion(-)
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index 38d6074fdbbc87..23b0695a32d150 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -229,6 +229,7 @@ typedef struct ext2_file *ext2_file_t;
  * Internal flags for use by the ext2fs library only
  */
 #define EXT2_FLAG2_USE_FAKE_TIME	0x000000001
+#define EXT2_FLAG2_MMP_USE_IOCHANNEL	0x000000002
 
 /*
  * Special flag in the ext2 inode i_flag field that means that this is
diff --git a/lib/ext2fs/mmp.c b/lib/ext2fs/mmp.c
index e2823732e2b6a2..5e7c0be5a48aeb 100644
--- a/lib/ext2fs/mmp.c
+++ b/lib/ext2fs/mmp.c
@@ -26,6 +26,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <limits.h>
 
 #include "ext2fs/ext2_fs.h"
 #include "ext2fs/ext2fs.h"
@@ -48,6 +49,74 @@ errcode_t ext2fs_mmp_get_mem(ext2_filsys fs, void **ptr)
 	return ext2fs_get_memalign(fs->blocksize, align, ptr);
 }
 
+static int possibly_unixfd(ext2_filsys fs)
+{
+	char *endptr = NULL;
+
+	if (fs->io->manager == unixfd_io_manager)
+		return 1;
+
+	/*
+	 * Due to the possibility of stacking IO managers, it's possible that
+	 * there's a unixfd IO manager under all of this.  We can guess the
+	 * presence of one if the device_name is a string representation of an
+	 * integer (fd) number.
+	 */
+	errno = 0;
+	strtol(fs->device_name, &endptr, 10);
+	return !errno && endptr == fs->device_name + strlen(fs->device_name);
+}
+
+static int ext2fs_mmp_open_device(ext2_filsys fs, int flags)
+{
+	struct stat st;
+	int maybe_fd = -1;
+	int new_fd;
+	int want_directio = 1;
+	int ret;
+	errcode_t retval = 0;
+
+	/*
+	 * If the unixfd IO manager is in use, extract the fd number from the
+	 * unixfd IO manager so we can reuse it below.
+	 *
+	 * If that fails, fall back to opening the filesystem device, which is
+	 * the preferred method.
+	 */
+	if (possibly_unixfd(fs))
+		retval = io_channel_get_fd(fs->io, &maybe_fd);
+	if (retval || maybe_fd < 0)
+		return open(fs->device_name, flags);
+
+	/*
+	 * We extracted the fd from the unixfd IO manager.  Skip directio if
+	 * this is a regular file, just ext2fs_mmp_read does.
+	 */
+	ret = fstat(maybe_fd, &st);
+	if (ret == 0 && S_ISREG(st.st_mode))
+		want_directio = 0;
+
+	/* Duplicate the fd so that the MMP code can close it later */
+	new_fd = dup(maybe_fd);
+	if (new_fd < 0)
+		return -1;
+
+	/* Make sure we actually got directio if that's required */
+	if (want_directio) {
+		ret = fcntl(new_fd, F_GETFL);
+		if (ret < 0 || !(ret & O_DIRECT))
+			return -1;
+	}
+
+	/*
+	 * The MMP fd is a duplicate of the io channel fd, so we must use that
+	 * for all MMP block accesses because the two fds share the same file
+	 * position and O_DIRECT state.
+	 */
+	fs->flags2 |= EXT2_FLAG2_MMP_USE_IOCHANNEL;
+	return new_fd;
+}
+
 errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 {
 #ifdef CONFIG_MMP
@@ -77,7 +146,7 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 		    S_ISREG(st.st_mode))
 			flags &= ~O_DIRECT;
 
-		fs->mmp_fd = open(fs->device_name, flags);
+		fs->mmp_fd = ext2fs_mmp_open_device(fs, flags);
 		if (fs->mmp_fd < 0) {
 			retval = EXT2_ET_MMP_OPEN_DIRECT;
 			goto out;
@@ -90,6 +159,15 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 			return retval;
 	}
 
+	if (fs->flags2 & EXT2_FLAG2_MMP_USE_IOCHANNEL) {
+		retval = io_channel_read_blk64(fs->io, mmp_blk, -fs->blocksize,
+					       fs->mmp_cmp);
+		if (retval)
+			return retval;
+
+		goto read_compare;
+	}
+
 	if ((blk64_t) ext2fs_llseek(fs->mmp_fd, mmp_blk * fs->blocksize,
 				    SEEK_SET) !=
 	    mmp_blk * fs->blocksize) {
@@ -102,6 +180,7 @@ errcode_t ext2fs_mmp_read(ext2_filsys fs, blk64_t mmp_blk, void *buf)
 		goto out;
 	}
 
+read_compare:
 	mmp_cmp = fs->mmp_cmp;
 
 	if (!(fs->flags & EXT2_FLAG_IGNORE_CSUM_ERRORS) &&
@@ -428,6 +507,7 @@ errcode_t ext2fs_mmp_stop(ext2_filsys fs)
 
 mmp_error:
 	if (fs->mmp_fd > 0) {
+		fs->flags2 &= ~EXT2_FLAG2_MMP_USE_IOCHANNEL;
 		close(fs->mmp_fd);
 		fs->mmp_fd = -1;
 	}
Powered by blists - more mailing lists
 
