linux-ext4 - [PATCH 10/10] fuse2fs: use fuseblk mode for mounting filesystems

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <174786678871.1385354.15276136136470748216.stgit@frogsfrogsfrogs>
Date: Wed, 21 May 2025 15:47:43 -0700
From: "Darrick J. Wong" <djwong@...nel.org>
To: tytso@....edu
Cc: linux-ext4@...r.kernel.org
Subject: [PATCH 10/10] fuse2fs: use fuseblk mode for mounting filesystems

From: Darrick J. Wong <djwong@...nel.org>

So this is awkward.  Up until now, fuse2fs has opened the block device
in exclusive mode so that it can do all the superblock feature parsing
in main() prior to initiating the fuse connection.

However, in running fstests on fuse2fs, I noticed a weird unmount race
where the umount() syscall can return before the op_destroy function
gets called by libfuse.  This is problematic because fstests (and
probably users too) make a lot of assumptions about the block device
being openable after umount() completes.  The op_destroy function can
take some time to flush dirty blocks out of its pagecache, call fsync,
etc.

I poked around the kernel and libfuse and discovered that the kernel
fuse driver has two modes: anonymous and block device mode.  In block
device mode the kernel will send a FUSE_DESTROY command to userspace and
wait for libfuse to call our op_destroy function.  In anonymous mode,
the kernel closes the fuse device and completes the unmount, which means
that libfuse calls op_destroy after the unmount has gone away.

This is the root cause of _scratch_cycle_mount sporadically complaining
about the block device being in use.  The solution is to use block
device mode, but this means we have to move the libext2fs initialization
to op_init and we can no longer be the exclusive owner of the block
device.

Signed-off-by: "Darrick J. Wong" <djwong@...nel.org>
---
 misc/fuse2fs.c |  131 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 118 insertions(+), 13 deletions(-)


diff --git a/misc/fuse2fs.c b/misc/fuse2fs.c
index 7f9f230f37ed2b..51f703267462b4 100644
--- a/misc/fuse2fs.c
+++ b/misc/fuse2fs.c
@@ -164,6 +164,7 @@ struct fuse2fs {
 
 	int blocklog;
 	unsigned int blockmask;
+	int retcode;
 	unsigned long offset;
 	unsigned int next_generation;
 	unsigned long long cache_size;
@@ -715,6 +716,31 @@ static errcode_t open_fs(struct fuse2fs *ff, int libext2_flags)
 	return 0;
 }
 
+static errcode_t fs_on_bdev(struct fuse2fs *ff, int *is_bdev)
+{
+	struct stat statbuf;
+	ext2_filsys fs = ff->fs;
+	int fd;
+	errcode_t err;
+	int ret;
+
+	err = io_channel_fd(fs->io, &fd);
+	if (err) {
+		err_printf(ff, "%s\n",
+			   _("Cannot determine if this is a block device.\n"));
+		return err;
+	}
+
+	ret = fstat(fd, &statbuf);
+	if (ret) {
+		err_printf(ff, "%s\n", strerror(errno));
+		return ret;
+	}
+
+	*is_bdev = S_ISBLK(statbuf.st_mode);
+	return 0;
+}
+
 static errcode_t config_fs_cache(struct fuse2fs *ff)
 {
 	char buf[128];
@@ -854,9 +880,17 @@ static void op_destroy(void *p EXT2FS_ATTR((unused)))
 	ext2_filsys fs;
 	errcode_t err;
 
+	/* Can be null if op_init is given an incorrect fuse2fs */
+	if (!ff)
+		return;
 
 	FUSE2FS_CHECK_CONTEXT_NORET(ff);
+
+	/* Can be null if opening the filesystem failed */
+	if (!ff->fs)
+		return;
 	fs = ff->fs;
+
 	dbg_printf(ff, "%s: dev=%s\n", __func__, fs->device_name);
 	if (fs->flags & EXT2_FLAG_RW) {
 		fs->super->s_state |= EXT2_VALID_FS;
@@ -904,12 +938,12 @@ static void *op_init(struct fuse_conn_info *conn
 {
 	struct fuse_context *ctxt = fuse_get_context();
 	struct fuse2fs *ff = (struct fuse2fs *)ctxt->private_data;
-	ext2_filsys fs;
+	ext2_filsys fs = ff->fs;
 	errcode_t err;
+	int ret;
 
 	FUSE2FS_CHECK_CONTEXT_NULL(ff);
-	fs = ff->fs;
-	dbg_printf(ff, "%s: dev=%s\n", __func__, fs->device_name);
+	dbg_printf(ff, "%s: dev=%s\n", __func__, ff->device);
 #ifdef FUSE_CAP_IOCTL_DIR
 	conn->want |= FUSE_CAP_IOCTL_DIR;
 #endif
@@ -925,6 +959,46 @@ static void *op_init(struct fuse_conn_info *conn
 	cfg->use_ino = 1;
 	cfg->nullpath_ok = 1;
 #endif
+
+	/*
+	 * If the ext2_filsys object is null, then we are operating in fuseblk
+	 * mode and must reopen the filesystem.  If any of these steps fail,
+	 * tough.
+	 */
+	if (!fs) {
+		err = open_fs(ff, 0);
+		if (err)
+			goto mount_fail;
+		fs = ff->fs;
+
+		if (ff->cache_size) {
+			err = config_fs_cache(ff);
+			if (err)
+				goto mount_fail;
+		}
+
+		err = check_fs_supported(ff);
+		if (err)
+			goto mount_fail;
+
+		if (ext2fs_has_feature_shared_blocks(fs->super)) {
+			log_printf(ff, "%s\n",
+ _("shared file blocks, mounting filesystem read-only."));
+			fs->flags &= ~EXT2_FLAG_RW;
+		}
+
+		if (ff->norecovery) {
+			ret = check_norecovery(ff);
+			if (ret)
+				goto mount_fail;
+		}
+
+		err = mount_fs(ff);
+		if (err)
+			goto mount_fail;
+	}
+
+	/* Clear the valid flag so that an unclean shutdown forces a fsck */
 	if (fs->flags & EXT2_FLAG_RW) {
 		fs->super->s_mnt_count++;
 		ext2fs_set_tstamp(fs->super, s_mtime, time(NULL));
@@ -943,7 +1017,13 @@ static void *op_init(struct fuse_conn_info *conn
 		uuid_unparse(fs->super->s_uuid, uuid);
 		log_printf(ff, "%s %s.\n", _("mounted filesystem"), uuid);
 	}
+out:
 	return ff;
+mount_fail:
+	ff->retcode = 32;
+	/* Tear down the mount immediately. */
+	fuse_exit(ctxt->fuse);
+	goto out;
 }
 
 static int stat_inode(ext2_filsys fs, ext2_ino_t ino, struct stat *statbuf)
@@ -4663,6 +4743,8 @@ int main(int argc, char *argv[])
 	FILE *orig_stderr = stderr;
 	char *logfile;
 	char extra_args[BUFSIZ];
+	unsigned int blksize;
+	int is_bdev;
 	int ret = 0;
 
 	memset(&fctx, 0, sizeof(fctx));
@@ -4717,6 +4799,10 @@ int main(int argc, char *argv[])
 		fctx.alloc_all_blocks = 1;
 	}
 
+	/*
+	 * ext4 can't do COW of shared blocks, so if the feature is enabled,
+	 * we must force ro mode.
+	 */
 	err = open_fs(&fctx, EXT2_FLAG_EXCLUSIVE);
 	if (err) {
 		ret = 32;
@@ -4725,13 +4811,6 @@ int main(int argc, char *argv[])
 
 	if (!fctx.cache_size)
 		fctx.cache_size = default_cache_size();
-	if (fctx.cache_size) {
-		err = config_fs_cache(&fctx);
-		if (err) {
-			ret = 32;
-			goto out;
-		}
-	}
 
 	err = check_fs_supported(&fctx);
 	if (err) {
@@ -4754,17 +4833,40 @@ int main(int argc, char *argv[])
 		goto out;
 	}
 
+	err = fs_on_bdev(&fctx, &is_bdev);
+	if (err) {
+		ret = 32;
+		goto out;
+	}
+
+	blksize = fctx.fs->blocksize;
+
+	/*
+	 * If this is a block device, we want to close the fd, open the fuse
+	 * driver in fuseblk mode (which will reopen the block device) so that
+	 * unmount will wait until op_destroy completes.  If this is not a
+	 * block device, we cannot use fuseblk mode and should leave the
+	 * filesystem open.
+	 */
+	if (is_bdev)
+		close_fs(&fctx);
+
 	/* Initialize generation counter */
 	get_random_bytes(&fctx.next_generation, sizeof(unsigned int));
 
 	/* Set up default fuse parameters */
 	snprintf(extra_args, BUFSIZ, "-okernel_cache,subtype=%s,"
-		 "fsname=%s,attr_timeout=0" FUSE_PLATFORM_OPTS,
-		 get_subtype(argv[0]),
-		 fctx.device);
+		 "attr_timeout=0" FUSE_PLATFORM_OPTS,
+		 get_subtype(argv[0]));
 	if (fctx.no_default_opts == 0)
 		fuse_opt_add_arg(&args, extra_args);
 
+	if (is_bdev) {
+		snprintf(extra_args, BUFSIZ, "-ofsname=%s,blkdev,blksize=%u",
+			 fctx.device, blksize);
+		fuse_opt_add_arg(&args, extra_args);
+	}
+
 	if (fctx.ro)
 		fuse_opt_add_arg(&args, "-oro");
 
@@ -4824,6 +4926,9 @@ int main(int argc, char *argv[])
 		ret = 0;
 		break;
 	}
+
+	/* mount might have failed */
+	ret |= fctx.retcode;
 out:
 	if (ret & 1) {
 		fprintf(orig_stderr, "%s\n",