[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJnrk1ZOLNytBdVqvWiHbwA0rE0KCVt09SmHFZ3pp_tffg+iaQ@mail.gmail.com>
Date: Wed, 21 Jan 2026 11:34:24 -0800
From: Joanne Koong <joannelkoong@...il.com>
To: "Darrick J. Wong" <djwong@...nel.org>
Cc: miklos@...redi.hu, bernd@...ernd.com, neal@...pa.dev,
linux-ext4@...r.kernel.org, linux-fsdevel@...r.kernel.org
Subject: Re: [PATCH 01/31] fuse: implement the basic iomap mechanisms
On Tue, Oct 28, 2025 at 5:45 PM Darrick J. Wong <djwong@...nel.org> wrote:
>
> From: Darrick J. Wong <djwong@...nel.org>
>
> Implement functions to enable upcalling of iomap_begin and iomap_end to
> userspace fuse servers.
>
> Signed-off-by: "Darrick J. Wong" <djwong@...nel.org>
> ---
> fs/fuse/fuse_i.h | 22 ++
> fs/fuse/iomap_i.h | 36 ++++
> include/uapi/linux/fuse.h | 90 +++++++++
> fs/fuse/Kconfig | 32 +++
> fs/fuse/Makefile | 1
> fs/fuse/file_iomap.c | 434 +++++++++++++++++++++++++++++++++++++++++++++
> fs/fuse/inode.c | 8 +
> 7 files changed, 621 insertions(+), 2 deletions(-)
> create mode 100644 fs/fuse/iomap_i.h
> create mode 100644 fs/fuse/file_iomap.c
>
>
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 7c7d255d817f1e..45be59df7ae592 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -929,6 +929,9 @@ struct fuse_conn {
> /* Is synchronous FUSE_INIT allowed? */
> unsigned int sync_init:1;
>
> + /* Enable fs/iomap for file operations */
> + unsigned int iomap:1;
> +
> /* Use io_uring for communication */
> unsigned int io_uring;
>
> @@ -1053,12 +1056,17 @@ static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
> return sb->s_fs_info;
> }
>
> +static inline const struct fuse_mount *get_fuse_mount_super_c(const struct super_block *sb)
> +{
> + return sb->s_fs_info;
> +}
I'm not seeing this getting used anywhere - did you mean to remove this?
> +
> static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
> {
> return get_fuse_mount_super(sb)->fc;
> }
>
> -static inline struct fuse_mount *get_fuse_mount(struct inode *inode)
> +static inline struct fuse_mount *get_fuse_mount(const struct inode *inode)
> {
> return get_fuse_mount_super(inode->i_sb);
> }
> @@ -1683,4 +1691,16 @@ extern void fuse_sysctl_unregister(void);
> #define fuse_sysctl_unregister() do { } while (0)
> #endif /* CONFIG_SYSCTL */
>
> +#if IS_ENABLED(CONFIG_FUSE_IOMAP)
> +bool fuse_iomap_enabled(void);
> +
> +static inline bool fuse_has_iomap(const struct inode *inode)
> +{
> + return get_fuse_conn(inode)->iomap;
> +}
> +#else
> +# define fuse_iomap_enabled(...) (false)
> +# define fuse_has_iomap(...) (false)
> +#endif
> +
> #endif /* _FS_FUSE_I_H */
> diff --git a/fs/fuse/iomap_i.h b/fs/fuse/iomap_i.h
> new file mode 100644
> index 00000000000000..d773f728579d1d
> --- /dev/null
> +++ b/fs/fuse/iomap_i.h
> @@ -0,0 +1,36 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2025 Oracle. All Rights Reserved.
> + * Author: Darrick J. Wong <djwong@...nel.org>
> + */
> +#ifndef _FS_FUSE_IOMAP_I_H
> +#define _FS_FUSE_IOMAP_I_H
> +
> +#if IS_ENABLED(CONFIG_FUSE_IOMAP)
> +#if IS_ENABLED(CONFIG_FUSE_IOMAP_DEBUG)
> +# define ASSERT(condition) do { \
> + int __cond = !!(condition); \
> + WARN(!__cond, "Assertion failed: %s, func: %s, line: %d", #condition, __func__, __LINE__); \
> +} while (0)
> +# define BAD_DATA(condition) ({ \
> + int __cond = !!(condition); \
> + WARN(__cond, "Bad mapping: %s, func: %s, line: %d", #condition, __func__, __LINE__); \
> +})
> +#else
> +# define ASSERT(condition)
> +# define BAD_DATA(condition) ({ \
> + int __cond = !!(condition); \
> + unlikely(__cond); \
> +})
> +#endif /* CONFIG_FUSE_IOMAP_DEBUG */
> +
> +enum fuse_iomap_iodir {
> + READ_MAPPING,
> + WRITE_MAPPING,
> +};
> +
> +#define EFSCORRUPTED EUCLEAN
> +
> +#endif /* CONFIG_FUSE_IOMAP */
> +
> +#endif /* _FS_FUSE_IOMAP_I_H */
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 18713cfaf09171..7d709cf12b41a7 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -240,6 +240,9 @@
> * - add FUSE_COPY_FILE_RANGE_64
> * - add struct fuse_copy_file_range_out
> * - add FUSE_NOTIFY_PRUNE
> + *
> + * 7.99
Should this be changed to something like 7.46 now that this patch is
submitted for merging into the tree?
> + * - add FUSE_IOMAP and iomap_{begin,end,ioend} for regular file operations
> */
>
> #ifndef _LINUX_FUSE_H
> @@ -275,7 +278,7 @@
> #define FUSE_KERNEL_VERSION 7
>
> /** Minor version number of this interface */
> -#define FUSE_KERNEL_MINOR_VERSION 45
> +#define FUSE_KERNEL_MINOR_VERSION 99
Same question here
>
> /** The node ID of the root inode */
> #define FUSE_ROOT_ID 1
> @@ -448,6 +451,7 @@ struct fuse_file_lock {
> * FUSE_OVER_IO_URING: Indicate that client supports io-uring
> * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests.
> * init_out.request_timeout contains the timeout (in secs)
> + * FUSE_IOMAP: Client supports iomap for regular file operations.
> */
> #define FUSE_ASYNC_READ (1 << 0)
> #define FUSE_POSIX_LOCKS (1 << 1)
> @@ -495,6 +499,7 @@ struct fuse_file_lock {
> #define FUSE_ALLOW_IDMAP (1ULL << 40)
> #define FUSE_OVER_IO_URING (1ULL << 41)
> #define FUSE_REQUEST_TIMEOUT (1ULL << 42)
> +#define FUSE_IOMAP (1ULL << 43)
>
> /**
> * CUSE INIT request/reply flags
> @@ -664,6 +669,9 @@ enum fuse_opcode {
> FUSE_STATX = 52,
> FUSE_COPY_FILE_RANGE_64 = 53,
>
> + FUSE_IOMAP_BEGIN = 4094,
> + FUSE_IOMAP_END = 4095,
> +
> /* CUSE specific operations */
> CUSE_INIT = 4096,
>
> @@ -1314,4 +1322,84 @@ struct fuse_uring_cmd_req {
> uint8_t padding[6];
> };
>
> +/* mapping types; see corresponding IOMAP_TYPE_ */
> +#define FUSE_IOMAP_TYPE_HOLE (0)
> +#define FUSE_IOMAP_TYPE_DELALLOC (1)
> +#define FUSE_IOMAP_TYPE_MAPPED (2)
> +#define FUSE_IOMAP_TYPE_UNWRITTEN (3)
> +#define FUSE_IOMAP_TYPE_INLINE (4)
> +
> +/* fuse-specific mapping type indicating that writes use the read mapping */
> +#define FUSE_IOMAP_TYPE_PURE_OVERWRITE (255)
> +
> +#define FUSE_IOMAP_DEV_NULL (0U) /* null device cookie */
> +
> +/* mapping flags passed back from iomap_begin; see corresponding IOMAP_F_ */
> +#define FUSE_IOMAP_F_NEW (1U << 0)
> +#define FUSE_IOMAP_F_DIRTY (1U << 1)
> +#define FUSE_IOMAP_F_SHARED (1U << 2)
> +#define FUSE_IOMAP_F_MERGED (1U << 3)
> +#define FUSE_IOMAP_F_BOUNDARY (1U << 4)
> +#define FUSE_IOMAP_F_ANON_WRITE (1U << 5)
> +#define FUSE_IOMAP_F_ATOMIC_BIO (1U << 6)
Do you think it makes sense to have the fuse iomap constants mirror
the in-kernel iomap ones? Maybe I'm mistaken but it seems like the
fuse iomap capabilities won't diverge too much from fs/iomap ones? I
like that if they're mirrored, then it makes it simpler instead of
needing to convert back and forth.
> +
> +/* fuse-specific mapping flag asking for ->iomap_end call */
> +#define FUSE_IOMAP_F_WANT_IOMAP_END (1U << 7)
> +
> +/* mapping flags passed to iomap_end */
> +#define FUSE_IOMAP_F_SIZE_CHANGED (1U << 8)
> +#define FUSE_IOMAP_F_STALE (1U << 9)
> +
> +/* operation flags from iomap; see corresponding IOMAP_* */
> +#define FUSE_IOMAP_OP_WRITE (1U << 0)
> +#define FUSE_IOMAP_OP_ZERO (1U << 1)
> +#define FUSE_IOMAP_OP_REPORT (1U << 2)
> +#define FUSE_IOMAP_OP_FAULT (1U << 3)
> +#define FUSE_IOMAP_OP_DIRECT (1U << 4)
> +#define FUSE_IOMAP_OP_NOWAIT (1U << 5)
> +#define FUSE_IOMAP_OP_OVERWRITE_ONLY (1U << 6)
> +#define FUSE_IOMAP_OP_UNSHARE (1U << 7)
> +#define FUSE_IOMAP_OP_DAX (1U << 8)
> +#define FUSE_IOMAP_OP_ATOMIC (1U << 9)
> +#define FUSE_IOMAP_OP_DONTCACHE (1U << 10)
> +
> +#define FUSE_IOMAP_NULL_ADDR (-1ULL) /* addr is not valid */
> +
> +struct fuse_iomap_io {
> + uint64_t offset; /* file offset of mapping, bytes */
> + uint64_t length; /* length of mapping, bytes */
> + uint64_t addr; /* disk offset of mapping, bytes */
> + uint16_t type; /* FUSE_IOMAP_TYPE_* */
> + uint16_t flags; /* FUSE_IOMAP_F_* */
> + uint32_t dev; /* device cookie */
Do you think it's a good idea to add a reserved field here in case we
end up needing it in the future?
> +};
> +
> +struct fuse_iomap_begin_in {
> + uint32_t opflags; /* FUSE_IOMAP_OP_* */
> + uint32_t reserved; /* zero */
> + uint64_t attr_ino; /* matches fuse_attr:ino */
> + uint64_t pos; /* file position, in bytes */
> + uint64_t count; /* operation length, in bytes */
> +};
> +
> +struct fuse_iomap_begin_out {
> + /* read file data from here */
> + struct fuse_iomap_io read;
> +
> + /* write file data to here, if applicable */
> + struct fuse_iomap_io write;
Same question here
> +};
> +
> +struct fuse_iomap_end_in {
> + uint32_t opflags; /* FUSE_IOMAP_OP_* */
> + uint32_t reserved; /* zero */
> + uint64_t attr_ino; /* matches fuse_attr:ino */
> + uint64_t pos; /* file position, in bytes */
> + uint64_t count; /* operation length, in bytes */
> + int64_t written; /* bytes processed */
On the fs/iomap side, I see that written is passed through by
iomap_iter() to ->iomap_end through 'ssize_t advanced' but it's not
clear to me why advanced needs to be signed. I think it used to also
represent the error status, but it looks like now that's represented
through iter->status and 'advanced' strictly reflects the number of
bytes written. As such, do you think it makes sense to change
'advanced' to loff_t and have written be uint64_t instead?
> +
> + /* mapping that the kernel acted upon */
> + struct fuse_iomap_io map;
> +};
> +
> #endif /* _LINUX_FUSE_H */
> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
> index 290d1c09e0b924..934d48076a010c 100644
> --- a/fs/fuse/Kconfig
> +++ b/fs/fuse/Kconfig
> @@ -69,6 +69,38 @@ config FUSE_PASSTHROUGH
> config FUSE_BACKING
> bool
>
> +config FUSE_IOMAP
> + bool "FUSE file IO over iomap"
> + default y
> + depends on FUSE_FS
> + depends on BLOCK
> + select FS_IOMAP
> + help
> + Enable fuse servers to operate the regular file I/O path through
> + the fs-iomap library in the kernel. This enables higher performance
> + userspace filesystems by keeping the performance critical parts in
> + the kernel while delegating the difficult metadata parsing parts to
> + an easily-contained userspace program.
> +
> + This feature is considered EXPERIMENTAL. Use with caution!
> +
> + If unsure, say N.
> +
> +config FUSE_IOMAP_BY_DEFAULT
> + bool "FUSE file I/O over iomap by default"
> + default n
> + depends on FUSE_IOMAP
> + help
> + Enable sending FUSE file I/O over iomap by default.
I'm not really sure what the general linux preference is for adding
new configs, but assuming it errs towards less configs than more, imo
it seems easy enough to just set the enable_iomap module param to true
manually instead of needing this config for it, especially since the
param only needs to be set once.
> +
> +config FUSE_IOMAP_DEBUG
> + bool "Debug FUSE file IO over iomap"
> + default y
> + depends on FUSE_IOMAP
> + help
> + Enable debugging assertions for the fuse iomap code paths and logging
> + of bad iomap file mapping data being sent to the kernel.
I'm wondering if it makes sense to make this a general FUSE_DEBUG
config so we can reuse this more generally
> +
> config FUSE_IO_URING
> bool "FUSE communication over io-uring"
> default y
> diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
> index 46041228e5be2c..27be39317701d6 100644
> --- a/fs/fuse/Makefile
> +++ b/fs/fuse/Makefile
> @@ -18,5 +18,6 @@ fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
> fuse-$(CONFIG_FUSE_BACKING) += backing.o
> fuse-$(CONFIG_SYSCTL) += sysctl.o
> fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
> +fuse-$(CONFIG_FUSE_IOMAP) += file_iomap.o
>
> virtiofs-y := virtio_fs.o
> diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c
> new file mode 100644
> index 00000000000000..d564d60d0f1779
> --- /dev/null
> +++ b/fs/fuse/file_iomap.c
> @@ -0,0 +1,434 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2025 Oracle. All Rights Reserved.
> + * Author: Darrick J. Wong <djwong@...nel.org>
> + */
> +#include <linux/iomap.h>
> +#include "fuse_i.h"
> +#include "fuse_trace.h"
> +#include "iomap_i.h"
> +
> +static bool __read_mostly enable_iomap =
> +#if IS_ENABLED(CONFIG_FUSE_IOMAP_BY_DEFAULT)
> + true;
> +#else
> + false;
> +#endif
> +module_param(enable_iomap, bool, 0644);
> +MODULE_PARM_DESC(enable_iomap, "Enable file I/O through iomap");
> +
> +bool fuse_iomap_enabled(void)
> +{
> + /* Don't let anyone touch iomap until the end of the patchset. */
> + return false;
> +
> + /*
> + * There are fears that a fuse+iomap server could somehow DoS the
> + * system by doing things like going out to lunch during a writeback
> + * related iomap request. Only allow iomap access if the fuse server
> + * has rawio capabilities since those processes can mess things up
> + * quite well even without our help.
> + */
> + return enable_iomap && has_capability_noaudit(current, CAP_SYS_RAWIO);
> +}
> +
> +/* Convert IOMAP_* mapping types to FUSE_IOMAP_TYPE_* */
> +#define XMAP(word) \
> + case IOMAP_##word: \
> + return FUSE_IOMAP_TYPE_##word
> +static inline uint16_t fuse_iomap_type_to_server(uint16_t iomap_type)
> +{
> + switch (iomap_type) {
> + XMAP(HOLE);
> + XMAP(DELALLOC);
> + XMAP(MAPPED);
> + XMAP(UNWRITTEN);
> + XMAP(INLINE);
> + default:
> + ASSERT(0);
> + }
> + return 0;
> +}
> +#undef XMAP
> +
> +/* Convert FUSE_IOMAP_TYPE_* to IOMAP_* mapping types */
> +#define XMAP(word) \
> + case FUSE_IOMAP_TYPE_##word: \
> + return IOMAP_##word
> +static inline uint16_t fuse_iomap_type_from_server(uint16_t fuse_type)
> +{
> + switch (fuse_type) {
> + XMAP(HOLE);
> + XMAP(DELALLOC);
> + XMAP(MAPPED);
> + XMAP(UNWRITTEN);
> + XMAP(INLINE);
> + default:
> + ASSERT(0);
> + }
> + return 0;
> +}
> +#undef XMAP
> +
> +/* Validate FUSE_IOMAP_TYPE_* */
> +static inline bool fuse_iomap_check_type(uint16_t fuse_type)
> +{
> + switch (fuse_type) {
> + case FUSE_IOMAP_TYPE_HOLE:
> + case FUSE_IOMAP_TYPE_DELALLOC:
> + case FUSE_IOMAP_TYPE_MAPPED:
> + case FUSE_IOMAP_TYPE_UNWRITTEN:
> + case FUSE_IOMAP_TYPE_INLINE:
> + case FUSE_IOMAP_TYPE_PURE_OVERWRITE:
> + return true;
> + }
> +
> + return false;
> +}
> +
> +#define FUSE_IOMAP_F_ALL (FUSE_IOMAP_F_NEW | \
> + FUSE_IOMAP_F_DIRTY | \
> + FUSE_IOMAP_F_SHARED | \
> + FUSE_IOMAP_F_MERGED | \
> + FUSE_IOMAP_F_BOUNDARY | \
> + FUSE_IOMAP_F_ANON_WRITE | \
> + FUSE_IOMAP_F_ATOMIC_BIO | \
> + FUSE_IOMAP_F_WANT_IOMAP_END)
> +
> +static inline bool fuse_iomap_check_flags(uint16_t flags)
> +{
> + return (flags & ~FUSE_IOMAP_F_ALL) == 0;
> +}
> +
> +/* Convert IOMAP_F_* mapping state flags to FUSE_IOMAP_F_* */
> +#define XMAP(word) \
> + if (iomap_f_flags & IOMAP_F_##word) \
> + ret |= FUSE_IOMAP_F_##word
> +#define YMAP(iword, oword) \
> + if (iomap_f_flags & IOMAP_F_##iword) \
> + ret |= FUSE_IOMAP_F_##oword
> +static inline uint16_t fuse_iomap_flags_to_server(uint16_t iomap_f_flags)
> +{
> + uint16_t ret = 0;
> +
> + XMAP(NEW);
> + XMAP(DIRTY);
> + XMAP(SHARED);
> + XMAP(MERGED);
> + XMAP(BOUNDARY);
> + XMAP(ANON_WRITE);
> + XMAP(ATOMIC_BIO);
> + YMAP(PRIVATE, WANT_IOMAP_END);
> +
> + XMAP(SIZE_CHANGED);
> + XMAP(STALE);
> +
> + return ret;
> +}
> +#undef YMAP
> +#undef XMAP
> +
> +/* Convert FUSE_IOMAP_F_* to IOMAP_F_* mapping state flags */
> +#define XMAP(word) \
> + if (fuse_f_flags & FUSE_IOMAP_F_##word) \
> + ret |= IOMAP_F_##word
> +#define YMAP(iword, oword) \
> + if (fuse_f_flags & FUSE_IOMAP_F_##iword) \
> + ret |= IOMAP_F_##oword
> +static inline uint16_t fuse_iomap_flags_from_server(uint16_t fuse_f_flags)
> +{
> + uint16_t ret = 0;
> +
> + XMAP(NEW);
> + XMAP(DIRTY);
> + XMAP(SHARED);
> + XMAP(MERGED);
> + XMAP(BOUNDARY);
> + XMAP(ANON_WRITE);
> + XMAP(ATOMIC_BIO);
> + YMAP(WANT_IOMAP_END, PRIVATE);
> +
> + return ret;
> +}
> +#undef YMAP
> +#undef XMAP
> +
> +/* Convert IOMAP_* operation flags to FUSE_IOMAP_OP_* */
> +#define XMAP(word) \
> + if (iomap_op_flags & IOMAP_##word) \
> + ret |= FUSE_IOMAP_OP_##word
> +static inline uint32_t fuse_iomap_op_to_server(unsigned iomap_op_flags)
> +{
> + uint32_t ret = 0;
> +
> + XMAP(WRITE);
> + XMAP(ZERO);
> + XMAP(REPORT);
> + XMAP(FAULT);
> + XMAP(DIRECT);
> + XMAP(NOWAIT);
> + XMAP(OVERWRITE_ONLY);
> + XMAP(UNSHARE);
> + XMAP(DAX);
> + XMAP(ATOMIC);
> + XMAP(DONTCACHE);
> +
> + return ret;
> +}
> +#undef XMAP
> +
> +/* Validate an iomap mapping. */
> +static inline bool fuse_iomap_check_mapping(const struct inode *inode,
> + const struct fuse_iomap_io *map,
> + enum fuse_iomap_iodir iodir)
> +{
> + const unsigned int blocksize = i_blocksize(inode);
> + uint64_t end;
> +
> + /* Type and flags must be known */
> + if (BAD_DATA(!fuse_iomap_check_type(map->type)))
> + return false;
> + if (BAD_DATA(!fuse_iomap_check_flags(map->flags)))
> + return false;
> +
> + /* No zero-length mappings */
> + if (BAD_DATA(map->length == 0))
> + return false;
> +
> + /* File range must be aligned to blocksize */
> + if (BAD_DATA(!IS_ALIGNED(map->offset, blocksize)))
> + return false;
> + if (BAD_DATA(!IS_ALIGNED(map->length, blocksize)))
> + return false;
> +
> + /* No overflows in the file range */
> + if (BAD_DATA(check_add_overflow(map->offset, map->length, &end)))
> + return false;
> +
> + /* File range cannot start past maxbytes */
> + if (BAD_DATA(map->offset >= inode->i_sb->s_maxbytes))
> + return false;
> +
> + switch (map->type) {
> + case FUSE_IOMAP_TYPE_MAPPED:
> + case FUSE_IOMAP_TYPE_UNWRITTEN:
> + /* Mappings backed by space must have a device/addr */
> + if (BAD_DATA(map->dev == FUSE_IOMAP_DEV_NULL))
> + return false;
> + if (BAD_DATA(map->addr == FUSE_IOMAP_NULL_ADDR))
> + return false;
> + break;
> + case FUSE_IOMAP_TYPE_DELALLOC:
> + case FUSE_IOMAP_TYPE_HOLE:
> + case FUSE_IOMAP_TYPE_INLINE:
> + /* Mappings not backed by space cannot have a device addr. */
> + if (BAD_DATA(map->dev != FUSE_IOMAP_DEV_NULL))
> + return false;
> + if (BAD_DATA(map->addr != FUSE_IOMAP_NULL_ADDR))
> + return false;
> + break;
> + case FUSE_IOMAP_TYPE_PURE_OVERWRITE:
> + /* "Pure overwrite" only allowed for write mapping */
> + if (BAD_DATA(iodir != WRITE_MAPPING))
> + return false;
> + break;
> + default:
> + /* should have been caught already */
> + ASSERT(0);
> + return false;
> + }
> +
> + /* XXX: we don't support devices yet */
> + if (BAD_DATA(map->dev != FUSE_IOMAP_DEV_NULL))
> + return false;
> +
> + /* No overflows in the device range, if supplied */
> + if (map->addr != FUSE_IOMAP_NULL_ADDR &&
> + BAD_DATA(check_add_overflow(map->addr, map->length, &end)))
> + return false;
> +
> + return true;
> +}
> +
> +/* Convert a mapping from the server into something the kernel can use */
> +static inline void fuse_iomap_from_server(struct inode *inode,
Maybe worth adding a const in front of struct inode?
> + struct iomap *iomap,
> + const struct fuse_iomap_io *fmap)
> +{
> + iomap->addr = fmap->addr;
> + iomap->offset = fmap->offset;
> + iomap->length = fmap->length;
> + iomap->type = fuse_iomap_type_from_server(fmap->type);
> + iomap->flags = fuse_iomap_flags_from_server(fmap->flags);
> + iomap->bdev = inode->i_sb->s_bdev; /* XXX */
> +}
> +
> +/* Convert a mapping from the kernel into something the server can use */
> +static inline void fuse_iomap_to_server(struct fuse_iomap_io *fmap,
> + const struct iomap *iomap)
> +{
> + fmap->addr = FUSE_IOMAP_NULL_ADDR; /* XXX */
> + fmap->offset = iomap->offset;
> + fmap->length = iomap->length;
> + fmap->type = fuse_iomap_type_to_server(iomap->type);
> + fmap->flags = fuse_iomap_flags_to_server(iomap->flags);
> + fmap->dev = FUSE_IOMAP_DEV_NULL; /* XXX */
AFAICT, this only gets used for sending the FUSE_IOMAP_END request. Is
passing the iomap->addr to fmap->addr and inode->i_sb->s_bdev to
fmap->dev not useful to the server here?
Also, did you mean to leave in the /* XXX */ comments?
> +}
> +
> +/* Check the incoming _begin mappings to make sure they're not nonsense. */
> +static inline int
> +fuse_iomap_begin_validate(const struct inode *inode,
> + unsigned opflags, loff_t pos,
> + const struct fuse_iomap_begin_out *outarg)
> +{
> + /* Make sure the mappings aren't garbage */
> + if (!fuse_iomap_check_mapping(inode, &outarg->read, READ_MAPPING))
> + return -EFSCORRUPTED;
> +
> + if (!fuse_iomap_check_mapping(inode, &outarg->write, WRITE_MAPPING))
> + return -EFSCORRUPTED;
> +
> + /*
> + * Must have returned a mapping for at least the first byte in the
> + * range. The main mapping check already validated that the length
> + * is nonzero and there is no overflow in computing end.
> + */
> + if (BAD_DATA(outarg->read.offset > pos))
> + return -EFSCORRUPTED;
> + if (BAD_DATA(outarg->write.offset > pos))
> + return -EFSCORRUPTED;
> +
> + if (BAD_DATA(outarg->read.offset + outarg->read.length <= pos))
> + return -EFSCORRUPTED;
> + if (BAD_DATA(outarg->write.offset + outarg->write.length <= pos))
> + return -EFSCORRUPTED;
> +
> + return 0;
> +}
> +
> +static inline bool fuse_is_iomap_file_write(unsigned int opflags)
> +{
> + return opflags & (IOMAP_WRITE | IOMAP_ZERO | IOMAP_UNSHARE);
> +}
> +
> +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count,
> + unsigned opflags, struct iomap *iomap,
> + struct iomap *srcmap)
> +{
> + struct fuse_inode *fi = get_fuse_inode(inode);
> + struct fuse_iomap_begin_in inarg = {
> + .attr_ino = fi->orig_ino,
> + .opflags = fuse_iomap_op_to_server(opflags),
> + .pos = pos,
> + .count = count,
> + };
> + struct fuse_iomap_begin_out outarg = { };
> + struct fuse_mount *fm = get_fuse_mount(inode);
> + FUSE_ARGS(args);
> + int err;
> +
> + args.opcode = FUSE_IOMAP_BEGIN;
> + args.nodeid = get_node_id(inode);
> + args.in_numargs = 1;
> + args.in_args[0].size = sizeof(inarg);
> + args.in_args[0].value = &inarg;
> + args.out_numargs = 1;
> + args.out_args[0].size = sizeof(outarg);
> + args.out_args[0].value = &outarg;
> + err = fuse_simple_request(fm, &args);
> + if (err)
> + return err;
> +
> + err = fuse_iomap_begin_validate(inode, opflags, pos, &outarg);
> + if (err)
> + return err;
> +
> + if (fuse_is_iomap_file_write(opflags) &&
> + outarg.write.type != FUSE_IOMAP_TYPE_PURE_OVERWRITE) {
> + /*
> + * For an out of place write, we must supply the write mapping
> + * via @iomap, and the read mapping via @srcmap.
> + */
> + fuse_iomap_from_server(inode, iomap, &outarg.write);
> + fuse_iomap_from_server(inode, srcmap, &outarg.read);
> + } else {
> + /*
> + * For everything else (reads, reporting, and pure overwrites),
> + * we can return the sole mapping through @iomap and leave
> + * @srcmap unchanged from its default (HOLE).
> + */
> + fuse_iomap_from_server(inode, iomap, &outarg.read);
> + }
> +
> + return 0;
> +}
> +
> +/* Decide if we send FUSE_IOMAP_END to the fuse server */
> +static bool fuse_should_send_iomap_end(const struct iomap *iomap,
> + unsigned int opflags, loff_t count,
> + ssize_t written)
> +{
> + /* fuse server demanded an iomap_end call. */
> + if (iomap->flags & FUSE_IOMAP_F_WANT_IOMAP_END)
> + return true;
> +
> + /* Reads and reporting should never affect the filesystem metadata */
> + if (!fuse_is_iomap_file_write(opflags))
> + return false;
> +
> + /* Appending writes get an iomap_end call */
> + if (iomap->flags & IOMAP_F_SIZE_CHANGED)
> + return true;
> +
> + /* Short writes get an iomap_end call to clean up delalloc */
> + return written < count;
> +}
> +
> +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t count,
> + ssize_t written, unsigned opflags,
> + struct iomap *iomap)
> +{
> + struct fuse_inode *fi = get_fuse_inode(inode);
> + struct fuse_mount *fm = get_fuse_mount(inode);
> + int err = 0;
> +
> + if (fuse_should_send_iomap_end(iomap, opflags, count, written)) {
> + struct fuse_iomap_end_in inarg = {
> + .opflags = fuse_iomap_op_to_server(opflags),
> + .attr_ino = fi->orig_ino,
> + .pos = pos,
> + .count = count,
> + .written = written,
> + };
> + FUSE_ARGS(args);
> +
> + fuse_iomap_to_server(&inarg.map, iomap);
> +
> + args.opcode = FUSE_IOMAP_END;
> + args.nodeid = get_node_id(inode);
Just curious about this - does it make sense to set args.force here
for this opcode? It seems like it serves the same sort of purpose a
flush request (which sets args.force) does?
> + args.in_numargs = 1;
> + args.in_args[0].size = sizeof(inarg);
> + args.in_args[0].value = &inarg;
> + err = fuse_simple_request(fm, &args);
> + switch (err) {
> + case -ENOSYS:
> + /*
> + * libfuse returns ENOSYS for servers that don't
> + * implement iomap_end
> + */
> + err = 0;
> + break;
> + case 0:
> + break;
Is this case 0 needed separately from the default case?
Thanks,
Joanne
> + default:
> + break;
> + }
> + }
> +
> + return err;
> +}
> +
> +const struct iomap_ops fuse_iomap_ops = {
> + .iomap_begin = fuse_iomap_begin,
> + .iomap_end = fuse_iomap_end,
> +};
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 0cac7164afa298..1eea8dc6e723c6 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -1457,6 +1457,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
>
> if (flags & FUSE_REQUEST_TIMEOUT)
> timeout = arg->request_timeout;
> +
> + if ((flags & FUSE_IOMAP) && fuse_iomap_enabled()) {
> + fc->iomap = 1;
> + pr_warn(
> + "EXPERIMENTAL iomap feature enabled. Use at your own risk!");
> + }
> } else {
> ra_pages = fc->max_read / PAGE_SIZE;
> fc->no_lock = 1;
> @@ -1525,6 +1531,8 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm)
> */
> if (fuse_uring_enabled())
> flags |= FUSE_OVER_IO_URING;
> + if (fuse_iomap_enabled())
> + flags |= FUSE_IOMAP;
>
> ia->in.flags = flags;
> ia->in.flags2 = flags >> 32;
>
Powered by blists - more mailing lists