[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ca86fc29-b0fe-4e23-94b3-76015a95b64f@fastmail.fm>
Date: Fri, 5 Jul 2024 13:50:01 +0200
From: Bernd Schubert <bernd.schubert@...tmail.fm>
To: Jingbo Xu <jefflexu@...ux.alibaba.com>, miklos@...redi.hu,
linux-fsdevel@...r.kernel.org
Cc: linux-kernel@...r.kernel.org
Subject: Re: [PATCH] fuse: make foffset alignment opt-in for optimum backend
performance
On 7/5/24 12:04, Jingbo Xu wrote:
> Sometimes the file offset alignment needs to be opt-in to achieve the
> optimum performance at the backend store.
>
> For example when ErasureCode [1] is used at the backend store, the
> optimum write performance is achieved when the WRITE request is aligned
> with the stripe size of ErasureCode. Otherwise a non-aligned WRITE
> request needs to be split at the stripe size boundary. It is quite
> costly to handle these split partial requests, as firstly the whole
> stripe to which the split partial request belongs needs to be read out,
> then overwrite the read stripe buffer with the request, and finally write
> the whole stripe back to the persistent storage.
>
> Thus the backend store can suffer severe performance degradation when
> WRITE requests can not fit into one stripe exactly. The write performance
> can be 10x slower when the request is 256KB in size given 4MB stripe size.
> Also there can be 50% performance degradation in theory if the request
> is not stripe boundary aligned.
>
> Besides, the conveyed test indicates that, the non-alignment issue
> becomes more severe when decreasing fuse's max_ratio, maybe partly
> because the background writeback now is more likely to run parallelly
> with the dirtier.
>
> fuse's max_ratio ratio of aligned WRITE requests
> ---------------- -------------------------------
> 70 99.9%
> 40 74%
> 20 45%
> 10 20%
>
> With the patched version, which makes the alignment constraint opt-in
> when constructing WRITE requests, the ratio of aligned WRITE requests
> increases to 98% (previously 20%) when fuse's max_ratio is 10.
>
> [1] https://lore.kernel.org/linux-fsdevel/20240124070512.52207-1-jefflexu@linux.alibaba.com/T/#m9bce469998ea6e4f911555c6f7be1e077ce3d8b4
> Signed-off-by: Jingbo Xu <jefflexu@...ux.alibaba.com>
>
> Signed-off-by: Jingbo Xu <jefflexu@...ux.alibaba.com>
> ---
> fs/fuse/file.c | 4 ++++
> fs/fuse/fuse_i.h | 6 ++++++
> fs/fuse/inode.c | 9 +++++++++
> include/uapi/linux/fuse.h | 9 ++++++++-
> 4 files changed, 27 insertions(+), 1 deletion(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index f39456c65ed7..f9b477016c2e 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -2246,6 +2246,10 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
> if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
> return true;
>
> + /* Reached alignment */
> + if (fc->opt_alignment && !(page->index % fc->opt_alignment_pages))
> + return true;
I link the idea, but couldn't it just do that with
fc->max_pages? I'm asking because fc->opt_alignment_pages
takes another uint32_t in fuse_init_out and there is not so much
space left anymore.
> +
> return false;
> }
>
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index f23919610313..5963571b394c 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -860,6 +860,9 @@ struct fuse_conn {
> /** Passthrough support for read/write IO */
> unsigned int passthrough:1;
>
> + /* Foffset alignment required for optimum performance */
> + unsigned int opt_alignment:1;
> +
> /** Maximum stack depth for passthrough backing files */
> int max_stack_depth;
>
> @@ -917,6 +920,9 @@ struct fuse_conn {
> /** IDR for backing files ids */
> struct idr backing_files_map;
> #endif
> +
> + /* The foffset alignment in PAGE_SIZE */
> + unsigned int opt_alignment_pages;
> };
>
> /*
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 99e44ea7d875..9266b22cce8e 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -1331,6 +1331,15 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> }
> if (flags & FUSE_NO_EXPORT_SUPPORT)
> fm->sb->s_export_op = &fuse_export_fid_operations;
> +
> + /* fallback to default if opt_alignment <= PAGE_SHIFT */
> + if (flags & FUSE_OPT_ALIGNMENT) {
> + if (arg->opt_alignment > PAGE_SHIFT) {
> + fc->opt_alignment = 1;
> + fc->opt_alignment_pages = 1 <<
> + (arg->opt_alignment - PAGE_SHIFT);
opt_alignment is the number of bits required for alignment? Not
very user friendly, from my point of view would be better to have
this in a byte or kb unit.
Thanks,
Bernd
> + }
> + }
> } else {
> ra_pages = fc->max_read / PAGE_SIZE;
> fc->no_lock = 1;
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index d08b99d60f6f..2c6ad1577591 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -217,6 +217,9 @@
> * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag
> * - add FUSE_NO_EXPORT_SUPPORT init flag
> * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag
> + *
> + * 7.41
> + * - add opt_alignment to fuse_init_out, add FUSE_OPT_ALIGNMENT init flag
> */
>
> #ifndef _LINUX_FUSE_H
> @@ -421,6 +424,8 @@ struct fuse_file_lock {
> * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support
> * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit
> * of the request ID indicates resend requests
> + * FUSE_OPT_ALIGNMENT: init_out.opt_alignment contains log2(byte alignment) for
> + * foffset alignment for optimum write performance
> */
> #define FUSE_ASYNC_READ (1 << 0)
> #define FUSE_POSIX_LOCKS (1 << 1)
> @@ -463,6 +468,7 @@ struct fuse_file_lock {
> #define FUSE_PASSTHROUGH (1ULL << 37)
> #define FUSE_NO_EXPORT_SUPPORT (1ULL << 38)
> #define FUSE_HAS_RESEND (1ULL << 39)
> +#define FUSE_OPT_ALIGNMENT (1ULL << 40)
>
> /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */
> #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP
> @@ -893,7 +899,8 @@ struct fuse_init_out {
> uint16_t map_alignment;
> uint32_t flags2;
> uint32_t max_stack_depth;
> - uint32_t unused[6];
> + uint32_t opt_alignment;
> + uint32_t unused[5];
> };
>
> #define CUSE_INIT_INFO_MAX 4096
Powered by blists - more mailing lists