[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZjH5Ia+dWGss5Duv@dread.disaster.area>
Date: Wed, 1 May 2024 18:11:13 +1000
From: Dave Chinner <david@...morbit.com>
To: Zhang Yi <yi.zhang@...weicloud.com>
Cc: linux-ext4@...r.kernel.org, linux-fsdevel@...r.kernel.org,
linux-mm@...ck.org, linux-kernel@...r.kernel.org, tytso@....edu,
adilger.kernel@...ger.ca, jack@...e.cz, ritesh.list@...il.com,
hch@...radead.org, djwong@...nel.org, willy@...radead.org,
zokeefe@...gle.com, yi.zhang@...wei.com, chengzhihao1@...wei.com,
yukuai3@...wei.com, wangkefeng.wang@...wei.com
Subject: Re: [RFC PATCH v4 24/34] ext4: implement buffered write iomap path
On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@...wei.com>
>
> Implement buffered write iomap path, use ext4_da_map_blocks() to map
> delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
> delalloc is disabled or free space is about to run out.
>
> Note that we always allocate unwritten extents for new blocks in the
> iomap write path, this means that the allocation type is no longer
> controlled by the dioread_nolock mount option. After that, we could
> postpone the i_disksize updating to the writeback path, and drop journal
> handle in the buffered dealloc write path completely.
>
> Signed-off-by: Zhang Yi <yi.zhang@...wei.com>
> ---
> fs/ext4/ext4.h | 3 +
> fs/ext4/file.c | 19 +++++-
> fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++--
> 3 files changed, 183 insertions(+), 7 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 05949a8136ae..2bd543c43341 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2970,6 +2970,7 @@ int ext4_walk_page_buffers(handle_t *handle,
> struct buffer_head *bh));
> int do_journal_get_write_access(handle_t *handle, struct inode *inode,
> struct buffer_head *bh);
> +int ext4_nonda_switch(struct super_block *sb);
> #define FALL_BACK_TO_NONDELALLOC 1
> #define CONVERT_INLINE_DATA 2
>
> @@ -3827,6 +3828,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
> extern const struct iomap_ops ext4_iomap_ops;
> extern const struct iomap_ops ext4_iomap_overwrite_ops;
> extern const struct iomap_ops ext4_iomap_report_ops;
> +extern const struct iomap_ops ext4_iomap_buffered_write_ops;
> +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
>
> static inline int ext4_buffer_uptodate(struct buffer_head *bh)
> {
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 54d6ff22585c..52f37c49572a 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -282,6 +282,20 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
> return count;
> }
>
> +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
> + struct iov_iter *from)
> +{
> + struct inode *inode = file_inode(iocb->ki_filp);
> + const struct iomap_ops *iomap_ops;
> +
> + if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
> + iomap_ops = &ext4_iomap_buffered_da_write_ops;
> + else
> + iomap_ops = &ext4_iomap_buffered_write_ops;
> +
> + return iomap_file_buffered_write(iocb, from, iomap_ops);
> +}
> +
> static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> struct iov_iter *from)
> {
> @@ -296,7 +310,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> if (ret <= 0)
> goto out;
>
> - ret = generic_perform_write(iocb, from);
> + if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
> + ret = ext4_iomap_buffered_write(iocb, from);
> + else
> + ret = generic_perform_write(iocb, from);
>
> out:
> inode_unlock(inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 20eb772f4f62..e825ed16fd60 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2857,7 +2857,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
> return ret;
> }
>
> -static int ext4_nonda_switch(struct super_block *sb)
> +int ext4_nonda_switch(struct super_block *sb)
> {
> s64 free_clusters, dirty_clusters;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -3254,6 +3254,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
> return inode->i_state & I_DIRTY_DATASYNC;
> }
>
> +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
> +{
> + return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
> +}
> +
> +static const struct iomap_folio_ops ext4_iomap_folio_ops = {
> + .iomap_valid = ext4_iomap_valid,
> +};
> +
> static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
> struct ext4_map_blocks *map, loff_t offset,
> loff_t length, unsigned int flags)
> @@ -3284,6 +3293,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
> !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> iomap->flags |= IOMAP_F_MERGED;
>
> + iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq);
> + iomap->folio_ops = &ext4_iomap_folio_ops;
> +
> /*
> * Flags passed to ext4_map_blocks() for direct I/O writes can result
> * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
> @@ -3523,11 +3535,42 @@ const struct iomap_ops ext4_iomap_report_ops = {
> .iomap_begin = ext4_iomap_begin_report,
> };
>
> -static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
> +static int ext4_iomap_get_blocks(struct inode *inode,
> + struct ext4_map_blocks *map)
> +{
> + handle_t *handle;
> + int ret, needed_blocks;
> +
> + /*
> + * Reserve one block more for addition to orphan list in case
> + * we allocate blocks but write fails for some reason.
> + */
> + needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
> + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
> + if (IS_ERR(handle))
> + return PTR_ERR(handle);
> +
> + ret = ext4_map_blocks(handle, inode, map,
> + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
> + /*
> + * Have to stop journal here since there is a potential deadlock
> + * caused by later balance_dirty_pages(), it might wait on the
> + * ditry pages to be written back, which might start another
> + * handle and wait this handle stop.
> + */
> + ext4_journal_stop(handle);
> +
> + return ret;
> +}
> +
> +#define IOMAP_F_EXT4_DELALLOC IOMAP_F_PRIVATE
> +
> +static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
> loff_t length, unsigned int iomap_flags,
> - struct iomap *iomap, struct iomap *srcmap)
> + struct iomap *iomap, struct iomap *srcmap,
> + bool delalloc)
> {
> - int ret;
> + int ret, retries = 0;
> struct ext4_map_blocks map;
> u8 blkbits = inode->i_blkbits;
>
> @@ -3537,20 +3580,133 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
> return -EINVAL;
> if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
> return -ERANGE;
> -
> +retry:
> /* Calculate the first and last logical blocks respectively. */
> map.m_lblk = offset >> blkbits;
> map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
> EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
> + if (iomap_flags & IOMAP_WRITE) {
> + if (delalloc)
> + ret = ext4_da_map_blocks(inode, &map);
> + else
> + ret = ext4_iomap_get_blocks(inode, &map);
>
> - ret = ext4_map_blocks(NULL, inode, &map, 0);
> + if (ret == -ENOSPC &&
> + ext4_should_retry_alloc(inode->i_sb, &retries))
> + goto retry;
> + } else {
> + ret = ext4_map_blocks(NULL, inode, &map, 0);
> + }
> if (ret < 0)
> return ret;
>
> ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
> + if (delalloc)
> + iomap->flags |= IOMAP_F_EXT4_DELALLOC;
> +
> + return 0;
> +}
Why are you implementing both read and write mapping paths in
the one function? The whole point of having separate ops vectors for
read and write is that it allows a clean separation of the read and
write mapping operations. i.e. there is no need to use "if (write)
else {do read}" code constructs at all.
You can even have a different delalloc mapping function so you don't
need "if (delalloc) else {do nonda}" branches everiywhere...
> +
> +static inline int ext4_iomap_buffered_io_begin(struct inode *inode,
> + loff_t offset, loff_t length, unsigned int flags,
> + struct iomap *iomap, struct iomap *srcmap)
> +{
> + return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
> + iomap, srcmap, false);
> +}
> +
> +static inline int ext4_iomap_buffered_da_write_begin(struct inode *inode,
> + loff_t offset, loff_t length, unsigned int flags,
> + struct iomap *iomap, struct iomap *srcmap)
> +{
> + return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
> + iomap, srcmap, true);
> +}
> +
> +/*
> + * Drop the staled delayed allocation range from the write failure,
> + * including both start and end blocks. If not, we could leave a range
> + * of delayed extents covered by a clean folio, it could lead to
> + * inaccurate space reservation.
> + */
> +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
> + loff_t length)
> +{
> + ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
> + DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
> return 0;
> }
>
> +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
> + loff_t length, ssize_t written,
> + unsigned int flags,
> + struct iomap *iomap)
> +{
> + handle_t *handle;
> + loff_t end;
> + int ret = 0, ret2;
> +
> + /* delalloc */
> + if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
> + ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
> + offset, length, written, ext4_iomap_punch_delalloc);
> + if (ret)
> + ext4_warning(inode->i_sb,
> + "Failed to clean up delalloc for inode %lu, %d",
> + inode->i_ino, ret);
> + return ret;
> + }
Why are you creating a delalloc extent for the write operation and
then immediately deleting it from the extent tree once the write
operation is done? Delayed allocation is a state that persists for
that file range until the data is written back, right, but this
appears to return the range to a hole in the extent state tree?
What happens when we do a second write to this same range? WIll it
do another delayed allocation because there are no extents over this
range?
Also, why do you need IOMAP_F_EXT4_DELALLOC? Isn't a delalloc iomap
set up with iomap->type = IOMAP_DELALLOC? Why can't that be used?
-Dave.
--
Dave Chinner
david@...morbit.com
Powered by blists - more mailing lists