[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20231123125121.4064694-14-yi.zhang@huaweicloud.com>
Date: Thu, 23 Nov 2023 20:51:15 +0800
From: Zhang Yi <yi.zhang@...weicloud.com>
To: linux-ext4@...r.kernel.org
Cc: linux-fsdevel@...r.kernel.org,
tytso@....edu,
adilger.kernel@...ger.ca,
jack@...e.cz,
ritesh.list@...il.com,
hch@...radead.org,
djwong@...nel.org,
yi.zhang@...wei.com,
yi.zhang@...weicloud.com,
chengzhihao1@...wei.com,
yukuai3@...wei.com
Subject: [RFC PATCH 13/18] ext4: impliment writeback iomap path
From: Zhang Yi <yi.zhang@...wei.com>
Impliment writeback iomap path and journal write data path in
data=order mode, includes .map_blocks() and .prepare_ioend() callbacks
in iomap_writeback_ops, most of them are inherited from
ext4_writepages() and ext4_normal_submit_inode_data_buffers(), modify
and reuse mpage_map_one_extent() to save some codes. At the same time,
we are not able to switch buffered IO to iomap at onece, so introduce a
flag EXT4_STATE_BUFFERED_IOMAP to indicate one inode use traditional
buffered_head path or iomap path.
Signed-off-by: Zhang Yi <yi.zhang@...wei.com>
---
fs/ext4/ext4.h | 5 +
fs/ext4/inode.c | 262 +++++++++++++++++++++++++++++++++++++++++-----
fs/ext4/page-io.c | 74 +++++++++++++
fs/ext4/super.c | 2 +
4 files changed, 318 insertions(+), 25 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b5026090ad6f..65373d53ba6a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1136,6 +1136,8 @@ struct ext4_inode_info {
*/
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
+ struct list_head i_iomap_ioend_list;
+ struct work_struct i_iomap_ioend_work;
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1900,6 +1902,7 @@ enum {
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
+ EXT4_STATE_BUFFERED_IOMAP, /* Inode use iomap for buffered IO */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -3743,6 +3746,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
+extern void ext4_iomap_end_io(struct work_struct *work);
+extern void ext4_iomap_end_bio(struct bio *bio);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9229297e1efc..f72864b9a6b3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -43,6 +43,7 @@
#include <linux/iversion.h>
#include "ext4_jbd2.h"
+#include "ext4_extents.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"
@@ -2172,10 +2173,10 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
return err;
}
-static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+static int mpage_map_one_extent(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_io_submit *io)
{
- struct inode *inode = mpd->inode;
- struct ext4_map_blocks *map = &mpd->map;
int get_blocks_flags;
int err, dioread_nolock;
@@ -2207,13 +2208,13 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
return err;
- if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
- if (!mpd->io_submit.io_end->handle &&
+ if (io && dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (!io->io_end->handle &&
ext4_handle_valid(handle)) {
- mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ io->io_end->handle = handle->h_rsv_handle;
handle->h_rsv_handle = NULL;
}
- ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ ext4_set_io_unwritten_flag(inode, io->io_end);
}
BUG_ON(map->m_len == 0);
@@ -2257,7 +2258,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
return PTR_ERR(io_end_vec);
io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
do {
- err = mpage_map_one_extent(handle, mpd);
+ err = mpage_map_one_extent(handle, inode, map, &mpd->io_submit);
if (err < 0) {
struct super_block *sb = inode->i_sb;
@@ -2822,22 +2823,6 @@ static int ext4_writepages(struct address_space *mapping,
return ret;
}
-int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = jinode->i_dirty_start,
- .range_end = jinode->i_dirty_end,
- };
- struct mpage_da_data mpd = {
- .inode = jinode->i_vfs_inode,
- .wbc = &wbc,
- .can_map = 0,
- };
- return ext4_do_writepages(&mpd);
-}
-
static int ext4_dax_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -3773,10 +3758,237 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
iomap_readahead(rac, &ext4_iomap_read_ops);
}
+struct ext4_writeback_ctx {
+ struct iomap_writepage_ctx ctx;
+ struct writeback_control *wbc;
+ unsigned int can_map:1; /* Can writepages call map blocks? */
+};
+
+static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
+{
+ struct ext4_writeback_ctx *ewpc =
+ container_of(wpc, struct ext4_writeback_ctx, ctx);
+ struct super_block *sb = inode->i_sb;
+ struct journal_s *journal = EXT4_SB(sb)->s_journal;
+ int needed_blocks;
+ struct ext4_map_blocks map;
+ handle_t *handle = NULL;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int index = offset >> blkbits;
+ unsigned int end = ewpc->wbc->range_end >> blkbits;
+ unsigned int len = end - index + 1 ? : UINT_MAX;
+ loff_t new_disksize;
+ bool allocated = false;
+ int ret = 0;
+
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
+ /* Check validity of the cached writeback mapping. */
+ if (offset >= wpc->iomap.offset &&
+ offset < wpc->iomap.offset + wpc->iomap.length)
+ return 0;
+
+ needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+retry:
+ map.m_lblk = index;
+ map.m_len = min_t(unsigned int, EXT_UNWRITTEN_MAX_LEN, len);
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+
+ if (!ewpc->can_map &&
+ (map.m_len == 0 || map.m_flags != EXT4_MAP_MAPPED)) {
+ /*
+ * We cannot reach here when we do a journal commit via
+ * journal_submit_data_buffers(), we must write mapped
+ * blocks to achieve data=ordered mode guarantees.
+ */
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+
+ allocated = (map.m_flags & EXT4_MAP_MAPPED) ||
+ ((map.m_flags & EXT4_MAP_UNWRITTEN) &&
+ ext4_should_dioread_nolock(inode));
+ if (allocated) {
+ new_disksize = offset + (map.m_len << blkbits);
+ if (new_disksize <= READ_ONCE(EXT4_I(inode)->i_disksize))
+ goto out;
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+
+ if (!allocated) {
+ ret = mpage_map_one_extent(handle, inode, &map, NULL);
+ if (ret < 0) {
+ if (ext4_forced_shutdown(sb))
+ goto out_journal;
+
+ /*
+ * Retry transient ENOSPC errors, if
+ * ext4_count_free_blocks() is non-zero, a commit
+ * should free up blocks.
+ */
+ if (ret == -ENOSPC && ext4_count_free_clusters(sb)) {
+ ext4_journal_stop(handle);
+ jbd2_journal_force_commit_nested(journal);
+ goto retry;
+ }
+
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with "
+ "max blocks %u with error %d",
+ inode->i_ino, (unsigned long long)map.m_lblk,
+ (unsigned int)map.m_len, -ret);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (ret == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ goto out_journal;
+ }
+ }
+
+ /*
+ * Update on-disk size after IO is submitted. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
+ new_disksize = offset + (map.m_len << blkbits);
+ if (new_disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (new_disksize > i_size)
+ new_disksize = i_size;
+ if (new_disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = new_disksize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ EXT4_ERROR_INODE_ERR(inode, -ret,
+ "Failed to mark inode dirty");
+ }
+out_journal:
+ ext4_journal_stop(handle);
+out:
+ if (!ret)
+ ext4_set_iomap(inode, &wpc->iomap, &map, offset,
+ map.m_len << blkbits, 0);
+ return 0;
+}
+
+static int ext4_iomap_prepare_ioend(struct iomap_ioend *ioend, int status)
+{
+ handle_t *handle = NULL;
+ struct inode *inode = ioend->io_inode;
+ int rsv_blocks;
+ int ret;
+
+ if (ioend->io_type != IOMAP_UNWRITTEN)
+ return status;
+
+ ioend->io_bio->bi_end_io = ext4_iomap_end_bio;
+
+ /*
+ * Reserve enough transaction credits for unwritten extent
+ * convert processing in end IO.
+ */
+ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+ ioend->io_size >> inode->i_blkbits);
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, 0, rsv_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "%s: jbd2_start: %ld blocks, ino %lu; err %d\n",
+ __func__, ioend->io_size >> inode->i_blkbits,
+ inode->i_ino, ret);
+ return status ? status : ret;
+ }
+ if (ext4_handle_valid(handle)) {
+ ioend->io_private = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_journal_stop(handle);
+
+ return status;
+}
+
+static const struct iomap_writeback_ops ext4_iomap_writeback_ops = {
+ .map_blocks = ext4_iomap_map_blocks,
+ .prepare_ioend = ext4_iomap_prepare_ioend,
+};
+
+static int ext4_iomap_do_writepages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ext4_writeback_ctx *ewpc)
+{
+ struct inode *inode = mapping->host;
+ long nr_to_write = wbc->nr_to_write;
+ int ret;
+
+ trace_ext4_writepages(inode, wbc);
+ ret = iomap_writepages(mapping, wbc, &ewpc->ctx,
+ &ext4_iomap_writeback_ops);
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
+ return ret;
+}
+
static int ext4_iomap_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return 0;
+ struct ext4_writeback_ctx ewpc = {
+ .wbc = wbc,
+ .can_map = 1,
+ };
+ struct super_block *sb = mapping->host->i_sb;
+ int alloc_ctx, ret;
+
+ if (unlikely(ext4_forced_shutdown(sb)))
+ return -EIO;
+
+ alloc_ctx = ext4_writepages_down_read(sb);
+ ret = ext4_iomap_do_writepages(mapping, wbc, &ewpc);
+ ext4_writepages_up_read(sb, alloc_ctx);
+
+ return ret;
+}
+
+int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct inode *inode = jinode->i_vfs_inode;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+ struct mpage_da_data mpd = {
+ .inode = inode,
+ .wbc = &wbc,
+ .can_map = 0,
+ };
+ struct ext4_writeback_ctx ewpc = {
+ .wbc = &wbc,
+ .can_map = 0,
+ };
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ return ext4_iomap_do_writepages(jinode->i_vfs_inode->i_mapping,
+ &wbc, &ewpc);
+
+ return ext4_do_writepages(&mpd);
}
/*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dfdd7e5cf038..f817fcf8df99 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -22,6 +22,7 @@
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
+#include <linux/iomap.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
@@ -565,3 +566,76 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
return 0;
}
+
+static int ext4_iomap_convert_unwritten_io_end(struct iomap_ioend *ioend)
+{
+ handle_t *handle = ioend->io_private;
+ struct inode *inode = ioend->io_inode;
+ int ret, err;
+
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ }
+
+ ret = ext4_convert_unwritten_extents(handle, ioend->io_inode,
+ ioend->io_offset, ioend->io_size);
+ if (handle) {
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ }
+out:
+ if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to "
+ "written extents -- potential data loss! "
+ "(inode %lu, error %d)", inode->i_ino, ret);
+ }
+ iomap_finish_ioends(ioend, ret);
+ return ret;
+}
+
+/*
+ * Work on buffered iomap completed IO, to convert unwritten extents to
+ * mapped extents
+ */
+void ext4_iomap_end_io(struct work_struct *work)
+{
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_iomap_ioend_work);
+ struct iomap_ioend *ioend;
+ struct list_head ioend_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ list_replace_init(&ei->i_iomap_ioend_list, &ioend_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while (!list_empty(&ioend_list)) {
+ ioend = list_entry(ioend_list.next, struct iomap_ioend, io_list);
+ BUG_ON(ioend->io_type != IOMAP_UNWRITTEN);
+ list_del_init(&ioend->io_list);
+ ext4_iomap_convert_unwritten_io_end(ioend);
+ }
+}
+
+void ext4_iomap_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = bio->bi_private;
+ struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+ struct ext4_sb_info *sbi = EXT4_SB(ioend->io_inode->i_sb);
+ unsigned long flags;
+
+ /* Only reserved conversions from writeback should enter here */
+ WARN_ON(ioend->io_type != IOMAP_UNWRITTEN);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (list_empty(&ei->i_iomap_ioend_list))
+ queue_work(sbi->rsv_conversion_wq, &ei->i_iomap_ioend_work);
+ list_add_tail(&ioend->io_list, &ei->i_iomap_ioend_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dbebd8b3127e..08a39f364d78 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1422,11 +1422,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+ INIT_LIST_HEAD(&ei->i_iomap_ioend_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ INIT_WORK(&ei->i_iomap_ioend_work, ext4_iomap_end_io);
ext4_fc_init_inode(&ei->vfs_inode);
mutex_init(&ei->i_fc_lock);
return &ei->vfs_inode;
--
2.39.2
Powered by blists - more mailing lists