lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20231123125121.4064694-14-yi.zhang@huaweicloud.com>
Date: Thu, 23 Nov 2023 20:51:15 +0800
From: Zhang Yi <yi.zhang@...weicloud.com>
To: linux-ext4@...r.kernel.org
Cc: linux-fsdevel@...r.kernel.org,
	tytso@....edu,
	adilger.kernel@...ger.ca,
	jack@...e.cz,
	ritesh.list@...il.com,
	hch@...radead.org,
	djwong@...nel.org,
	yi.zhang@...wei.com,
	yi.zhang@...weicloud.com,
	chengzhihao1@...wei.com,
	yukuai3@...wei.com
Subject: [RFC PATCH 13/18] ext4: impliment writeback iomap path

From: Zhang Yi <yi.zhang@...wei.com>

Impliment writeback iomap path and journal write data path in
data=order mode, includes .map_blocks() and .prepare_ioend() callbacks
in iomap_writeback_ops, most of them are inherited from
ext4_writepages() and ext4_normal_submit_inode_data_buffers(), modify
and reuse mpage_map_one_extent() to save some codes. At the same time,
we are not able to switch buffered IO to iomap at onece, so introduce a
flag EXT4_STATE_BUFFERED_IOMAP to indicate one inode use traditional
buffered_head path or iomap path.

Signed-off-by: Zhang Yi <yi.zhang@...wei.com>
---
 fs/ext4/ext4.h    |   5 +
 fs/ext4/inode.c   | 262 +++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/page-io.c |  74 +++++++++++++
 fs/ext4/super.c   |   2 +
 4 files changed, 318 insertions(+), 25 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b5026090ad6f..65373d53ba6a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1136,6 +1136,8 @@ struct ext4_inode_info {
 	 */
 	struct list_head i_rsv_conversion_list;
 	struct work_struct i_rsv_conversion_work;
+	struct list_head i_iomap_ioend_list;
+	struct work_struct i_iomap_ioend_work;
 	atomic_t i_unwritten; /* Nr. of inflight conversions pending */
 
 	spinlock_t i_block_reservation_lock;
@@ -1900,6 +1902,7 @@ enum {
 	EXT4_STATE_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
 	EXT4_STATE_FC_COMMITTING,	/* Fast commit ongoing */
 	EXT4_STATE_ORPHAN_FILE,		/* Inode orphaned in orphan file */
+	EXT4_STATE_BUFFERED_IOMAP,	/* Inode use iomap for buffered IO */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -3743,6 +3746,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
 		size_t len);
 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
+extern void ext4_iomap_end_io(struct work_struct *work);
+extern void ext4_iomap_end_bio(struct bio *bio);
 
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9229297e1efc..f72864b9a6b3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -43,6 +43,7 @@
 #include <linux/iversion.h>
 
 #include "ext4_jbd2.h"
+#include "ext4_extents.h"
 #include "xattr.h"
 #include "acl.h"
 #include "truncate.h"
@@ -2172,10 +2173,10 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 	return err;
 }
 
-static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+static int mpage_map_one_extent(handle_t *handle, struct inode *inode,
+				struct ext4_map_blocks *map,
+				struct ext4_io_submit *io)
 {
-	struct inode *inode = mpd->inode;
-	struct ext4_map_blocks *map = &mpd->map;
 	int get_blocks_flags;
 	int err, dioread_nolock;
 
@@ -2207,13 +2208,13 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
 	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
 	if (err < 0)
 		return err;
-	if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
-		if (!mpd->io_submit.io_end->handle &&
+	if (io && dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
+		if (!io->io_end->handle &&
 		    ext4_handle_valid(handle)) {
-			mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+			io->io_end->handle = handle->h_rsv_handle;
 			handle->h_rsv_handle = NULL;
 		}
-		ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+		ext4_set_io_unwritten_flag(inode, io->io_end);
 	}
 
 	BUG_ON(map->m_len == 0);
@@ -2257,7 +2258,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 		return PTR_ERR(io_end_vec);
 	io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
 	do {
-		err = mpage_map_one_extent(handle, mpd);
+		err = mpage_map_one_extent(handle, inode, map, &mpd->io_submit);
 		if (err < 0) {
 			struct super_block *sb = inode->i_sb;
 
@@ -2822,22 +2823,6 @@ static int ext4_writepages(struct address_space *mapping,
 	return ret;
 }
 
-int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
-{
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = LONG_MAX,
-		.range_start = jinode->i_dirty_start,
-		.range_end = jinode->i_dirty_end,
-	};
-	struct mpage_da_data mpd = {
-		.inode = jinode->i_vfs_inode,
-		.wbc = &wbc,
-		.can_map = 0,
-	};
-	return ext4_do_writepages(&mpd);
-}
-
 static int ext4_dax_writepages(struct address_space *mapping,
 			       struct writeback_control *wbc)
 {
@@ -3773,10 +3758,237 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
 	iomap_readahead(rac, &ext4_iomap_read_ops);
 }
 
+struct ext4_writeback_ctx {
+	struct iomap_writepage_ctx ctx;
+	struct writeback_control *wbc;
+	unsigned int can_map:1;	/* Can writepages call map blocks? */
+};
+
+static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc,
+				 struct inode *inode, loff_t offset)
+{
+	struct ext4_writeback_ctx *ewpc =
+			container_of(wpc, struct ext4_writeback_ctx, ctx);
+	struct super_block *sb = inode->i_sb;
+	struct journal_s *journal = EXT4_SB(sb)->s_journal;
+	int needed_blocks;
+	struct ext4_map_blocks map;
+	handle_t *handle = NULL;
+	unsigned int blkbits = inode->i_blkbits;
+	unsigned int index = offset >> blkbits;
+	unsigned int end = ewpc->wbc->range_end >> blkbits;
+	unsigned int len = end - index + 1 ? : UINT_MAX;
+	loff_t new_disksize;
+	bool allocated = false;
+	int ret = 0;
+
+	if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
+	/* Check validity of the cached writeback mapping. */
+	if (offset >= wpc->iomap.offset &&
+	    offset < wpc->iomap.offset + wpc->iomap.length)
+		return 0;
+
+	needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+retry:
+	map.m_lblk = index;
+	map.m_len = min_t(unsigned int, EXT_UNWRITTEN_MAX_LEN, len);
+	ret = ext4_map_blocks(NULL, inode, &map, 0);
+	if (ret < 0)
+		return ret;
+	ret = 0;
+
+	if (!ewpc->can_map &&
+	    (map.m_len == 0 || map.m_flags != EXT4_MAP_MAPPED)) {
+		/*
+		 * We cannot reach here when we do a journal commit via
+		 * journal_submit_data_buffers(), we must write mapped
+		 * blocks to achieve data=ordered mode guarantees.
+		 */
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+
+	allocated = (map.m_flags & EXT4_MAP_MAPPED) ||
+		    ((map.m_flags & EXT4_MAP_UNWRITTEN) &&
+				ext4_should_dioread_nolock(inode));
+	if (allocated) {
+		new_disksize = offset + (map.m_len << blkbits);
+		if (new_disksize <= READ_ONCE(EXT4_I(inode)->i_disksize))
+			goto out;
+	}
+
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		return ret;
+	}
+
+	if (!allocated) {
+		ret = mpage_map_one_extent(handle, inode, &map, NULL);
+		if (ret < 0) {
+			if (ext4_forced_shutdown(sb))
+				goto out_journal;
+
+			/*
+			 * Retry transient ENOSPC errors, if
+			 * ext4_count_free_blocks() is non-zero, a commit
+			 * should free up blocks.
+			 */
+			if (ret == -ENOSPC && ext4_count_free_clusters(sb)) {
+				ext4_journal_stop(handle);
+				jbd2_journal_force_commit_nested(journal);
+				goto retry;
+			}
+
+			ext4_msg(sb, KERN_CRIT,
+				 "Delayed block allocation failed for "
+				 "inode %lu at logical offset %llu with "
+				 "max blocks %u with error %d",
+				 inode->i_ino, (unsigned long long)map.m_lblk,
+				 (unsigned int)map.m_len, -ret);
+			ext4_msg(sb, KERN_CRIT,
+				 "This should not happen!! Data will "
+				 "be lost\n");
+			if (ret == -ENOSPC)
+				ext4_print_free_blocks(inode);
+			goto out_journal;
+		}
+	}
+
+	/*
+	 * Update on-disk size after IO is submitted.  Races with
+	 * truncate are avoided by checking i_size under i_data_sem.
+	 */
+	new_disksize = offset + (map.m_len << blkbits);
+	if (new_disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
+		loff_t i_size;
+
+		down_write(&EXT4_I(inode)->i_data_sem);
+		i_size = i_size_read(inode);
+		if (new_disksize > i_size)
+			new_disksize = i_size;
+		if (new_disksize > EXT4_I(inode)->i_disksize)
+			EXT4_I(inode)->i_disksize = new_disksize;
+		up_write(&EXT4_I(inode)->i_data_sem);
+		ret = ext4_mark_inode_dirty(handle, inode);
+		if (ret)
+			EXT4_ERROR_INODE_ERR(inode, -ret,
+					     "Failed to mark inode dirty");
+	}
+out_journal:
+	ext4_journal_stop(handle);
+out:
+	if (!ret)
+		ext4_set_iomap(inode, &wpc->iomap, &map, offset,
+			       map.m_len << blkbits, 0);
+	return 0;
+}
+
+static int ext4_iomap_prepare_ioend(struct iomap_ioend *ioend, int status)
+{
+	handle_t *handle = NULL;
+	struct inode *inode = ioend->io_inode;
+	int rsv_blocks;
+	int ret;
+
+	if (ioend->io_type != IOMAP_UNWRITTEN)
+		return status;
+
+	ioend->io_bio->bi_end_io = ext4_iomap_end_bio;
+
+	/*
+	 * Reserve enough transaction credits for unwritten extent
+	 * convert processing in end IO.
+	 */
+	rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+				ioend->io_size >> inode->i_blkbits);
+	handle = ext4_journal_start_with_reserve(inode,
+				EXT4_HT_WRITE_PAGE, 0, rsv_blocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		ext4_msg(inode->i_sb, KERN_CRIT,
+			 "%s: jbd2_start: %ld blocks, ino %lu; err %d\n",
+			 __func__, ioend->io_size >> inode->i_blkbits,
+			 inode->i_ino, ret);
+		return status ? status : ret;
+	}
+	if (ext4_handle_valid(handle)) {
+		ioend->io_private = handle->h_rsv_handle;
+		handle->h_rsv_handle = NULL;
+	}
+	ext4_journal_stop(handle);
+
+	return status;
+}
+
+static const struct iomap_writeback_ops ext4_iomap_writeback_ops = {
+	.map_blocks = ext4_iomap_map_blocks,
+	.prepare_ioend = ext4_iomap_prepare_ioend,
+};
+
+static int ext4_iomap_do_writepages(struct address_space *mapping,
+				    struct writeback_control *wbc,
+				    struct ext4_writeback_ctx *ewpc)
+{
+	struct inode *inode = mapping->host;
+	long nr_to_write = wbc->nr_to_write;
+	int ret;
+
+	trace_ext4_writepages(inode, wbc);
+	ret = iomap_writepages(mapping, wbc, &ewpc->ctx,
+			       &ext4_iomap_writeback_ops);
+	trace_ext4_writepages_result(inode, wbc, ret,
+				     nr_to_write - wbc->nr_to_write);
+	return ret;
+}
+
 static int ext4_iomap_writepages(struct address_space *mapping,
 				 struct writeback_control *wbc)
 {
-	return 0;
+	struct ext4_writeback_ctx ewpc = {
+		.wbc = wbc,
+		.can_map = 1,
+	};
+	struct super_block *sb = mapping->host->i_sb;
+	int alloc_ctx, ret;
+
+	if (unlikely(ext4_forced_shutdown(sb)))
+		return -EIO;
+
+	alloc_ctx = ext4_writepages_down_read(sb);
+	ret = ext4_iomap_do_writepages(mapping, wbc, &ewpc);
+	ext4_writepages_up_read(sb, alloc_ctx);
+
+	return ret;
+}
+
+int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+	struct inode *inode = jinode->i_vfs_inode;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = LONG_MAX,
+		.range_start = jinode->i_dirty_start,
+		.range_end = jinode->i_dirty_end,
+	};
+	struct mpage_da_data mpd = {
+		.inode = inode,
+		.wbc = &wbc,
+		.can_map = 0,
+	};
+	struct ext4_writeback_ctx ewpc = {
+		.wbc = &wbc,
+		.can_map = 0,
+	};
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+		return ext4_iomap_do_writepages(jinode->i_vfs_inode->i_mapping,
+						&wbc, &ewpc);
+
+	return ext4_do_writepages(&mpd);
 }
 
 /*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dfdd7e5cf038..f817fcf8df99 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -22,6 +22,7 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/iomap.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
@@ -565,3 +566,76 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
 
 	return 0;
 }
+
+static int ext4_iomap_convert_unwritten_io_end(struct iomap_ioend *ioend)
+{
+	handle_t *handle = ioend->io_private;
+	struct inode *inode = ioend->io_inode;
+	int ret, err;
+
+	if (handle) {
+		handle = ext4_journal_start_reserved(handle,
+						     EXT4_HT_EXT_CONVERT);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+	}
+
+	ret = ext4_convert_unwritten_extents(handle, ioend->io_inode,
+					     ioend->io_offset, ioend->io_size);
+	if (handle) {
+		err = ext4_journal_stop(handle);
+		if (!ret)
+			ret = err;
+	}
+out:
+	if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
+		ext4_msg(inode->i_sb, KERN_EMERG,
+			 "failed to convert unwritten extents to "
+			 "written extents -- potential data loss!  "
+			 "(inode %lu, error %d)", inode->i_ino, ret);
+	}
+	iomap_finish_ioends(ioend, ret);
+	return ret;
+}
+
+/*
+ * Work on buffered iomap completed IO, to convert unwritten extents to
+ * mapped extents
+ */
+void ext4_iomap_end_io(struct work_struct *work)
+{
+	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+						  i_iomap_ioend_work);
+	struct iomap_ioend *ioend;
+	struct list_head ioend_list;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	list_replace_init(&ei->i_iomap_ioend_list, &ioend_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+	while (!list_empty(&ioend_list)) {
+		ioend = list_entry(ioend_list.next, struct iomap_ioend, io_list);
+		BUG_ON(ioend->io_type != IOMAP_UNWRITTEN);
+		list_del_init(&ioend->io_list);
+		ext4_iomap_convert_unwritten_io_end(ioend);
+	}
+}
+
+void ext4_iomap_end_bio(struct bio *bio)
+{
+	struct iomap_ioend *ioend = bio->bi_private;
+	struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+	struct ext4_sb_info *sbi = EXT4_SB(ioend->io_inode->i_sb);
+	unsigned long flags;
+
+	/* Only reserved conversions from writeback should enter here */
+	WARN_ON(ioend->io_type != IOMAP_UNWRITTEN);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (list_empty(&ei->i_iomap_ioend_list))
+		queue_work(sbi->rsv_conversion_wq, &ei->i_iomap_ioend_work);
+	list_add_tail(&ioend->io_list, &ei->i_iomap_ioend_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dbebd8b3127e..08a39f364d78 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1422,11 +1422,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 #endif
 	ei->jinode = NULL;
 	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+	INIT_LIST_HEAD(&ei->i_iomap_ioend_list);
 	spin_lock_init(&ei->i_completed_io_lock);
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
 	atomic_set(&ei->i_unwritten, 0);
 	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+	INIT_WORK(&ei->i_iomap_ioend_work, ext4_iomap_end_io);
 	ext4_fc_init_inode(&ei->vfs_inode);
 	mutex_init(&ei->i_fc_lock);
 	return &ei->vfs_inode;
-- 
2.39.2


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ