lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250122110533.4116662-6-libaokun@huaweicloud.com>
Date: Wed, 22 Jan 2025 19:05:29 +0800
From: libaokun@...weicloud.com
To: linux-ext4@...r.kernel.org
Cc: tytso@....edu,
	adilger.kernel@...ger.ca,
	jack@...e.cz,
	linux-kernel@...r.kernel.org,
	yi.zhang@...wei.com,
	yangerkun@...wei.com,
	libaokun@...weicloud.com,
	Baokun Li <libaokun1@...wei.com>
Subject: [PATCH v3 5/9] ext4: abort journal on data writeback failure if in data_err=abort mode

From: Baokun Li <libaokun1@...wei.com>

The data_err=abort was initially introduced to address users' worries
about data corruption spreading unnoticed. With direct writes, we can
rely on return values to confirm successful writes to disk. But with
buffered writes, a successful return only means the data has been written
to memory. Users have no way of knowing if the data has actually written
it to disk unless they use fsync (which impacts performance and can
sometimes miss errors).

The current data_err=abort implementation relies on the ordered data list,
but past changes have inadvertently altered its behavior. For example, if
an extent is unwritten, we do not add the inode to the ordered data list.
Therefore, jbd2 will not wait for the data write-back of that inode to
complete and check for errors in the inode mapping. Moreover, the checks
performed by jbd2 can also miss errors.

Now, all buffered writes eventually call ext4_end_bio(), where I/O errors
are checked. Therefore, we can check for the data_err=abort mode at this
point and abort the journal in a kworker (due to the interrupt context).

Therefore, when data_err=abort is enabled, the journal is aborted in
ext4_end_io_end() when an I/O error is detected in ext4_end_bio() to make
users who are concerned about the contents of the file happy.

Suggested-by: Jan Kara <jack@...e.cz>
Link: https://patch.msgid.link/c7ab26f3-85ad-4b31-b132-0afb0e07bf79@huawei.com
Signed-off-by: Baokun Li <libaokun1@...wei.com>
Reviewed-by: Zhang Yi <yi.zhang@...wei.com>
Reviewed-by: Jan Kara <jack@...e.cz>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/page-io.c | 48 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9da0e32af02a..0fed71beb906 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -281,6 +281,8 @@ struct ext4_system_blocks {
 #define EXT4_IO_END_UNWRITTEN	0x0001
 #define EXT4_IO_END_FAILED	0x0002
 
+#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED)
+
 struct ext4_io_end_vec {
 	struct list_head list;		/* list of io_end_vec */
 	loff_t offset;			/* offset in the file */
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 6054ec27fb48..31d8963a3fd6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
 }
 
 /*
- * Check a range of space and convert unwritten extents to written. Note that
+ * On successful IO, check a range of space and convert unwritten extents to
+ * written. On IO failure, check if journal abort is needed. Note that
  * we are protected from truncate touching same part of extent tree by the
  * fact that truncate code waits for all DIO to finish (thus exclusion from
  * direct IO is achieved) and also waits for PageWriteback bits. Thus we
@@ -175,6 +176,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
 {
 	struct inode *inode = io_end->inode;
 	handle_t *handle = io_end->handle;
+	struct super_block *sb = inode->i_sb;
 	int ret = 0;
 
 	ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
@@ -190,11 +192,15 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
 		ret = -EIO;
 		if (handle)
 			jbd2_journal_free_reserved(handle);
+
+		if (test_opt(sb, DATA_ERR_ABORT))
+			jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret);
 	} else {
 		ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
 	}
-	if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
-		ext4_msg(inode->i_sb, KERN_EMERG,
+	if (ret < 0 && !ext4_forced_shutdown(sb) &&
+	    io_end->flag & EXT4_IO_END_UNWRITTEN) {
+		ext4_msg(sb, KERN_EMERG,
 			 "failed to convert unwritten extents to written "
 			 "extents -- potential data loss!  "
 			 "(inode %lu, error %d)", inode->i_ino, ret);
@@ -228,6 +234,16 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
 #endif
 }
 
+static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
+{
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN)
+		return true;
+	if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
+	    io_end->flag & EXT4_IO_END_FAILED)
+		return true;
+	return false;
+}
+
 /* Add the io_end to per-inode completed end_io list. */
 static void ext4_add_complete_io(ext4_io_end_t *io_end)
 {
@@ -236,9 +252,11 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
 	struct workqueue_struct *wq;
 	unsigned long flags;
 
-	/* Only reserved conversions from writeback should enter here */
-	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-	WARN_ON(!io_end->handle && sbi->s_journal);
+	/* Only reserved conversions or pending IO errors will enter here. */
+	WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
+	WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
+		!io_end->handle && sbi->s_journal);
+
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	wq = sbi->rsv_conversion_wq;
 	if (list_empty(&ei->i_rsv_conversion_list))
@@ -263,7 +281,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
 
 	while (!list_empty(&unwritten)) {
 		io_end = list_entry(unwritten.next, ext4_io_end_t, list);
-		BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+		BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
 		list_del_init(&io_end->list);
 
 		err = ext4_end_io_end(io_end);
@@ -274,7 +292,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
 }
 
 /*
- * work on completed IO, to convert unwritten extents to extents
+ * Used to convert unwritten extents to written extents upon IO completion,
+ * or used to abort the journal upon IO errors.
  */
 void ext4_end_io_rsv_work(struct work_struct *work)
 {
@@ -299,19 +318,20 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 void ext4_put_io_end_defer(ext4_io_end_t *io_end)
 {
 	if (refcount_dec_and_test(&io_end->count)) {
-		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
-				list_empty(&io_end->list_vec)) {
-			ext4_release_io_end(io_end);
+		if (io_end->flag & EXT4_IO_END_FAILED ||
+		    (io_end->flag & EXT4_IO_END_UNWRITTEN &&
+		     !list_empty(&io_end->list_vec))) {
+			ext4_add_complete_io(io_end);
 			return;
 		}
-		ext4_add_complete_io(io_end);
+		ext4_release_io_end(io_end);
 	}
 }
 
 int ext4_put_io_end(ext4_io_end_t *io_end)
 {
 	if (refcount_dec_and_test(&io_end->count)) {
-		if (io_end->flag & EXT4_IO_END_UNWRITTEN)
+		if (ext4_io_end_defer_completion(io_end))
 			return ext4_end_io_end(io_end);
 
 		ext4_release_io_end(io_end);
@@ -355,7 +375,7 @@ static void ext4_end_bio(struct bio *bio)
 				blk_status_to_errno(bio->bi_status));
 	}
 
-	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+	if (ext4_io_end_defer_completion(io_end)) {
 		/*
 		 * Link bio into list hanging from io_end. We have to do it
 		 * atomically as bio completions can be racing against each
-- 
2.39.2


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ