lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20191221202630.30718-2-mfo@canonical.com>
Date:   Sat, 21 Dec 2019 17:26:30 -0300
From:   mfo@...onical.com
To:     tytso@....edu
Cc:     dann.frazier@...onical.com, linux-ext4@...r.kernel.org,
        mauricio.foliveira@...il.com
Subject: [RFC 1/1] ext4: set page writeback on journalled writepage

From: Mauricio Faria de Oliveira <mfo@...onical.com>

Work in progress / request for comments to fix issue with
journal consistency errors on unclean shutdown with mmap()
on ext4 data=journal,journal_checksum mode.

Reference:
  https://lore.kernel.org/linux-ext4/20190830012236.GC10779@mit.edu/
---
 fs/ext4/ext4_jbd2.h | 11 ++++++
 fs/ext4/inline.c    | 13 +++++++
 fs/ext4/inode.c     | 82 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 102 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a6b9b66dbfad..8b51ca8231b7 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -144,6 +144,17 @@ struct ext4_journal_cb_entry {
 	/* user data goes here */
 };
 
+/**
+ * struct ext4_journal_cb_entry_simple - Simple structure for callback information.
+ *
+ * This struct is a 'simple' structure on top of the base/seed structure,
+ * which adds a private data pointer to be used by the callback function.
+ */
+struct ext4_journal_cb_entry_simple {
+	struct ext4_journal_cb_entry jce;
+	void *jce_private;
+};
+
 /**
  * ext4_journal_callback_add: add a function to call after transaction commit
  * @handle: active journal transaction handle to register callback on
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 2fec62d764fa..a168fe497d5d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -565,6 +565,9 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
 		ret = -ENOMEM;
 		goto out;
 	}
+	/* XXX(mfo): deadlock with journal? */
+	if (ext4_should_journal_data(inode))
+		wait_on_page_writeback(page);
 
 	ext4_write_lock_xattr(inode, &no_expand);
 	sem_held = 1;
@@ -693,6 +696,9 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 		ret = -ENOMEM;
 		goto out;
 	}
+	/* XXX(mfo): deadlock with journal? */
+	if (ext4_should_journal_data(inode))
+		wait_on_page_writeback(page);
 
 	*pagep = page;
 	down_read(&EXT4_I(inode)->xattr_sem);
@@ -808,6 +814,10 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
 	page = grab_cache_page_write_begin(mapping, 0, flags);
 	if (!page)
 		return -ENOMEM;
+	/* XXX(mfo): should not deadlock with journal;
+	 * this is only called after ext4_journal_stop(). */
+	if (ext4_should_journal_data(inode))
+		wait_on_page_writeback(page);
 
 	down_read(&EXT4_I(inode)->xattr_sem);
 	if (!ext4_has_inline_data(inode)) {
@@ -911,6 +921,9 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
 		ret = -ENOMEM;
 		goto out_journal;
 	}
+	/* XXX(mfo): deadlock with journal? */
+	if (ext4_should_journal_data(inode))
+		wait_on_page_writeback(page);
 
 	down_read(&EXT4_I(inode)->xattr_sem);
 	if (!ext4_has_inline_data(inode)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28f28de0c1b6..ca31db5f81f0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -139,7 +139,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 
 static void ext4_invalidatepage(struct page *page, unsigned int offset,
 				unsigned int length);
-static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len, bool sync);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
 				  int pextents);
@@ -1155,6 +1155,15 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	unlock_page(page);
 
 retry_journal:
+
+	/* XXX(mfo): deadlock with journal: fix attempt.
+	 * does just wait_on_page_writeback() need (un)lock_page() ? */
+	if (ext4_should_journal_data(inode)) {
+		lock_page(page);
+		wait_on_page_writeback(page);
+		unlock_page(page);
+	}
+
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
 	if (IS_ERR(handle)) {
 		put_page(page);
@@ -1170,6 +1179,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 		goto retry_grab;
 	}
 	/* In case writeback began while the page was unlocked */
+	/* XXX(mfo): this may call wait_on_page_writeback() and deadlock. */
 	wait_for_stable_page(page);
 
 #ifdef CONFIG_FS_ENCRYPTION
@@ -1841,8 +1851,21 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
+static void __ext4_jce_finish_page_writeback(struct super_block *sb,
+					     struct ext4_journal_cb_entry *jce,
+					     int error)
+{
+	struct ext4_journal_cb_entry_simple *jce_simple =
+		(struct ext4_journal_cb_entry_simple *) jce;
+	struct page *page = (struct page *) jce_simple->jce_private;
+
+	end_page_writeback(page);
+	kfree(jce);
+}
+
 static int __ext4_journalled_writepage(struct page *page,
-				       unsigned int len)
+				       unsigned int len,
+				       bool sync)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
@@ -1851,6 +1874,7 @@ static int __ext4_journalled_writepage(struct page *page,
 	int ret = 0, err = 0;
 	int inline_data = ext4_has_inline_data(inode);
 	struct buffer_head *inode_bh = NULL;
+	struct ext4_journal_cb_entry_simple *jce = NULL;
 
 	ClearPageChecked(page);
 
@@ -1875,13 +1899,25 @@ static int __ext4_journalled_writepage(struct page *page,
 	 * out from under us.
 	 */
 	get_page(page);
+	/* XXX(mfo): do NOT set_page_writeback() here; as the next lock_page()
+	 * could deadlock with write_cache_pages() (since it calls lock_page()
+	 * and then wait_on_page_writeback()).
+	 */
+	//set_page_writeback(page);
 	unlock_page(page);
 
+	if (!sync) {
+		jce = kzalloc(sizeof(*jce), GFP_NOFS);
+		if (!jce) {
+			ret = -ENOMEM;
+			goto out_no_pagelock;
+		}
+	}
+
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 				    ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		put_page(page);
 		goto out_no_pagelock;
 	}
 	BUG_ON(!ext4_handle_valid(handle));
@@ -1906,7 +1942,25 @@ static int __ext4_journalled_writepage(struct page *page,
 	}
 	if (ret == 0)
 		ret = err;
+
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+
+	/* XXX(mfo): only call set_page_writeback() and add callback to end_page_writeback()
+	 * on journal commit on the *a*sync case;  otherwise msync() blocks until periodic
+	 * journal commit happens, waiting on page writeback.
+	 * msync() -> ext4_sync_file() -> __filemap_fdatawait_range() -> wait_on_page_writeback()
+	 *
+	 * Confirm wether the *sync* case does *not* need ext4_handle_sync() ?
+	 * as msync() -> ext4_sync_file() already calls ext4_force_commit() for data=journal.
+	 */
+	if (!sync) {
+		/* Add journal callback entry to finish page writeback and free itself */
+		set_page_writeback(page);
+		jce->jce_private = page;
+		ext4_journal_callback_add(handle, __ext4_jce_finish_page_writeback,
+					  (struct ext4_journal_cb_entry *) jce);
+	}
+
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
@@ -1918,6 +1972,15 @@ static int __ext4_journalled_writepage(struct page *page,
 out:
 	unlock_page(page);
 out_no_pagelock:
+	/*
+	 * Error leg for handle not yet initialized (jce allocation error)
+	 * or failed to. Either way kfree(jce) is ok (it's NULL or valid.)
+	 */
+	if (IS_ERR_OR_NULL(handle)) {
+		put_page(page);
+		kfree(jce);
+	}
+
 	brelse(inode_bh);
 	return ret;
 }
@@ -2029,7 +2092,8 @@ static int ext4_writepage(struct page *page,
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
-		return __ext4_journalled_writepage(page, len);
+		return __ext4_journalled_writepage(page, len,
+						   (wbc->sync_mode == WB_SYNC_ALL));
 
 	ext4_io_submit_init(&io_submit, wbc);
 	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
@@ -2974,6 +3038,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	 * of file which has an already mapped buffer.
 	 */
 retry_journal:
+
+	/* XXX(mfo): see comment in non-da ext4_write_begin(). */
+	if (ext4_should_journal_data(inode)) {
+		lock_page(page);
+		wait_on_page_writeback(page);
+		unlock_page(page);
+	}
+
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 				ext4_da_write_credits(inode, pos, len));
 	if (IS_ERR(handle)) {
@@ -5929,6 +6001,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 					    ext4_bh_unmapped)) {
 			/* Wait so that we don't change page under IO */
 			wait_for_stable_page(page);
+			if (ext4_should_journal_data(inode))
+				wait_on_page_writeback(page);
 			ret = VM_FAULT_LOCKED;
 			goto out;
 		}
-- 
2.17.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ