linux-kernel - Test patch to make afs use its own version of write_cache

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <522122.1677799976@warthog.procyon.org.uk>
Date:   Thu, 02 Mar 2023 23:32:56 +0000
From:   David Howells <dhowells@...hat.com>
To:     Linus Torvalds <torvalds@...ux-foundation.org>,
        Steve French <smfrench@...il.com>
Cc:     dhowells@...hat.com, Vishal Moola <vishal.moola@...il.com>,
        Shyam Prasad N <nspmangalore@...il.com>,
        Rohith Surabattula <rohiths.msft@...il.com>,
        Tom Talpey <tom@...pey.com>,
        Stefan Metzmacher <metze@...ba.org>,
        Paulo Alcantara <pc@....nz>, Jeff Layton <jlayton@...nel.org>,
        Matthew Wilcox <willy@...radead.org>,
        Marc Dionne <marc.dionne@...istor.com>,
        linux-afs@...ts.infradead.org, linux-cifs@...r.kernel.org,
        linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Test patch to make afs use its own version of write_cache_pages()

David Howells <dhowells@...hat.com> wrote:

> AFS firstly. ...
> 
>   Base + Page-dirty-region tracking removed + Own write_cache_pages()
> 	WRITE: bw=302MiB/s (316MB/s), 75.1MiB/s-76.1MiB/s (78.7MB/s-79.8MB/s)
> 	WRITE: bw=302MiB/s (316MB/s), 74.5MiB/s-76.1MiB/s (78.1MB/s-79.8MB/s)
> 	WRITE: bw=301MiB/s (316MB/s), 75.2MiB/s-75.5MiB/s (78.9MB/s-79.1MB/s)


This goes on top of "Test patch to remove per-page dirty region tracking from
afs" and "Test patch to make afs use write_cache_pages()"

David
---
 write.c |  141 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 138 insertions(+), 3 deletions(-)

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 86b6e7cbe17c..d66c05acda8c 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -463,9 +463,9 @@ static int afs_writepages_submit(struct address_space *mapping,
  * Add a page to the set and flush when large enough.
  */
 static int afs_writepages_add_folio(struct folio *folio,
-				    struct writeback_control *wbc, void *data)
+				    struct writeback_control *wbc,
+				    struct afs_writepages_context *ctx)
 {
-	struct afs_writepages_context *ctx = data;
 	struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host);
 	int ret;
 
@@ -499,6 +499,141 @@ static int afs_writepages_add_folio(struct folio *folio,
 	}
 	return 0;
 }
+static int afs_write_cache_pages(struct address_space *mapping,
+				 struct writeback_control *wbc,
+				 struct afs_writepages_context *ctx)
+{
+	int ret = 0;
+	int done = 0;
+	int error;
+	struct folio_batch fbatch;
+	int nr_folios;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	pgoff_t done_index;
+	int range_whole = 0;
+	xa_mark_t tag;
+
+	folio_batch_init(&fbatch);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
+		tag_pages_for_writeback(mapping, index, end);
+		tag = PAGECACHE_TAG_TOWRITE;
+	} else {
+		tag = PAGECACHE_TAG_DIRTY;
+	}
+	done_index = index;
+	while (!done && (index <= end)) {
+		int i;
+
+		nr_folios = filemap_get_folios_tag(mapping, &index, end,
+				tag, &fbatch);
+
+		if (nr_folios == 0)
+			break;
+
+		for (i = 0; i < nr_folios; i++) {
+			struct folio *folio = fbatch.folios[i];
+
+			done_index = folio->index;
+
+			folio_lock(folio);
+
+			/*
+			 * Page truncated or invalidated. We can freely skip it
+			 * then, even for data integrity operations: the page
+			 * has disappeared concurrently, so there could be no
+			 * real expectation of this data integrity operation
+			 * even if there is now a new, dirty page at the same
+			 * pagecache address.
+			 */
+			if (unlikely(folio->mapping != mapping)) {
+continue_unlock:
+				folio_unlock(folio);
+				continue;
+			}
+
+			if (!folio_test_dirty(folio)) {
+				/* someone wrote it for us */
+				goto continue_unlock;
+			}
+
+			if (folio_test_writeback(folio)) {
+				if (wbc->sync_mode != WB_SYNC_NONE)
+					folio_wait_writeback(folio);
+				else
+					goto continue_unlock;
+			}
+
+			BUG_ON(folio_test_writeback(folio));
+			if (!folio_clear_dirty_for_io(folio))
+				goto continue_unlock;
+
+			//trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
+			error = afs_writepages_add_folio(folio, wbc, ctx);
+			if (unlikely(error)) {
+				/*
+				 * Handle errors according to the type of
+				 * writeback. There's no need to continue for
+				 * background writeback. Just push done_index
+				 * past this page so media errors won't choke
+				 * writeout for the entire file. For integrity
+				 * writeback, we must process the entire dirty
+				 * set regardless of errors because the fs may
+				 * still have state to clear for each page. In
+				 * that case we continue processing and return
+				 * the first error.
+				 */
+				if (error == AOP_WRITEPAGE_ACTIVATE) {
+					folio_unlock(folio);
+					error = 0;
+				} else if (wbc->sync_mode != WB_SYNC_ALL) {
+					ret = error;
+					done_index = folio->index +
+						folio_nr_pages(folio);
+					done = 1;
+					break;
+				}
+				if (!ret)
+					ret = error;
+			}
+
+			/*
+			 * We stop writing back only if we are not doing
+			 * integrity sync. In case of integrity sync we have to
+			 * keep going until we have written all the pages
+			 * we tagged for writeback prior to entering this loop.
+			 */
+			if (--wbc->nr_to_write <= 0 &&
+			    wbc->sync_mode == WB_SYNC_NONE) {
+				done = 1;
+				break;
+			}
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	/*
+	 * If we hit the last page and there is more work to be done: wrap
+	 * back the index back to the start of the file for the next
+	 * time we are called.
+	 */
+	if (wbc->range_cyclic && !done)
+		done_index = 0;
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = done_index;
+
+	return ret;
+}
 
 /*
  * write some of the pending data back to the server
@@ -523,7 +658,7 @@ int afs_writepages(struct address_space *mapping,
 	else if (!down_read_trylock(&vnode->validate_lock))
 		return 0;
 
-	ret = write_cache_pages(mapping, wbc, afs_writepages_add_folio, &ctx);
+	ret = afs_write_cache_pages(mapping, wbc, &ctx);
 	if (ret >= 0 && ctx.begun)
 		ret = afs_writepages_submit(mapping, wbc, &ctx);