lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <162879975253.3306668.15630001599959638168.stgit@warthog.procyon.org.uk>
Date:   Thu, 12 Aug 2021 21:22:32 +0100
From:   David Howells <dhowells@...hat.com>
To:     willy@...radead.org
Cc:     Trond Myklebust <trond.myklebust@...merspace.com>,
        linux-nfs@...r.kernel.org, dhowells@...hat.com,
        dhowells@...hat.com, trond.myklebust@...marydata.com,
        darrick.wong@...cle.com, hch@....de, viro@...iv.linux.org.uk,
        jlayton@...nel.org, sfrench@...ba.org,
        torvalds@...ux-foundation.org, linux-nfs@...r.kernel.org,
        linux-mm@...ck.org, linux-fsdevel@...r.kernel.org,
        linux-kernel@...r.kernel.org
Subject: [RFC PATCH v2 4/5] mm: Make __swap_writepage() do async DIO if asked
 for it

Make __swap_writepage()'s DIO path do sync DIO if the writeback control's
sync mode is WB_SYNC_ALL and async DIO if not.

Note that this causes hanging processes in sunrpc if the swapfile is on
NFS.  I'm not sure whether it's due to misscheduling or something else.

Suggested-by: Matthew Wilcox (Oracle) <willy@...radead.org>
Signed-off-by: David Howells <dhowells@...hat.com>
cc: Trond Myklebust <trond.myklebust@...merspace.com>
cc: linux-nfs@...r.kernel.org
---

 mm/page_io.c |  145 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 102 insertions(+), 43 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 92ec4a7b0545..dae7bbd7a842 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -300,6 +300,105 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 #define bio_associate_blkg_from_page(bio, page)		do { } while (0)
 #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
 
+static void __swapfile_write_complete(struct kiocb *iocb, long ret, long ret2)
+{
+	struct page *page = iocb->ki_swap_page;
+
+	if (ret == thp_size(page)) {
+		count_vm_event(PSWPOUT);
+		ret = 0;
+	} else {
+		/*
+		 * In the case of swap-over-nfs, this can be a
+		 * temporary failure if the system has limited memory
+		 * for allocating transmit buffers.  Mark the page
+		 * dirty and avoid folio_rotate_reclaimable but
+		 * rate-limit the messages but do not flag PageError
+		 * like the normal direct-to-bio case as it could be
+		 * temporary.
+		 */
+		set_page_dirty(page);
+		ClearPageReclaim(page);
+		pr_err_ratelimited("Write error (%ld) on dio swapfile (%llu)\n",
+				   ret, page_file_offset(page));
+	}
+	end_page_writeback(page);
+}
+
+static void swapfile_write_complete(struct kiocb *iocb, long ret, long ret2)
+{
+	struct swapfile_kiocb *ki = container_of(iocb, struct swapfile_kiocb, iocb);
+
+	__swapfile_write_complete(iocb, ret, ret2);
+	swapfile_put_kiocb(ki);
+}
+
+static int swapfile_write_sync(struct swap_info_struct *sis,
+			       struct page *page, struct writeback_control *wbc)
+{
+	struct kiocb kiocb;
+	struct file *swap_file = sis->swap_file;
+	struct bio_vec bv = {
+		.bv_page	= page,
+		.bv_len		= thp_size(page),
+		.bv_offset	= 0
+	};
+	struct iov_iter from;
+	int ret;
+
+	init_sync_kiocb(&kiocb, swap_file);
+	kiocb.ki_swap_page	= page;
+	kiocb.ki_pos		= page_file_offset(page);
+	kiocb.ki_flags		= IOCB_DIRECT | IOCB_WRITE | IOCB_SWAP;
+
+	set_page_writeback(page);
+	unlock_page(page);
+
+	iov_iter_bvec(&from, WRITE, &bv, 1, thp_size(page));
+	ret = swap_file->f_mapping->a_ops->direct_IO(&kiocb, &from);
+	__swapfile_write_complete(&kiocb, ret, 0);
+	return (ret > 0) ? 0 : ret;
+}
+
+static int swapfile_write(struct swap_info_struct *sis,
+			  struct page *page, struct writeback_control *wbc)
+{
+	struct swapfile_kiocb *ki;
+	struct file *swap_file = sis->swap_file;
+	struct bio_vec bv = {
+		.bv_page	= page,
+		.bv_len		= thp_size(page),
+		.bv_offset	= 0
+	};
+	struct iov_iter from;
+	int ret;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		return swapfile_write_sync(sis, page, wbc);
+
+	ki = kzalloc(sizeof(*ki), GFP_KERNEL);
+	if (!ki)
+		return -ENOMEM;
+
+	refcount_set(&ki->ki_refcnt, 2);
+	iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
+	init_sync_kiocb(&ki->iocb, swap_file);
+	ki->iocb.ki_swap_page	= page;
+	ki->iocb.ki_pos		= page_file_offset(page);
+	ki->iocb.ki_flags	= IOCB_DIRECT | IOCB_WRITE | IOCB_SWAP;
+	ki->iocb.ki_complete	= swapfile_write_complete;
+	get_file(swap_file);
+
+	set_page_writeback(page);
+	unlock_page(page);
+	ret = swap_file->f_mapping->a_ops->direct_IO(&ki->iocb, &from);
+
+	if (ret != -EIOCBQUEUED)
+		swapfile_write_complete(&ki->iocb, ret, 0);
+	swapfile_put_kiocb(ki);
+	return (ret > 0) ? 0 : ret;
+}
+
 int __swap_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct bio *bio;
@@ -307,47 +406,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc)
 	struct swap_info_struct *sis = page_swap_info(page);
 
 	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
-	if (data_race(sis->flags & SWP_FS_OPS)) {
-		struct kiocb kiocb;
-		struct file *swap_file = sis->swap_file;
-		struct address_space *mapping = swap_file->f_mapping;
-		struct bio_vec bv = {
-			.bv_page = page,
-			.bv_len  = PAGE_SIZE,
-			.bv_offset = 0
-		};
-		struct iov_iter from;
-
-		iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
-		init_sync_kiocb(&kiocb, swap_file);
-		kiocb.ki_pos	= page_file_offset(page);
-		kiocb.ki_flags	= IOCB_DIRECT | IOCB_WRITE | IOCB_SWAP;
-
-		set_page_writeback(page);
-		unlock_page(page);
-		ret = mapping->a_ops->direct_IO(&kiocb, &from);
-		if (ret == PAGE_SIZE) {
-			count_vm_event(PSWPOUT);
-			ret = 0;
-		} else {
-			/*
-			 * In the case of swap-over-nfs, this can be a
-			 * temporary failure if the system has limited
-			 * memory for allocating transmit buffers.
-			 * Mark the page dirty and avoid
-			 * folio_rotate_reclaimable but rate-limit the
-			 * messages but do not flag PageError like
-			 * the normal direct-to-bio case as it could
-			 * be temporary.
-			 */
-			set_page_dirty(page);
-			ClearPageReclaim(page);
-			pr_err_ratelimited("Write error (%d) on dio swapfile (%llu)\n",
-					   ret, page_file_offset(page));
-		}
-		end_page_writeback(page);
-		return ret;
-	}
+	if (data_race(sis->flags & SWP_FS_OPS))
+		return swapfile_write(sis, page, wbc);
 
 	ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
 	if (!ret) {
@@ -410,7 +470,6 @@ static int swapfile_read_sync(struct swap_info_struct *sis, struct page *page)
 	init_sync_kiocb(&kiocb, swap_file);
 	kiocb.ki_swap_page	= page;
 	kiocb.ki_pos		= page_file_offset(page);
-	kiocb.ki_filp		= swap_file;
 	kiocb.ki_flags		= IOCB_DIRECT | IOCB_SWAP;
 	/* Should set IOCB_HIPRI too, but the box becomes unresponsive whilst
 	 * putting out occasional messages about the NFS sunrpc scheduling
@@ -449,8 +508,8 @@ static int swapfile_read(struct swap_info_struct *sis, struct page *page,
 	ki->iocb.ki_swap_page	= page;
 	ki->iocb.ki_flags	= IOCB_DIRECT | IOCB_SWAP;
 	ki->iocb.ki_pos		= page_file_offset(page);
-	ki->iocb.ki_filp	= get_file(swap_file);
 	ki->iocb.ki_complete	= swapfile_read_complete;
+	get_file(swap_file);
 
 	iov_iter_bvec(&to, READ, &bv, 1, thp_size(page));
 	ret = swap_file->f_mapping->a_ops->direct_IO(&ki->iocb, &to);


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ