lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <167391061117.2311931.16807283804788007499.stgit@warthog.procyon.org.uk>
Date:   Mon, 16 Jan 2023 23:10:11 +0000
From:   David Howells <dhowells@...hat.com>
To:     Al Viro <viro@...iv.linux.org.uk>
Cc:     Jens Axboe <axboe@...nel.dk>, Jan Kara <jack@...e.cz>,
        Christoph Hellwig <hch@....de>,
        Matthew Wilcox <willy@...radead.org>,
        Logan Gunthorpe <logang@...tatee.com>,
        linux-fsdevel@...r.kernel.org, linux-block@...r.kernel.org,
        dhowells@...hat.com, Christoph Hellwig <hch@...radead.org>,
        Matthew Wilcox <willy@...radead.org>,
        Jens Axboe <axboe@...nel.dk>, Jan Kara <jack@...e.cz>,
        Jeff Layton <jlayton@...nel.org>,
        Logan Gunthorpe <logang@...tatee.com>,
        linux-fsdevel@...r.kernel.org, linux-block@...r.kernel.org,
        linux-kernel@...r.kernel.org
Subject: [PATCH v6 18/34] dio: Pin pages rather than ref'ing if appropriate

Convert the generic direct-I/O code to use iov_iter_extract_pages() instead
of iov_iter_get_pages().  This will pin pages or leave them unaltered
rather than getting a ref on them as appropriate to the iterator.

The pages need to be pinned for DIO-read rather than having refs taken on
them to prevent VM copy-on-write from malfunctioning during a concurrent
fork() (the result of the I/O would otherwise end up only visible to the
child process and not the parent).

Signed-off-by: David Howells <dhowells@...hat.com>
cc: Al Viro <viro@...iv.linux.org.uk>
cc: Jens Axboe <axboe@...nel.dk>
cc: Jan Kara <jack@...e.cz>
cc: Christoph Hellwig <hch@....de>
cc: Matthew Wilcox <willy@...radead.org>
cc: Logan Gunthorpe <logang@...tatee.com>
cc: linux-fsdevel@...r.kernel.org
cc: linux-block@...r.kernel.org
---

 fs/direct-io.c |   57 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index b1e26a706e31..b4d2c9f85a5b 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -142,9 +142,11 @@ struct dio {
 
 	/*
 	 * pages[] (and any fields placed after it) are not zeroed out at
-	 * allocation time.  Don't add new fields after pages[] unless you
-	 * wish that they not be zeroed.
+	 * allocation time.  Don't add new fields after pages[] unless you wish
+	 * that they not be zeroed.  Pages may have a ref taken, a pin emplaced
+	 * or no retention measures.
 	 */
+	unsigned int cleanup_mode;	/* How pages should be cleaned up (0/FOLL_GET/PIN) */
 	union {
 		struct page *pages[DIO_PAGES];	/* page buffer */
 		struct work_struct complete_work;/* deferred AIO completion */
@@ -167,12 +169,13 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
 static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
+	unsigned int gup_flags =
+		op_is_write(dio_op) ? FOLL_SOURCE_BUF : FOLL_DEST_BUF;
+	struct page **pages = dio->pages;
 	ssize_t ret;
 
-	ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
-				 &sdio->from,
-				 op_is_write(dio_op) ?
-				 FOLL_SOURCE_BUF : FOLL_DEST_BUF);
+	ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX, DIO_PAGES,
+				     gup_flags, &sdio->from);
 
 	if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
 		struct page *page = ZERO_PAGE(0);
@@ -183,7 +186,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
-		get_page(page);
+		dio->cleanup_mode = 0;
 		dio->pages[0] = page;
 		sdio->head = 0;
 		sdio->tail = 1;
@@ -197,6 +200,8 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 		sdio->head = 0;
 		sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
 		sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
+		dio->cleanup_mode =
+			iov_iter_extract_mode(sdio->iter, gup_flags);
 		return 0;
 	}
 	return ret;	
@@ -400,6 +405,10 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	 * we request a valid number of vectors.
 	 */
 	bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL);
+	if (!(dio->cleanup_mode & FOLL_GET))
+		bio_clear_flag(bio, BIO_PAGE_REFFED);
+	if (dio->cleanup_mode & FOLL_PIN)
+		bio_set_flag(bio, BIO_PAGE_PINNED);
 	bio->bi_iter.bi_sector = first_sector;
 	if (dio->is_async)
 		bio->bi_end_io = dio_bio_end_aio;
@@ -443,13 +452,18 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	sdio->logical_offset_in_bio = 0;
 }
 
+static void dio_cleanup_page(struct dio *dio, struct page *page)
+{
+	page_put_unpin(page, dio->cleanup_mode);
+}
+
 /*
  * Release any resources in case of a failure
  */
 static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
 	while (sdio->head < sdio->tail)
-		put_page(dio->pages[sdio->head++]);
+		dio_cleanup_page(dio, dio->pages[sdio->head++]);
 }
 
 /*
@@ -704,7 +718,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
  *
  * Return zero on success.  Non-zero means the caller needs to start a new BIO.
  */
-static inline int dio_bio_add_page(struct dio_submit *sdio)
+static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 
@@ -771,11 +785,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
 			goto out;
 	}
 
-	if (dio_bio_add_page(sdio) != 0) {
+	if (dio_bio_add_page(dio, sdio) != 0) {
 		dio_bio_submit(dio, sdio);
 		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
 		if (ret == 0) {
-			ret = dio_bio_add_page(sdio);
+			ret = dio_bio_add_page(dio, sdio);
 			BUG_ON(ret != 0);
 		}
 	}
@@ -832,13 +846,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 	 */
 	if (sdio->cur_page) {
 		ret = dio_send_cur_page(dio, sdio, map_bh);
-		put_page(sdio->cur_page);
+		dio_cleanup_page(dio, sdio->cur_page);
 		sdio->cur_page = NULL;
 		if (ret)
 			return ret;
 	}
 
-	get_page(page);		/* It is in dio */
+	ret = try_grab_page(page, dio->cleanup_mode);		/* It is in dio */
+	if (ret < 0)
+		return ret;
+
 	sdio->cur_page = page;
 	sdio->cur_page_offset = offset;
 	sdio->cur_page_len = len;
@@ -853,7 +870,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		ret = dio_send_cur_page(dio, sdio, map_bh);
 		if (sdio->bio)
 			dio_bio_submit(dio, sdio);
-		put_page(sdio->cur_page);
+		dio_cleanup_page(dio, sdio->cur_page);
 		sdio->cur_page = NULL;
 	}
 	return ret;
@@ -954,7 +971,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 
 				ret = get_more_blocks(dio, sdio, map_bh);
 				if (ret) {
-					put_page(page);
+					dio_cleanup_page(dio, page);
 					goto out;
 				}
 				if (!buffer_mapped(map_bh))
@@ -999,7 +1016,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
 				if (dio_op == REQ_OP_WRITE) {
-					put_page(page);
+					dio_cleanup_page(dio, page);
 					return -ENOTBLK;
 				}
 
@@ -1012,7 +1029,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 				if (sdio->block_in_file >=
 						i_size_aligned >> blkbits) {
 					/* We hit eof */
-					put_page(page);
+					dio_cleanup_page(dio, page);
 					goto out;
 				}
 				zero_user(page, from, 1 << blkbits);
@@ -1052,7 +1069,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 						  sdio->next_block_for_io,
 						  map_bh);
 			if (ret) {
-				put_page(page);
+				dio_cleanup_page(dio, page);
 				goto out;
 			}
 			sdio->next_block_for_io += this_chunk_blocks;
@@ -1068,7 +1085,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 		}
 
 		/* Drop the ref which was taken in get_user_pages() */
-		put_page(page);
+		dio_cleanup_page(dio, page);
 	}
 out:
 	return ret;
@@ -1288,7 +1305,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
 		if (retval == 0)
 			retval = ret2;
-		put_page(sdio.cur_page);
+		dio_cleanup_page(dio, sdio.cur_page);
 		sdio.cur_page = NULL;
 	}
 	if (sdio.bio)


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ