linux-kernel - [RFC PATCH 20/22] ext4: add support for read_iter, write_iter, and direct_IO

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1330377576-3659-21-git-send-email-dave.kleikamp@oracle.com>
Date:	Mon, 27 Feb 2012 15:19:34 -0600
From:	Dave Kleikamp <dave.kleikamp@...cle.com>
To:	linux-fsdevel@...r.kernel.org
Cc:	linux-kernel@...r.kernel.org, Zach Brown <zab@...bo.net>,
	Dave Kleikamp <dave.kleikamp@...cle.com>,
	"Theodore Ts'o" <tytso@....edu>,
	Andreas Dilger <adilger.kernel@...ger.ca>,
	linux-ext4@...r.kernel.org
Subject: [RFC PATCH 20/22] ext4: add support for read_iter, write_iter, and direct_IO_bvec

Some helpers were broken out of ext4_ind_direct_IO() and
ext4_ext_direct_IO() in order to avoid code duplication in new
bio_vec-based functions.

Signed-off-by: Dave Kleikamp <dave.kleikamp@...cle.com>
Cc: Zach Brown <zab@...bo.net>
Cc: "Theodore Ts'o" <tytso@....edu>
Cc: Andreas Dilger <adilger.kernel@...ger.ca>
Cc: linux-ext4@...r.kernel.org
---
 fs/ext4/ext4.h     |    3 +
 fs/ext4/file.c     |    2 +
 fs/ext4/indirect.c |  169 +++++++++++++++++++++++++++++++-----------
 fs/ext4/inode.c    |  206 +++++++++++++++++++++++++++++++++++-----------------
 4 files changed, 268 insertions(+), 112 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004f..6426d43 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1905,6 +1905,9 @@ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 				const struct iovec *iov, loff_t offset,
 				unsigned long nr_segs);
+extern ssize_t ext4_ind_direct_IO_bvec(int rw, struct kiocb *iocb,
+				struct bio_vec *bvec, loff_t offset,
+				unsigned long bvec_len);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
 extern void ext4_ind_truncate(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cb70f18..ce76745 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -234,6 +234,8 @@ const struct file_operations ext4_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= ext4_file_write,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
 	.unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2..e8ca3b9 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -760,6 +760,72 @@ out:
 	return err;
 }
 
+static ssize_t ext4_journal_orphan_add(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	handle_t *handle;
+	ssize_t ret;
+
+	/* Credits for sb + inode write */
+	handle = ext4_journal_start(inode, 2);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	ret = ext4_orphan_add(handle, inode);
+	if (ret) {
+		ext4_journal_stop(handle);
+		goto out;
+	}
+	ei->i_disksize = inode->i_size;
+	ext4_journal_stop(handle);
+out:
+	return ret;
+}
+
+static ssize_t ext4_journal_orphan_del(struct inode *inode, ssize_t ret,
+				       loff_t offset)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	handle_t *handle;
+	int err;
+
+	/* Credits for sb + inode write */
+	handle = ext4_journal_start(inode, 2);
+	if (IS_ERR(handle)) {
+		/* This is really bad luck. We've written the data
+		 * but cannot extend i_size. Bail out and pretend
+		 * the write failed... */
+		ret = PTR_ERR(handle);
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+
+		goto out;
+	}
+	if (inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+	if (ret > 0) {
+		loff_t end = offset + ret;
+		if (end > inode->i_size) {
+			ei->i_disksize = end;
+			i_size_write(inode, end);
+			/*
+			 * We're going to return a positive `ret'
+			 * here due to non-zero-length I/O, so there's
+			 * no way of reporting error returns from
+			 * ext4_mark_inode_dirty() to userspace.  So
+			 * ignore it.
+			 */
+			ext4_mark_inode_dirty(handle, inode);
+		}
+	}
+	err = ext4_journal_stop(handle);
+	if (ret == 0)
+		ret = err;
+out:
+	return ret;
+}
+
 /*
  * O_DIRECT for ext3 (or indirect map) based files
  *
@@ -778,7 +844,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	handle_t *handle;
 	ssize_t ret;
 	int orphan = 0;
 	size_t count = iov_length(iov, nr_segs);
@@ -788,20 +853,10 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 		loff_t final_size = offset + count;
 
 		if (final_size > inode->i_size) {
-			/* Credits for sb + inode write */
-			handle = ext4_journal_start(inode, 2);
-			if (IS_ERR(handle)) {
-				ret = PTR_ERR(handle);
-				goto out;
-			}
-			ret = ext4_orphan_add(handle, inode);
-			if (ret) {
-				ext4_journal_stop(handle);
+			ret =  ext4_journal_orphan_add(inode);
+			if (ret)
 				goto out;
-			}
 			orphan = 1;
-			ei->i_disksize = inode->i_size;
-			ext4_journal_stop(handle);
 		}
 	}
 
@@ -831,42 +886,68 @@ retry:
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
-	if (orphan) {
-		int err;
+	if (orphan)
+		ret = ext4_journal_orphan_del(inode, ret, offset);
+out:
+	return ret;
+}
 
-		/* Credits for sb + inode write */
-		handle = ext4_journal_start(inode, 2);
-		if (IS_ERR(handle)) {
-			/* This is really bad luck. We've written the data
-			 * but cannot extend i_size. Bail out and pretend
-			 * the write failed... */
-			ret = PTR_ERR(handle);
-			if (inode->i_nlink)
-				ext4_orphan_del(NULL, inode);
+/*
+ * Like ext4_ind_direct_IO, but operates on bio_vec instead of iovec
+ */
+ssize_t ext4_ind_direct_IO_bvec(int rw, struct kiocb *iocb,
+				struct bio_vec *bvec, loff_t offset,
+				unsigned long bvec_len)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ssize_t ret;
+	int orphan = 0;
+	size_t count = bvec_length(bvec, bvec_len);
+	int retries = 0;
+
+	if (rw == WRITE) {
+		loff_t final_size = offset + count;
 
-			goto out;
+		if (final_size > inode->i_size) {
+			ret =  ext4_journal_orphan_add(inode);
+			if (ret)
+				goto out;
+			orphan = 1;
 		}
-		if (inode->i_nlink)
-			ext4_orphan_del(handle, inode);
-		if (ret > 0) {
-			loff_t end = offset + ret;
-			if (end > inode->i_size) {
-				ei->i_disksize = end;
-				i_size_write(inode, end);
-				/*
-				 * We're going to return a positive `ret'
-				 * here due to non-zero-length I/O, so there's
-				 * no way of reporting error returns from
-				 * ext4_mark_inode_dirty() to userspace.  So
-				 * ignore it.
-				 */
-				ext4_mark_inode_dirty(handle, inode);
-			}
+	}
+
+retry:
+	if (rw == READ && ext4_should_dioread_nolock(inode)) {
+		if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+			mutex_lock(&inode->i_mutex);
+			ext4_flush_completed_IO(inode);
+			mutex_unlock(&inode->i_mutex);
+		}
+		ret = __blockdev_direct_IO_bvec(rw, iocb, inode,
+				 inode->i_sb->s_bdev, bvec,
+				 offset, bvec_len,
+				 ext4_get_block, NULL, NULL, 0);
+	} else {
+		ret = blockdev_direct_IO_bvec(rw, iocb, inode,
+				 inode->i_sb->s_bdev, bvec,
+				 offset, bvec_len,
+				 ext4_get_block, NULL);
+
+		if (unlikely((rw & WRITE) && ret < 0)) {
+			loff_t isize = i_size_read(inode);
+			loff_t end = offset + bvec_length(bvec, bvec_len);
+
+			if (end > isize)
+				ext4_truncate_failed_write(inode);
 		}
-		err = ext4_journal_stop(handle);
-		if (ret == 0)
-			ret = err;
 	}
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	if (orphan)
+		ret = ext4_journal_orphan_del(inode, ret, offset);
 out:
 	return ret;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82f..922b26f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2764,7 +2764,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p "
 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
- 		  iocb->private, io_end->inode->i_ino, iocb, offset,
+		  iocb->private, io_end->inode->i_ino, iocb, offset,
 		  size);
 
 	iocb->private = NULL;
@@ -2868,6 +2868,85 @@ retry:
 	return 0;
 }
 
+static ssize_t ext4_ext_direct_IO_pre_write(struct kiocb *iocb,
+					    struct inode *inode)
+{
+	/*
+	 * We could direct write to holes and fallocate.
+	 *
+	 * Allocated blocks to fill the hole are marked as uninitialized
+	 * to prevent parallel buffered read to expose the stale data
+	 * before DIO complete the data IO.
+	 *
+	 * As to previously fallocated extents, ext4 get_block
+	 * will just simply mark the buffer mapped but still
+	 * keep the extents uninitialized.
+	 *
+	 * for non AIO case, we will convert those unwritten extents
+	 * to written after return back from blockdev_direct_IO.
+	 *
+	 * for async DIO, the conversion needs to be defered when
+	 * the IO is completed. The ext4 end_io callback function
+	 * will be called to take care of the conversion work.
+	 * Here for async case, we allocate an io_end structure to
+	 * hook to the iocb.
+	 */
+	iocb->private = NULL;
+	EXT4_I(inode)->cur_aio_dio = NULL;
+	if (!is_sync_kiocb(iocb)) {
+		iocb->private = ext4_init_io_end(inode, GFP_NOFS);
+		if (!iocb->private)
+			return -ENOMEM;
+		/*
+		 * we save the io structure for current async
+		 * direct IO, so that later ext4_map_blocks()
+		 * could flag the io structure whether there
+		 * is a unwritten extents needs to be converted
+		 * when IO is completed.
+		 */
+		EXT4_I(inode)->cur_aio_dio = iocb->private;
+	}
+	return 0;
+}
+
+static ssize_t ext4_ext_direct_IO_post_write(struct kiocb *iocb,
+					     struct inode *inode,
+					     loff_t offset, ssize_t ret)
+{
+	if (iocb->private)
+			EXT4_I(inode)->cur_aio_dio = NULL;
+	/*
+	 * The io_end structure takes a reference to the inode,
+	 * that structure needs to be destroyed and the
+	 * reference to the inode need to be dropped, when IO is
+	 * complete, even with 0 byte write, or failed.
+	 *
+	 * In the successful AIO DIO case, the io_end structure will be
+	 * desctroyed and the reference to the inode will be dropped
+	 * after the end_io call back function is called.
+	 *
+	 * In the case there is 0 byte write, or error case, since
+	 * VFS direct IO won't invoke the end_io call back function,
+	 * we need to free the end_io structure here.
+	 */
+	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+		ext4_free_io_end(iocb->private);
+		iocb->private = NULL;
+	} else if (ret > 0 &&
+		   ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) {
+		int err;
+		/*
+		 * for non AIO case, since the IO is already
+		 * completed, we could do the conversion right here
+		 */
+		err = ext4_convert_unwritten_extents(inode, offset, ret);
+		if (err < 0)
+			ret = err;
+		ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+	}
+	return ret;
+}
+
 /*
  * For ext4 extent files, ext4 will do direct-io write to holes,
  * preallocated extents, and those write extend the file, no need to
@@ -2898,41 +2977,9 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 
 	loff_t final_size = offset + count;
 	if (rw == WRITE && final_size <= inode->i_size) {
-		/*
- 		 * We could direct write to holes and fallocate.
-		 *
- 		 * Allocated blocks to fill the hole are marked as uninitialized
- 		 * to prevent parallel buffered read to expose the stale data
- 		 * before DIO complete the data IO.
-		 *
- 		 * As to previously fallocated extents, ext4 get_block
- 		 * will just simply mark the buffer mapped but still
- 		 * keep the extents uninitialized.
- 		 *
-		 * for non AIO case, we will convert those unwritten extents
-		 * to written after return back from blockdev_direct_IO.
-		 *
-		 * for async DIO, the conversion needs to be defered when
-		 * the IO is completed. The ext4 end_io callback function
-		 * will be called to take care of the conversion work.
-		 * Here for async case, we allocate an io_end structure to
-		 * hook to the iocb.
- 		 */
-		iocb->private = NULL;
-		EXT4_I(inode)->cur_aio_dio = NULL;
-		if (!is_sync_kiocb(iocb)) {
-			iocb->private = ext4_init_io_end(inode, GFP_NOFS);
-			if (!iocb->private)
-				return -ENOMEM;
-			/*
-			 * we save the io structure for current async
-			 * direct IO, so that later ext4_map_blocks()
-			 * could flag the io structure whether there
-			 * is a unwritten extents needs to be converted
-			 * when IO is completed.
-			 */
-			EXT4_I(inode)->cur_aio_dio = iocb->private;
-		}
+		ret = ext4_ext_direct_IO_pre_write(iocb, inode);
+		if (ret)
+			return ret;
 
 		ret = __blockdev_direct_IO(rw, iocb, inode,
 					 inode->i_sb->s_bdev, iov,
@@ -2941,38 +2988,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 					 ext4_end_io_dio,
 					 NULL,
 					 DIO_LOCKING | DIO_SKIP_HOLES);
-		if (iocb->private)
-			EXT4_I(inode)->cur_aio_dio = NULL;
-		/*
-		 * The io_end structure takes a reference to the inode,
-		 * that structure needs to be destroyed and the
-		 * reference to the inode need to be dropped, when IO is
-		 * complete, even with 0 byte write, or failed.
-		 *
-		 * In the successful AIO DIO case, the io_end structure will be
-		 * desctroyed and the reference to the inode will be dropped
-		 * after the end_io call back function is called.
-		 *
-		 * In the case there is 0 byte write, or error case, since
-		 * VFS direct IO won't invoke the end_io call back function,
-		 * we need to free the end_io structure here.
-		 */
-		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
-			ext4_free_io_end(iocb->private);
-			iocb->private = NULL;
-		} else if (ret > 0 && ext4_test_inode_state(inode,
-						EXT4_STATE_DIO_UNWRITTEN)) {
-			int err;
-			/*
-			 * for non AIO case, since the IO is already
-			 * completed, we could do the conversion right here
-			 */
-			err = ext4_convert_unwritten_extents(inode,
-							     offset, ret);
-			if (err < 0)
-				ret = err;
-			ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-		}
+		ret = ext4_ext_direct_IO_post_write(iocb, inode, offset, ret);
 		return ret;
 	}
 
@@ -2980,6 +2996,37 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
 }
 
+/*
+ * Like ext4_ext_direct_IO, but operates on a bio_vec rather than iovec.
+ */
+static ssize_t ext4_ext_direct_IO_bvec(int rw, struct kiocb *iocb,
+			      struct bio_vec *bvec, loff_t offset,
+			      unsigned long bvec_len)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+	size_t count = bvec_length(bvec, bvec_len);
+
+	loff_t final_size = offset + count;
+	if (rw == WRITE && final_size <= inode->i_size) {
+		ret = ext4_ext_direct_IO_pre_write(iocb, inode);
+		if (ret)
+			return ret;
+
+		ret = blockdev_direct_IO_bvec(rw, iocb, inode,
+					 inode->i_sb->s_bdev, bvec,
+					 offset, bvec_len,
+					 ext4_get_block_write,
+					 ext4_end_io_dio);
+		ret = ext4_ext_direct_IO_post_write(iocb, inode, offset, ret);
+		return ret;
+	}
+
+	/* for write the the end of file case, we fall back to old way */
+	return ext4_ind_direct_IO_bvec(rw, iocb, bvec, offset, bvec_len);
+}
+
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
@@ -3004,6 +3051,25 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	return ret;
 }
 
+static ssize_t ext4_direct_IO_bvec(int rw, struct kiocb *iocb,
+			      struct bio_vec *bvec, loff_t offset,
+			      unsigned long bvec_len)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+
+	trace_ext4_direct_IO_enter(inode, offset, bvec_length(bvec, bvec_len),
+				   rw);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ret = ext4_ext_direct_IO_bvec(rw, iocb, bvec, offset, bvec_len);
+	else
+		ret = ext4_ind_direct_IO_bvec(rw, iocb, bvec, offset, bvec_len);
+	trace_ext4_direct_IO_exit(inode, offset, bvec_length(bvec, bvec_len),
+				  rw, ret);
+	return ret;
+}
+
 /*
  * Pages can be marked dirty completely asynchronously from ext4's journalling
  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
@@ -3033,6 +3099,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
+	.direct_IO_bvec		= ext4_direct_IO_bvec,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
@@ -3048,6 +3115,7 @@ static const struct address_space_operations ext4_writeback_aops = {
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
+	.direct_IO_bvec		= ext4_direct_IO_bvec,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
@@ -3064,6 +3132,7 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
+	.direct_IO_bvec		= ext4_direct_IO_bvec,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
@@ -3079,6 +3148,7 @@ static const struct address_space_operations ext4_da_aops = {
 	.invalidatepage		= ext4_da_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
+	.direct_IO_bvec		= ext4_direct_IO_bvec,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
-- 
1.7.9.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/