lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 15 Apr 2015 12:22:56 -0600
From:	Jens Axboe <axboe@...com>
To:	<linux-fsdevel@...r.kernel.org>
CC:	<hch@...radead.org>, <viro@...iv.linux.org.uk>,
	<akpm@...ux-foundation.org>, <linux-kernel@...r.kernel.org>
Subject: [PATCH] direct-io: only inc/dec inode->i_dio_count for file systems

Hi,

This is a reposting of a patch that was originally in the blk-mq series.
It has huge upside on shared access to a multiqueue device doing
O_DIRECT, it's basically the scaling block that ends up killing
performance. A quick test here reveals that we spend 30% of all system
time just incrementing and decremening inode->i_dio_count. For block
devices this isn't useful at all, as we don't need protection against
truncate. For that test case, performance increases about 3.6x (!!) by
getting rid of this inc/dec per IO.

I've cleaned it up a bit since last time, integrating the checks in
inode_dio_done() and adding a inode_dio_begin() so that callers don't
need to know about this.

We've been running a variant of this patch in the FB kernel for a while.
I'd like to finally get this upstream.

Signed-off-by: Jens Axboe <axboe@...com>

---

 fs/block_dev.c     |    2 +-
 fs/btrfs/inode.c   |    6 +++---
 fs/dax.c           |    4 ++--
 fs/direct-io.c     |    5 +++--
 fs/ext4/indirect.c |    6 +++---
 fs/ext4/inode.c    |    4 ++--
 fs/inode.c         |   21 +++++++++++++++++++--
 fs/nfs/direct.c    |   10 +++++-----
 include/linux/fs.h |    6 +++++-
 9 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266be67d3..9b290121301a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -155,7 +155,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 
 	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
 				    offset, blkdev_get_block,
-				    NULL, NULL, 0);
+				    NULL, NULL, DIO_IGNORE_TRUNCATE);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d2e732d7af52..6c66719179ba 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8129,7 +8129,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
 		return 0;
 
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, flags);
 	smp_mb__after_atomic();
 
 	/*
@@ -8169,7 +8169,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 		current->journal_info = &outstanding_extents;
 	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 				     &BTRFS_I(inode)->runtime_flags)) {
-		inode_dio_done(inode);
+		inode_dio_done(inode, flags);
 		flags = DIO_LOCKING | DIO_SKIP_HOLES;
 		wakeup = false;
 	}
@@ -8188,7 +8188,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	}
 out:
 	if (wakeup)
-		inode_dio_done(inode);
+		inode_dio_done(inode, flags);
 	if (relock)
 		mutex_lock(&inode->i_mutex);
 
diff --git a/fs/dax.c b/fs/dax.c
index ed1619ec6537..0f55ee4ed2fe 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -210,7 +210,7 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
 	}
 
 	/* Protects against truncate */
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, flags);
 
 	retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
 
@@ -220,7 +220,7 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
 	if ((retval > 0) && end_io)
 		end_io(iocb, pos, retval, bh.b_private);
 
-	inode_dio_done(inode);
+	inode_dio_done(inode, flags);
  out:
 	return retval;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b2e297..5f59709a8b80 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -254,7 +254,8 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
 	if (dio->end_io && dio->result)
 		dio->end_io(dio->iocb, offset, transferred, dio->private);
 
-	inode_dio_done(dio->inode);
+	inode_dio_done(dio->inode, dio->flags);
+
 	if (is_async) {
 		if (dio->rw & WRITE) {
 			int err;
@@ -1199,7 +1200,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	/*
 	 * Will be decremented at I/O completion time.
 	 */
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, dio->flags);
 
 	retval = 0;
 	sdio.blkbits = blkbits;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 45fe924f82bc..853a8674b137 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -682,11 +682,11 @@ retry:
 		 * via ext4_inode_block_unlocked_dio(). Check inode's state
 		 * while holding extra i_dio_count ref.
 		 */
-		atomic_inc(&inode->i_dio_count);
+		inode_dio_begin(inode, 0);
 		smp_mb();
 		if (unlikely(ext4_test_inode_state(inode,
 						    EXT4_STATE_DIOREAD_LOCK))) {
-			inode_dio_done(inode);
+			inode_dio_done(inode, 0);
 			goto locked;
 		}
 		if (IS_DAX(inode))
@@ -696,7 +696,7 @@ retry:
 			ret = __blockdev_direct_IO(rw, iocb, inode,
 					inode->i_sb->s_bdev, iter, offset,
 					ext4_get_block, NULL, NULL, 0);
-		inode_dio_done(inode);
+		inode_dio_done(inode, 0);
 	} else {
 locked:
 		if (IS_DAX(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cb9a212b86f..5310e31a4327 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2978,7 +2978,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
 	 */
 	if (rw == WRITE)
-		atomic_inc(&inode->i_dio_count);
+		inode_dio_begin(inode, dio_flags);
 
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
@@ -3080,7 +3080,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 
 retake_lock:
 	if (rw == WRITE)
-		inode_dio_done(inode);
+		inode_dio_done(inode, dio_flags);
 	/* take i_mutex locking again if we do a ovewrite dio */
 	if (overwrite) {
 		up_read(&EXT4_I(inode)->i_data_sem);
diff --git a/fs/inode.c b/fs/inode.c
index f00b16f45507..7a544bea1566 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1946,15 +1946,32 @@ void inode_dio_wait(struct inode *inode)
 EXPORT_SYMBOL(inode_dio_wait);
 
 /*
+ * inode_dio_begin - signal start of a direct I/O requests
+ * @inode: inode the direct I/O happens on
+ * @flags: DIO_* flags
+ *
+ * This is called once we've finished processing a direct I/O request,
+ * and is used to wake up callers waiting for direct I/O to be quiesced.
+ */
+void inode_dio_begin(struct inode *inode, unsigned int flags)
+{
+	if (!(flags & DIO_IGNORE_TRUNCATE))
+		atomic_inc(&inode->i_dio_count);
+}
+EXPORT_SYMBOL(inode_dio_begin);
+
+/*
  * inode_dio_done - signal finish of a direct I/O requests
  * @inode: inode the direct I/O happens on
+ * @flags: DIO_* flags
  *
  * This is called once we've finished processing a direct I/O request,
  * and is used to wake up callers waiting for direct I/O to be quiesced.
  */
-void inode_dio_done(struct inode *inode)
+void inode_dio_done(struct inode *inode, unsigned int flags)
 {
-	if (atomic_dec_and_test(&inode->i_dio_count))
+	if ((flags & DIO_IGNORE_TRUNCATE) ||
+	    atomic_dec_and_test(&inode->i_dio_count))
 		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
 }
 EXPORT_SYMBOL(inode_dio_done);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e907c8cf732e..a5d764431f11 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 	if (write)
 		nfs_zap_mapping(inode, inode->i_mapping);
 
-	inode_dio_done(inode);
+	inode_dio_done(inode, 0);
 
 	if (dreq->iocb) {
 		long res = (long) dreq->error;
@@ -487,7 +487,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 			     &nfs_direct_read_completion_ops);
 	get_dreq(dreq);
 	desc.pg_dreq = dreq;
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, 0);
 
 	while (iov_iter_count(iter)) {
 		struct page **pagevec;
@@ -539,7 +539,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	 * generic layer handle the completion.
 	 */
 	if (requested_bytes == 0) {
-		inode_dio_done(inode);
+		inode_dio_done(inode, 0);
 		nfs_direct_req_release(dreq);
 		return result < 0 ? result : -EIO;
 	}
@@ -873,7 +873,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 			      &nfs_direct_write_completion_ops);
 	desc.pg_dreq = dreq;
 	get_dreq(dreq);
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, 0);
 
 	NFS_I(inode)->write_io += iov_iter_count(iter);
 	while (iov_iter_count(iter)) {
@@ -929,7 +929,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	 * generic layer handle the completion.
 	 */
 	if (requested_bytes == 0) {
-		inode_dio_done(inode);
+		inode_dio_done(inode, 0);
 		nfs_direct_req_release(dreq);
 		return result < 0 ? result : -EIO;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 52cc4492cb3a..43aab9f588fa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2613,6 +2613,9 @@ enum {
 
 	/* filesystem can handle aio writes beyond i_size */
 	DIO_ASYNC_EXTEND = 0x04,
+
+	/* inode/fs/bdev does not need truncate protection */
+	DIO_IGNORE_TRUNCATE = 0x08,
 };
 
 void dio_end_io(struct bio *bio, int error);
@@ -2633,7 +2636,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 #endif
 
 void inode_dio_wait(struct inode *inode);
-void inode_dio_done(struct inode *inode);
+void inode_dio_begin(struct inode *inode, unsigned int flags);
+void inode_dio_done(struct inode *inode, unsigned int flags);
 
 extern void inode_set_flags(struct inode *inode, unsigned int flags,
 			    unsigned int mask);

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ