linux-kernel - [RFC-v2 2/2] ext4: Improve DIO writes locking sequence

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20190923054407.8102-3-riteshh@linux.ibm.com>
Date:   Mon, 23 Sep 2019 11:14:07 +0530
From:   Ritesh Harjani <riteshh@...ux.ibm.com>
To:     jack@...e.cz, tytso@....edu, linux-ext4@...r.kernel.org,
        adilger@...ger.ca
Cc:     joseph.qi@...ux.alibaba.com, david@...morbit.com,
        hch@...radead.org, riteshh@...ux.ibm.com,
        mbobrowski@...browski.org, rgoldwyn@...e.de,
        aneesh.kumar@...ux.ibm.com, linux-kernel@...r.kernel.org
Subject: [RFC-v2 2/2] ext4: Improve DIO writes locking sequence

Earlier there was no shared lock in DIO read path.
But this patch (16c54688592ce: ext4: Allow parallel DIO reads)
simplified some of the locking mechanism while still allowing
for parallel DIO reads by adding shared lock in inode DIO
read path.

But this created problem with mixed read/write workload.
It is due to the fact that in DIO path, we first start with
exclusive lock and only when we determine that it is a ovewrite
IO, we downgrade the lock. This causes the problem, since
with above patch we have shared locking in DIO reads.

So, this patch tries to fix this issue by starting with
shared lock and then switching to exclusive lock only
when required based on ext4_dio_write_checks().

Other than that, it also simplifies below cases:-

1. Simplified ext4_unaligned_aio API to
ext4_unaligned_io.
Previous API was abused in the sense that it was
not really checking for AIO anywhere also it used to
check for extending writes.
So this API was renamed and simplified to ext4_unaligned_io()
which actully only checks if the IO is really unaligned.

This is because in all other cases inode_dio_wait()
will anyway become a no-op. So no need to over complicate
by checking for every condition here.

2. Added ext4_extending_io API. This checks if the IO
is extending the file.

Now we only check for
unaligned_io, extend, dioread_nolock & overwrite.

Signed-off-by: Ritesh Harjani <riteshh@...ux.ibm.com>
---
 fs/ext4/file.c | 213 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 159 insertions(+), 54 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ce1cecbae932..faa8dcf9c08d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -166,19 +166,25 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
  * threads are at work on the same unwritten block, they must be synchronized
  * or one thread will zero the other's data, causing corruption.
  */
-static int
-ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+static bool
+ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
 {
 	struct super_block *sb = inode->i_sb;
-	int blockmask = sb->s_blocksize - 1;
-
-	if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
-		return 0;
+	unsigned long blockmask = sb->s_blocksize - 1;
 
 	if ((pos | iov_iter_alignment(from)) & blockmask)
-		return 1;
+		return true;
 
-	return 0;
+	return false;
+}
+
+static bool
+ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
+{
+	if (offset + len > i_size_read(inode) ||
+	    offset + len > EXT4_I(inode)->i_disksize)
+		return true;
+	return false;
 }
 
 /* Is IO overwriting allocated and initialized blocks? */
@@ -204,7 +210,9 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
 	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
 }
 
-static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+
+static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
+					 struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
@@ -216,10 +224,6 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	if (ret <= 0)
 		return ret;
 
-	ret = file_modified(iocb->ki_filp);
-	if (ret)
-		return 0;
-
 	/*
 	 * If we have encountered a bitmap-format file, the size limit
 	 * is smaller than s_maxbytes, which is for extent-mapped files.
@@ -231,9 +235,26 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 			return -EFBIG;
 		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
 	}
+
 	return iov_iter_count(from);
 }
 
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+	ssize_t ret;
+	ssize_t count;
+
+	count = ext4_generic_write_checks(iocb, from);
+	if (count <= 0)
+		return count;
+
+	ret = file_modified(iocb->ki_filp);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
 static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 					struct iov_iter *from)
 {
@@ -336,6 +357,83 @@ static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
 	return 0;
 }
 
+/*
+ * The intention here is to start with shared lock acquired
+ * (except in unaligned IO & extending writes case),
+ * then see if any condition requires an exclusive inode
+ * lock. If yes, then we restart the whole operation by
+ * releasing the shared lock and acquiring exclusive lock.
+ *
+ * - For unaligned_io we never take exclusive lock as it
+ *   may cause data corruption when two unaligned IO tries
+ *   to modify the same block.
+ *
+ * - For extending wirtes case we don't take
+ *   the exclusive lock, since it requires updating
+ *   inode i_disksize with exclusive lock.
+ *
+ * - shared locking will only be true mostly in case of
+ *   overwrites with dioread_nolock mode.
+ *   Otherwise we will switch to exclusive locking mode.
+ */
+static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
+				 unsigned int *iolock, bool *unaligned_io,
+				 bool *extend)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	loff_t offset = iocb->ki_pos;
+	loff_t final_size;
+	size_t count;
+	ssize_t ret;
+
+restart:
+	if (!ext4_dio_checks(inode)) {
+		ext4_iunlock(inode, *iolock);
+		return ext4_buffered_write_iter(iocb, from);
+	}
+
+	ret = ext4_generic_write_checks(iocb, from);
+	if (ret <= 0) {
+		ext4_iunlock(inode, *iolock);
+		return ret;
+	}
+
+	/* Recalculate since offset & count may change above. */
+	offset = iocb->ki_pos;
+	count = iov_iter_count(from);
+	final_size = offset + count;
+
+	if (ext4_unaligned_io(inode, from, offset))
+		*unaligned_io = true;
+
+	if (ext4_extending_io(inode, offset, count))
+		*extend = true;
+	/*
+	 * Determine whether the IO operation will overwrite allocated
+	 * and initialized blocks. If so, check to see whether it is
+	 * possible to take the dioread_nolock path.
+	 *
+	 * We need exclusive i_rwsem for changing security info
+	 * in file_modified().
+	 */
+	if (*iolock == EXT4_IOLOCK_SHARED &&
+	    (!IS_NOSEC(inode) || *unaligned_io || *extend ||
+	     !ext4_should_dioread_nolock(inode) ||
+	     !ext4_overwrite_io(inode, offset, count))) {
+		ext4_iunlock(inode, *iolock);
+		*iolock = EXT4_IOLOCK_EXCL;
+		ext4_ilock(inode, *iolock);
+		goto restart;
+	}
+
+	ret = file_modified(file);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
 /*
  * For a write that extends the inode size, ext4_dio_write_iter() will
  * wait for the write to complete. Consequently, operations performed
@@ -371,64 +469,71 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
 
 static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
+
 	ssize_t ret;
-	size_t count;
 	loff_t offset = iocb->ki_pos;
+	size_t count = iov_iter_count(from);
 	struct inode *inode = file_inode(iocb->ki_filp);
-	bool extend = false, overwrite = false, unaligned_aio = false;
-	unsigned int iolock = EXT4_IOLOCK_EXCL;
+	bool extend = false, unaligned_io = false;
+	unsigned int iolock = EXT4_IOLOCK_SHARED;
+
+	/*
+	 * We initially start with shared inode lock
+	 * unless it is unaligned IO which needs
+	 * exclusive lock anyways.
+	 */
+	if (ext4_unaligned_io(inode, from, offset)) {
+		unaligned_io = true;
+		iolock = EXT4_IOLOCK_EXCL;
+	}
+	/*
+	 * Extending writes need exclusive lock
+	 * to update
+	 */
+	if (ext4_extending_io(inode, offset, count)) {
+		extend = true;
+		iolock = EXT4_IOLOCK_EXCL;
+	}
+
+	if (iolock == EXT4_IOLOCK_SHARED && !ext4_should_dioread_nolock(inode))
+		iolock = EXT4_IOLOCK_EXCL;
 
 	if (iocb->ki_flags & IOCB_NOWAIT) {
+		/*
+		 * unaligned IO may anyway require wait
+		 * at other places, so bail out.
+		 */
+		if (unaligned_io)
+			return -EAGAIN;
 		if (!ext4_ilock_nowait(inode, iolock))
 			return -EAGAIN;
 	} else {
 		ext4_ilock(inode, iolock);
 	}
 
-	if (!ext4_dio_checks(inode)) {
-		ext4_iunlock(inode, iolock);
-		/*
-		 * Fallback to buffered IO if the operation on the
-		 * inode is not supported by direct IO.
-		 */
-		return ext4_buffered_write_iter(iocb, from);
-	}
-
-	ret = ext4_write_checks(iocb, from);
-	if (ret <= 0) {
-		ext4_iunlock(inode, iolock);
+	ret = ext4_dio_write_checks(iocb, from, &iolock, &unaligned_io,
+				    &extend);
+	if (ret <= 0)
 		return ret;
-	}
-
 	/*
-	 * Unaligned direct AIO must be serialized among each other as
+	 * Unaligned direct IO must be serialized among each other as
 	 * the zeroing of partial blocks of two competing unaligned
-	 * AIOs can result in data corruption.
+	 * IOs can result in data corruption. This can mainly
+	 * happen since we may start with shared locking and for
+	 * dioread_nolock and overwrite case we may continue to be
+	 * in shared locking mode. In that case two parallel unaligned
+	 * IO may cause data corruption.
+	 *
+	 * So we make sure we don't allow any unaligned IO in flight.
+	 * For IOs where we need not wait (like unaligned non-AIO DIO),
+	 * below dio_wait may anyway become a no-op,
+	 * since we start take exclusive locks.
 	 */
-	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
-	    !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
-		unaligned_aio = true;
+	if (unaligned_io)
 		inode_dio_wait(inode);
-	}
-
-	/*
-	 * Determine whether the IO operation will overwrite allocated
-	 * and initialized blocks. If so, check to see whether it is
-	 * possible to take the dioread_nolock path.
-	 */
-	count = iov_iter_count(from);
-	if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
-	    ext4_should_dioread_nolock(inode)) {
-		overwrite = true;
-		ext4_ilock_demote(inode, iolock);
-		iolock = EXT4_IOLOCK_SHARED;
-	}
 
-	if (offset + count > i_size_read(inode) ||
-	    offset + count > EXT4_I(inode)->i_disksize) {
+	if (extend)
 		ext4_update_i_disksize(inode, inode->i_size);
-		extend = true;
-	}
 
 	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
 
@@ -440,7 +545,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	 * routines in ext4_dio_write_end_io() are covered by the
 	 * inode_lock().
 	 */
-	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
+	if (ret == -EIOCBQUEUED && (unaligned_io || extend))
 		inode_dio_wait(inode);
 
 	ext4_iunlock(inode, iolock);
-- 
2.21.0