xfs: pull write IO locking up to protect S_DAX

From: Dave Chinner <dchinner@redhat.com>

Much more complex than the read side because of all the lock
juggling we for the different types of writes and all the various
metadata updates a write might need prior to dispatching the data
IO.

Not really happy about the way the high level logic turned out;
could probably be make cleaner and smarter with a bit more thought.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_file.c | 205 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 115 insertions(+), 90 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7d31a0e76b68..b7c2bb09b945 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -250,9 +250,9 @@ xfs_file_read_iter(
 /*
  * Common pre-write limit and setup checks.
  *
- * Called with the iolocked held either shared and exclusive according to
- * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
- * if called for a direct write beyond i_size.
+ * Called without the iolocked held, but will lock it according to @iolock, and
+ * returns with it held on both success and error.  Might upgrade the iolock to
+ * exclusive if called for a direct write beyond i_size.
  */
 STATIC ssize_t
 xfs_file_aio_write_checks(
@@ -268,6 +268,13 @@ xfs_file_aio_write_checks(
 	bool			drained_dio = false;
 	loff_t			isize;
 
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!xfs_ilock_nowait(ip, *iolock))
+			return -EAGAIN;
+	} else {
+		xfs_ilock(ip, *iolock);
+	}
+
 restart:
 	error = generic_write_checks(iocb, from);
 	if (error <= 0)
@@ -325,7 +332,7 @@ xfs_file_aio_write_checks(
 			drained_dio = true;
 			goto restart;
 		}
-	
+
 		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
 		error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
 				NULL, &xfs_buffered_write_iomap_ops);
@@ -449,19 +456,14 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
  * Returns with locks held indicated by @iolock and errors indicated by
  * negative return values.
  */
-STATIC ssize_t
-xfs_file_dio_aio_write(
+static int
+xfs_file_dio_write_lock_mode(
 	struct kiocb		*iocb,
 	struct iov_iter		*from)
 {
-	struct file		*file = iocb->ki_filp;
-	struct address_space	*mapping = file->f_mapping;
-	struct inode		*inode = mapping->host;
+	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			ret = 0;
-	int			unaligned_io = 0;
-	int			iolock;
 	size_t			count = iov_iter_count(from);
 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
 
@@ -478,35 +480,37 @@ xfs_file_dio_aio_write(
 	 */
 	if ((iocb->ki_pos & mp->m_blockmask) ||
 	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
-		unaligned_io = 1;
+		if (iocb->ki_flags & IOCB_NOWAIT) {
+			/* unaligned dio always waits, bail */
+			return -EAGAIN;
+		}
 
 		/*
 		 * We can't properly handle unaligned direct I/O to reflink
 		 * files yet, as we can't unshare a partial block.
 		 */
 		if (xfs_is_cow_inode(ip)) {
-			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
+			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos,
+							count);
 			return -EREMCHG;
 		}
-		iolock = XFS_IOLOCK_EXCL;
-	} else {
-		iolock = XFS_IOLOCK_SHARED;
-	}
-
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		/* unaligned dio always waits, bail */
-		if (unaligned_io)
-			return -EAGAIN;
-		if (!xfs_ilock_nowait(ip, iolock))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, iolock);
+		return XFS_IOLOCK_EXCL;
 	}
+	return XFS_IOLOCK_SHARED;
+}
 
-	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
-	if (ret)
-		goto out;
-	count = iov_iter_count(from);
+STATIC ssize_t
+xfs_file_dio_aio_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from,
+	int			iolock)
+{
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			ret = 0;
+	bool			unaligned_io = 0;
+	size_t			count = iov_iter_count(from);
 
 	/*
 	 * If we are doing unaligned IO, we can't allow any other overlapping IO
@@ -515,7 +519,9 @@ xfs_file_dio_aio_write(
 	 * iolock if we had to take the exclusive lock in
 	 * xfs_file_aio_write_checks() for other reasons.
 	 */
-	if (unaligned_io) {
+	if ((iocb->ki_pos & mp->m_blockmask) ||
+	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
+		unaligned_io = true;
 		inode_dio_wait(inode);
 	} else if (iolock == XFS_IOLOCK_EXCL) {
 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
@@ -530,7 +536,6 @@ xfs_file_dio_aio_write(
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
 			   &xfs_dio_write_ops,
 			   is_sync_kiocb(iocb) || unaligned_io);
-out:
 	xfs_iunlock(ip, iolock);
 
 	/*
@@ -544,26 +549,15 @@ xfs_file_dio_aio_write(
 static noinline ssize_t
 xfs_file_dax_write(
 	struct kiocb		*iocb,
-	struct iov_iter		*from)
+	struct iov_iter		*from,
+	int			iolock)
 {
-	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_inode	*ip = XFS_I(inode);
-	int			iolock = XFS_IOLOCK_EXCL;
 	ssize_t			ret, error = 0;
 	size_t			count;
 	loff_t			pos;
 
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (!xfs_ilock_nowait(ip, iolock))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, iolock);
-	}
-
-	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
-	if (ret)
-		goto out;
-
 	pos = iocb->ki_pos;
 	count = iov_iter_count(from);
 
@@ -573,7 +567,6 @@ xfs_file_dax_write(
 		i_size_write(inode, iocb->ki_pos);
 		error = xfs_setfilesize(ip, pos, ret);
 	}
-out:
 	xfs_iunlock(ip, iolock);
 	if (error)
 		return error;
@@ -590,26 +583,13 @@ xfs_file_dax_write(
 STATIC ssize_t
 xfs_file_buffered_aio_write(
 	struct kiocb		*iocb,
-	struct iov_iter		*from)
+	struct iov_iter		*from,
+	int			iolock,
+	bool			flush_on_enospc)
 {
-	struct file		*file = iocb->ki_filp;
-	struct address_space	*mapping = file->f_mapping;
-	struct inode		*inode = mapping->host;
+	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
-	int			enospc = 0;
-	int			iolock;
-
-	if (iocb->ki_flags & IOCB_NOWAIT)
-		return -EOPNOTSUPP;
-
-write_retry:
-	iolock = XFS_IOLOCK_EXCL;
-	xfs_ilock(ip, iolock);
-
-	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
-	if (ret)
-		goto out;
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
@@ -617,8 +597,6 @@ xfs_file_buffered_aio_write(
 	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
 	ret = iomap_file_buffered_write(iocb, from,
 			&xfs_buffered_write_iomap_ops);
-	if (likely(ret >= 0))
-		iocb->ki_pos += ret;
 
 	/*
 	 * If we hit a space limit, try to free up some lingering preallocated
@@ -629,34 +607,35 @@ xfs_file_buffered_aio_write(
 	 * also behaves as a filter to prevent too many eofblocks scans from
 	 * running at the same time.
 	 */
-	if (ret == -EDQUOT && !enospc) {
+	if (ret == -EDQUOT && flush_on_enospc) {
+		int	made_progress;
+
 		xfs_iunlock(ip, iolock);
-		enospc = xfs_inode_free_quota_eofblocks(ip);
-		if (enospc)
-			goto write_retry;
-		enospc = xfs_inode_free_quota_cowblocks(ip);
-		if (enospc)
-			goto write_retry;
-		iolock = 0;
-	} else if (ret == -ENOSPC && !enospc) {
+		made_progress = xfs_inode_free_quota_eofblocks(ip);
+		if (made_progress)
+			return -EAGAIN;
+		made_progress = xfs_inode_free_quota_cowblocks(ip);
+		if (made_progress)
+			return -EAGAIN;
+		return ret;
+	}
+	if (ret == -ENOSPC && flush_on_enospc) {
 		struct xfs_eofblocks eofb = {0};
 
-		enospc = 1;
 		xfs_flush_inodes(ip->i_mount);
 
 		xfs_iunlock(ip, iolock);
 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
 		xfs_icache_free_cowblocks(ip->i_mount, &eofb);
-		goto write_retry;
+		return -EAGAIN;
 	}
 
 	current->backing_dev_info = NULL;
-out:
-	if (iolock)
-		xfs_iunlock(ip, iolock);
-
+	xfs_iunlock(ip, iolock);
 	if (ret > 0) {
+		iocb->ki_pos += ret;
+
 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 		/* Handle various SYNC-type writes */
 		ret = generic_write_sync(iocb, ret);
@@ -675,6 +654,8 @@ xfs_file_write_iter(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
 	size_t			ocount = iov_iter_count(from);
+	int			iolock;
+	bool			flush_on_enospc = true;
 
 	XFS_STATS_INC(ip->i_mount, xs_write_calls);
 
@@ -684,22 +665,66 @@ xfs_file_write_iter(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	if (IS_DAX(inode))
-		return xfs_file_dax_write(iocb, from);
-
-	if (iocb->ki_flags & IOCB_DIRECT) {
+	/*
+	 * Set up the initial locking state - only direct IO uses shared locking
+	 * for writes, and buffered IO does not support IOCB_NOWAIT.
+	 */
+relock:
+	if (IS_DAX(inode)) {
+		iolock = XFS_IOLOCK_EXCL;
+	} else if (iocb->ki_flags & IOCB_DIRECT) {
 		/*
 		 * Allow a directio write to fall back to a buffered
 		 * write *only* in the case that we're doing a reflink
 		 * CoW.  In all other directio scenarios we do not
 		 * allow an operation to fall back to buffered mode.
 		 */
-		ret = xfs_file_dio_aio_write(iocb, from);
-		if (ret != -EREMCHG)
-			return ret;
+		iolock = xfs_file_dio_write_lock_mode(iocb, from);
+		if (iolock == -EREMCHG) {
+			iocb->ki_flags &= ~IOCB_DIRECT;
+			iolock = XFS_IOLOCK_EXCL;
+		}
+		if (iolock < 0)
+			return iolock;
+	} else if (iocb->ki_flags & IOCB_NOWAIT) {
+		/* buffered writes do not support IOCB_NOWAIT */
+		return -EOPNOTSUPP;
+	} else {
+		iolock = XFS_IOLOCK_EXCL;
 	}
 
-	return xfs_file_buffered_aio_write(iocb, from);
+	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+	if (ret) {
+		xfs_iunlock(ip, iolock);
+		return ret;
+	}
+
+	/*
+	 * If the IO mode switched while we were locking meaning we hold
+	 * the wrong lock type right now, start again.
+	 */
+	if ((IS_DAX(inode) || !(iocb->ki_flags & IOCB_DIRECT)) &&
+	    iolock != XFS_IOLOCK_EXCL) {
+		xfs_iunlock(ip, iolock);
+		goto relock;
+	}
+
+	if (IS_DAX(inode)) {
+		ret = xfs_file_dax_write(iocb, from, iolock);
+	} else if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = xfs_file_dio_aio_write(iocb, from, iolock);
+		ASSERT(ret != -EREMCHG);
+	} else {
+		ret = xfs_file_buffered_aio_write(iocb, from, iolock,
+						flush_on_enospc);
+		if (ret == -EAGAIN && flush_on_enospc) {
+			/* inode already unlocked, but need another attempt */
+			flush_on_enospc = false;
+			goto relock;
+		}
+		ASSERT(ret != -EAGAIN);
+	}
+	return ret;
 }
 
 static void