xfs: pull write IO locking up to protect S_DAX From: Dave Chinner Much more complex than the read side because of all the lock juggling we for the different types of writes and all the various metadata updates a write might need prior to dispatching the data IO. Not really happy about the way the high level logic turned out; could probably be make cleaner and smarter with a bit more thought. Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 205 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 115 insertions(+), 90 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7d31a0e76b68..b7c2bb09b945 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -250,9 +250,9 @@ xfs_file_read_iter( /* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to - * @iolock, and returns with it held. Might upgrade the iolock to exclusive - * if called for a direct write beyond i_size. + * Called without the iolocked held, but will lock it according to @iolock, and + * returns with it held on both success and error. Might upgrade the iolock to + * exclusive if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( @@ -268,6 +268,13 @@ xfs_file_aio_write_checks( bool drained_dio = false; loff_t isize; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, *iolock)) + return -EAGAIN; + } else { + xfs_ilock(ip, *iolock); + } + restart: error = generic_write_checks(iocb, from); if (error <= 0) @@ -325,7 +332,7 @@ xfs_file_aio_write_checks( drained_dio = true; goto restart; } - + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, NULL, &xfs_buffered_write_iomap_ops); @@ -449,19 +456,14 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ -STATIC ssize_t -xfs_file_dio_aio_write( +static int +xfs_file_dio_write_lock_mode( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0; - int unaligned_io = 0; - int iolock; size_t count = iov_iter_count(from); struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -478,35 +480,37 @@ xfs_file_dio_aio_write( */ if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) { - unaligned_io = 1; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* unaligned dio always waits, bail */ + return -EAGAIN; + } /* * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ if (xfs_is_cow_inode(ip)) { - trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); + trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, + count); return -EREMCHG; } - iolock = XFS_IOLOCK_EXCL; - } else { - iolock = XFS_IOLOCK_SHARED; - } - - if (iocb->ki_flags & IOCB_NOWAIT) { - /* unaligned dio always waits, bail */ - if (unaligned_io) - return -EAGAIN; - if (!xfs_ilock_nowait(ip, iolock)) - return -EAGAIN; - } else { - xfs_ilock(ip, iolock); + return XFS_IOLOCK_EXCL; } + return XFS_IOLOCK_SHARED; +} - ret = xfs_file_aio_write_checks(iocb, from, &iolock); - if (ret) - goto out; - count = iov_iter_count(from); +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + struct iov_iter *from, + int iolock) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + bool unaligned_io = 0; + size_t count = iov_iter_count(from); /* * If we are doing unaligned IO, we can't allow any other overlapping IO @@ -515,7 +519,9 @@ xfs_file_dio_aio_write( * iolock if we had to take the exclusive lock in * xfs_file_aio_write_checks() for other reasons. */ - if (unaligned_io) { + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + count) & mp->m_blockmask)) { + unaligned_io = true; inode_dio_wait(inode); } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); @@ -530,7 +536,6 @@ xfs_file_dio_aio_write( ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, is_sync_kiocb(iocb) || unaligned_io); -out: xfs_iunlock(ip, iolock); /* @@ -544,26 +549,15 @@ xfs_file_dio_aio_write( static noinline ssize_t xfs_file_dax_write( struct kiocb *iocb, - struct iov_iter *from) + struct iov_iter *from, + int iolock) { - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); - int iolock = XFS_IOLOCK_EXCL; ssize_t ret, error = 0; size_t count; loff_t pos; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!xfs_ilock_nowait(ip, iolock)) - return -EAGAIN; - } else { - xfs_ilock(ip, iolock); - } - - ret = xfs_file_aio_write_checks(iocb, from, &iolock); - if (ret) - goto out; - pos = iocb->ki_pos; count = iov_iter_count(from); @@ -573,7 +567,6 @@ xfs_file_dax_write( i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } -out: xfs_iunlock(ip, iolock); if (error) return error; @@ -590,26 +583,13 @@ xfs_file_dax_write( STATIC ssize_t xfs_file_buffered_aio_write( struct kiocb *iocb, - struct iov_iter *from) + struct iov_iter *from, + int iolock, + bool flush_on_enospc) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int enospc = 0; - int iolock; - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EOPNOTSUPP; - -write_retry: - iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); - - ret = xfs_file_aio_write_checks(iocb, from, &iolock); - if (ret) - goto out; /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -617,8 +597,6 @@ xfs_file_buffered_aio_write( trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = iomap_file_buffered_write(iocb, from, &xfs_buffered_write_iomap_ops); - if (likely(ret >= 0)) - iocb->ki_pos += ret; /* * If we hit a space limit, try to free up some lingering preallocated @@ -629,34 +607,35 @@ xfs_file_buffered_aio_write( * also behaves as a filter to prevent too many eofblocks scans from * running at the same time. */ - if (ret == -EDQUOT && !enospc) { + if (ret == -EDQUOT && flush_on_enospc) { + int made_progress; + xfs_iunlock(ip, iolock); - enospc = xfs_inode_free_quota_eofblocks(ip); - if (enospc) - goto write_retry; - enospc = xfs_inode_free_quota_cowblocks(ip); - if (enospc) - goto write_retry; - iolock = 0; - } else if (ret == -ENOSPC && !enospc) { + made_progress = xfs_inode_free_quota_eofblocks(ip); + if (made_progress) + return -EAGAIN; + made_progress = xfs_inode_free_quota_cowblocks(ip); + if (made_progress) + return -EAGAIN; + return ret; + } + if (ret == -ENOSPC && flush_on_enospc) { struct xfs_eofblocks eofb = {0}; - enospc = 1; xfs_flush_inodes(ip->i_mount); xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); xfs_icache_free_cowblocks(ip->i_mount, &eofb); - goto write_retry; + return -EAGAIN; } current->backing_dev_info = NULL; -out: - if (iolock) - xfs_iunlock(ip, iolock); - + xfs_iunlock(ip, iolock); if (ret > 0) { + iocb->ki_pos += ret; + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); /* Handle various SYNC-type writes */ ret = generic_write_sync(iocb, ret); @@ -675,6 +654,8 @@ xfs_file_write_iter( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from); + int iolock; + bool flush_on_enospc = true; XFS_STATS_INC(ip->i_mount, xs_write_calls); @@ -684,22 +665,66 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (IS_DAX(inode)) - return xfs_file_dax_write(iocb, from); - - if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Set up the initial locking state - only direct IO uses shared locking + * for writes, and buffered IO does not support IOCB_NOWAIT. + */ +relock: + if (IS_DAX(inode)) { + iolock = XFS_IOLOCK_EXCL; + } else if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered * write *only* in the case that we're doing a reflink * CoW. In all other directio scenarios we do not * allow an operation to fall back to buffered mode. */ - ret = xfs_file_dio_aio_write(iocb, from); - if (ret != -EREMCHG) - return ret; + iolock = xfs_file_dio_write_lock_mode(iocb, from); + if (iolock == -EREMCHG) { + iocb->ki_flags &= ~IOCB_DIRECT; + iolock = XFS_IOLOCK_EXCL; + } + if (iolock < 0) + return iolock; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + /* buffered writes do not support IOCB_NOWAIT */ + return -EOPNOTSUPP; + } else { + iolock = XFS_IOLOCK_EXCL; } - return xfs_file_buffered_aio_write(iocb, from); + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) { + xfs_iunlock(ip, iolock); + return ret; + } + + /* + * If the IO mode switched while we were locking meaning we hold + * the wrong lock type right now, start again. + */ + if ((IS_DAX(inode) || !(iocb->ki_flags & IOCB_DIRECT)) && + iolock != XFS_IOLOCK_EXCL) { + xfs_iunlock(ip, iolock); + goto relock; + } + + if (IS_DAX(inode)) { + ret = xfs_file_dax_write(iocb, from, iolock); + } else if (iocb->ki_flags & IOCB_DIRECT) { + ret = xfs_file_dio_aio_write(iocb, from, iolock); + ASSERT(ret != -EREMCHG); + } else { + ret = xfs_file_buffered_aio_write(iocb, from, iolock, + flush_on_enospc); + if (ret == -EAGAIN && flush_on_enospc) { + /* inode already unlocked, but need another attempt */ + flush_on_enospc = false; + goto relock; + } + ASSERT(ret != -EAGAIN); + } + return ret; } static void