>From e659e5db649be01aec20515aef8ca48143e10c0b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 7 Mar 2018 17:19:12 +0200 Subject: [PATCH] btrfs: Fix memory ordering of unlocked dio reads vs truncate Signed-off-by: Nikolay Borisov --- fs/btrfs/btrfs_inode.h | 17 ----------------- fs/btrfs/inode.c | 41 ++++++++++++++++++++++++++++++++++------- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4e12a477d32e..e84f58cca02e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -317,23 +317,6 @@ struct btrfs_dio_private { blk_status_t); }; -/* - * Disable DIO read nolock optimization, so new dio readers will be forced - * to grab i_mutex. It is used to avoid the endless truncate due to - * nonlocked dio read. - */ -static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) -{ - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); - smp_mb(); -} - -static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) -{ - smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6d2bb58d277a..d64600268c3a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4626,10 +4626,29 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the endless truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); + /* + * This code is very subtle. It is essentially a lock of its + * own type. BTRFS allows multiple DIO readers to race with + * writers so long as they don't read beyond EOF of an inode. + * However, if we have a pending truncate we'd like to signal + * DIO readers they should fall back to DIO_LOCKING semantics. + * This ensures that multiple aggressive DIO readers cannot + * starve the truncating thread. + * + * This semantics is achieved by the use of the below flag. If + * new readers come after the flag has been cleared then the + * state is still consistent, since the RELEASE semantics of + * clear_bit_unlock ensure the truncate inode size will be + * visible and DIO readers will bail out. + * + * The implied memory barrier by inode_dio_wait is paired with + * smp_mb__before_atomic in btrfs_direct_IO. + */ + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); + clear_bit_unlock(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { @@ -8070,11 +8089,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + } else { + /* + * This barrier is paired with the implied barrier in + * inode_dio_wait. It ensures that READDIO_NEED_LOCK is + * visible if we have a pending truncate. + */ + smp_mb__before_atomic(); + if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; + inode_dio_end(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; + } } ret = __blockdev_direct_IO(iocb, inode, -- 2.17.1