[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20210414115830.GC31323@quack2.suse.cz>
Date: Wed, 14 Apr 2021 13:58:30 +0200
From: Jan Kara <jack@...e.cz>
To: Ted Tso <tytso@....edu>
Cc: linux-ext4@...r.kernel.org, Eric Whitney <enwlinux@...il.com>,
Jan Kara <jack@...e.cz>, stable@...r.kernel.org
Subject: Re: [PATCH] ext4: Fix occasional generic/418 failure
On Wed 14-04-21 11:59:35, Jan Kara wrote:
> Eric has noticed that after pagecache read rework, generic/418 is
> occasionally failing for ext4 when blocksize < pagesize. In fact, the
> pagecache rework just made hard to hit race in ext4 more likely. The
> problem is that since ext4 conversion of direct IO writes to iomap
> framework (commit 378f32bab371), we update inode size after direct IO
> write only after invalidating page cache. Thus if buffered read sneaks
> at unfortunate moment like:
>
> CPU1 - write at offset 1k CPU2 - read from offset 0
> iomap_dio_rw(..., IOMAP_DIO_FORCE_WAIT);
> ext4_readpage();
> ext4_handle_inode_extension()
>
> the read will zero out tail of the page as it still sees smaller inode
> size and thus page cache becomes inconsistent with on-disk contents with
> all the consequences.
>
> Fix the problem by moving inode size update into end_io handler which
> gets called before the page cache is invalidated.
>
> Reported-by: Eric Whitney <enwlinux@...il.com>
> Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure")
> CC: stable@...r.kernel.org
> Signed-off-by: Jan Kara <jack@...e.cz>
Dave just suggested a better alternative than this in a different thread.
I'll submit it once testing completes...
Honza
> ---
> fs/ext4/file.c | 194 +++++++++++++++++++++++++------------------------
> 1 file changed, 98 insertions(+), 96 deletions(-)
>
> Eric, can you please try whether this patch fixes the failures you are
> occasionally seeing?
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 194f5d00fa32..2b84fb48bde6 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -280,13 +280,66 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> return ret;
> }
>
> -static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
> - ssize_t written, size_t count)
> +static int ext4_prepare_dio_extend(struct inode *inode)
> +{
> + handle_t *handle;
> + int ret;
> +
> + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> + if (IS_ERR(handle))
> + return PTR_ERR(handle);
> +
> + ext4_fc_start_update(inode);
> + ret = ext4_orphan_add(handle, inode);
> + ext4_fc_stop_update(inode);
> + if (ret) {
> + ext4_journal_stop(handle);
> + return ret;
> + }
> + ext4_journal_stop(handle);
> + return 0;
> +}
> +
> +static void ext4_cleanup_dio_extend(struct inode *inode, loff_t offset,
> + ssize_t written, size_t count)
> {
> handle_t *handle;
> - bool truncate = false;
> u8 blkbits = inode->i_blkbits;
> ext4_lblk_t written_blk, end_blk;
> +
> + /* Error? Nothing was written... */
> + if (written < 0)
> + written = 0;
> +
> + written_blk = ALIGN(offset + written, 1 << blkbits);
> + end_blk = ALIGN(offset + count, 1 << blkbits);
> + if (written_blk < end_blk && ext4_can_truncate(inode)) {
> + ext4_truncate_failed_write(inode);
> + /*
> + * If the truncate operation failed early, then the inode may
> + * still be on the orphan list. In that case, we need to try
> + * remove the inode from the in-memory linked list.
> + */
> + if (inode->i_nlink)
> + ext4_orphan_del(NULL, inode);
> + return;
> + }
> +
> + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
> + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> + if (IS_ERR(handle)) {
> + ext4_orphan_del(NULL, inode);
> + return;
> + }
> + ext4_orphan_del(handle, inode);
> + ext4_journal_stop(handle);
> + }
> +}
> +
> +static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
> + ssize_t written)
> +{
> + handle_t *handle;
> int ret;
>
> /*
> @@ -298,74 +351,23 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
> * as much as we intended.
> */
> WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
> - if (offset + count <= EXT4_I(inode)->i_disksize) {
> - /*
> - * We need to ensure that the inode is removed from the orphan
> - * list if it has been added prematurely, due to writeback of
> - * delalloc blocks.
> - */
> - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
> - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> -
> - if (IS_ERR(handle)) {
> - ext4_orphan_del(NULL, inode);
> - return PTR_ERR(handle);
> - }
> -
> - ext4_orphan_del(handle, inode);
> - ext4_journal_stop(handle);
> - }
> -
> - return written;
> - }
> -
> - if (written < 0)
> - goto truncate;
> + if (offset + written <= EXT4_I(inode)->i_disksize)
> + return 0;
>
> handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> - if (IS_ERR(handle)) {
> - written = PTR_ERR(handle);
> - goto truncate;
> - }
> + if (IS_ERR(handle))
> + return PTR_ERR(handle);
>
> if (ext4_update_inode_size(inode, offset + written)) {
> ret = ext4_mark_inode_dirty(handle, inode);
> if (unlikely(ret)) {
> - written = ret;
> ext4_journal_stop(handle);
> - goto truncate;
> + return ret;
> }
> }
> -
> - /*
> - * We may need to truncate allocated but not written blocks beyond EOF.
> - */
> - written_blk = ALIGN(offset + written, 1 << blkbits);
> - end_blk = ALIGN(offset + count, 1 << blkbits);
> - if (written_blk < end_blk && ext4_can_truncate(inode))
> - truncate = true;
> -
> - /*
> - * Remove the inode from the orphan list if it has been extended and
> - * everything went OK.
> - */
> - if (!truncate && inode->i_nlink)
> - ext4_orphan_del(handle, inode);
> ext4_journal_stop(handle);
>
> - if (truncate) {
> -truncate:
> - ext4_truncate_failed_write(inode);
> - /*
> - * If the truncate operation failed early, then the inode may
> - * still be on the orphan list. In that case, we need to try
> - * remove the inode from the in-memory linked list.
> - */
> - if (inode->i_nlink)
> - ext4_orphan_del(NULL, inode);
> - }
> -
> - return written;
> + return 0;
> }
>
> static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
> @@ -380,14 +382,29 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
> if (size && flags & IOMAP_DIO_UNWRITTEN)
> return ext4_convert_unwritten_extents(NULL, inode,
> offset, size);
> -
> return 0;
> }
>
> +static int ext4_dio_extending_write_end_io(struct kiocb *iocb, ssize_t size,
> + int error, unsigned int flags)
> +{
> + struct inode *inode = file_inode(iocb->ki_filp);
> + int ret;
> +
> + ret = ext4_dio_write_end_io(iocb, size, error, flags);
> + if (ret < 0)
> + return ret;
> + return ext4_handle_inode_extension(inode, iocb->ki_pos, size);
> +}
> +
> static const struct iomap_dio_ops ext4_dio_write_ops = {
> .end_io = ext4_dio_write_end_io,
> };
>
> +static const struct iomap_dio_ops ext4_dio_extending_write_ops = {
> + .end_io = ext4_dio_extending_write_end_io,
> +};
> +
> /*
> * The intention here is to start with shared lock acquired then see if any
> * condition requires an exclusive inode lock. If yes, then we restart the
> @@ -454,11 +471,11 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> {
> ssize_t ret;
> - handle_t *handle;
> struct inode *inode = file_inode(iocb->ki_filp);
> loff_t offset = iocb->ki_pos;
> size_t count = iov_iter_count(from);
> const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
> + const struct iomap_dio_ops *dio_ops = &ext4_dio_write_ops;
> bool extend = false, unaligned_io = false;
> bool ilock_shared = true;
>
> @@ -529,33 +546,21 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> inode_dio_wait(inode);
>
> if (extend) {
> - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> - if (IS_ERR(handle)) {
> - ret = PTR_ERR(handle);
> - goto out;
> - }
> -
> - ext4_fc_start_update(inode);
> - ret = ext4_orphan_add(handle, inode);
> - ext4_fc_stop_update(inode);
> - if (ret) {
> - ext4_journal_stop(handle);
> + ret = ext4_prepare_dio_extend(inode);
> + if (ret)
> goto out;
> - }
> -
> - ext4_journal_stop(handle);
> + dio_ops = &ext4_dio_extending_write_ops;
> }
>
> if (ilock_shared)
> iomap_ops = &ext4_iomap_overwrite_ops;
> - ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
> - (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
> +
> + ret = iomap_dio_rw(iocb, from, iomap_ops, dio_ops,
> + (extend || unaligned_io) ? IOMAP_DIO_FORCE_WAIT : 0);
> if (ret == -ENOTBLK)
> ret = 0;
> -
> if (extend)
> - ret = ext4_handle_inode_extension(inode, offset, ret, count);
> -
> + ext4_cleanup_dio_extend(inode, offset, ret, count);
> out:
> if (ilock_shared)
> inode_unlock_shared(inode);
> @@ -598,7 +603,6 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> ssize_t ret;
> size_t count;
> loff_t offset;
> - handle_t *handle;
> bool extend = false;
> struct inode *inode = file_inode(iocb->ki_filp);
>
> @@ -617,26 +621,24 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> count = iov_iter_count(from);
>
> if (offset + count > EXT4_I(inode)->i_disksize) {
> - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> - if (IS_ERR(handle)) {
> - ret = PTR_ERR(handle);
> - goto out;
> - }
> -
> - ret = ext4_orphan_add(handle, inode);
> - if (ret) {
> - ext4_journal_stop(handle);
> + ret = ext4_prepare_dio_extend(inode);
> + if (ret < 0)
> goto out;
> - }
> -
> extend = true;
> - ext4_journal_stop(handle);
> }
>
> ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
>
> - if (extend)
> - ret = ext4_handle_inode_extension(inode, offset, ret, count);
> + if (extend) {
> + if (ret > 0) {
> + int ret2;
> +
> + ret2 = ext4_handle_inode_extension(inode, offset, ret);
> + if (ret2 < 0)
> + ret = ret2;
> + }
> + ext4_cleanup_dio_extend(inode, offset, ret, count);
> + }
> out:
> inode_unlock(inode);
> if (ret > 0)
> --
> 2.26.2
>
--
Jan Kara <jack@...e.com>
SUSE Labs, CR
Powered by blists - more mailing lists