linux-ext4 - Re: [PATCH] ext4: defer updating i

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230327092914.mzizhh52izbvjhhv@quack3>
Date:   Mon, 27 Mar 2023 11:29:14 +0200
From:   Jan Kara <jack@...e.cz>
To:     Chung-Chiang Cheng <cccheng@...ology.com>
Cc:     linux-ext4@...r.kernel.org, tytso@....edu,
        adilger.kernel@...ger.ca, jack@...e.cz, yi.zhang@...wei.com,
        shepjeng@...il.com, kernel@...heng.net,
        Robbie Ko <robbieko@...ology.com>
Subject: Re: [PATCH] ext4: defer updating i_disksize until endio

On Fri 24-03-23 17:29:07, Chung-Chiang Cheng wrote:
> During a buffer write, the on-disk inode size (i_disksize) gets updated
> at ext4_da_write_end() or mpage_map_and_submit_extent(). This update is
> then committed into the journal. However, at that point, the writeback
> process is not yet completed. If an interruption occurs, the content of
> the file may not be in a consistent state because the i_disksize is
> replayed by the journal, but the corresponding content may not have been
> actually submitted to the disk.
> 
>     $ mount -o commit=1 /dev/vdc /mnt
>     $ echo -n foo >> /mnt/test; sync
>     $ echo -n bar >> /mnt/test; sleep 3; echo b > /proc/sysrq-trigger
> 
>     $ xxd /mnt/test
>     00000000: 666f 6f00 0000                           foo...
> 
> After the above steps have been executed, there are padding zeros at the
> end of the file, which are obviously not part of the actual content.
> To fix this issue, we can defer updating i_disksize until the endio. The
> original ext4_end_io_rsv_work() was to convert unwritten extents to
> extents, but it now also updates the disk size.
> 
> Reviewed-by: Robbie Ko <robbieko@...ology.com>
> Signed-off-by: Chung-Chiang Cheng <cccheng@...ology.com>

As Zhang Yi already noted in his review, this is expected at least with
data=writeback mount option. With data=ordered this should not happen
though as the commit of the transaction with i_disksize update will wait
for page writeback to complete (this is exactly the reason why data=ordered
exists after all). Are you able to observe this problem with data=ordered
mount option?

								Honza

> ---
>  fs/ext4/inode.c   | 41 ++++--------------------------
>  fs/ext4/page-io.c | 64 +++++++++++++++++++++++++++++++++++++++--------
>  2 files changed, 58 insertions(+), 47 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 96517785a9f8..c3537cd603dc 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2515,6 +2515,8 @@ static int mpage_map_and_submit_extent(handle_t *handle,
>  			goto update_disksize;
>  	} while (map->m_len);
>  
> +	return err;
> +
>  update_disksize:
>  	/*
>  	 * Update on-disk size after IO is submitted.  Races with
> @@ -3105,36 +3107,12 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
>  	return ret;
>  }
>  
> -/*
> - * Check if we should update i_disksize
> - * when write to the end of file but not require block allocation
> - */
> -static int ext4_da_should_update_i_disksize(struct page *page,
> -					    unsigned long offset)
> -{
> -	struct buffer_head *bh;
> -	struct inode *inode = page->mapping->host;
> -	unsigned int idx;
> -	int i;
> -
> -	bh = page_buffers(page);
> -	idx = offset >> inode->i_blkbits;
> -
> -	for (i = 0; i < idx; i++)
> -		bh = bh->b_this_page;
> -
> -	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
> -		return 0;
> -	return 1;
> -}
> -
>  static int ext4_da_write_end(struct file *file,
>  			     struct address_space *mapping,
>  			     loff_t pos, unsigned len, unsigned copied,
>  			     struct page *page, void *fsdata)
>  {
>  	struct inode *inode = mapping->host;
> -	loff_t new_i_size;
>  	unsigned long start, end;
>  	int write_mode = (int)(unsigned long)fsdata;
>  
> @@ -3155,22 +3133,13 @@ static int ext4_da_write_end(struct file *file,
>  	/*
>  	 * Since we are holding inode lock, we are sure i_disksize <=
>  	 * i_size. We also know that if i_disksize < i_size, there are
> -	 * delalloc writes pending in the range upto i_size. If the end of
> -	 * the current write is <= i_size, there's no need to touch
> -	 * i_disksize since writeback will push i_disksize upto i_size
> -	 * eventually. If the end of the current write is > i_size and
> -	 * inside an allocated block (ext4_da_should_update_i_disksize()
> -	 * check), we need to update i_disksize here as neither
> -	 * ext4_writepage() nor certain ext4_writepages() paths not
> -	 * allocating blocks update i_disksize.
> +	 * delalloc writes pending in the range upto i_size. There's no
> +	 * need to touch i_disksize since the endio of writeback will
> +	 * push i_disksize upto i_size eventually.
>  	 *
>  	 * Note that we defer inode dirtying to generic_write_end() /
>  	 * ext4_da_write_inline_data_end().
>  	 */
> -	new_i_size = pos + copied;
> -	if (copied && new_i_size > inode->i_size &&
> -	    ext4_da_should_update_i_disksize(page, end))
> -		ext4_update_i_disksize(inode, new_i_size);
>  
>  	return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
>  }
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 1e4db96a04e6..f893d26c4b88 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -182,6 +182,10 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
>  		   "list->prev 0x%p\n",
>  		   io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
>  
> +	/* Only reserved conversions from writeback should enter here */
> +	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
> +	WARN_ON(!io_end->handle && EXT4_SB(inode->i_sb)->s_journal);
> +
>  	io_end->handle = NULL;	/* Following call will use up the handle */
>  	ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
>  	if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
> @@ -226,9 +230,6 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
>  	struct workqueue_struct *wq;
>  	unsigned long flags;
>  
> -	/* Only reserved conversions from writeback should enter here */
> -	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
> -	WARN_ON(!io_end->handle && sbi->s_journal);
>  	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
>  	wq = sbi->rsv_conversion_wq;
>  	if (list_empty(&ei->i_rsv_conversion_list))
> @@ -237,6 +238,43 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
>  	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>  }
>  
> +static int ext4_endio_update_disksize(ext4_io_end_t *io_end)
> +{
> +	int ret = 0;
> +	loff_t i_size, disksize = 0;
> +	handle_t *handle;
> +	struct ext4_io_end_vec *io_end_vec;
> +	struct inode *inode = io_end->inode;
> +	struct ext4_inode_info *ei = EXT4_I(inode);
> +
> +	list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
> +		if (disksize < io_end_vec->offset + io_end_vec->size)
> +			disksize = io_end_vec->offset + io_end_vec->size;
> +	}
> +
> +	if (disksize > ei->i_disksize) {
> +		down_write(&ei->i_data_sem);
> +		i_size = inode->i_size;
> +		if (disksize > i_size)
> +			disksize = i_size;
> +		if (disksize > ei->i_disksize)
> +			WRITE_ONCE(ei->i_disksize, i_size);
> +		up_write(&ei->i_data_sem);
> +
> +		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
> +		if (IS_ERR(handle))
> +			return PTR_ERR(handle);
> +
> +		ret = ext4_mark_inode_dirty(handle, inode);
> +		if (ret)
> +			ext4_error_err(inode->i_sb, ret, "Failed to mark inode %lu dirty",
> +				       inode->i_ino);
> +		ext4_journal_stop(handle);
> +	}
> +
> +	return ret;
> +}
> +
>  static int ext4_do_flush_completed_IO(struct inode *inode,
>  				      struct list_head *head)
>  {
> @@ -253,10 +291,16 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
>  
>  	while (!list_empty(&unwritten)) {
>  		io_end = list_entry(unwritten.next, ext4_io_end_t, list);
> -		BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
>  		list_del_init(&io_end->list);
>  
> -		err = ext4_end_io_end(io_end);
> +		err = ext4_endio_update_disksize(io_end);
> +		if (unlikely(!ret && err))
> +			ret = err;
> +
> +		if (io_end->flag & EXT4_IO_END_UNWRITTEN)
> +			err = ext4_end_io_end(io_end);
> +		else
> +			ext4_release_io_end(io_end);
>  		if (unlikely(!ret && err))
>  			ret = err;
>  	}
> @@ -264,7 +308,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
>  }
>  
>  /*
> - * work on completed IO, to convert unwritten extents to extents
> + * work on completed IO, to convert unwritten extents to extents and update disksize
>   */
>  void ext4_end_io_rsv_work(struct work_struct *work)
>  {
> @@ -289,12 +333,10 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
>  void ext4_put_io_end_defer(ext4_io_end_t *io_end)
>  {
>  	if (refcount_dec_and_test(&io_end->count)) {
> -		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
> -				list_empty(&io_end->list_vec)) {
> +		if (list_empty(&io_end->list_vec))
>  			ext4_release_io_end(io_end);
> -			return;
> -		}
> -		ext4_add_complete_io(io_end);
> +		else
> +			ext4_add_complete_io(io_end);
>  	}
>  }
>  
> -- 
> 2.34.1
> 
-- 
Jan Kara <jack@...e.com>
SUSE Labs, CR