[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240426154148.GL360919@frogsfrogsfrogs>
Date: Fri, 26 Apr 2024 08:41:48 -0700
From: "Darrick J. Wong" <djwong@...nel.org>
To: "Ritesh Harjani (IBM)" <ritesh.list@...il.com>
Cc: linux-ext4@...r.kernel.org, linux-xfs@...r.kernel.org,
linux-fsdevel@...r.kernel.org, Matthew Wilcox <willy@...radead.org>,
Ojaswin Mujoo <ojaswin@...ux.ibm.com>, Jan Kara <jack@...e.cz>
Subject: Re: [RFCv3 4/7] ext2: Implement seq counter for validating cached
iomap
On Thu, Apr 25, 2024 at 06:58:48PM +0530, Ritesh Harjani (IBM) wrote:
> There is a possibility of following race with iomap during
> writebck -
>
> write_cache_pages()
> cache extent covering 0..1MB range
> write page at offset 0k
> truncate(file, 4k)
> drops all relevant pages
> frees fs blocks
> pwrite(file, 4k, 4k)
> creates dirty page in the page cache
> writes page at offset 4k to a stale block
>
> This race can happen because iomap_writepages() keeps a cached extent mapping
> within struct iomap. While write_cache_pages() is going over each folio,
> (can cache a large extent range), if a truncate happens in parallel on the
> next folio followed by a buffered write to the same offset within the file,
> this can change logical to physical offset of the cached iomap mapping.
> That means, the cached iomap has now become stale.
>
> This patch implements the seq counter approach for revalidation of stale
> iomap mappings. i_blkseq will get incremented for every block
> allocation/free. Here is what we do -
>
> For ext2 buffered-writes, the block allocation happens at the
> ->write_iter time itself. So at writeback time,
> 1. We first cache the i_blkseq.
> 2. Call ext2_get_blocks(, create = 0) to get the no. of blocks
> already allocated.
> 3. Call ext2_get_blocks() the second time with length to be same as
> the no. of blocks we know were already allocated.
> 4. Till now it means, the cached i_blkseq remains valid as no block
> allocation has happened yet.
> This means the next call to ->map_blocks(), we can verify whether the
> i_blkseq has raced with truncate or not. If not, then i_blkseq will
> remain valid.
>
> In case of a hole (could happen with mmaped writes), we only allocate
> 1 block at a time anyways. So even if the i_blkseq value changes right
> after, we anyway need to allocate the next block in subsequent
> ->map_blocks() call.
>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@...il.com>
> ---
> fs/ext2/balloc.c | 1 +
> fs/ext2/ext2.h | 6 +++++
> fs/ext2/inode.c | 57 ++++++++++++++++++++++++++++++++++++++++++++----
> fs/ext2/super.c | 2 +-
> 4 files changed, 61 insertions(+), 5 deletions(-)
>
> diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
> index 1bfd6ab11038..047a8f41a6f5 100644
> --- a/fs/ext2/balloc.c
> +++ b/fs/ext2/balloc.c
> @@ -495,6 +495,7 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
> }
>
> ext2_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
> + ext2_inc_i_blkseq(EXT2_I(inode));
>
> do_more:
> overflow = 0;
> diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
> index f38bdd46e4f7..67b1acb08eb2 100644
> --- a/fs/ext2/ext2.h
> +++ b/fs/ext2/ext2.h
> @@ -663,6 +663,7 @@ struct ext2_inode_info {
> struct rw_semaphore xattr_sem;
> #endif
> rwlock_t i_meta_lock;
> + unsigned int i_blkseq;
>
> /*
> * truncate_mutex is for serialising ext2_truncate() against
> @@ -698,6 +699,11 @@ static inline struct ext2_inode_info *EXT2_I(struct inode *inode)
> return container_of(inode, struct ext2_inode_info, vfs_inode);
> }
>
> +static inline void ext2_inc_i_blkseq(struct ext2_inode_info *ei)
> +{
> + WRITE_ONCE(ei->i_blkseq, READ_ONCE(ei->i_blkseq) + 1);
> +}
> +
> /* balloc.c */
> extern int ext2_bg_has_super(struct super_block *sb, int group);
> extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group);
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 2b62786130b5..946a614ddfc0 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -406,6 +406,8 @@ static int ext2_alloc_blocks(struct inode *inode,
> ext2_fsblk_t current_block = 0;
> int ret = 0;
>
> + ext2_inc_i_blkseq(EXT2_I(inode));
> +
> /*
> * Here we try to allocate the requested multiple blocks at once,
> * on a best-effort basis.
> @@ -966,15 +968,62 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
> return mpage_writepages(mapping, wbc, ext2_get_block);
> }
>
> +static bool ext2_imap_valid(struct iomap_writepage_ctx *wpc, struct inode *inode,
> + loff_t offset)
ext2_iomap_valid, to stay consistent with the ext4 conversion series?
> +{
> + if (offset < wpc->iomap.offset ||
> + offset >= wpc->iomap.offset + wpc->iomap.length)
> + return false;
> +
> + if (wpc->iomap.validity_cookie != READ_ONCE(EXT2_I(inode)->i_blkseq))
> + return false;
> +
> + return true;
> +}
> +
> static int ext2_write_map_blocks(struct iomap_writepage_ctx *wpc,
> struct inode *inode, loff_t offset,
> unsigned len)
> {
> - if (offset >= wpc->iomap.offset &&
> - offset < wpc->iomap.offset + wpc->iomap.length)
> + loff_t maxblocks = (loff_t)INT_MAX;
> + u8 blkbits = inode->i_blkbits;
> + u32 bno;
> + bool new, boundary;
> + int ret;
> +
> + if (ext2_imap_valid(wpc, inode, offset))
> return 0;
>
> - return ext2_iomap_begin(inode, offset, inode->i_sb->s_blocksize,
> + /*
> + * For ext2 buffered-writes, the block allocation happens at the
> + * ->write_iter time itself. So at writeback time -
> + * 1. We first cache the i_blkseq.
> + * 2. Call ext2_get_blocks(, create = 0) to get the no. of blocks
> + * already allocated.
> + * 3. Call ext2_get_blocks() the second time with length to be same as
> + * the no. of blocks we know were already allocated.
> + * 4. Till now it means, the cached i_blkseq remains valid as no block
> + * allocation has happened yet.
> + * This means the next call to ->map_blocks(), we can verify whether the
> + * i_blkseq has raced with truncate or not. If not, then i_blkseq will
> + * remain valid.
> + *
> + * In case of a hole (could happen with mmaped writes), we only allocate
> + * 1 block at a time anyways. So even if the i_blkseq value changes, we
> + * anyway need to allocate the next block in subsequent ->map_blocks()
> + * call.
You might want to leave a comment here that ext2 doesn't support
unwritten extents, so the validation cookie is needed only for
writeback, and not for pagecache writes themselves. I don't expect
anyone to port extents (and hence unwritten blocks) to ext2, so this is
a minor point.
> + */
> + wpc->iomap.validity_cookie = READ_ONCE(EXT2_I(inode)->i_blkseq);
> +
> + ret = ext2_get_blocks(inode, offset >> blkbits, maxblocks << blkbits,
> + &bno, &new, &boundary, 0);
> + if (ret < 0)
> + return ret;
> + /*
> + * ret can be 0 in case of a hole which is possible for mmaped writes.
> + */
> + ret = ret ? ret : 1;
> + return ext2_iomap_begin(inode, offset, (loff_t)ret << blkbits,
> IOMAP_WRITE, &wpc->iomap, NULL);
> }
>
> @@ -1000,7 +1049,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
>
> const struct address_space_operations ext2_file_aops = {
> .dirty_folio = iomap_dirty_folio,
> - .release_folio = iomap_release_folio,
> + .release_folio = iomap_release_folio,
This fix should be in patch 2.
--D
> .invalidate_folio = iomap_invalidate_folio,
> .read_folio = ext2_file_read_folio,
> .readahead = ext2_file_readahead,
> diff --git a/fs/ext2/super.c b/fs/ext2/super.c
> index 37f7ce56adce..32f5386284d6 100644
> --- a/fs/ext2/super.c
> +++ b/fs/ext2/super.c
> @@ -188,7 +188,7 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
> #ifdef CONFIG_QUOTA
> memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
> #endif
> -
> + WRITE_ONCE(ei->i_blkseq, 0);
> return &ei->vfs_inode;
> }
>
> --
> 2.44.0
>
>
Powered by blists - more mailing lists