[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <009d08646b77e0d774b4ce248675b86564bca9ee.1714046808.git.ritesh.list@gmail.com>
Date: Thu, 25 Apr 2024 18:58:48 +0530
From: "Ritesh Harjani (IBM)" <ritesh.list@...il.com>
To: linux-ext4@...r.kernel.org,
linux-xfs@...r.kernel.org
Cc: linux-fsdevel@...r.kernel.org,
Matthew Wilcox <willy@...radead.org>,
"Darrick J . Wong" <djwong@...nel.org>,
Ojaswin Mujoo <ojaswin@...ux.ibm.com>,
Ritesh Harjani <ritesh.list@...il.com>,
Jan Kara <jack@...e.cz>
Subject: [RFCv3 4/7] ext2: Implement seq counter for validating cached iomap
There is a possibility of following race with iomap during
writebck -
write_cache_pages()
cache extent covering 0..1MB range
write page at offset 0k
truncate(file, 4k)
drops all relevant pages
frees fs blocks
pwrite(file, 4k, 4k)
creates dirty page in the page cache
writes page at offset 4k to a stale block
This race can happen because iomap_writepages() keeps a cached extent mapping
within struct iomap. While write_cache_pages() is going over each folio,
(can cache a large extent range), if a truncate happens in parallel on the
next folio followed by a buffered write to the same offset within the file,
this can change logical to physical offset of the cached iomap mapping.
That means, the cached iomap has now become stale.
This patch implements the seq counter approach for revalidation of stale
iomap mappings. i_blkseq will get incremented for every block
allocation/free. Here is what we do -
For ext2 buffered-writes, the block allocation happens at the
->write_iter time itself. So at writeback time,
1. We first cache the i_blkseq.
2. Call ext2_get_blocks(, create = 0) to get the no. of blocks
already allocated.
3. Call ext2_get_blocks() the second time with length to be same as
the no. of blocks we know were already allocated.
4. Till now it means, the cached i_blkseq remains valid as no block
allocation has happened yet.
This means the next call to ->map_blocks(), we can verify whether the
i_blkseq has raced with truncate or not. If not, then i_blkseq will
remain valid.
In case of a hole (could happen with mmaped writes), we only allocate
1 block at a time anyways. So even if the i_blkseq value changes right
after, we anyway need to allocate the next block in subsequent
->map_blocks() call.
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@...il.com>
---
fs/ext2/balloc.c | 1 +
fs/ext2/ext2.h | 6 +++++
fs/ext2/inode.c | 57 ++++++++++++++++++++++++++++++++++++++++++++----
fs/ext2/super.c | 2 +-
4 files changed, 61 insertions(+), 5 deletions(-)
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1bfd6ab11038..047a8f41a6f5 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -495,6 +495,7 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
}
ext2_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
+ ext2_inc_i_blkseq(EXT2_I(inode));
do_more:
overflow = 0;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index f38bdd46e4f7..67b1acb08eb2 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -663,6 +663,7 @@ struct ext2_inode_info {
struct rw_semaphore xattr_sem;
#endif
rwlock_t i_meta_lock;
+ unsigned int i_blkseq;
/*
* truncate_mutex is for serialising ext2_truncate() against
@@ -698,6 +699,11 @@ static inline struct ext2_inode_info *EXT2_I(struct inode *inode)
return container_of(inode, struct ext2_inode_info, vfs_inode);
}
+static inline void ext2_inc_i_blkseq(struct ext2_inode_info *ei)
+{
+ WRITE_ONCE(ei->i_blkseq, READ_ONCE(ei->i_blkseq) + 1);
+}
+
/* balloc.c */
extern int ext2_bg_has_super(struct super_block *sb, int group);
extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 2b62786130b5..946a614ddfc0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -406,6 +406,8 @@ static int ext2_alloc_blocks(struct inode *inode,
ext2_fsblk_t current_block = 0;
int ret = 0;
+ ext2_inc_i_blkseq(EXT2_I(inode));
+
/*
* Here we try to allocate the requested multiple blocks at once,
* on a best-effort basis.
@@ -966,15 +968,62 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
return mpage_writepages(mapping, wbc, ext2_get_block);
}
+static bool ext2_imap_valid(struct iomap_writepage_ctx *wpc, struct inode *inode,
+ loff_t offset)
+{
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length)
+ return false;
+
+ if (wpc->iomap.validity_cookie != READ_ONCE(EXT2_I(inode)->i_blkseq))
+ return false;
+
+ return true;
+}
+
static int ext2_write_map_blocks(struct iomap_writepage_ctx *wpc,
struct inode *inode, loff_t offset,
unsigned len)
{
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
+ loff_t maxblocks = (loff_t)INT_MAX;
+ u8 blkbits = inode->i_blkbits;
+ u32 bno;
+ bool new, boundary;
+ int ret;
+
+ if (ext2_imap_valid(wpc, inode, offset))
return 0;
- return ext2_iomap_begin(inode, offset, inode->i_sb->s_blocksize,
+ /*
+ * For ext2 buffered-writes, the block allocation happens at the
+ * ->write_iter time itself. So at writeback time -
+ * 1. We first cache the i_blkseq.
+ * 2. Call ext2_get_blocks(, create = 0) to get the no. of blocks
+ * already allocated.
+ * 3. Call ext2_get_blocks() the second time with length to be same as
+ * the no. of blocks we know were already allocated.
+ * 4. Till now it means, the cached i_blkseq remains valid as no block
+ * allocation has happened yet.
+ * This means the next call to ->map_blocks(), we can verify whether the
+ * i_blkseq has raced with truncate or not. If not, then i_blkseq will
+ * remain valid.
+ *
+ * In case of a hole (could happen with mmaped writes), we only allocate
+ * 1 block at a time anyways. So even if the i_blkseq value changes, we
+ * anyway need to allocate the next block in subsequent ->map_blocks()
+ * call.
+ */
+ wpc->iomap.validity_cookie = READ_ONCE(EXT2_I(inode)->i_blkseq);
+
+ ret = ext2_get_blocks(inode, offset >> blkbits, maxblocks << blkbits,
+ &bno, &new, &boundary, 0);
+ if (ret < 0)
+ return ret;
+ /*
+ * ret can be 0 in case of a hole which is possible for mmaped writes.
+ */
+ ret = ret ? ret : 1;
+ return ext2_iomap_begin(inode, offset, (loff_t)ret << blkbits,
IOMAP_WRITE, &wpc->iomap, NULL);
}
@@ -1000,7 +1049,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
const struct address_space_operations ext2_file_aops = {
.dirty_folio = iomap_dirty_folio,
- .release_folio = iomap_release_folio,
+ .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.read_folio = ext2_file_read_folio,
.readahead = ext2_file_readahead,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 37f7ce56adce..32f5386284d6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -188,7 +188,7 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
#ifdef CONFIG_QUOTA
memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
#endif
-
+ WRITE_ONCE(ei->i_blkseq, 0);
return &ei->vfs_inode;
}
--
2.44.0
Powered by blists - more mailing lists