[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240805013446.814357-1-chengzhihao@huaweicloud.com>
Date: Mon, 5 Aug 2024 09:34:46 +0800
From: Zhihao Cheng <chengzhihao@...weicloud.com>
To: viro@...iv.linux.org.uk,
brauner@...nel.org,
tahsin@...gle.com,
mjguzik@...il.com,
error27@...il.com,
tytso@....edu,
rydercoding@...mail.com,
jack@...e.cz,
hch@...radead.org,
andreas.dilger@...el.com,
richard@....at
Cc: linux-fsdevel@...r.kernel.org,
linux-ext4@...r.kernel.org,
linux-mtd@...ts.infradead.org,
linux-kernel@...r.kernel.org,
chengzhihao1@...wei.com,
yi.zhang@...wei.com,
yangerkun@...wei.com,
wangzhaolong1@...wei.com
Subject: [PATCH] vfs: Don't evict inode under the inode lru traversing context
From: Zhihao Cheng <chengzhihao1@...wei.com>
The inode reclaiming process(See function prune_icache_sb) collects all
reclaimable inodes and mark them with I_FREEING flag at first, at that
time, other processes will be stuck if they try getting these inodes
(See function find_inode_fast), then the reclaiming process destroy the
inodes by function dispose_list(). Some filesystems(eg. ext4 with
ea_inode feature, ubifs with xattr) may do inode lookup in the inode
evicting callback function, if the inode lookup is operated under the
inode lru traversing context, deadlock problems may happen.
Case 1: In function ext4_evict_inode(), the ea inode lookup could happen
if ea_inode feature is enabled, the lookup process will be stuck
under the evicting context like this:
1. File A has inode i_reg and an ea inode i_ea
2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea
3. Then, following three processes running like this:
PA PB
echo 2 > /proc/sys/vm/drop_caches
shrink_slab
prune_dcache_sb
// i_reg is added into lru, lru->i_ea->i_reg
prune_icache_sb
list_lru_walk_one
inode_lru_isolate
i_ea->i_state |= I_FREEING // set inode state
inode_lru_isolate
__iget(i_reg)
spin_unlock(&i_reg->i_lock)
spin_unlock(lru_lock)
rm file A
i_reg->nlink = 0
iput(i_reg) // i_reg->nlink is 0, do evict
ext4_evict_inode
ext4_xattr_delete_inode
ext4_xattr_inode_dec_ref_all
ext4_xattr_inode_iget
ext4_iget(i_ea->i_ino)
iget_locked
find_inode_fast
__wait_on_freeing_inode(i_ea) ----→ AA deadlock
dispose_list // cannot be executed by prune_icache_sb
wake_up_bit(&i_ea->i_state)
Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file
deleting process holds BASEHD's wbuf->io_mutex while getting the
xattr inode, which could race with inode reclaiming process(The
reclaiming process could try locking BASEHD's wbuf->io_mutex in
inode evicting function), then an ABBA deadlock problem would
happen as following:
1. File A has inode ia and a xattr(with inode ixa), regular file B has
inode ib and a xattr.
2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa
3. Then, following three processes running like this:
PA PB PC
echo 2 > /proc/sys/vm/drop_caches
shrink_slab
prune_dcache_sb
// ib and ia are added into lru, lru->ixa->ib->ia
prune_icache_sb
list_lru_walk_one
inode_lru_isolate
ixa->i_state |= I_FREEING // set inode state
inode_lru_isolate
__iget(ib)
spin_unlock(&ib->i_lock)
spin_unlock(lru_lock)
rm file B
ib->nlink = 0
rm file A
iput(ia)
ubifs_evict_inode(ia)
ubifs_jnl_delete_inode(ia)
ubifs_jnl_write_inode(ia)
make_reservation(BASEHD) // Lock wbuf->io_mutex
ubifs_iget(ixa->i_ino)
iget_locked
find_inode_fast
__wait_on_freeing_inode(ixa)
| iput(ib) // ib->nlink is 0, do evict
| ubifs_evict_inode
| ubifs_jnl_delete_inode(ib)
↓ ubifs_jnl_write_inode
ABBA deadlock ←-----make_reservation(BASEHD)
dispose_list // cannot be executed by prune_icache_sb
wake_up_bit(&ixa->i_state)
Fix it by forbidding inode evicting under the inode lru traversing
context. In details, we import a new inode state flag 'I_LRU_ISOLATING'
to pin inode without holding i_count under the inode lru traversing
context, the inode evicting process will wait until this flag is
cleared from i_state.
Link: https://lore.kernel.org/all/37c29c42-7685-d1f0-067d-63582ffac405@huaweicloud.com/
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022
Fixes: e50e5129f384 ("ext4: xattr-in-inode support")
Fixes: 7959cf3a7506 ("ubifs: journal: Handle xattrs like files")
Cc: stable@...r.kernel.org
Signed-off-by: Zhihao Cheng <chengzhihao1@...wei.com>
Suggested-by: Jan Kara <jack@...e.cz>
Suggested-by: Mateusz Guzik <mjguzik@...il.com>
---
fs/inode.c | 37 +++++++++++++++++++++++++++++++++++--
include/linux/fs.h | 5 +++++
2 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 86670941884b..f1c6e8072f39 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -488,6 +488,36 @@ static void inode_lru_list_del(struct inode *inode)
this_cpu_dec(nr_unused);
}
+static void inode_lru_isolating(struct inode *inode)
+{
+ BUG_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
+ inode->i_state |= I_LRU_ISOLATING;
+}
+
+static void inode_lru_finish_isolating(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ BUG_ON(!(inode->i_state & I_LRU_ISOLATING));
+ inode->i_state &= ~I_LRU_ISOLATING;
+ wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
+ spin_unlock(&inode->i_lock);
+}
+
+static void inode_wait_for_lru_isolating(struct inode *inode)
+{
+ DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
+ wait_queue_head_t *wqh;
+
+ spin_lock(&inode->i_lock);
+ wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
+ while (inode->i_state & I_LRU_ISOLATING) {
+ spin_unlock(&inode->i_lock);
+ __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
+ spin_lock(&inode->i_lock);
+ }
+ spin_unlock(&inode->i_lock);
+}
+
/**
* inode_sb_list_add - add inode to the superblock list of inodes
* @inode: inode to add
@@ -657,6 +687,9 @@ static void evict(struct inode *inode)
inode_sb_list_del(inode);
+ /* Wait for LRU isolating to finish. */
+ inode_wait_for_lru_isolating(inode);
+
/*
* Wait for flusher thread to be done with the inode so that filesystem
* does not start destroying it while writeback is still running. Since
@@ -855,7 +888,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
* be under pressure before the cache inside the highmem zone.
*/
if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
- __iget(inode);
+ inode_lru_isolating(inode);
spin_unlock(&inode->i_lock);
spin_unlock(lru_lock);
if (remove_inode_buffers(inode)) {
@@ -867,7 +900,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
__count_vm_events(PGINODESTEAL, reap);
mm_account_reclaimed_pages(reap);
}
- iput(inode);
+ inode_lru_finish_isolating(inode);
spin_lock(lru_lock);
return LRU_RETRY;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..fb0426f349fc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2392,6 +2392,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
*
* I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback.
*
+ * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding
+ * i_count.
+ *
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
#define I_DIRTY_SYNC (1 << 0)
@@ -2415,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
#define I_DONTCACHE (1 << 16)
#define I_SYNC_QUEUED (1 << 17)
#define I_PINNING_NETFS_WB (1 << 18)
+#define __I_LRU_ISOLATING 19
+#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING)
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
--
2.39.2
Powered by blists - more mailing lists