[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <tzpuuk2lvbltpwxq7scjxdvuogztlptmvc7io23uwjwkokyqbs@kywrdx4jym7u>
Date: Tue, 6 Jan 2026 13:10:04 +0100
From: Jan Kara <jack@...e.cz>
To: Li Chen <me@...ux.beauty>
Cc: Jan Kara <jack@...e.cz>, Theodore Ts'o <tytso@....edu>,
Andreas Dilger <adilger.kernel@...ger.ca>, Harshad Shirwadkar <harshadshirwadkar@...il.com>,
linux-ext4@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] ext4: fast commit: make s_fc_lock reclaim-safe
On Tue 06-01-26 20:06:21, Li Chen wrote:
> s_fc_lock can be acquired from inode eviction and thus is
> reclaim unsafe. Since the fast commit path holds s_fc_lock while writing
> the commit log, allocations under the lock can enter reclaim and invert
> the lock order with fs_reclaim. Add ext4_fc_lock()/ext4_fc_unlock()
> helpers which acquire s_fc_lock under memalloc_nofs_save()/restore()
> context and use them everywhere so allocations under the lock cannot
> recurse into filesystem reclaim.
>
> Fixes: 6593714d67ba ("ext4: hold s_fc_lock while during fast commit")
> Signed-off-by: Li Chen <me@...ux.beauty>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <jack@...e.cz>
Honza
> ---
>
> RFC->v1: create helper functions for acquiring / releasing the lock as suggested by Jan Kara.
>
> RFC: https://patchwork.ozlabs.org/project/linux-ext4/patch/20251223131342.287864-1-me@linux.beauty/
>
> fs/ext4/ext4.h | 16 ++++++++++++++
> fs/ext4/fast_commit.c | 51 ++++++++++++++++++++++++-------------------
> 2 files changed, 44 insertions(+), 23 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 57087da6c7be..933297251f66 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1771,6 +1771,10 @@ struct ext4_sb_info {
> * Main fast commit lock. This lock protects accesses to the
> * following fields:
> * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
> + *
> + * s_fc_lock can be taken from reclaim context (inode eviction) and is
> + * thus reclaim unsafe. Use ext4_fc_lock()/ext4_fc_unlock() helpers
> + * when acquiring / releasing the lock.
> */
> struct mutex s_fc_lock;
> struct buffer_head *s_fc_bh;
> @@ -1815,6 +1819,18 @@ static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
> percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
> }
>
> +static inline int ext4_fc_lock(struct super_block *sb)
> +{
> + mutex_lock(&EXT4_SB(sb)->s_fc_lock);
> + return memalloc_nofs_save();
> +}
> +
> +static inline void ext4_fc_unlock(struct super_block *sb, int ctx)
> +{
> + memalloc_nofs_restore(ctx);
> + mutex_unlock(&EXT4_SB(sb)->s_fc_lock);
> +}
> +
> static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
> {
> return ino == EXT4_ROOT_INO ||
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index 5727ff4e9273..2f28a089fc7e 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -231,16 +231,16 @@ static bool ext4_fc_disabled(struct super_block *sb)
> void ext4_fc_del(struct inode *inode)
> {
> struct ext4_inode_info *ei = EXT4_I(inode);
> - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> struct ext4_fc_dentry_update *fc_dentry;
> wait_queue_head_t *wq;
> + int alloc_ctx;
>
> if (ext4_fc_disabled(inode->i_sb))
> return;
>
> - mutex_lock(&sbi->s_fc_lock);
> + alloc_ctx = ext4_fc_lock(inode->i_sb);
> if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(inode->i_sb, alloc_ctx);
> return;
> }
>
> @@ -275,9 +275,9 @@ void ext4_fc_del(struct inode *inode)
> #endif
> prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
> if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(inode->i_sb, alloc_ctx);
> schedule();
> - mutex_lock(&sbi->s_fc_lock);
> + alloc_ctx = ext4_fc_lock(inode->i_sb);
> }
> finish_wait(wq, &wait.wq_entry);
> }
> @@ -288,7 +288,7 @@ void ext4_fc_del(struct inode *inode)
> * dentry create references, since it is not needed to log it anyways.
> */
> if (list_empty(&ei->i_fc_dilist)) {
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(inode->i_sb, alloc_ctx);
> return;
> }
>
> @@ -298,7 +298,7 @@ void ext4_fc_del(struct inode *inode)
> list_del_init(&fc_dentry->fcd_dilist);
>
> WARN_ON(!list_empty(&ei->i_fc_dilist));
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(inode->i_sb, alloc_ctx);
>
> release_dentry_name_snapshot(&fc_dentry->fcd_name);
> kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
> @@ -315,6 +315,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
> tid_t tid;
> bool has_transaction = true;
> bool is_ineligible;
> + int alloc_ctx;
>
> if (ext4_fc_disabled(sb))
> return;
> @@ -329,12 +330,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
> has_transaction = false;
> read_unlock(&sbi->s_journal->j_state_lock);
> }
> - mutex_lock(&sbi->s_fc_lock);
> + alloc_ctx = ext4_fc_lock(sb);
> is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
> if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
> sbi->s_fc_ineligible_tid = tid;
> ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(sb, alloc_ctx);
> WARN_ON(reason >= EXT4_FC_REASON_MAX);
> sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
> }
> @@ -358,6 +359,7 @@ static int ext4_fc_track_template(
> struct ext4_inode_info *ei = EXT4_I(inode);
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> tid_t tid = 0;
> + int alloc_ctx;
> int ret;
>
> tid = handle->h_transaction->t_tid;
> @@ -373,14 +375,14 @@ static int ext4_fc_track_template(
> if (!enqueue)
> return ret;
>
> - mutex_lock(&sbi->s_fc_lock);
> + alloc_ctx = ext4_fc_lock(inode->i_sb);
> if (list_empty(&EXT4_I(inode)->i_fc_list))
> list_add_tail(&EXT4_I(inode)->i_fc_list,
> (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
> sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
> &sbi->s_fc_q[FC_Q_STAGING] :
> &sbi->s_fc_q[FC_Q_MAIN]);
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(inode->i_sb, alloc_ctx);
>
> return ret;
> }
> @@ -402,6 +404,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
> struct inode *dir = dentry->d_parent->d_inode;
> struct super_block *sb = inode->i_sb;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> + int alloc_ctx;
>
> spin_unlock(&ei->i_fc_lock);
>
> @@ -425,7 +428,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
> take_dentry_name_snapshot(&node->fcd_name, dentry);
> INIT_LIST_HEAD(&node->fcd_dilist);
> INIT_LIST_HEAD(&node->fcd_list);
> - mutex_lock(&sbi->s_fc_lock);
> + alloc_ctx = ext4_fc_lock(sb);
> if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
> sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
> list_add_tail(&node->fcd_list,
> @@ -446,7 +449,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
> WARN_ON(!list_empty(&ei->i_fc_dilist));
> list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
> }
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(sb, alloc_ctx);
> spin_lock(&ei->i_fc_lock);
>
> return 0;
> @@ -1051,18 +1054,19 @@ static int ext4_fc_perform_commit(journal_t *journal)
> struct blk_plug plug;
> int ret = 0;
> u32 crc = 0;
> + int alloc_ctx;
>
> /*
> * Step 1: Mark all inodes on s_fc_q[MAIN] with
> * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
> * freed until the data flush is over.
> */
> - mutex_lock(&sbi->s_fc_lock);
> + alloc_ctx = ext4_fc_lock(sb);
> list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> ext4_set_inode_state(&iter->vfs_inode,
> EXT4_STATE_FC_FLUSHING_DATA);
> }
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(sb, alloc_ctx);
>
> /* Step 2: Flush data for all the eligible inodes. */
> ret = ext4_fc_flush_data(journal);
> @@ -1072,7 +1076,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
> * any error from step 2. This ensures that waiters waiting on
> * EXT4_STATE_FC_FLUSHING_DATA can resume.
> */
> - mutex_lock(&sbi->s_fc_lock);
> + alloc_ctx = ext4_fc_lock(sb);
> list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> ext4_clear_inode_state(&iter->vfs_inode,
> EXT4_STATE_FC_FLUSHING_DATA);
> @@ -1089,7 +1093,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
> * prepare_to_wait() in ext4_fc_del().
> */
> smp_mb();
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(sb, alloc_ctx);
>
> /*
> * If we encountered error in Step 2, return it now after clearing
> @@ -1106,12 +1110,12 @@ static int ext4_fc_perform_commit(journal_t *journal)
> * previous handles are now drained. We now mark the inodes on the
> * commit queue as being committed.
> */
> - mutex_lock(&sbi->s_fc_lock);
> + alloc_ctx = ext4_fc_lock(sb);
> list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> ext4_set_inode_state(&iter->vfs_inode,
> EXT4_STATE_FC_COMMITTING);
> }
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(sb, alloc_ctx);
> jbd2_journal_unlock_updates(journal);
>
> /*
> @@ -1122,6 +1126,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
> blkdev_issue_flush(journal->j_fs_dev);
>
> blk_start_plug(&plug);
> + alloc_ctx = ext4_fc_lock(sb);
> /* Step 6: Write fast commit blocks to disk. */
> if (sbi->s_fc_bytes == 0) {
> /*
> @@ -1139,7 +1144,6 @@ static int ext4_fc_perform_commit(journal_t *journal)
> }
>
> /* Step 6.2: Now write all the dentry updates. */
> - mutex_lock(&sbi->s_fc_lock);
> ret = ext4_fc_commit_dentry_updates(journal, &crc);
> if (ret)
> goto out;
> @@ -1161,7 +1165,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
> ret = ext4_fc_write_tail(sb, crc);
>
> out:
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(sb, alloc_ctx);
> blk_finish_plug(&plug);
> return ret;
> }
> @@ -1295,6 +1299,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> struct ext4_inode_info *ei;
> struct ext4_fc_dentry_update *fc_dentry;
> + int alloc_ctx;
>
> if (full && sbi->s_fc_bh)
> sbi->s_fc_bh = NULL;
> @@ -1302,7 +1307,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
> trace_ext4_fc_cleanup(journal, full, tid);
> jbd2_fc_release_bufs(journal);
>
> - mutex_lock(&sbi->s_fc_lock);
> + alloc_ctx = ext4_fc_lock(sb);
> while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
> ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
> struct ext4_inode_info,
> @@ -1361,7 +1366,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
>
> if (full)
> sbi->s_fc_bytes = 0;
> - mutex_unlock(&sbi->s_fc_lock);
> + ext4_fc_unlock(sb, alloc_ctx);
> trace_ext4_fc_stats(sb);
> }
>
> --
> 2.52.0
>
--
Jan Kara <jack@...e.com>
SUSE Labs, CR
Powered by blists - more mailing lists