[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20210610161221.GD23539@quack2.suse.cz>
Date: Thu, 10 Jun 2021 18:12:21 +0200
From: Jan Kara <jack@...e.cz>
To: Zhang Yi <yi.zhang@...wei.com>
Cc: linux-ext4@...r.kernel.org, linux-fsdevel@...r.kernel.org,
jack@...e.cz, tytso@....edu, adilger.kernel@...ger.ca,
david@...morbit.com, hch@...radead.org
Subject: Re: [RFC PATCH v4 5/8] jbd2,ext4: add a shrinker to release
checkpointed buffers
On Thu 10-06-21 19:24:37, Zhang Yi wrote:
> Current metadata buffer release logic in bdev_try_to_free_page() have
> a lot of use-after-free issues when umount filesystem concurrently, and
> it is difficult to fix directly because ext4 is the only user of
> s_op->bdev_try_to_free_page callback and we may have to add more special
> refcount or lock that is only used by ext4 into the common vfs layer,
> which is unacceptable.
>
> One better solution is remove the bdev_try_to_free_page callback, but
> the real problem is we cannot easily release journal_head on the
> checkpointed buffer, so try_to_free_buffers() cannot release buffers and
> page under memory pressure, which is more likely to trigger
> out-of-memory. So we cannot remove the callback directly before we find
> another way to release journal_head.
>
> This patch introduce a shrinker to free journal_head on the checkpointed
> transaction. After the journal_head got freed, try_to_free_buffers()
> could free buffer properly.
>
> Signed-off-by: Zhang Yi <yi.zhang@...wei.com>
> Suggested-by: Jan Kara <jack@...e.cz>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <jack@...e.cz>
Honza
> ---
> fs/ext4/super.c | 8 ++
> fs/jbd2/checkpoint.c | 147 ++++++++++++++++++++++++++++++++++++
> fs/jbd2/journal.c | 87 +++++++++++++++++++++
> include/linux/jbd2.h | 26 +++++++
> include/trace/events/jbd2.h | 101 +++++++++++++++++++++++++
> 5 files changed, 369 insertions(+)
>
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index d29f6aa7d96e..80064e566f56 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1174,6 +1174,7 @@ static void ext4_put_super(struct super_block *sb)
> ext4_unregister_sysfs(sb);
>
> if (sbi->s_journal) {
> + jbd2_journal_unregister_shrinker(sbi->s_journal);
> aborted = is_journal_aborted(sbi->s_journal);
> err = jbd2_journal_destroy(sbi->s_journal);
> sbi->s_journal = NULL;
> @@ -5178,6 +5179,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> sbi->s_ea_block_cache = NULL;
>
> if (sbi->s_journal) {
> + jbd2_journal_unregister_shrinker(sbi->s_journal);
> jbd2_journal_destroy(sbi->s_journal);
> sbi->s_journal = NULL;
> }
> @@ -5504,6 +5506,12 @@ static int ext4_load_journal(struct super_block *sb,
> ext4_commit_super(sb);
> }
>
> + err = jbd2_journal_register_shrinker(journal);
> + if (err) {
> + EXT4_SB(sb)->s_journal = NULL;
> + goto err_out;
> + }
> +
> return 0;
>
> err_out:
> diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
> index 75a4f622afaf..1abdae44a3d8 100644
> --- a/fs/jbd2/checkpoint.c
> +++ b/fs/jbd2/checkpoint.c
> @@ -79,6 +79,18 @@ static inline void __buffer_relink_io(struct journal_head *jh)
> transaction->t_checkpoint_io_list = jh;
> }
>
> +/*
> + * Check a checkpoint buffer could be release or not.
> + *
> + * Requires j_list_lock
> + */
> +static inline bool __cp_buffer_busy(struct journal_head *jh)
> +{
> + struct buffer_head *bh = jh2bh(jh);
> +
> + return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh));
> +}
> +
> /*
> * Try to release a checkpointed buffer from its transaction.
> * Returns 1 if we released it and 2 if we also released the
> @@ -458,6 +470,137 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
> return 0;
> }
>
> +/*
> + * journal_shrink_one_cp_list
> + *
> + * Find 'nr_to_scan' written-back checkpoint buffers in the given list
> + * and try to release them. If the whole transaction is released, set
> + * the 'released' parameter. Return the number of released checkpointed
> + * buffers.
> + *
> + * Called with j_list_lock held.
> + */
> +static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
> + unsigned long *nr_to_scan,
> + bool *released)
> +{
> + struct journal_head *last_jh;
> + struct journal_head *next_jh = jh;
> + unsigned long nr_freed = 0;
> + int ret;
> +
> + if (!jh || *nr_to_scan == 0)
> + return 0;
> +
> + last_jh = jh->b_cpprev;
> + do {
> + jh = next_jh;
> + next_jh = jh->b_cpnext;
> +
> + (*nr_to_scan)--;
> + if (__cp_buffer_busy(jh))
> + continue;
> +
> + nr_freed++;
> + ret = __jbd2_journal_remove_checkpoint(jh);
> + if (ret) {
> + *released = true;
> + break;
> + }
> +
> + if (need_resched())
> + break;
> + } while (jh != last_jh && *nr_to_scan);
> +
> + return nr_freed;
> +}
> +
> +/*
> + * jbd2_journal_shrink_checkpoint_list
> + *
> + * Find 'nr_to_scan' written-back checkpoint buffers in the journal
> + * and try to release them. Return the number of released checkpointed
> + * buffers.
> + *
> + * Called with j_list_lock held.
> + */
> +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
> + unsigned long *nr_to_scan)
> +{
> + transaction_t *transaction, *last_transaction, *next_transaction;
> + bool released;
> + tid_t first_tid = 0, last_tid = 0, next_tid = 0;
> + tid_t tid = 0;
> + unsigned long nr_freed = 0;
> + unsigned long nr_scanned = *nr_to_scan;
> +
> +again:
> + spin_lock(&journal->j_list_lock);
> + if (!journal->j_checkpoint_transactions) {
> + spin_unlock(&journal->j_list_lock);
> + goto out;
> + }
> +
> + /*
> + * Get next shrink transaction, resume previous scan or start
> + * over again. If some others do checkpoint and drop transaction
> + * from the checkpoint list, we ignore saved j_shrink_transaction
> + * and start over unconditionally.
> + */
> + if (journal->j_shrink_transaction)
> + transaction = journal->j_shrink_transaction;
> + else
> + transaction = journal->j_checkpoint_transactions;
> +
> + if (!first_tid)
> + first_tid = transaction->t_tid;
> + last_transaction = journal->j_checkpoint_transactions->t_cpprev;
> + next_transaction = transaction;
> + last_tid = last_transaction->t_tid;
> + do {
> + transaction = next_transaction;
> + next_transaction = transaction->t_cpnext;
> + tid = transaction->t_tid;
> + released = false;
> +
> + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list,
> + nr_to_scan, &released);
> + if (*nr_to_scan == 0)
> + break;
> + if (need_resched() || spin_needbreak(&journal->j_list_lock))
> + break;
> + if (released)
> + continue;
> +
> + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list,
> + nr_to_scan, &released);
> + if (*nr_to_scan == 0)
> + break;
> + if (need_resched() || spin_needbreak(&journal->j_list_lock))
> + break;
> + } while (transaction != last_transaction);
> +
> + if (transaction != last_transaction) {
> + journal->j_shrink_transaction = next_transaction;
> + next_tid = next_transaction->t_tid;
> + } else {
> + journal->j_shrink_transaction = NULL;
> + next_tid = 0;
> + }
> +
> + spin_unlock(&journal->j_list_lock);
> + cond_resched();
> +
> + if (*nr_to_scan && next_tid)
> + goto again;
> +out:
> + nr_scanned -= *nr_to_scan;
> + trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid,
> + nr_freed, nr_scanned, next_tid);
> +
> + return nr_freed;
> +}
> +
> /*
> * journal_clean_checkpoint_list
> *
> @@ -580,6 +723,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
>
> __buffer_unlink(jh);
> jh->b_cp_transaction = NULL;
> + percpu_counter_dec(&journal->j_jh_shrink_count);
> jbd2_journal_put_journal_head(jh);
>
> /* Is this transaction empty? */
> @@ -642,6 +786,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
> jh->b_cpnext->b_cpprev = jh;
> }
> transaction->t_checkpoint_list = jh;
> + percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count);
> }
>
> /*
> @@ -657,6 +802,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
> void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
> {
> assert_spin_locked(&journal->j_list_lock);
> +
> + journal->j_shrink_transaction = NULL;
> if (transaction->t_cpnext) {
> transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
> transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 90146755941f..3746bb4fc431 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -1954,6 +1954,91 @@ int jbd2_journal_load(journal_t *journal)
> return -EIO;
> }
>
> +/**
> + * jbd2_journal_shrink_scan()
> + *
> + * Scan the checkpointed buffer on the checkpoint list and release the
> + * journal_head.
> + */
> +static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
> + struct shrink_control *sc)
> +{
> + journal_t *journal = container_of(shrink, journal_t, j_shrinker);
> + unsigned long nr_to_scan = sc->nr_to_scan;
> + unsigned long nr_shrunk;
> + unsigned long count;
> +
> + count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
> + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
> +
> + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
> +
> + count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
> + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
> +
> + return nr_shrunk;
> +}
> +
> +/**
> + * jbd2_journal_shrink_count()
> + *
> + * Count the number of checkpoint buffers on the checkpoint list.
> + */
> +static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
> + struct shrink_control *sc)
> +{
> + journal_t *journal = container_of(shrink, journal_t, j_shrinker);
> + unsigned long count;
> +
> + count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
> + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
> +
> + return count;
> +}
> +
> +/**
> + * jbd2_journal_register_shrinker()
> + * @journal: Journal to act on.
> + *
> + * Init a percpu counter to record the checkpointed buffers on the checkpoint
> + * list and register a shrinker to release their journal_head.
> + */
> +int jbd2_journal_register_shrinker(journal_t *journal)
> +{
> + int err;
> +
> + journal->j_shrink_transaction = NULL;
> +
> + err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL);
> + if (err)
> + return err;
> +
> + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
> + journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
> + journal->j_shrinker.seeks = DEFAULT_SEEKS;
> + journal->j_shrinker.batch = journal->j_max_transaction_buffers;
> +
> + err = register_shrinker(&journal->j_shrinker);
> + if (err) {
> + percpu_counter_destroy(&journal->j_jh_shrink_count);
> + return err;
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * jbd2_journal_unregister_shrinker()
> + * @journal: Journal to act on.
> + *
> + * Unregister the checkpointed buffer shrinker and destroy the percpu counter.
> + */
> +void jbd2_journal_unregister_shrinker(journal_t *journal)
> +{
> + percpu_counter_destroy(&journal->j_jh_shrink_count);
> + unregister_shrinker(&journal->j_shrinker);
> +}
> +
> /**
> * jbd2_journal_destroy() - Release a journal_t structure.
> * @journal: Journal to act on.
> @@ -2026,6 +2111,8 @@ int jbd2_journal_destroy(journal_t *journal)
> brelse(journal->j_sb_buffer);
> }
>
> + jbd2_journal_unregister_shrinker(journal);
> +
> if (journal->j_proc_entry)
> jbd2_stats_proc_exit(journal);
> iput(journal->j_inode);
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index f9b5e657b8f3..23578506215f 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -909,6 +909,29 @@ struct journal_s
> */
> struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH];
>
> + /**
> + * @j_shrinker:
> + *
> + * Journal head shrinker, reclaim buffer's journal head which
> + * has been written back.
> + */
> + struct shrinker j_shrinker;
> +
> + /**
> + * @j_jh_shrink_count:
> + *
> + * Number of journal buffers on the checkpoint list. [j_list_lock]
> + */
> + struct percpu_counter j_jh_shrink_count;
> +
> + /**
> + * @j_shrink_transaction:
> + *
> + * Record next transaction will shrink on the checkpoint list.
> + * [j_list_lock]
> + */
> + transaction_t *j_shrink_transaction;
> +
> /**
> * @j_head:
> *
> @@ -1418,6 +1441,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
>
> /* Checkpoint list management */
> void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
> +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
> int __jbd2_journal_remove_checkpoint(struct journal_head *);
> void jbd2_journal_destroy_checkpoint(journal_t *journal);
> void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
> @@ -1528,6 +1552,8 @@ extern int jbd2_journal_set_features
> (journal_t *, unsigned long, unsigned long, unsigned long);
> extern void jbd2_journal_clear_features
> (journal_t *, unsigned long, unsigned long, unsigned long);
> +extern int jbd2_journal_register_shrinker(journal_t *journal);
> +extern void jbd2_journal_unregister_shrinker(journal_t *journal);
> extern int jbd2_journal_load (journal_t *journal);
> extern int jbd2_journal_destroy (journal_t *);
> extern int jbd2_journal_recover (journal_t *journal);
> diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
> index d16a32867f3a..a4dfe005983d 100644
> --- a/include/trace/events/jbd2.h
> +++ b/include/trace/events/jbd2.h
> @@ -394,6 +394,107 @@ TRACE_EVENT(jbd2_lock_buffer_stall,
> __entry->stall_ms)
> );
>
> +DECLARE_EVENT_CLASS(jbd2_journal_shrink,
> +
> + TP_PROTO(journal_t *journal, unsigned long nr_to_scan,
> + unsigned long count),
> +
> + TP_ARGS(journal, nr_to_scan, count),
> +
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(unsigned long, nr_to_scan)
> + __field(unsigned long, count)
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = journal->j_fs_dev->bd_dev;
> + __entry->nr_to_scan = nr_to_scan;
> + __entry->count = count;
> + ),
> +
> + TP_printk("dev %d,%d nr_to_scan %lu count %lu",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->nr_to_scan, __entry->count)
> +);
> +
> +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count,
> +
> + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count),
> +
> + TP_ARGS(journal, nr_to_scan, count)
> +);
> +
> +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter,
> +
> + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count),
> +
> + TP_ARGS(journal, nr_to_scan, count)
> +);
> +
> +TRACE_EVENT(jbd2_shrink_scan_exit,
> +
> + TP_PROTO(journal_t *journal, unsigned long nr_to_scan,
> + unsigned long nr_shrunk, unsigned long count),
> +
> + TP_ARGS(journal, nr_to_scan, nr_shrunk, count),
> +
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(unsigned long, nr_to_scan)
> + __field(unsigned long, nr_shrunk)
> + __field(unsigned long, count)
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = journal->j_fs_dev->bd_dev;
> + __entry->nr_to_scan = nr_to_scan;
> + __entry->nr_shrunk = nr_shrunk;
> + __entry->count = count;
> + ),
> +
> + TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->nr_to_scan, __entry->nr_shrunk,
> + __entry->count)
> +);
> +
> +TRACE_EVENT(jbd2_shrink_checkpoint_list,
> +
> + TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid,
> + unsigned long nr_freed, unsigned long nr_scanned,
> + tid_t next_tid),
> +
> + TP_ARGS(journal, first_tid, tid, last_tid, nr_freed,
> + nr_scanned, next_tid),
> +
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(tid_t, first_tid)
> + __field(tid_t, tid)
> + __field(tid_t, last_tid)
> + __field(unsigned long, nr_freed)
> + __field(unsigned long, nr_scanned)
> + __field(tid_t, next_tid)
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = journal->j_fs_dev->bd_dev;
> + __entry->first_tid = first_tid;
> + __entry->tid = tid;
> + __entry->last_tid = last_tid;
> + __entry->nr_freed = nr_freed;
> + __entry->nr_scanned = nr_scanned;
> + __entry->next_tid = next_tid;
> + ),
> +
> + TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu "
> + "scanned %lu next transaction %u",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->first_tid, __entry->tid, __entry->last_tid,
> + __entry->nr_freed, __entry->nr_scanned, __entry->next_tid)
> +);
> +
> #endif /* _TRACE_JBD2_H */
>
> /* This part must be outside protection */
> --
> 2.31.1
>
--
Jan Kara <jack@...e.com>
SUSE Labs, CR
Powered by blists - more mailing lists