[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230830132503.6xxgb4g7xi7n6lbr@quack3>
Date: Wed, 30 Aug 2023 15:25:03 +0200
From: Jan Kara <jack@...e.cz>
To: Zhang Yi <yi.zhang@...weicloud.com>
Cc: linux-ext4@...r.kernel.org, tytso@....edu,
adilger.kernel@...ger.ca, jack@...e.cz, yi.zhang@...wei.com,
chengzhihao1@...wei.com, yukuai3@...wei.com
Subject: Re: [RFC PATCH 02/16] ext4: make sure allocate pending entry not fail
On Thu 24-08-23 17:26:05, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@...wei.com>
>
> __insert_pending() allocate memory in atomic context, so the allocation
> could fail, but we are not handling that failure now. It could lead
> ext4_es_remove_extent() to get wrong reserved clusters, and the global
> data blocks reservation count will be incorrect. The same to
> extents_status entry preallocation, preallocate pending entry out of the
> i_es_lock with __GFP_NOFAIL, make sure __insert_pending() and
> __revise_pending() always succeeds.
>
> Signed-off-by: Zhang Yi <yi.zhang@...wei.com>
Looks sensible. Feel free to add:
Reviewed-by: Jan Kara <jack@...e.cz>
Honza
> ---
> fs/ext4/extents_status.c | 123 ++++++++++++++++++++++++++++-----------
> 1 file changed, 89 insertions(+), 34 deletions(-)
>
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 5e625ea4545d..f4b50652f0cc 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -152,8 +152,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
> static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
> static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
> struct ext4_inode_info *locked_ei);
> -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> - ext4_lblk_t len);
> +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> + ext4_lblk_t len,
> + struct pending_reservation **prealloc);
>
> int __init ext4_init_es(void)
> {
> @@ -448,6 +449,19 @@ static void ext4_es_list_del(struct inode *inode)
> spin_unlock(&sbi->s_es_lock);
> }
>
> +static inline struct pending_reservation *__alloc_pending(bool nofail)
> +{
> + if (!nofail)
> + return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
> +
> + return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
> +}
> +
> +static inline void __free_pending(struct pending_reservation *pr)
> +{
> + kmem_cache_free(ext4_pending_cachep, pr);
> +}
> +
> /*
> * Returns true if we cannot fail to allocate memory for this extent_status
> * entry and cannot reclaim it until its status changes.
> @@ -836,11 +850,12 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
> {
> struct extent_status newes;
> ext4_lblk_t end = lblk + len - 1;
> - int err1 = 0;
> - int err2 = 0;
> + int err1 = 0, err2 = 0, err3 = 0;
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> struct extent_status *es1 = NULL;
> struct extent_status *es2 = NULL;
> + struct pending_reservation *pr = NULL;
> + bool revise_pending = false;
>
> if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> return;
> @@ -868,11 +883,17 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>
> ext4_es_insert_extent_check(inode, &newes);
>
> + revise_pending = sbi->s_cluster_ratio > 1 &&
> + test_opt(inode->i_sb, DELALLOC) &&
> + (status & (EXTENT_STATUS_WRITTEN |
> + EXTENT_STATUS_UNWRITTEN));
> retry:
> if (err1 && !es1)
> es1 = __es_alloc_extent(true);
> if ((err1 || err2) && !es2)
> es2 = __es_alloc_extent(true);
> + if ((err1 || err2 || err3) && revise_pending && !pr)
> + pr = __alloc_pending(true);
> write_lock(&EXT4_I(inode)->i_es_lock);
>
> err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
> @@ -897,13 +918,18 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
> es2 = NULL;
> }
>
> - if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
> - (status & EXTENT_STATUS_WRITTEN ||
> - status & EXTENT_STATUS_UNWRITTEN))
> - __revise_pending(inode, lblk, len);
> + if (revise_pending) {
> + err3 = __revise_pending(inode, lblk, len, &pr);
> + if (err3 != 0)
> + goto error;
> + if (pr) {
> + __free_pending(pr);
> + pr = NULL;
> + }
> + }
> error:
> write_unlock(&EXT4_I(inode)->i_es_lock);
> - if (err1 || err2)
> + if (err1 || err2 || err3)
> goto retry;
>
> ext4_es_print_tree(inode);
> @@ -1311,7 +1337,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
> rc->ndelonly--;
> node = rb_next(&pr->rb_node);
> rb_erase(&pr->rb_node, &tree->root);
> - kmem_cache_free(ext4_pending_cachep, pr);
> + __free_pending(pr);
> if (!node)
> break;
> pr = rb_entry(node, struct pending_reservation,
> @@ -1907,11 +1933,13 @@ static struct pending_reservation *__get_pending(struct inode *inode,
> *
> * @inode - file containing the cluster
> * @lblk - logical block in the cluster to be added
> + * @prealloc - preallocated pending entry
> *
> * Returns 0 on successful insertion and -ENOMEM on failure. If the
> * pending reservation is already in the set, returns successfully.
> */
> -static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
> +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
> + struct pending_reservation **prealloc)
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
> @@ -1937,10 +1965,15 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
> }
> }
>
> - pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
> - if (pr == NULL) {
> - ret = -ENOMEM;
> - goto out;
> + if (likely(*prealloc == NULL)) {
> + pr = __alloc_pending(false);
> + if (!pr) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + } else {
> + pr = *prealloc;
> + *prealloc = NULL;
> }
> pr->lclu = lclu;
>
> @@ -1970,7 +2003,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
> if (pr != NULL) {
> tree = &EXT4_I(inode)->i_pending_tree;
> rb_erase(&pr->rb_node, &tree->root);
> - kmem_cache_free(ext4_pending_cachep, pr);
> + __free_pending(pr);
> }
> }
>
> @@ -2029,10 +2062,10 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
> bool allocated)
> {
> struct extent_status newes;
> - int err1 = 0;
> - int err2 = 0;
> + int err1 = 0, err2 = 0, err3 = 0;
> struct extent_status *es1 = NULL;
> struct extent_status *es2 = NULL;
> + struct pending_reservation *pr = NULL;
>
> if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> return;
> @@ -2052,6 +2085,8 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
> es1 = __es_alloc_extent(true);
> if ((err1 || err2) && !es2)
> es2 = __es_alloc_extent(true);
> + if ((err1 || err2 || err3) && allocated && !pr)
> + pr = __alloc_pending(true);
> write_lock(&EXT4_I(inode)->i_es_lock);
>
> err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
> @@ -2074,11 +2109,18 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
> es2 = NULL;
> }
>
> - if (allocated)
> - __insert_pending(inode, lblk);
> + if (allocated) {
> + err3 = __insert_pending(inode, lblk, &pr);
> + if (err3 != 0)
> + goto error;
> + if (pr) {
> + __free_pending(pr);
> + pr = NULL;
> + }
> + }
> error:
> write_unlock(&EXT4_I(inode)->i_es_lock);
> - if (err1 || err2)
> + if (err1 || err2 || err3)
> goto retry;
>
> ext4_es_print_tree(inode);
> @@ -2184,21 +2226,24 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
> * @inode - file containing the range
> * @lblk - logical block defining the start of range
> * @len - length of range in blocks
> + * @prealloc - preallocated pending entry
> *
> * Used after a newly allocated extent is added to the extents status tree.
> * Requires that the extents in the range have either written or unwritten
> * status. Must be called while holding i_es_lock.
> */
> -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> - ext4_lblk_t len)
> +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> + ext4_lblk_t len,
> + struct pending_reservation **prealloc)
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> ext4_lblk_t end = lblk + len - 1;
> ext4_lblk_t first, last;
> bool f_del = false, l_del = false;
> + int ret = 0;
>
> if (len == 0)
> - return;
> + return 0;
>
> /*
> * Two cases - block range within single cluster and block range
> @@ -2219,7 +2264,9 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> f_del = __es_scan_range(inode, &ext4_es_is_delonly,
> first, lblk - 1);
> if (f_del) {
> - __insert_pending(inode, first);
> + ret = __insert_pending(inode, first, prealloc);
> + if (ret < 0)
> + goto out;
> } else {
> last = EXT4_LBLK_CMASK(sbi, end) +
> sbi->s_cluster_ratio - 1;
> @@ -2227,9 +2274,11 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> l_del = __es_scan_range(inode,
> &ext4_es_is_delonly,
> end + 1, last);
> - if (l_del)
> - __insert_pending(inode, last);
> - else
> + if (l_del) {
> + ret = __insert_pending(inode, last, prealloc);
> + if (ret < 0)
> + goto out;
> + } else
> __remove_pending(inode, last);
> }
> } else {
> @@ -2237,18 +2286,24 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> if (first != lblk)
> f_del = __es_scan_range(inode, &ext4_es_is_delonly,
> first, lblk - 1);
> - if (f_del)
> - __insert_pending(inode, first);
> - else
> + if (f_del) {
> + ret = __insert_pending(inode, first, prealloc);
> + if (ret < 0)
> + goto out;
> + } else
> __remove_pending(inode, first);
>
> last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
> if (last != end)
> l_del = __es_scan_range(inode, &ext4_es_is_delonly,
> end + 1, last);
> - if (l_del)
> - __insert_pending(inode, last);
> - else
> + if (l_del) {
> + ret = __insert_pending(inode, last, prealloc);
> + if (ret < 0)
> + goto out;
> + } else
> __remove_pending(inode, last);
> }
> +out:
> + return ret;
> }
> --
> 2.39.2
>
--
Jan Kara <jack@...e.com>
SUSE Labs, CR
Powered by blists - more mailing lists