lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230830132503.6xxgb4g7xi7n6lbr@quack3>
Date:   Wed, 30 Aug 2023 15:25:03 +0200
From:   Jan Kara <jack@...e.cz>
To:     Zhang Yi <yi.zhang@...weicloud.com>
Cc:     linux-ext4@...r.kernel.org, tytso@....edu,
        adilger.kernel@...ger.ca, jack@...e.cz, yi.zhang@...wei.com,
        chengzhihao1@...wei.com, yukuai3@...wei.com
Subject: Re: [RFC PATCH 02/16] ext4: make sure allocate pending entry not fail

On Thu 24-08-23 17:26:05, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@...wei.com>
> 
> __insert_pending() allocate memory in atomic context, so the allocation
> could fail, but we are not handling that failure now. It could lead
> ext4_es_remove_extent() to get wrong reserved clusters, and the global
> data blocks reservation count will be incorrect. The same to
> extents_status entry preallocation, preallocate pending entry out of the
> i_es_lock with __GFP_NOFAIL, make sure __insert_pending() and
> __revise_pending() always succeeds.
> 
> Signed-off-by: Zhang Yi <yi.zhang@...wei.com>

Looks sensible. Feel free to add:

Reviewed-by: Jan Kara <jack@...e.cz>

								Honza

> ---
>  fs/ext4/extents_status.c | 123 ++++++++++++++++++++++++++++-----------
>  1 file changed, 89 insertions(+), 34 deletions(-)
> 
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 5e625ea4545d..f4b50652f0cc 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -152,8 +152,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
>  static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
>  		       struct ext4_inode_info *locked_ei);
> -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> -			     ext4_lblk_t len);
> +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> +			    ext4_lblk_t len,
> +			    struct pending_reservation **prealloc);
>  
>  int __init ext4_init_es(void)
>  {
> @@ -448,6 +449,19 @@ static void ext4_es_list_del(struct inode *inode)
>  	spin_unlock(&sbi->s_es_lock);
>  }
>  
> +static inline struct pending_reservation *__alloc_pending(bool nofail)
> +{
> +	if (!nofail)
> +		return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
> +
> +	return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
> +}
> +
> +static inline void __free_pending(struct pending_reservation *pr)
> +{
> +	kmem_cache_free(ext4_pending_cachep, pr);
> +}
> +
>  /*
>   * Returns true if we cannot fail to allocate memory for this extent_status
>   * entry and cannot reclaim it until its status changes.
> @@ -836,11 +850,12 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  {
>  	struct extent_status newes;
>  	ext4_lblk_t end = lblk + len - 1;
> -	int err1 = 0;
> -	int err2 = 0;
> +	int err1 = 0, err2 = 0, err3 = 0;
>  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>  	struct extent_status *es1 = NULL;
>  	struct extent_status *es2 = NULL;
> +	struct pending_reservation *pr = NULL;
> +	bool revise_pending = false;
>  
>  	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
>  		return;
> @@ -868,11 +883,17 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  
>  	ext4_es_insert_extent_check(inode, &newes);
>  
> +	revise_pending = sbi->s_cluster_ratio > 1 &&
> +			 test_opt(inode->i_sb, DELALLOC) &&
> +			 (status & (EXTENT_STATUS_WRITTEN |
> +				    EXTENT_STATUS_UNWRITTEN));
>  retry:
>  	if (err1 && !es1)
>  		es1 = __es_alloc_extent(true);
>  	if ((err1 || err2) && !es2)
>  		es2 = __es_alloc_extent(true);
> +	if ((err1 || err2 || err3) && revise_pending && !pr)
> +		pr = __alloc_pending(true);
>  	write_lock(&EXT4_I(inode)->i_es_lock);
>  
>  	err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
> @@ -897,13 +918,18 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  		es2 = NULL;
>  	}
>  
> -	if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
> -	    (status & EXTENT_STATUS_WRITTEN ||
> -	     status & EXTENT_STATUS_UNWRITTEN))
> -		__revise_pending(inode, lblk, len);
> +	if (revise_pending) {
> +		err3 = __revise_pending(inode, lblk, len, &pr);
> +		if (err3 != 0)
> +			goto error;
> +		if (pr) {
> +			__free_pending(pr);
> +			pr = NULL;
> +		}
> +	}
>  error:
>  	write_unlock(&EXT4_I(inode)->i_es_lock);
> -	if (err1 || err2)
> +	if (err1 || err2 || err3)
>  		goto retry;
>  
>  	ext4_es_print_tree(inode);
> @@ -1311,7 +1337,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
>  				rc->ndelonly--;
>  				node = rb_next(&pr->rb_node);
>  				rb_erase(&pr->rb_node, &tree->root);
> -				kmem_cache_free(ext4_pending_cachep, pr);
> +				__free_pending(pr);
>  				if (!node)
>  					break;
>  				pr = rb_entry(node, struct pending_reservation,
> @@ -1907,11 +1933,13 @@ static struct pending_reservation *__get_pending(struct inode *inode,
>   *
>   * @inode - file containing the cluster
>   * @lblk - logical block in the cluster to be added
> + * @prealloc - preallocated pending entry
>   *
>   * Returns 0 on successful insertion and -ENOMEM on failure.  If the
>   * pending reservation is already in the set, returns successfully.
>   */
> -static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
> +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
> +			    struct pending_reservation **prealloc)
>  {
>  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>  	struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
> @@ -1937,10 +1965,15 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
>  		}
>  	}
>  
> -	pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
> -	if (pr == NULL) {
> -		ret = -ENOMEM;
> -		goto out;
> +	if (likely(*prealloc == NULL)) {
> +		pr = __alloc_pending(false);
> +		if (!pr) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +	} else {
> +		pr = *prealloc;
> +		*prealloc = NULL;
>  	}
>  	pr->lclu = lclu;
>  
> @@ -1970,7 +2003,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
>  	if (pr != NULL) {
>  		tree = &EXT4_I(inode)->i_pending_tree;
>  		rb_erase(&pr->rb_node, &tree->root);
> -		kmem_cache_free(ext4_pending_cachep, pr);
> +		__free_pending(pr);
>  	}
>  }
>  
> @@ -2029,10 +2062,10 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
>  				  bool allocated)
>  {
>  	struct extent_status newes;
> -	int err1 = 0;
> -	int err2 = 0;
> +	int err1 = 0, err2 = 0, err3 = 0;
>  	struct extent_status *es1 = NULL;
>  	struct extent_status *es2 = NULL;
> +	struct pending_reservation *pr = NULL;
>  
>  	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
>  		return;
> @@ -2052,6 +2085,8 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
>  		es1 = __es_alloc_extent(true);
>  	if ((err1 || err2) && !es2)
>  		es2 = __es_alloc_extent(true);
> +	if ((err1 || err2 || err3) && allocated && !pr)
> +		pr = __alloc_pending(true);
>  	write_lock(&EXT4_I(inode)->i_es_lock);
>  
>  	err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
> @@ -2074,11 +2109,18 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
>  		es2 = NULL;
>  	}
>  
> -	if (allocated)
> -		__insert_pending(inode, lblk);
> +	if (allocated) {
> +		err3 = __insert_pending(inode, lblk, &pr);
> +		if (err3 != 0)
> +			goto error;
> +		if (pr) {
> +			__free_pending(pr);
> +			pr = NULL;
> +		}
> +	}
>  error:
>  	write_unlock(&EXT4_I(inode)->i_es_lock);
> -	if (err1 || err2)
> +	if (err1 || err2 || err3)
>  		goto retry;
>  
>  	ext4_es_print_tree(inode);
> @@ -2184,21 +2226,24 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
>   * @inode - file containing the range
>   * @lblk - logical block defining the start of range
>   * @len  - length of range in blocks
> + * @prealloc - preallocated pending entry
>   *
>   * Used after a newly allocated extent is added to the extents status tree.
>   * Requires that the extents in the range have either written or unwritten
>   * status.  Must be called while holding i_es_lock.
>   */
> -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> -			     ext4_lblk_t len)
> +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
> +			    ext4_lblk_t len,
> +			    struct pending_reservation **prealloc)
>  {
>  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>  	ext4_lblk_t end = lblk + len - 1;
>  	ext4_lblk_t first, last;
>  	bool f_del = false, l_del = false;
> +	int ret = 0;
>  
>  	if (len == 0)
> -		return;
> +		return 0;
>  
>  	/*
>  	 * Two cases - block range within single cluster and block range
> @@ -2219,7 +2264,9 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
>  			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
>  						first, lblk - 1);
>  		if (f_del) {
> -			__insert_pending(inode, first);
> +			ret = __insert_pending(inode, first, prealloc);
> +			if (ret < 0)
> +				goto out;
>  		} else {
>  			last = EXT4_LBLK_CMASK(sbi, end) +
>  			       sbi->s_cluster_ratio - 1;
> @@ -2227,9 +2274,11 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
>  				l_del = __es_scan_range(inode,
>  							&ext4_es_is_delonly,
>  							end + 1, last);
> -			if (l_del)
> -				__insert_pending(inode, last);
> -			else
> +			if (l_del) {
> +				ret = __insert_pending(inode, last, prealloc);
> +				if (ret < 0)
> +					goto out;
> +			} else
>  				__remove_pending(inode, last);
>  		}
>  	} else {
> @@ -2237,18 +2286,24 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
>  		if (first != lblk)
>  			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
>  						first, lblk - 1);
> -		if (f_del)
> -			__insert_pending(inode, first);
> -		else
> +		if (f_del) {
> +			ret = __insert_pending(inode, first, prealloc);
> +			if (ret < 0)
> +				goto out;
> +		} else
>  			__remove_pending(inode, first);
>  
>  		last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
>  		if (last != end)
>  			l_del = __es_scan_range(inode, &ext4_es_is_delonly,
>  						end + 1, last);
> -		if (l_del)
> -			__insert_pending(inode, last);
> -		else
> +		if (l_del) {
> +			ret = __insert_pending(inode, last, prealloc);
> +			if (ret < 0)
> +				goto out;
> +		} else
>  			__remove_pending(inode, last);
>  	}
> +out:
> +	return ret;
>  }
> -- 
> 2.39.2
> 
-- 
Jan Kara <jack@...e.com>
SUSE Labs, CR

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ