lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <gvy6graln2rfpkaa3ms7lx7wlzzpjruzvefagsnnf3vcdfoxq7@dlsvj2gw6huk>
Date: Fri, 27 Jun 2025 21:11:32 +0200
From: Jan Kara <jack@...e.cz>
To: Baokun Li <libaokun1@...wei.com>
Cc: linux-ext4@...r.kernel.org, tytso@....edu, jack@...e.cz, 
	adilger.kernel@...ger.ca, ojaswin@...ux.ibm.com, linux-kernel@...r.kernel.org, 
	yi.zhang@...wei.com, yangerkun@...wei.com
Subject: Re: [PATCH v2 08/16] ext4: merge freed extent with existing extents
 before insertion

On Mon 23-06-25 15:32:56, Baokun Li wrote:
> Attempt to merge ext4_free_data with already inserted free extents prior
> to adding new ones. This strategy drastically cuts down the number of
> times locks are held.
> 
> For example, if prev, new, and next extents are all mergeable, the existing
> code (before this patch) requires acquiring the s_md_lock three times:
> 
>   prev merge into new and free prev // hold lock
>   next merge into new and free next // hold lock
>   insert new // hold lock
> 
> After the patch, it only needs to be acquired once:
> 
>   new merge next and free new // no lock
>   next merge into prev and free prev // hold lock
> 
> Performance test data follows:
> 
> Test: Running will-it-scale/fallocate2 on CPU-bound containers.
> Observation: Average fallocate operations per container per second.
> 
>                    | Kunpeng 920 / 512GB -P80|  AMD 9654 / 1536GB -P96 |
>  Disk: 960GB SSD   |-------------------------|-------------------------|
>                    | base  |    patched      | base  |    patched      |
> -------------------|-------|-----------------|-------|-----------------|
> mb_optimize_scan=0 | 20982 | 21157 (+0.8%)   | 50629 | 50420 (-0.4%)   |
> mb_optimize_scan=1 | 10703 | 12896 (+20.4%)  | 14856 | 17273 (+16.2%)  |
> 
> Signed-off-by: Baokun Li <libaokun1@...wei.com>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@...e.cz>

								Honza

> ---
>  fs/ext4/mballoc.c | 113 +++++++++++++++++++++++++++++++---------------
>  1 file changed, 76 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 5410fb3688ee..94950b07a577 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -6298,28 +6298,63 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
>   * are contiguous, AND the extents were freed by the same transaction,
>   * AND the blocks are associated with the same group.
>   */
> -static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
> -					struct ext4_free_data *entry,
> -					struct ext4_free_data *new_entry,
> -					struct rb_root *entry_rb_root)
> +static inline bool
> +ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1,
> +				 struct ext4_free_data *entry2)
>  {
> -	if ((entry->efd_tid != new_entry->efd_tid) ||
> -	    (entry->efd_group != new_entry->efd_group))
> -		return;
> -	if (entry->efd_start_cluster + entry->efd_count ==
> -	    new_entry->efd_start_cluster) {
> -		new_entry->efd_start_cluster = entry->efd_start_cluster;
> -		new_entry->efd_count += entry->efd_count;
> -	} else if (new_entry->efd_start_cluster + new_entry->efd_count ==
> -		   entry->efd_start_cluster) {
> -		new_entry->efd_count += entry->efd_count;
> -	} else
> -		return;
> +	if (entry1->efd_tid != entry2->efd_tid)
> +		return false;
> +	if (entry1->efd_start_cluster + entry1->efd_count !=
> +	    entry2->efd_start_cluster)
> +		return false;
> +	if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group))
> +		return false;
> +	return true;
> +}
> +
> +static inline void
> +ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root,
> +			 struct ext4_free_data *entry1,
> +			 struct ext4_free_data *entry2)
> +{
> +	entry1->efd_count += entry2->efd_count;
>  	spin_lock(&sbi->s_md_lock);
> -	list_del(&entry->efd_list);
> +	list_del(&entry2->efd_list);
>  	spin_unlock(&sbi->s_md_lock);
> -	rb_erase(&entry->efd_node, entry_rb_root);
> -	kmem_cache_free(ext4_free_data_cachep, entry);
> +	rb_erase(&entry2->efd_node, root);
> +	kmem_cache_free(ext4_free_data_cachep, entry2);
> +}
> +
> +static inline void
> +ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root,
> +				 struct ext4_free_data *entry)
> +{
> +	struct ext4_free_data *prev;
> +	struct rb_node *node;
> +
> +	node = rb_prev(&entry->efd_node);
> +	if (!node)
> +		return;
> +
> +	prev = rb_entry(node, struct ext4_free_data, efd_node);
> +	if (ext4_freed_extents_can_be_merged(prev, entry))
> +		ext4_merge_freed_extents(sbi, root, prev, entry);
> +}
> +
> +static inline void
> +ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root,
> +				 struct ext4_free_data *entry)
> +{
> +	struct ext4_free_data *next;
> +	struct rb_node *node;
> +
> +	node = rb_next(&entry->efd_node);
> +	if (!node)
> +		return;
> +
> +	next = rb_entry(node, struct ext4_free_data, efd_node);
> +	if (ext4_freed_extents_can_be_merged(entry, next))
> +		ext4_merge_freed_extents(sbi, root, entry, next);
>  }
>  
>  static noinline_for_stack void
> @@ -6329,11 +6364,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  	ext4_group_t group = e4b->bd_group;
>  	ext4_grpblk_t cluster;
>  	ext4_grpblk_t clusters = new_entry->efd_count;
> -	struct ext4_free_data *entry;
> +	struct ext4_free_data *entry = NULL;
>  	struct ext4_group_info *db = e4b->bd_info;
>  	struct super_block *sb = e4b->bd_sb;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> -	struct rb_node **n = &db->bb_free_root.rb_node, *node;
> +	struct rb_root *root = &db->bb_free_root;
> +	struct rb_node **n = &root->rb_node;
>  	struct rb_node *parent = NULL, *new_node;
>  
>  	BUG_ON(!ext4_handle_valid(handle));
> @@ -6369,27 +6405,30 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  		}
>  	}
>  
> -	rb_link_node(new_node, parent, n);
> -	rb_insert_color(new_node, &db->bb_free_root);
> -
> -	/* Now try to see the extent can be merged to left and right */
> -	node = rb_prev(new_node);
> -	if (node) {
> -		entry = rb_entry(node, struct ext4_free_data, efd_node);
> -		ext4_try_merge_freed_extent(sbi, entry, new_entry,
> -					    &(db->bb_free_root));
> +	atomic_add(clusters, &sbi->s_mb_free_pending);
> +	if (!entry)
> +		goto insert;
> +
> +	/* Now try to see the extent can be merged to prev and next */
> +	if (ext4_freed_extents_can_be_merged(new_entry, entry)) {
> +		entry->efd_start_cluster = cluster;
> +		entry->efd_count += new_entry->efd_count;
> +		kmem_cache_free(ext4_free_data_cachep, new_entry);
> +		ext4_try_merge_freed_extent_prev(sbi, root, entry);
> +		return;
>  	}
> -
> -	node = rb_next(new_node);
> -	if (node) {
> -		entry = rb_entry(node, struct ext4_free_data, efd_node);
> -		ext4_try_merge_freed_extent(sbi, entry, new_entry,
> -					    &(db->bb_free_root));
> +	if (ext4_freed_extents_can_be_merged(entry, new_entry)) {
> +		entry->efd_count += new_entry->efd_count;
> +		kmem_cache_free(ext4_free_data_cachep, new_entry);
> +		ext4_try_merge_freed_extent_next(sbi, root, entry);
> +		return;
>  	}
> +insert:
> +	rb_link_node(new_node, parent, n);
> +	rb_insert_color(new_node, root);
>  
>  	spin_lock(&sbi->s_md_lock);
>  	list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
> -	atomic_add(clusters, &sbi->s_mb_free_pending);
>  	spin_unlock(&sbi->s_md_lock);
>  }
>  
> -- 
> 2.46.1
> 
-- 
Jan Kara <jack@...e.com>
SUSE Labs, CR

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ