lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <9BA9D25F-A282-4998-9B53-03EDCD0D7C25@dilger.ca>
Date: Thu, 29 May 2025 15:44:26 -0600
From: Andreas Dilger <adilger@...ger.ca>
To: David Wang <00107082@....com>
Cc: Theodore Ts'o <tytso@....edu>,
 Ext4 Developers List <linux-ext4@...r.kernel.org>,
 LKML <linux-kernel@...r.kernel.org>
Subject: Re: [RFC] ext4: use kmem_cache for short fname allocation in readdir

On May 29, 2025, at 8:42 AM, David Wang <00107082@....com> wrote:
> 
> When searching files, ext4_readdir would kzalloc() a fname
> object for each entry. It would be faster if a dedicated
> kmem_cache is used for fname.
> 
> But fnames are of variable length.
> 
> This patch suggests using kmem_cache for fname with short
> length, and resorting to kzalloc when fname needs larger buffer.
> Assuming long file names are not very common.

It may be reasonable to have a cache, but I suspect that 127 bytes
is too large for most common workloads.  Statistics that I've seen
show 99-percentile filename length is <= 48 bytes on most filesystems,
so allocating 128 bytes is probably sub-optimal (most is wasted).

That said, kmalloc() is mostly a wrapper for the kmalloc-* slabs, so
creating a 128-byte slab for this doesn't seem super useful?

> Profiling when searching files in kernel code base, with following
> command:
> 	# perf record -g -e cpu-clock --freq=max bash -c \
> 	"for i in {1..100}; do find ./linux -name notfoundatall > /dev/null; done"
> And using sample counts as indicator of performance improvement.
> (The faster, the less samples collected. And the tests are carried out
> when system is under no memory pressure.)
> 
> Before without the change:
> 	1232868--ext4_readdir
> 		 |
> 		 |--839085--ext4_htree_fill_tree
> 		 |          |
> 		 |           --829223--htree_dirblock_to_tree
> 		 |                     |
> 		 |                     |--365869--ext4_htree_store_dirent
> 		 |                     |          |
> 		 |                     |          |--43169--0xffffffffa7f8d094
> 		 |                     |          |
> 		 |                     |           --21947--0xffffffffa7f8d0f7
> 		 |                     |
> 		 |                     |--213124--ext4fs_dirhash
> 		 |                     |          |
> 		 |                     |           --86339--str2hashbuf_signed
> 		 |                     |
> 		 |                     |--145839--__ext4_read_dirblock
> 
> and with the change, ~3% less samples:
> 	1202922--ext4_readdir

> 		 |
> 		 |--805105--ext4_htree_fill_tree
> 		 |          |
> 		 |           --795055--htree_dirblock_to_tree
> 		 |                     |
> 		 |                     |--328876--ext4_htree_store_dirent
> 		 |                     |          |
> 		 |                     |          |--123207--kmem_cache_alloc_noprof
> 		 |                     |          |          |
> 		 |                     |          |          |--26453--__alloc_tagging_slab_alloc_hook
> 		 |                     |          |          |
> 		 |                     |          |           --20413--__slab_alloc.isra.0
> 		 |                     |          |
> 		 |                     |           --31566--rb_insert_color
> 		 |                     |
> 		 |                     |--212915--ext4fs_dirhash
> 		 |                     |          |
> 		 |                     |           --86004--str2hashbuf_signed
> 		 |                     |
> 		 |                     |--149146--__ext4_read_dirblock
> 
> readdir() would have sigfinicant improvement, but the overall
> improvements for searching files is only ~0.5%, might be more
> sigfinicant if the system is under some memory pressures.
> 
> The slab stats after the test:
> ext4_dir_fname      1242   1242    176   23    1 : tunables    0    0    0 : slabdata     54     54      0
> 
> Signed-off-by: David Wang <00107082@....com>
> ---
> fs/ext4/dir.c   | 47 ++++++++++++++++++++++++++++++++++++++++++++---
> fs/ext4/ext4.h  |  4 ++++
> fs/ext4/super.c |  3 +++
> 3 files changed, 51 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
> index d4164c507a90..3adfa0d038cd 100644
> --- a/fs/ext4/dir.c
> +++ b/fs/ext4/dir.c
> @@ -424,6 +424,48 @@ struct fname {
> 	char		name[] __counted_by(name_len);
> };
> 
> +#define EXT4_DIR_FNAME_SHORT_LENGTH 127
> +static struct kmem_cache *ext4_dir_fname_cachep;
> +
> +void __init ext4_init_dir(void)
> +{
> +	ext4_dir_fname_cachep = kmem_cache_create_usercopy("ext4_dir_fname",
> +			struct_size_t(struct fname, name,  EXT4_DIR_FNAME_SHORT_LENGTH + 1),
> +			0, SLAB_RECLAIM_ACCOUNT,
> +			offsetof(struct fname, name), EXT4_DIR_FNAME_SHORT_LENGTH + 1,
> +			NULL);
> +}
> +
> +void ext4_exit_dir(void)
> +{
> +	if (ext4_dir_fname_cachep)
> +		kmem_cache_destroy(ext4_dir_fname_cachep);
> +}
> +
> +static struct fname *rb_node_fname_zalloc(__u8 name_len)
> +{
> +	struct fname *p;
> +	if (ext4_dir_fname_cachep && name_len <= EXT4_DIR_FNAME_SHORT_LENGTH)
> +		p = kmem_cache_alloc(ext4_dir_fname_cachep, GFP_KERNEL);
> +	else
> +		p = kmalloc(struct_size(p, name, name_len + 1), GFP_KERNEL);
> +	if (p) {
> +		/* no need to fill name with zeroes*/
> +		memset(p, 0, offsetof(struct fname, name));
> +		p->name[name_len] = 0;
> +	}
> +	return p;
> +}
> +
> +static void rb_node_fname_free(struct fname *p) {
> +	if (!p)
> +		return;
> +	if (ext4_dir_fname_cachep && p->name_len <= EXT4_DIR_FNAME_SHORT_LENGTH)
> +		kmem_cache_free(ext4_dir_fname_cachep, p);
> +	else
> +		kfree(p);
> +}
> +
> /*
>  * This function implements a non-recursive way of freeing all of the
>  * nodes in the red-black tree.
> @@ -436,7 +478,7 @@ static void free_rb_tree_fname(struct rb_root *root)
> 		while (fname) {
> 			struct fname *old = fname;
> 			fname = fname->next;
> -			kfree(old);
> +			rb_node_fname_free(old);
> 		}
> 
> 	*root = RB_ROOT;
> @@ -479,8 +521,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
> 	p = &info->root.rb_node;
> 
> 	/* Create and allocate the fname structure */
> -	new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1),
> -			 GFP_KERNEL);
> +	new_fn = rb_node_fname_zalloc(ent_name->len);
> 	if (!new_fn)
> 		return -ENOMEM;
> 	new_fn->hash = hash;
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5a20e9cd7184..33ab97143000 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3795,6 +3795,10 @@ extern void ext4_orphan_file_block_trigger(
> 				struct buffer_head *bh,
> 				void *data, size_t size);
> 
> +/* dir.c */
> +extern void __init ext4_init_dir(void);
> +extern void ext4_exit_dir(void);
> +
> /*
>  * Add new method to test whether block and inode bitmaps are properly
>  * initialized. With uninit_bg reading the block from disk is not enough
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 181934499624..21ce3d78912a 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -7457,6 +7457,8 @@ static int __init ext4_init_fs(void)
> 	if (err)
> 		goto out;
> 
> +	ext4_init_dir();
> +
> 	return 0;
> out:
> 	unregister_as_ext2();
> @@ -7497,6 +7499,7 @@ static void __exit ext4_exit_fs(void)
> 	ext4_exit_post_read_processing();
> 	ext4_exit_es();
> 	ext4_exit_pending();
> +	ext4_exit_dir();
> }
> 
> MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
> --
> 2.39.2
> 


Cheers, Andreas






Download attachment "signature.asc" of type "application/pgp-signature" (874 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ