lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250620125744.GT4037@twin.jikos.cz>
Date: Fri, 20 Jun 2025 14:57:44 +0200
From: David Sterba <dsterba@...e.cz>
To: Daniel Vacek <neelx@...e.com>
Cc: Chris Mason <clm@...com>, Josef Bacik <josef@...icpanda.com>,
	David Sterba <dsterba@...e.com>, Qu Wenruo <wqu@...e.com>,
	linux-btrfs@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2] btrfs: index buffer_tree using node size

On Thu, Jun 12, 2025 at 10:47:23AM +0200, Daniel Vacek wrote:
> So far we are deriving the buffer tree index using the sector size. But each
> extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
> 
> For example the typical and quite common configuration uses sector size of 4KiB
> and node size of 16KiB. In this case it means the buffer tree is using up to
> the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> slots are wasted as never used.
> 
> We can score significant memory savings on the required tree nodes by indexing
> the tree using the node size instead. As a result far less slots are wasted
> and the tree can now use up to all 100% of it's slots this way.
> 
> Note: This works even with unaligned tree blocks as we can still get unique
>       index by doing eb->start >> nodesize_shift.

Can we have at least some numbers? As we've talked about it and you
showed me the number of radix nodes or other internal xarray structures
before/after.

> Signed-off-by: Daniel Vacek <neelx@...e.com>
> Reviewed-by: Qu Wenruo <wqu@...e.com>
> ---
> v2 changes:
>  * Note that this is still correct even with unaligned tree blocks.
>  * Rename node_bits to nodesize_bits to stay consistent.
>  * Move the nodesize_bits member next to nodesize and make it u32.
> 
> ---
>  fs/btrfs/disk-io.c   |  1 +
>  fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
>  fs/btrfs/fs.h        |  3 ++-
>  3 files changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 0d6ad7512f217..3d465258f15b7 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3396,6 +3396,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
>  	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
>  
>  	fs_info->nodesize = nodesize;
> +	fs_info->nodesize_bits = ilog2(nodesize);
>  	fs_info->sectorsize = sectorsize;
>  	fs_info->sectorsize_bits = ilog2(sectorsize);
>  	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index e9ba80a56172d..a55c7c7eb8990 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
>  	 */
>  	spin_lock(&eb->refs_lock);
>  	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> -		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +		XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
>  		unsigned long flags;
>  
>  		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
>  static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  {
>  	struct btrfs_fs_info *fs_info = eb->fs_info;
> -	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
>  	unsigned long flags;
>  
>  	xas_lock_irqsave(&xas, flags);
> @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
>  {
>  	struct btrfs_fs_info *fs_info = eb->fs_info;
> -	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> +	XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
>  	unsigned long flags;
>  
>  	xas_lock_irqsave(&xas, flags);
> @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
>  	rcu_read_lock();
>  	while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
>  		if (!eb_batch_add(batch, eb)) {
> -			*start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
> +			*start = (eb->start + eb->len) >> fs_info->nodesize_bits;

In other places you drop the outer ( ) from the shifts, please keep it
if it's there (or add if it's missing).

>  			goto out;
>  		}
>  	}
> @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
>  		struct btrfs_fs_info *fs_info, u64 start)
>  {
>  	struct extent_buffer *eb;
> -	unsigned long index = (start >> fs_info->sectorsize_bits);
> +	unsigned long index = start >> fs_info->nodesize_bits;
>  
>  	rcu_read_lock();
>  	eb = xa_load(&fs_info->buffer_tree, index);
> @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
>  				      u64 end)
>  {
>  	struct eb_batch batch;
> -	unsigned long start_index = (start >> fs_info->sectorsize_bits);
> -	unsigned long end_index = (end >> fs_info->sectorsize_bits);
> +	unsigned long start_index = start >> fs_info->nodesize_bits;
> +	unsigned long end_index = end >> fs_info->nodesize_bits;
>  
>  	eb_batch_init(&batch);
>  	while (start_index <= end_index) {
> @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
>  
>  	eb_batch_init(&batch);
>  	if (wbc->range_cyclic) {
> -		index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
> +		index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits;
>  		end = -1;
>  
>  		/*
> @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
>  		 */
>  		scanned = (index == 0);
>  	} else {
> -		index = (wbc->range_start >> fs_info->sectorsize_bits);
> -		end = (wbc->range_end >> fs_info->sectorsize_bits);
> +		index = wbc->range_start >> fs_info->nodesize_bits;
> +		end = wbc->range_end >> fs_info->nodesize_bits;
>  
>  		scanned = 1;
>  	}
> @@ -3038,7 +3038,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
>  	eb->fs_info = fs_info;
>  again:
>  	xa_lock_irq(&fs_info->buffer_tree);
> -	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> +	exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
>  			      NULL, eb, GFP_NOFS);
>  	if (xa_is_err(exists)) {
>  		ret = xa_err(exists);
> @@ -3355,7 +3355,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  again:
>  	xa_lock_irq(&fs_info->buffer_tree);
>  	existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> -				   start >> fs_info->sectorsize_bits, NULL, eb,
> +				   start >> fs_info->nodesize_bits, NULL, eb,
>  				   GFP_NOFS);
>  	if (xa_is_err(existing_eb)) {
>  		ret = xa_err(existing_eb);
> @@ -3458,7 +3458,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
>  		 * in this case.
>  		 */
>  		xa_cmpxchg_irq(&fs_info->buffer_tree,
> -			       eb->start >> fs_info->sectorsize_bits, eb, NULL,
> +			       eb->start >> fs_info->nodesize_bits, eb, NULL,
>  			       GFP_ATOMIC);
>  
>  		btrfs_leak_debug_del_eb(eb);
> @@ -4300,9 +4300,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
>  {
>  	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
>  	struct extent_buffer *eb;
> -	unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
> +	unsigned long start = folio_pos(folio) >> fs_info->nodesize_bits;
>  	unsigned long index = start;
> -	unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> +	unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
>  	int ret;
>  
>  	xa_lock_irq(&fs_info->buffer_tree);
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index b239e4b8421cf..fd7cbbe3515d6 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -781,7 +781,7 @@ struct btrfs_fs_info {
>  
>  	struct btrfs_delayed_root *delayed_root;
>  
> -	/* Entries are eb->start / sectorsize */
> +	/* Entries are eb->start >> nodesize_bits */
>  	struct xarray buffer_tree;
>  
>  	/* Next backup root to be overwritten */
> @@ -813,6 +813,7 @@ struct btrfs_fs_info {
>  
>  	/* Cached block sizes */
>  	u32 nodesize;
> +	u32 nodesize_bits;
>  	u32 sectorsize;
>  	/* ilog2 of sectorsize, use to avoid 64bit division */
>  	u32 sectorsize_bits;
> -- 
> 2.47.2
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ