[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4b29d4b8-c823-44d0-8647-a53a09e20b5f@gmx.com>
Date: Tue, 13 May 2025 08:14:03 +0930
From: Qu Wenruo <quwenruo.btrfs@....com>
To: Daniel Vacek <neelx@...e.com>, Chris Mason <clm@...com>,
Josef Bacik <josef@...icpanda.com>, David Sterba <dsterba@...e.com>
Cc: linux-btrfs@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] btrfs: index buffer_tree using node size
在 2025/5/13 02:53, Daniel Vacek 写道:
> So far we are deriving the buffer tree index using the sector size. But each
> extent buffer covers multiple sectors. This makes the buffer tree rather sparse.
>
> For example the typical and quite common configuration uses sector size of 4KiB
> and node size of 16KiB. In this case it means the buffer tree is using up to
> the maximum of 25% of it's slots. Or in other words at least 75% of the tree
> slots are wasted as never used.
>
> We can score significant memory savings on the required tree nodes by indexing
> the tree using the node size instead. As a result far less slots are wasted
> and the tree can now use up to all 100% of it's slots this way.
>
> Signed-off-by: Daniel Vacek <neelx@...e.com>
I'm fine with this change, but I still believe, we should address
unaligned tree blocks before doing this.
As this requires all tree blocks are nodesize aligned.
If we have some metadata chunks which starts at a bytenr that is not
node size aligned, all tree blocks inside that chunk will not be
nodesize aligned and causing problems.
Thanks,
Qu
> ---
> fs/btrfs/disk-io.c | 1 +
> fs/btrfs/extent_io.c | 30 +++++++++++++++---------------
> fs/btrfs/fs.h | 3 ++-
> 3 files changed, 18 insertions(+), 16 deletions(-)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 5bcf11246ba66..dcea5b0a2db50 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3395,6 +3395,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
>
> fs_info->nodesize = nodesize;
> + fs_info->node_bits = ilog2(nodesize);
> fs_info->sectorsize = sectorsize;
> fs_info->sectorsize_bits = ilog2(sectorsize);
> fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4d3584790cf7f..80a8563a25add 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1774,7 +1774,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
> */
> spin_lock(&eb->refs_lock);
> if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
> - XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
> unsigned long flags;
>
> set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> @@ -1874,7 +1874,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
> static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> {
> struct btrfs_fs_info *fs_info = eb->fs_info;
> - XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
> unsigned long flags;
>
> xas_lock_irqsave(&xas, flags);
> @@ -1886,7 +1886,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
> static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
> {
> struct btrfs_fs_info *fs_info = eb->fs_info;
> - XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
> + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->node_bits);
> unsigned long flags;
>
> xas_lock_irqsave(&xas, flags);
> @@ -1986,7 +1986,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
> rcu_read_lock();
> while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
> if (!eb_batch_add(batch, eb)) {
> - *start = (eb->start + eb->len) >> fs_info->sectorsize_bits;
> + *start = (eb->start + eb->len) >> fs_info->node_bits;
> goto out;
> }
> }
> @@ -2008,7 +2008,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
> struct btrfs_fs_info *fs_info, u64 start)
> {
> struct extent_buffer *eb;
> - unsigned long index = start >> fs_info->sectorsize_bits;
> + unsigned long index = start >> fs_info->node_bits;
>
> rcu_read_lock();
> eb = xa_load(&fs_info->buffer_tree, index);
> @@ -2114,8 +2114,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
> u64 end)
> {
> struct eb_batch batch;
> - unsigned long start_index = start >> fs_info->sectorsize_bits;
> - unsigned long end_index = end >> fs_info->sectorsize_bits;
> + unsigned long start_index = start >> fs_info->node_bits;
> + unsigned long end_index = end >> fs_info->node_bits;
>
> eb_batch_init(&batch);
> while (start_index <= end_index) {
> @@ -2151,7 +2151,7 @@ int btree_write_cache_pages(struct address_space *mapping,
>
> eb_batch_init(&batch);
> if (wbc->range_cyclic) {
> - index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits;
> + index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->node_bits;
> end = -1;
>
> /*
> @@ -2160,8 +2160,8 @@ int btree_write_cache_pages(struct address_space *mapping,
> */
> scanned = (index == 0);
> } else {
> - index = wbc->range_start >> fs_info->sectorsize_bits;
> - end = wbc->range_end >> fs_info->sectorsize_bits;
> + index = wbc->range_start >> fs_info->node_bits;
> + end = wbc->range_end >> fs_info->node_bits;
>
> scanned = 1;
> }
> @@ -3037,7 +3037,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
> eb->fs_info = fs_info;
> again:
> xa_lock_irq(&fs_info->buffer_tree);
> - exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
> + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->node_bits,
> NULL, eb, GFP_NOFS);
> if (xa_is_err(exists)) {
> ret = xa_err(exists);
> @@ -3353,7 +3353,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
> again:
> xa_lock_irq(&fs_info->buffer_tree);
> existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
> - start >> fs_info->sectorsize_bits, NULL, eb,
> + start >> fs_info->node_bits, NULL, eb,
> GFP_NOFS);
> if (xa_is_err(existing_eb)) {
> ret = xa_err(existing_eb);
> @@ -3456,7 +3456,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
> * in this case.
> */
> xa_cmpxchg_irq(&fs_info->buffer_tree,
> - eb->start >> fs_info->sectorsize_bits, eb, NULL,
> + eb->start >> fs_info->node_bits, eb, NULL,
> GFP_ATOMIC);
>
> btrfs_leak_debug_del_eb(eb);
> @@ -4294,9 +4294,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
> {
> struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
> struct extent_buffer *eb;
> - unsigned long start = folio_pos(folio) >> fs_info->sectorsize_bits;
> + unsigned long start = folio_pos(folio) >> fs_info->node_bits;
> unsigned long index = start;
> - unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
> + unsigned long end = index + (PAGE_SIZE >> fs_info->node_bits) - 1;
> int ret;
>
> xa_lock_irq(&fs_info->buffer_tree);
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index cf805b4032af3..8c9113304fabe 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -778,8 +778,9 @@ struct btrfs_fs_info {
>
> struct btrfs_delayed_root *delayed_root;
>
> - /* Entries are eb->start / sectorsize */
> + /* Entries are eb->start >> node_bits */
> struct xarray buffer_tree;
> + int node_bits;
>
> /* Next backup root to be overwritten */
> int backup_root_index;
Powered by blists - more mailing lists