[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220907184110.wu2uqs7s3hggdtj2@riteshh-domain>
Date: Thu, 8 Sep 2022 00:11:10 +0530
From: "Ritesh Harjani (IBM)" <ritesh.list@...il.com>
To: Jan Kara <jack@...e.cz>
Cc: Ted Tso <tytso@....edu>, linux-ext4@...r.kernel.org,
Thorsten Leemhuis <regressions@...mhuis.info>,
Ojaswin Mujoo <ojaswin@...ux.ibm.com>,
Stefan Wahren <stefan.wahren@...e.com>,
Andreas Dilger <adilger.kernel@...ger.ca>
Subject: Re: [PATCH 5/5] ext4: Use buckets for cr 1 block scan instead of
rbtree
On 22/09/06 05:29PM, Jan Kara wrote:
> Using rbtree for sorting groups by average fragment size is relatively
> expensive (needs rbtree update on every block freeing or allocation) and
> leads to wide spreading of allocations because selection of block group
> is very sentitive both to changes in free space and amount of blocks
> allocated. Furthermore selecting group with the best matching average
> fragment size is not necessary anyway, even more so because the
> variability of fragment sizes within a group is likely large so average
> is not telling much. We just need a group with large enough average
> fragment size so that we have high probability of finding large enough
> free extent and we don't want average fragment size to be too big so
> that we are likely to find free extent only somewhat larger than what we
> need.
>
> So instead of maintaing rbtree of groups sorted by fragment size keep
> bins (lists) or groups where average fragment size is in the interval
> [2^i, 2^(i+1)). This structure requires less updates on block allocation
> / freeing, generally avoids chaotic spreading of allocations into block
> groups, and still is able to quickly (even faster that the rbtree)
> provide a block group which is likely to have a suitably sized free
> space extent.
This makes sense because we anyways maintain buddy bitmap for MB_NUM_ORDERS
bitmaps. Hence our data structure to maintain different lists of groups, with
their average fragments size can be bounded within MB_NUM_ORDERS lists.
This also makes it for amortized O(1) search time for finding the right group
in CR1 search.
>
> This patch reduces number of block groups used when untarring archive
> with medium sized files (size somewhat above 64k which is default
> mballoc limit for avoiding locality group preallocation) to about half
> and thus improves write speeds for eMMC flash significantly.
>
Indeed a nice change. More inline with the how we maintain
sbi->s_mb_largest_free_orders lists.
I think as you already noted there are few minor checkpatch errors,
other than that one small query below.
> Signed-off-by: Jan Kara <jack@...e.cz>
> ---
> fs/ext4/ext4.h | 10 +-
> fs/ext4/mballoc.c | 252 +++++++++++++++++++---------------------------
> fs/ext4/mballoc.h | 1 -
> 3 files changed, 110 insertions(+), 153 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 9bca5565547b..3bf9a6926798 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -167,8 +167,6 @@ enum SHIFT_DIRECTION {
> #define EXT4_MB_CR0_OPTIMIZED 0x8000
> /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
> #define EXT4_MB_CR1_OPTIMIZED 0x00010000
> -/* Perform linear traversal for one group */
> -#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000
> struct ext4_allocation_request {
> /* target inode for block we're allocating */
> struct inode *inode;
> @@ -1600,8 +1598,8 @@ struct ext4_sb_info {
> struct list_head s_discard_list;
> struct work_struct s_discard_work;
> atomic_t s_retry_alloc_pending;
> - struct rb_root s_mb_avg_fragment_size_root;
> - rwlock_t s_mb_rb_lock;
> + struct list_head *s_mb_avg_fragment_size;
> + rwlock_t *s_mb_avg_fragment_size_locks;
> struct list_head *s_mb_largest_free_orders;
> rwlock_t *s_mb_largest_free_orders_locks;
>
> @@ -3413,6 +3411,8 @@ struct ext4_group_info {
> ext4_grpblk_t bb_first_free; /* first free block */
> ext4_grpblk_t bb_free; /* total free blocks */
> ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
> + int bb_avg_fragment_size_order; /* order of average
> + fragment in BG */
> ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
> ext4_group_t bb_group; /* Group number */
> struct list_head bb_prealloc_list;
> @@ -3420,7 +3420,7 @@ struct ext4_group_info {
> void *bb_bitmap;
> #endif
> struct rw_semaphore alloc_sem;
> - struct rb_node bb_avg_fragment_size_rb;
> + struct list_head bb_avg_fragment_size_node;
> struct list_head bb_largest_free_order_node;
> ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
> * regions, index is order.
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index af1e49c3603f..213d2d0750dd 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -140,13 +140,15 @@
> * number of buddy bitmap orders possible) number of lists. Group-infos are
> * placed in appropriate lists.
> *
> - * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
> + * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
> *
> - * Locking: sbi->s_mb_rb_lock (rwlock)
> + * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
> *
> - * This is a red black tree consisting of group infos and the tree is sorted
> - * by average fragment sizes (which is calculated as ext4_group_info->bb_free
> - * / ext4_group_info->bb_fragments).
> + * This is an array of lists where in the i-th list there are groups with
> + * average fragment size >= 2^i and < 2^(i+1). The average fragment size
> + * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
> + * Note that we don't bother with a special list for completely empty groups
> + * so we only have MB_NUM_ORDERS(sb) lists.
> *
> * When "mb_optimize_scan" mount option is set, mballoc consults the above data
> * structures to decide the order in which groups are to be traversed for
> @@ -160,7 +162,8 @@
> *
> * At CR = 1, we only consider groups where average fragment size > request
> * size. So, we lookup a group which has average fragment size just above or
> - * equal to request size using our rb tree (data structure 2) in O(log N) time.
> + * equal to request size using our average fragment size group lists (data
> + * structure 2) in O(1) time.
> *
> * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
> * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
> @@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
> }
> }
>
> -static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
> - int (*cmp)(struct rb_node *, struct rb_node *))
> +static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
> {
> - struct rb_node **iter = &root->rb_node, *parent = NULL;
> + int order;
>
> - while (*iter) {
> - parent = *iter;
> - if (cmp(new, *iter) > 0)
> - iter = &((*iter)->rb_left);
> - else
> - iter = &((*iter)->rb_right);
> - }
> -
> - rb_link_node(new, parent, iter);
> - rb_insert_color(new, root);
> -}
> -
> -static int
> -ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
> -{
> - struct ext4_group_info *grp1 = rb_entry(rb1,
> - struct ext4_group_info,
> - bb_avg_fragment_size_rb);
> - struct ext4_group_info *grp2 = rb_entry(rb2,
> - struct ext4_group_info,
> - bb_avg_fragment_size_rb);
> - int num_frags_1, num_frags_2;
> -
> - num_frags_1 = grp1->bb_fragments ?
> - grp1->bb_free / grp1->bb_fragments : 0;
> - num_frags_2 = grp2->bb_fragments ?
> - grp2->bb_free / grp2->bb_fragments : 0;
> -
> - return (num_frags_2 - num_frags_1);
> + /*
> + * We don't bother with a special lists groups with only 1 block free
> + * extents and for completely empty groups.
> + */
> + order = fls(len) - 2;
> + if (order < 0)
> + return 0;
> + if (order == MB_NUM_ORDERS(sb))
> + order--;
> + return order;
> }
>
> -/*
> - * Reinsert grpinfo into the avg_fragment_size tree with new average
> - * fragment size.
> - */
> +/* Move group to appropriate avg_fragment_size list */
> static void
> mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
> {
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> + int new_order;
>
> if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
> return;
>
> - write_lock(&sbi->s_mb_rb_lock);
> - if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
> - rb_erase(&grp->bb_avg_fragment_size_rb,
> - &sbi->s_mb_avg_fragment_size_root);
> - RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
> - }
> + new_order = mb_avg_fragment_size_order(sb,
> + grp->bb_free / grp->bb_fragments);
Previous rbtree change was always checking for if grp->bb_fragments for 0.
Can grp->bb_fragments be 0 here?
-ritesh
Powered by blists - more mailing lists