[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <C3DAF83A-CE88-4348-BCE2-237960F3CD9D@dilger.ca>
Date: Tue, 3 Feb 2026 20:53:07 -0700
From: Andreas Dilger <adilger@...ger.ca>
To: Mario Lohajner <mario_lohajner@...ketmail.com>
Cc: tytso@....edu,
linux-ext4@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH] ext4: add optional rotating block allocation policy
On Feb 3, 2026, at 20:31, Mario Lohajner <mario_lohajner@...ketmail.com> wrote:
>
> Add support for the rotalloc allocation policy as a new mount
> option. Policy rotates the starting block group for new allocations.
>
> Changes:
> - fs/ext4/ext4.h
> rotalloc policy dedlared, extend sb with cursor, vector & lock
>
> - fs/ext4/mballoc.h
> expose allocator functions for vectoring in super.c
>
> - fs/ext4/super.c
> parse rotalloc mnt opt, init cursor, lock and allocator vector
>
> - fs/ext4/mballoc.c
> add rotalloc allocator, vectored allocator call in new_blocks
>
> The policy is selected via a mount option and does not change the
> on-disk format or default allocation behavior. It preserves existing
> allocation heuristics within a block group while distributing
> allocations across block groups in a deterministic sequential manner.
>
> The rotating allocator is implemented as a separate allocation path
> selected at mount time. This avoids conditional branches in the regular
> allocator and keeps allocation policies isolated.
> This also allows the rotating allocator to evolve independently in the
> future without increasing complexity in the regular allocator.
>
> The policy was tested using v6.18.6 stable locally with the new mount
> option "rotalloc" enabled, confirmed working as desribed!
Hi Mario,
can you please provide some background/reasoning behind this allocator?
I suspect there are good reasons/workloads that could benefit from it
(e.g. flash wear leveling), but that should be stated in the commit
message, and preferably with some benchmarks/measurements that show
some benefit from adding this feature.
Cheers, Andreas
>
> Signed-off-by: Mario Lohajner <mario_lohajner@...ketmail.com>
> ---
> fs/ext4/ext4.h | 8 +++
> fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
> fs/ext4/mballoc.h | 3 +
> fs/ext4/super.c | 18 +++++-
> 4 files changed, 175 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 56112f201cac..cbbb7c05d7a2 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -229,6 +229,9 @@ struct ext4_allocation_request {
> unsigned int flags;
> };
>
> +/* expose rotalloc allocator argument pointer type */
> +struct ext4_allocation_context;
> +
> /*
> * Logical to physical block mapping, used by ext4_map_blocks()
> *
> @@ -1230,6 +1233,7 @@ struct ext4_inode_info {
> * Mount flags set via mount options or defaults
> */
> #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
> +#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc policy/allocator */
> #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
> #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
> #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
> @@ -1559,6 +1563,10 @@ struct ext4_sb_info {
> unsigned long s_mount_flags;
> unsigned int s_def_mount_opt;
> unsigned int s_def_mount_opt2;
> + /* Rotalloc cursor, lock & new_blocks allocator vector */
> + unsigned int s_rotalloc_cursor;
> + spinlock_t s_rotalloc_lock;
> + int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
> ext4_fsblk_t s_sb_block;
> atomic64_t s_resv_clusters;
> kuid_t s_resuid;
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 56d50fd3310b..74f79652c674 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
> * stop the scan and use it immediately
> *
> * * If free extent found is smaller than goal, then keep retrying
> - * upto a max of sbi->s_mb_max_to_scan times (default 200). After
> + * up to a max of sbi->s_mb_max_to_scan times (default 200). After
> * that stop scanning and use whatever we have.
> *
> * * If free extent found is bigger than goal, then keep retrying
> - * upto a max of sbi->s_mb_min_to_scan times (default 10) before
> + * up to a max of sbi->s_mb_min_to_scan times (default 10) before
> * stopping the scan and using the extent.
> *
> *
> @@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
> return ret;
> }
>
> -static noinline_for_stack int
> +noinline_for_stack int
> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> {
> ext4_group_t i;
> @@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> * is greater than equal to the sbi_s_mb_order2_reqs
> * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
> * We also support searching for power-of-two requests only for
> - * requests upto maximum buddy size we have constructed.
> + * requests up to maximum buddy size we have constructed.
> */
> if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
> if (is_power_of_2(ac->ac_g_ex.fe_len))
> @@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> return err;
> }
>
> +/* Rotating allocator (rotalloc mount option) */
> +noinline_for_stack int
> +ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
> +{
> + ext4_group_t i, goal;
> + int err = 0;
> + struct super_block *sb = ac->ac_sb;
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_buddy e4b;
> +
> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
> +
> + /* Set the goal from s_rotalloc_cursor */
> + spin_lock(&sbi->s_rotalloc_lock);
> + goal = sbi->s_rotalloc_cursor;
> + spin_unlock(&sbi->s_rotalloc_lock);
> + ac->ac_g_ex.fe_group = goal;
> +
> + /* first, try the goal */
> + err = ext4_mb_find_by_goal(ac, &e4b);
> + if (err || ac->ac_status == AC_STATUS_FOUND)
> + goto out;
> +
> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
> + goto out;
> +
> + /*
> + * ac->ac_2order is set only if the fe_len is a power of 2
> + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
> + * so that we try exact allocation using buddy.
> + */
> + i = fls(ac->ac_g_ex.fe_len);
> + ac->ac_2order = 0;
> + /*
> + * We search using buddy data only if the order of the request
> + * is greater than equal to the sbi_s_mb_order2_reqs
> + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
> + * We also support searching for power-of-two requests only for
> + * requests up to maximum buddy size we have constructed.
> + */
> + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
> + if (is_power_of_2(ac->ac_g_ex.fe_len))
> + ac->ac_2order = array_index_nospec(i - 1,
> + MB_NUM_ORDERS(sb));
> + }
> +
> + /* if stream allocation is enabled, use global goal */
> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
> + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
> +
> + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
> + ac->ac_g_ex.fe_start = -1;
> + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
> + }
> +
> + /*
> + * Let's just scan groups to find more-less suitable blocks We
> + * start with CR_GOAL_LEN_FAST, unless it is power of 2
> + * aligned, in which case let's do that faster approach first.
> + */
> + ac->ac_criteria = CR_GOAL_LEN_FAST;
> + if (ac->ac_2order)
> + ac->ac_criteria = CR_POWER2_ALIGNED;
> +
> + ac->ac_e4b = &e4b;
> + ac->ac_prefetch_ios = 0;
> + ac->ac_first_err = 0;
> +
> + /* Be sure to start scanning with goal from s_rotalloc_cursor! */
> + ac->ac_g_ex.fe_group = goal;
> +repeat:
> + while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
> + err = ext4_mb_scan_groups(ac);
> + if (err)
> + goto out;
> +
> + if (ac->ac_status != AC_STATUS_CONTINUE)
> + break;
> + }
> +
> + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
> + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
> + /*
> + * We've been searching too long. Let's try to allocate
> + * the best chunk we've found so far
> + */
> + ext4_mb_try_best_found(ac, &e4b);
> + if (ac->ac_status != AC_STATUS_FOUND) {
> + int lost;
> +
> + /*
> + * Someone more lucky has already allocated it.
> + * The only thing we can do is just take first
> + * found block(s)
> + */
> + lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
> + mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
> + ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
> + ac->ac_b_ex.fe_len, lost);
> +
> + ac->ac_b_ex.fe_group = 0;
> + ac->ac_b_ex.fe_start = 0;
> + ac->ac_b_ex.fe_len = 0;
> + ac->ac_status = AC_STATUS_CONTINUE;
> + ac->ac_flags |= EXT4_MB_HINT_FIRST;
> + ac->ac_criteria = CR_ANY_FREE;
> + goto repeat;
> + }
> + }
> +
> + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
> + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
> + ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
> + atomic_inc(&sbi->s_bal_stream_goals);
> + }
> +out:
> + if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
> + err = ac->ac_first_err;
> +
> + mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
> + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
> + ac->ac_flags, ac->ac_criteria, err);
> +
> + if (ac->ac_prefetch_nr)
> + ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
> +
> + if (!err) {
> + /* Finally, if no errors, set the currsor to best group! */
> + goal = ac->ac_b_ex.fe_group;
> + spin_lock(&sbi->s_rotalloc_lock);
> + sbi->s_rotalloc_cursor = goal;
> + spin_unlock(&sbi->s_rotalloc_lock);
> + }
> +
> + return err;
> +}
> +
> static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
> {
> struct super_block *sb = pde_data(file_inode(seq->file));
> @@ -6314,7 +6452,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> goto errout;
> repeat:
> /* allocate space in core */
> - *errp = ext4_mb_regular_allocator(ac);
> + /*
> + * Use vectored allocator insead of fixed
> + * ext4_mb_regular_allocator(ac) function
> + */
> + *errp = sbi->s_mb_new_blocks(ac);
> /*
> * pa allocated above is added to grp->bb_prealloc_list only
> * when we were able to allocate some block i.e. when
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index 15a049f05d04..309190ce05ae 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -270,4 +270,7 @@ ext4_mballoc_query_range(
> ext4_mballoc_query_range_fn formatter,
> void *priv);
>
> +/* Expose rotating & regular allocators for vectoring */
> +int ext4_mb_rotating_allocator(struct ext4_allocation_context *ac);
> +int ext4_mb_regular_allocator(struct ext4_allocation_context *ac);
> #endif
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 87205660c5d0..f53501bbfb4b 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1673,7 +1673,7 @@ enum {
> Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
> Opt_inode_readahead_blks, Opt_journal_ioprio,
> Opt_dioread_nolock, Opt_dioread_lock,
> - Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_rotalloc,
> Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
> Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
> @@ -1797,6 +1797,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
> fsparam_u32 ("init_itable", Opt_init_itable),
> fsparam_flag ("init_itable", Opt_init_itable),
> fsparam_flag ("noinit_itable", Opt_noinit_itable),
> + fsparam_flag ("rotalloc", Opt_rotalloc),
> #ifdef CONFIG_EXT4_DEBUG
> fsparam_flag ("fc_debug_force", Opt_fc_debug_force),
> fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay),
> @@ -1878,6 +1879,7 @@ static const struct mount_opts {
> {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
> {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
> {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
> + {Opt_rotalloc, EXT4_MOUNT_ROTALLOC, MOPT_SET},
> {Opt_dax_type, 0, MOPT_EXT4_ONLY},
> {Opt_journal_dev, 0, MOPT_NO_EXT2},
> {Opt_journal_path, 0, MOPT_NO_EXT2},
> @@ -2264,6 +2266,9 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
> ctx->s_li_wait_mult = result.uint_32;
> ctx->spec |= EXT4_SPEC_s_li_wait_mult;
> return 0;
> + case Opt_rotalloc:
> + ctx_set_mount_opt(ctx, EXT4_MOUNT_ROTALLOC);
> + return 0;
> case Opt_max_dir_size_kb:
> ctx->s_max_dir_size_kb = result.uint_32;
> ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
> @@ -5512,6 +5517,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
> }
> }
>
> + /*
> + * Initialize rotalloc cursor, lock and
> + * vector new_blocks to rotating^regular allocator
> + */
> + sbi->s_rotalloc_cursor = 0;
> + spin_lock_init(&sbi->s_rotalloc_lock);
> + if (test_opt(sb, ROTALLOC))
> + sbi->s_mb_new_blocks = ext4_mb_rotating_allocator;
> + else
> + sbi->s_mb_new_blocks = ext4_mb_regular_allocator;
> +
> /*
> * Get the # of file system overhead blocks from the
> * superblock if present.
> --
> 2.52.0
>
Cheers, Andreas
Powered by blists - more mailing lists