[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c6a3faa7-299a-4f10-981d-693cdf55b930@huawei.com>
Date: Wed, 4 Feb 2026 14:29:08 +0800
From: Baokun Li <libaokun1@...wei.com>
To: Mario Lohajner <mario_lohajner@...ketmail.com>, <tytso@....edu>
CC: <adilger.kernel@...ger.ca>, <linux-ext4@...r.kernel.org>,
<linux-kernel@...r.kernel.org>, Yang Erkun <yangerkun@...wei.com>,
<libaokun9@...il.com>, Baokun Li <libaokun1@...wei.com>
Subject: Re: [PATCH] ext4: add optional rotating block allocation policy
On 2026-02-04 11:31, Mario Lohajner wrote:
> Add support for the rotalloc allocation policy as a new mount
> option. Policy rotates the starting block group for new allocations.
>
> Changes:
> - fs/ext4/ext4.h
> rotalloc policy dedlared, extend sb with cursor, vector & lock
>
> - fs/ext4/mballoc.h
> expose allocator functions for vectoring in super.c
>
> - fs/ext4/super.c
> parse rotalloc mnt opt, init cursor, lock and allocator vector
>
> - fs/ext4/mballoc.c
> add rotalloc allocator, vectored allocator call in new_blocks
>
> The policy is selected via a mount option and does not change the
> on-disk format or default allocation behavior. It preserves existing
> allocation heuristics within a block group while distributing
> allocations across block groups in a deterministic sequential manner.
>
> The rotating allocator is implemented as a separate allocation path
> selected at mount time. This avoids conditional branches in the regular
> allocator and keeps allocation policies isolated.
> This also allows the rotating allocator to evolve independently in the
> future without increasing complexity in the regular allocator.
>
> The policy was tested using v6.18.6 stable locally with the new mount
> option "rotalloc" enabled, confirmed working as desribed!
>
> Signed-off-by: Mario Lohajner <mario_lohajner@...ketmail.com>
> ---
> fs/ext4/ext4.h | 8 +++
> fs/ext4/mballoc.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
> fs/ext4/mballoc.h | 3 +
> fs/ext4/super.c | 18 +++++-
> 4 files changed, 175 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 56112f201cac..cbbb7c05d7a2 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -229,6 +229,9 @@ struct ext4_allocation_request {
> unsigned int flags;
> };
>
> +/* expose rotalloc allocator argument pointer type */
> +struct ext4_allocation_context;
> +
> /*
> * Logical to physical block mapping, used by ext4_map_blocks()
> *
> @@ -1230,6 +1233,7 @@ struct ext4_inode_info {
> * Mount flags set via mount options or defaults
> */
> #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */
> +#define EXT4_MOUNT_ROTALLOC 0x00002 /* Use rotalloc policy/allocator */
> #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
> #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
> #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
> @@ -1559,6 +1563,10 @@ struct ext4_sb_info {
> unsigned long s_mount_flags;
> unsigned int s_def_mount_opt;
> unsigned int s_def_mount_opt2;
> + /* Rotalloc cursor, lock & new_blocks allocator vector */
> + unsigned int s_rotalloc_cursor;
> + spinlock_t s_rotalloc_lock;
> + int (*s_mb_new_blocks)(struct ext4_allocation_context *ac);
> ext4_fsblk_t s_sb_block;
> atomic64_t s_resv_clusters;
> kuid_t s_resuid;
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 56d50fd3310b..74f79652c674 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2314,11 +2314,11 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
> * stop the scan and use it immediately
> *
> * * If free extent found is smaller than goal, then keep retrying
> - * upto a max of sbi->s_mb_max_to_scan times (default 200). After
> + * up to a max of sbi->s_mb_max_to_scan times (default 200). After
> * that stop scanning and use whatever we have.
> *
> * * If free extent found is bigger than goal, then keep retrying
> - * upto a max of sbi->s_mb_min_to_scan times (default 10) before
> + * up to a max of sbi->s_mb_min_to_scan times (default 10) before
> * stopping the scan and using the extent.
> *
> *
> @@ -2981,7 +2981,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
> return ret;
> }
>
> -static noinline_for_stack int
> +noinline_for_stack int
> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> {
> ext4_group_t i;
> @@ -3012,7 +3012,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> * is greater than equal to the sbi_s_mb_order2_reqs
> * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
> * We also support searching for power-of-two requests only for
> - * requests upto maximum buddy size we have constructed.
> + * requests up to maximum buddy size we have constructed.
> */
> if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
> if (is_power_of_2(ac->ac_g_ex.fe_len))
> @@ -3101,6 +3101,144 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> return err;
> }
>
> +/* Rotating allocator (rotalloc mount option) */
> +noinline_for_stack int
> +ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
> +{
> + ext4_group_t i, goal;
> + int err = 0;
> + struct super_block *sb = ac->ac_sb;
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_buddy e4b;
> +
> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
> +
> + /* Set the goal from s_rotalloc_cursor */
> + spin_lock(&sbi->s_rotalloc_lock);
> + goal = sbi->s_rotalloc_cursor;
> + spin_unlock(&sbi->s_rotalloc_lock);
> + ac->ac_g_ex.fe_group = goal;
> +
> + /* first, try the goal */
> + err = ext4_mb_find_by_goal(ac, &e4b);
> + if (err || ac->ac_status == AC_STATUS_FOUND)
> + goto out;
> +
> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
> + goto out;
> +
> + /*
> + * ac->ac_2order is set only if the fe_len is a power of 2
> + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
> + * so that we try exact allocation using buddy.
> + */
> + i = fls(ac->ac_g_ex.fe_len);
> + ac->ac_2order = 0;
> + /*
> + * We search using buddy data only if the order of the request
> + * is greater than equal to the sbi_s_mb_order2_reqs
> + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
> + * We also support searching for power-of-two requests only for
> + * requests up to maximum buddy size we have constructed.
> + */
> + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
> + if (is_power_of_2(ac->ac_g_ex.fe_len))
> + ac->ac_2order = array_index_nospec(i - 1,
> + MB_NUM_ORDERS(sb));
> + }
> +
> + /* if stream allocation is enabled, use global goal */
> + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
> + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
> +
> + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
> + ac->ac_g_ex.fe_start = -1;
> + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
Rotating block allocation looks a lot like stream allocation—they both
pick up from where the last successful allocation left off.
I noticed that the stream allocation's global goal is now split up.
Is there an advantage to keeping it as a single goal?
Alternatively, do you see any downsides to this split in your use case?
Powered by blists - more mailing lists