[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1D048C17-B36D-497D-815E-54BBD9E9F769@dilger.ca>
Date: Wed, 22 Aug 2018 11:02:14 -0600
From: Andreas Dilger <adilger@...ger.ca>
To: Jaco Kroon <jaco@....co.za>
Cc: "Theodore Y. Ts'o" <tytso@....edu>,
linux-ext4 <linux-ext4@...r.kernel.org>
Subject: Re: [PATCH] Add block_high_watermark sysfs tunable.
On Aug 21, 2018, at 8:57 PM, Jaco Kroon <jaco@....co.za> wrote:
>
> Hi,
>
> The below is based on suggestion from Andreas. I opted for the sysfs
> tunable rather than an ioctl (which could also be used) because this
> enables to set the value without a special tool.
>
> Andreas mentioned a number of functions to adjust for allocating blocks,
> but only the two below seems to relate. I've also worked through all
> uses of ext4_blocks_count() and could not find further use cases that
> needs adjustment. Some of the uses is during mount only (cluster
> reservations), and since this change is per-mount and nt superblock
> persistent adjustments won't have any effect there. If this is
> desirable I'll try putting this in the superblock instead but this would
> require allocating feature bits and I'm not sure this change is worth a
> feature bit.
Haven't had a chance to look at the patch yet, but thanks for submitting.
One comment below...
> I'll attempt limiting inode allocation in ialloc based on this next,
> just wanted to get feedback on the below first.
>
> My big question is this: how do I build a test case for this code?
>
> As an aside, the resize2fs from 64T to 56T eventually finished some time
> during the night. So just under 17 days total. If we can get an online
> resize for the same to be double or even triple that in total it'll
> still be a massive win for me. Even if we need to for the final stages
> take the filesystem offline for two days - I know a fsck on this system
> takes ~18 hours (will probably be a bit less now), so a shrink without
> having to move data blocks will take at least that time in all
> likelihood, a debugfs ncheck took ~12 hours on 64TB (which is needed to
> migrate inodes), icheck was ~12 minutes, most of the time spent to open
> the filesystem.
Note that if opening the filesystem takes a long time, you can use
"debugfs -c" to skip loading the block and inode bitmaps, which can
speed things up significantly. That doesn't work for everything
(definitely not filesystem-modifying operations) but for many read-only
operations it is very useful.
Cheers, Andreas
> On 22/08/2018 04:21, Jaco Kroon wrote:
>> NOT READY FOR MERGE!!!!!!
>>
>> Limiting block allocations to a high watermark will eventually enable us
>> to perform online shrinks of an ext4 filesystem. As an immediate
>> benefit it'll prevent allocation of blocks in the high range, which if
>> performed as a precursor to an offline filesystem shrink will help to
>> reduce the overall time a filesystem needs to be taken offline in order
>> to shrink it.
>>
>> (possible) shortcomings:
>>
>> Currently this tunable does not get stored to the superblock, and thus
>> needs to be set again after each mount.
>>
>> The ext4_statfs function doesn't adjust the f_bavail value currently, as
>> such df will report incorrect results.
>>
>> The inode allocator hasn't been synced yet.
>> ---
>> fs/ext4/balloc.c | 2 +-
>> fs/ext4/ext4.h | 10 ++++++++++
>> fs/ext4/mballoc.c | 2 +-
>> fs/ext4/sysfs.c | 19 +++++++++++++++++++
>> 4 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
>> index e5d6ee61ff48..4f723c7a9c88 100644
>> --- a/fs/ext4/balloc.c
>> +++ b/fs/ext4/balloc.c
>> @@ -883,7 +883,7 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
>> block_group++;
>> }
>> bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
>> - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
>> + last_block = ext4_blocks_max_allocatable(EXT4_SB(inode->i_sb)) - 1;
>>
>> /*
>> * If we are doing delayed allocation, we don't need take
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 0f0edd1cd0cd..dc30ea107c55 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1423,6 +1423,7 @@ struct ext4_sb_info {
>> unsigned int s_mb_order2_reqs;
>> unsigned int s_mb_group_prealloc;
>> unsigned int s_max_dir_size_kb;
>> + ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */
>> /* where last allocation was done - for stream allocation */
>> unsigned long s_mb_last_group;
>> unsigned long s_mb_last_start;
>> @@ -2711,6 +2712,15 @@ static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
>> le32_to_cpu(es->s_blocks_count_lo);
>> }
>>
>> +static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi)
>> +{
>> + ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es);
>> + if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks)
>> + return sbi->s_block_high_watermark;
>> + else
>> + return blocks;
>> +}
>> +
>> static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
>> {
>> return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index e29fce2fbf25..a158c2c9de10 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -4232,7 +4232,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
>> /* start searching from the goal */
>> goal = ar->goal;
>> if (goal < le32_to_cpu(es->s_first_data_block) ||
>> - goal >= ext4_blocks_count(es))
>> + goal >= ext4_blocks_max_allocatable(sbi))
>> goal = le32_to_cpu(es->s_first_data_block);
>> ext4_get_group_no_and_offset(sb, goal, &group, &block);
>>
>> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
>> index 9212a026a1f1..2a1a955c2c0b 100644
>> --- a/fs/ext4/sysfs.c
>> +++ b/fs/ext4/sysfs.c
>> @@ -30,6 +30,7 @@ typedef enum {
>> attr_feature,
>> attr_pointer_ui,
>> attr_pointer_atomic,
>> + attr_block_high_watermark,
>> } attr_id_t;
>>
>> typedef enum {
>> @@ -167,6 +168,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
>> EXT4_ATTR_FUNC(session_write_kbytes, 0444);
>> EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
>> EXT4_ATTR_FUNC(reserved_clusters, 0644);
>> +EXT4_ATTR_FUNC(block_high_watermark, 0600);
>>
>> EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
>> ext4_sb_info, s_inode_readahead_blks);
>> @@ -217,6 +219,7 @@ static struct attribute *ext4_attrs[] = {
>> ATTR_LIST(errors_count),
>> ATTR_LIST(first_error_time),
>> ATTR_LIST(last_error_time),
>> + ATTR_LIST(block_high_watermark),
>> NULL,
>> };
>>
>> @@ -304,6 +307,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
>> return print_tstamp(buf, sbi->s_es, s_first_error_time);
>> case attr_last_error_time:
>> return print_tstamp(buf, sbi->s_es, s_last_error_time);
>> + case attr_block_high_watermark:
>> + return snprintf(buf, PAGE_SIZE, "%llu\n",
>> + (s64) sbi->s_block_high_watermark);
>> }
>>
>> return 0;
>> @@ -318,6 +324,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
>> struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
>> void *ptr = calc_ptr(a, sbi);
>> unsigned long t;
>> + unsigned long long t2;
>> int ret;
>>
>> switch (a->attr_id) {
>> @@ -338,6 +345,18 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
>> return inode_readahead_blks_store(sbi, buf, len);
>> case attr_trigger_test_error:
>> return trigger_test_error(sbi, buf, len);
>> + case attr_block_high_watermark:
>> + if (!ptr)
>> + return 0;
>> + ret = kstrtoull(skip_spaces(buf), 0, &t2);
>> + if (ret)
>> + return ret;
>> + if (t2 > ext4_blocks_count(sbi->s_es))
>> + return -EINVAL;
>> + if (t2 && t2 < le32_to_cpu(sbi->s_es->s_first_data_block))
>> + return -EINVAL;
>> + sbi->s_block_high_watermark = t2;
>> + return len;
>> }
>> return 0;
>> }
>
Cheers, Andreas
Download attachment "signature.asc" of type "application/pgp-signature" (874 bytes)
Powered by blists - more mailing lists