linux-ext4 - [PATCH] Add block_high_watermark sysfs tunable.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <9ba7e5de79b8b25e335026d57ec0640fc25e5ce0.1534905460.git.jaco@uls.co.za>
Date:   Wed, 22 Aug 2018 04:21:10 +0200
From:   Jaco Kroon <jaco@....co.za>
To:     Andreas Dilger <adilger@...ger.ca>,
        "Theodore Y. Ts'o" <tytso@....edu>
Cc:     linux-ext4 <linux-ext4@...r.kernel.org>
Subject: [PATCH] Add block_high_watermark sysfs tunable.

NOT READY FOR MERGE!!!!!!

Limiting block allocations to a high watermark will eventually enable us
to perform online shrinks of an ext4 filesystem.  As an immediate
benefit it'll prevent allocation of blocks in the high range, which if
performed as a precursor to an offline filesystem shrink will help to
reduce the overall time a filesystem needs to be taken offline in order
to shrink it.

(possible) shortcomings:

Currently this tunable does not get stored to the superblock, and thus
needs to be set again after each mount.

The ext4_statfs function doesn't adjust the f_bavail value currently, as
such df will report incorrect results.

The inode allocator hasn't been synced yet.
---
 fs/ext4/balloc.c  |  2 +-
 fs/ext4/ext4.h    | 10 ++++++++++
 fs/ext4/mballoc.c |  2 +-
 fs/ext4/sysfs.c   | 19 +++++++++++++++++++
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e5d6ee61ff48..4f723c7a9c88 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -883,7 +883,7 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
 			block_group++;
 	}
 	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
-	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+	last_block = ext4_blocks_max_allocatable(EXT4_SB(inode->i_sb)) - 1;
 
 	/*
 	 * If we are doing delayed allocation, we don't need take
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0f0edd1cd0cd..dc30ea107c55 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1423,6 +1423,7 @@ struct ext4_sb_info {
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
 	unsigned int s_max_dir_size_kb;
+	ext4_fsblk_t s_block_high_watermark; /* allocators must not allocate blocks above this */
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
@@ -2711,6 +2712,15 @@ static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 		le32_to_cpu(es->s_blocks_count_lo);
 }
 
+static inline ext4_fsblk_t ext4_blocks_max_allocatable(struct ext4_sb_info *sbi)
+{
+	ext4_fsblk_t blocks = ext4_blocks_count(sbi->s_es);
+	if (sbi->s_block_high_watermark && sbi->s_block_high_watermark < blocks)
+		return sbi->s_block_high_watermark;
+	else
+		return blocks;
+}
+
 static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e29fce2fbf25..a158c2c9de10 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4232,7 +4232,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	/* start searching from the goal */
 	goal = ar->goal;
 	if (goal < le32_to_cpu(es->s_first_data_block) ||
-			goal >= ext4_blocks_count(es))
+			goal >= ext4_blocks_max_allocatable(sbi))
 		goal = le32_to_cpu(es->s_first_data_block);
 	ext4_get_group_no_and_offset(sb, goal, &group, &block);
 
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9212a026a1f1..2a1a955c2c0b 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -30,6 +30,7 @@ typedef enum {
 	attr_feature,
 	attr_pointer_ui,
 	attr_pointer_atomic,
+	attr_block_high_watermark,
 } attr_id_t;
 
 typedef enum {
@@ -167,6 +168,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
 EXT4_ATTR_FUNC(session_write_kbytes, 0444);
 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
 EXT4_ATTR_FUNC(reserved_clusters, 0644);
+EXT4_ATTR_FUNC(block_high_watermark, 0600);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
 		 ext4_sb_info, s_inode_readahead_blks);
@@ -217,6 +219,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(errors_count),
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
+	ATTR_LIST(block_high_watermark),
 	NULL,
 };
 
@@ -304,6 +307,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
 		return print_tstamp(buf, sbi->s_es, s_first_error_time);
 	case attr_last_error_time:
 		return print_tstamp(buf, sbi->s_es, s_last_error_time);
+	case attr_block_high_watermark:
+		return snprintf(buf, PAGE_SIZE, "%llu\n",
+				(s64) sbi->s_block_high_watermark);
 	}
 
 	return 0;
@@ -318,6 +324,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
 	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
 	void *ptr = calc_ptr(a, sbi);
 	unsigned long t;
+	unsigned long long t2;
 	int ret;
 
 	switch (a->attr_id) {
@@ -338,6 +345,18 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
 		return inode_readahead_blks_store(sbi, buf, len);
 	case attr_trigger_test_error:
 		return trigger_test_error(sbi, buf, len);
+	case attr_block_high_watermark:
+		if (!ptr)
+			return 0;
+		ret = kstrtoull(skip_spaces(buf), 0, &t2);
+		if (ret)
+			return ret;
+		if (t2 > ext4_blocks_count(sbi->s_es))
+			return -EINVAL;
+		if (t2 && t2 < le32_to_cpu(sbi->s_es->s_first_data_block))
+			return -EINVAL;
+		sbi->s_block_high_watermark = t2;
+		return len;
 	}
 	return 0;
 }
-- 
2.18.0