lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200821015523.1698374-6-harshads@google.com>
Date:   Thu, 20 Aug 2020 18:55:19 -0700
From:   Harshad Shirwadkar <harshadshirwadkar@...il.com>
To:     linux-ext4@...r.kernel.org
Cc:     tytso@....edu, lyx1209@...il.com,
        Harshad Shirwadkar <harshadshirwadkar@...il.com>
Subject: [RFC PATCH v2 5/9] ext4: add freespace tree optimizations

From: Harshad Shirwadkar <harshadshirwadkar@...il.com>

This patch adds an optimization on top of our freespace tree
allocator. We add a new meta-tree which contains tree node sorted by
length of their largest extent. We use this tree to optimize an
allocation request so that it immediately gets a hit or it falls back
to next CR level.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@...il.com>
---
 fs/ext4/ext4.h    |  17 +++++
 fs/ext4/mballoc.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/mballoc.h |   1 +
 fs/ext4/super.c   |   9 +++
 4 files changed, 205 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513c8473f45f..15e6ce9f1afa 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -153,6 +153,8 @@ enum SHIFT_DIRECTION {
 #define EXT4_MB_USE_RESERVED		0x2000
 /* Do strict check for free blocks while retrying block allocation */
 #define EXT4_MB_STRICT_CHECK		0x4000
+/* Disable freespace optimizations on ac */
+#define EXT4_MB_FRSP_NO_OPTIMIZE	0x8000
 
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
@@ -1444,12 +1446,22 @@ struct ext4_frsp_tree {
 	__u32 frsp_index;				/* Tree index (flex bg
 							 * number)
 							 */
+	struct list_head frsp_list_node;
+	struct rb_node frsp_len_node;
 };
 
 /* Freespace tree flags */
 
 /* Tree is loaded in memory */
 #define EXT4_MB_FRSP_FLAG_LOADED			0x0001
+/* Tree is inserted into meta tree */
+#define EXT4_MB_FRSP_FLAG_CACHED			0x2
+
+/* Freespace tree cache aggression levels */
+#define EXT4_MB_FRSP_MIN_CACHE_AGGRESSION		0x0
+#define EXT4_MB_FRSP_DEFAULT_CACHE_AGGRESSION		0x1
+#define EXT4_MB_FRSP_MAX_CACHE_AGGRESSION		0x3
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1590,6 +1602,11 @@ struct ext4_sb_info {
 
 	/* freespace trees stuff */
 	int s_mb_num_frsp_trees;
+	struct rb_root_cached s_mb_frsp_meta_tree;
+	rwlock_t s_mb_frsp_lock;
+	atomic_t s_mb_num_frsp_trees_cached;
+	struct list_head s_mb_uncached_trees;
+	u32 s_mb_frsp_cache_aggression;
 
 	/* workqueue for reserved extent conversions (buffered io) */
 	struct workqueue_struct *rsv_conversion_wq;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 168e9708257f..1da63afdbb3d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -779,6 +779,13 @@ static inline int ext4_mb_frsp_len_cmp(struct ext4_frsp_node *arg1,
 	return (arg1->frsp_len > arg2->frsp_len);
 }
 
+/* Compare function for meta tree */
+static inline int ext4_mb_frsp_meta_cmp(struct ext4_frsp_tree *arg1,
+					struct ext4_frsp_tree *arg2)
+{
+	return (arg1->frsp_max_free_len > arg2->frsp_max_free_len);
+}
+
 /* insert to offset-indexed tree */
 static void ext4_mb_frsp_insert_off(struct ext4_frsp_tree *tree,
 				struct ext4_frsp_node *new_entry)
@@ -798,6 +805,35 @@ static void ext4_mb_frsp_insert_len(struct ext4_frsp_tree *tree,
 		ext4_mb_frsp_len_cmp);
 }
 
+void ext4_mb_frsp_meta_reinsert(struct super_block *sb,
+	struct ext4_frsp_tree *tree)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_frsp_node *node;
+	struct rb_node *first = rb_first_cached(&tree->frsp_len_root);
+	struct rb_root_cached *meta_root = &EXT4_SB(sb)->s_mb_frsp_meta_tree;
+	int expected_len = 0;
+
+	if (!(tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED))
+		return;
+
+	if (first) {
+		node = rb_entry(first, struct ext4_frsp_node, frsp_len_node);
+		expected_len = node->frsp_len;
+	}
+
+	if (tree->frsp_max_free_len == expected_len)
+		return;
+
+	write_lock(&sbi->s_mb_frsp_lock);
+	tree->frsp_max_free_len = expected_len;
+	rb_erase_cached(&tree->frsp_len_node, &sbi->s_mb_frsp_meta_tree);
+	RB_CLEAR_NODE(&tree->frsp_len_node);
+	ext4_mb_rb_insert(meta_root, tree, struct ext4_frsp_tree, frsp_len_node,
+		ext4_mb_frsp_meta_cmp);
+	write_unlock(&sbi->s_mb_frsp_lock);
+}
+
 #ifdef CONFIG_EXT4_DEBUG
 /* print freespace_tree in pre-order traversal */
 void ext4_mb_frsp_print_tree_len(struct super_block *sb,
@@ -966,6 +1002,7 @@ int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree,
 		}
 	}
 	ext4_mb_frsp_insert_len(tree, new_entry);
+	ext4_mb_frsp_meta_reinsert(sb, tree);
 
 	return 0;
 }
@@ -1063,6 +1100,9 @@ int ext4_mb_frsp_load(struct ext4_buddy *e4b)
 		if (ret)
 			goto out;
 	}
+	if (!(e4b->frsp_tree->frsp_flags & EXT4_MB_FRSP_FLAG_CACHED))
+		atomic_inc(&sbi->s_mb_num_frsp_trees_cached);
+	e4b->frsp_tree->frsp_flags |= EXT4_MB_FRSP_FLAG_CACHED;
 	e4b->frsp_tree->frsp_flags |= EXT4_MB_FRSP_FLAG_LOADED;
 out:
 	for (i = 0; i < ngroups; i++) {
@@ -1156,6 +1196,7 @@ static void ext4_mb_frsp_use_best_found(struct ext4_allocation_context *ac,
 	ext4_mb_load_allocator(ac->ac_sb, group_no, &e4b,
 		EXT4_ALLOCATOR_FRSP_NOLOAD);
 	mb_mark_used(&e4b, bex);
+	ext4_mb_frsp_meta_reinsert(ac->ac_sb, e4b.frsp_tree);
 	ext4_mb_unload_allocator(&e4b);
 }
 /*
@@ -1286,6 +1327,124 @@ void ext4_mb_frsp_find_by_goal(struct ext4_allocation_context *ac)
 	ext4_mb_unload_allocator(&e4b);
 }
 
+/*
+ * Determine if caching of trees is necessary. This function returns 1 if it is,
+ * 0 if it is not.
+ */
+static int ext4_mb_frsp_should_cache(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+
+	if (list_empty(&sbi->s_mb_uncached_trees))
+		return 0;
+
+	if (sbi->s_mb_frsp_cache_aggression >=
+		EXT4_MB_FRSP_MAX_CACHE_AGGRESSION)
+		return 1;
+
+	if (sbi->s_mb_frsp_cache_aggression ==
+		EXT4_MB_FRSP_MIN_CACHE_AGGRESSION)
+		return 0;
+
+	/* At cache aggression level 1, skip caching at CR 0 */
+	if (sbi->s_mb_frsp_cache_aggression == 1 && ac->ac_criteria == 0)
+		return 0;
+
+	/*
+	 * At cache aggression level 2, perform caching for every alternate
+	 * optimization.
+	 */
+	return (ac->ac_num_optimizations & 0x1);
+}
+
+/*
+ * Optimize allocation request. This function _tries_ to lookup the meta-tree
+ * and if it can optimize the search in any way, it does so. As a result
+ * this function returns 1, if the optimization was performed. In this case,
+ * the caller should restart the search from tree mentioned in *tree_idx.
+ */
+int ext4_mb_frsp_optimize(struct ext4_allocation_context *ac, int *tree_idx)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_frsp_tree *cur = NULL;
+	struct rb_node *node = NULL;
+	int found = 0, best = 0, cache_more_trees = 0, better_len = 0, ret = 0;
+
+	if (ac->ac_flags & EXT4_MB_HINT_FIRST ||
+		ac->ac_flags & EXT4_MB_FRSP_NO_OPTIMIZE ||
+		ac->ac_status != AC_STATUS_CONTINUE)
+		return 0;
+
+	ac->ac_num_optimizations++;
+	if (!read_trylock(&sbi->s_mb_frsp_lock))
+		return 0;
+
+	node = sbi->s_mb_frsp_meta_tree.rb_root.rb_node;
+	while (node) {
+		cur = rb_entry(node, struct ext4_frsp_tree, frsp_len_node);
+		if (ext4_mb_frsp_is_better(ac, cur->frsp_max_free_len, &best)) {
+			/*
+			 * This tree definitely has a better node than the best
+			 * found so far.
+			 */
+			found = 1;
+			ret = 1;
+			*tree_idx = cur->frsp_index;
+			better_len = cur->frsp_max_free_len;
+			if (best)
+				break;
+		}
+		if (cur->frsp_max_free_len > ac->ac_g_ex.fe_len)
+			node = node->rb_right;
+		else
+			node = node->rb_left;
+	}
+
+	if (ac->ac_found == 0 && !found) {
+		/*
+		 * If we haven't found a good match above, and we hadn't found
+		 * any match before us, that means we need to loosen our
+		 * criteria. Note that, if we had found something earlier,
+		 * not finding a better node doesn't imply that there is no
+		 * better node available.
+		 * TODO - in this case determine probabilistically which tree
+		 * may have a better node and direct our allocator there.
+		 */
+		if (ext4_mb_frsp_should_cache(ac)) {
+			cache_more_trees = 1;
+		} else if (ac->ac_criteria < 2) {
+			ac->ac_criteria++;
+			ac->ac_groups_scanned = 0;
+			*tree_idx = 0;
+			ret = 1;
+		} else {
+			ac->ac_flags |= EXT4_MB_HINT_FIRST;
+		}
+	} else if (!best && !list_empty(&sbi->s_mb_uncached_trees)) {
+		cache_more_trees = ext4_mb_frsp_should_cache(ac);
+	}
+
+	if (cache_more_trees) {
+		cur = list_first_entry(&sbi->s_mb_uncached_trees,
+				struct ext4_frsp_tree, frsp_list_node);
+		list_del_init(&cur->frsp_list_node);
+		*tree_idx = cur->frsp_index;
+		ret = 1;
+	}
+	read_unlock(&sbi->s_mb_frsp_lock);
+
+	/*
+	 * If we couldn't optimize now, it's unlikely that we'll be able to
+	 * optimize this request anymore
+	 */
+	if (!ret)
+		ac->ac_flags |= EXT4_MB_FRSP_NO_OPTIMIZE;
+	mb_debug(ac->ac_sb,
+		"Optimizer suggestion: found = %d, tree = %d, len = %d, cr = %d\n",
+		found, *tree_idx, better_len, ac->ac_criteria);
+	return ret;
+}
+
 static void ext4_mb_frsp_process(struct ext4_allocation_context *ac,
 				struct ext4_frsp_tree *tree)
 {
@@ -1324,6 +1483,7 @@ ext4_mb_tree_allocator(struct ext4_allocation_context *ac)
 	struct ext4_frsp_node *cur = NULL;
 	struct ext4_tree_extent *btx = NULL;
 	int ret = 0, start_idx = 0, tree_idx, j;
+	int optimize;
 
 	sb = ac->ac_sb;
 	btx = &ac->ac_b_tree_ex;
@@ -1341,6 +1501,8 @@ ext4_mb_tree_allocator(struct ext4_allocation_context *ac)
 
 	ac->ac_criteria = 0;
 	ac->ac_groups_scanned = 0;
+	ext4_mb_frsp_optimize(ac, &start_idx);
+
 repeat:
 
 	/* Loop through the rest of trees (flex_bg) */
@@ -1357,13 +1519,17 @@ ext4_mb_tree_allocator(struct ext4_allocation_context *ac)
 		mutex_lock(&e4b.frsp_tree->frsp_lock);
 		ext4_mb_frsp_process(ac, e4b.frsp_tree);
 		mutex_unlock(&e4b.frsp_tree->frsp_lock);
+		optimize = ext4_mb_frsp_optimize(ac, &start_idx);
 		ext4_mb_unload_allocator(&e4b);
+		if (optimize)
+			goto repeat;
 	}
 
 	if (ac->ac_status != AC_STATUS_FOUND) {
 		if (ac->ac_criteria < 2) {
 			ac->ac_criteria++;
 			ac->ac_groups_scanned = 0;
+			ac->ac_flags &= ~EXT4_MB_FRSP_NO_OPTIMIZE;
 			mb_debug(sb, "Falling back to CR=%d", ac->ac_criteria);
 			goto repeat;
 		}
@@ -1415,6 +1581,7 @@ static void ext4_mb_frsp_init_tree(struct ext4_frsp_tree *tree, int index)
 	mutex_init(&(tree->frsp_lock));
 	tree->frsp_flags = 0;
 	tree->frsp_index = index;
+	INIT_LIST_HEAD(&tree->frsp_list_node);
 }
 
 int ext4_mb_init_freespace_trees(struct super_block *sb)
@@ -1425,6 +1592,8 @@ int ext4_mb_init_freespace_trees(struct super_block *sb)
 
 	sbi->s_mb_num_frsp_trees =
 		ext4_num_grps_to_flexbg(sb, ext4_get_groups_count(sb));
+	sbi->s_mb_frsp_meta_tree = RB_ROOT_CACHED;
+	INIT_LIST_HEAD(&sbi->s_mb_uncached_trees);
 
 	for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) {
 		fg = sbi_array_rcu_deref(sbi, s_flex_groups, i);
@@ -1433,7 +1602,11 @@ int ext4_mb_init_freespace_trees(struct super_block *sb)
 		if (!fg->frsp_tree)
 			return -ENOMEM;
 		ext4_mb_frsp_init_tree(fg->frsp_tree, i);
+		list_add(&fg->frsp_tree->frsp_list_node,
+				&sbi->s_mb_uncached_trees);
 	}
+	rwlock_init(&sbi->s_mb_frsp_lock);
+	atomic_set(&sbi->s_mb_num_frsp_trees_cached, 0);
 
 	return 0;
 }
@@ -1460,6 +1633,11 @@ int ext4_mb_add_frsp_trees(struct super_block *sb, int ngroups)
 		if (!fg->frsp_tree)
 			return -ENOMEM;
 		ext4_mb_frsp_init_tree(fg->frsp_tree, i);
+		write_lock(&sbi->s_mb_frsp_lock);
+		list_add(&fg->frsp_tree->frsp_list_node,
+				&sbi->s_mb_uncached_trees);
+		write_unlock(&sbi->s_mb_frsp_lock);
+		ext4_mb_frsp_meta_reinsert(sb, fg->frsp_tree);
 	}
 	sbi->s_mb_num_frsp_trees = flex_bg_count;
 
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index af61651258a3..1fcdd3e6f7d5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -200,6 +200,7 @@ struct ext4_allocation_context {
 	__u8 ac_criteria;
 	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
 				 * N > 0, the field stores N, otherwise 0 */
+	__u8 ac_num_optimizations;
 	__u8 ac_op;		/* operation, for history only */
 	struct page *ac_bitmap_page;
 	struct page *ac_buddy_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 55ff4e8be976..6a2d43d81b14 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1527,6 +1527,8 @@ enum {
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
 	Opt_prefetch_block_bitmaps, Opt_freespace_tree,
+	Opt_frsp_cache_aggression,
+
 };
 
 static const match_table_t tokens = {
@@ -1620,6 +1622,7 @@ static const match_table_t tokens = {
 	{Opt_nombcache, "nombcache"},
 	{Opt_nombcache, "no_mbcache"},	/* for backward compatibility */
 	{Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
+	{Opt_frsp_cache_aggression, "frsp_cache_aggression=%u"},
 	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
 	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
 	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
@@ -1836,6 +1839,7 @@ static const struct mount_opts {
 	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
 	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+	{Opt_frsp_cache_aggression, 0, MOPT_GTE0},
 	{Opt_test_dummy_encryption, 0, MOPT_STRING},
 	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
 	{Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
@@ -2043,6 +2047,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		sbi->s_li_wait_mult = arg;
 	} else if (token == Opt_max_dir_size_kb) {
 		sbi->s_max_dir_size_kb = arg;
+	} else if (token == Opt_frsp_cache_aggression) {
+		sbi->s_mb_frsp_cache_aggression =
+			min(EXT4_MB_FRSP_MAX_CACHE_AGGRESSION,
+				max(EXT4_MB_FRSP_MIN_CACHE_AGGRESSION, arg));
 	} else if (token == Opt_stripe) {
 		sbi->s_stripe = arg;
 	} else if (token == Opt_resuid) {
@@ -3962,6 +3970,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+	sbi->s_mb_frsp_cache_aggression = EXT4_MB_FRSP_DEFAULT_CACHE_AGGRESSION;
 
 	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
 		set_opt(sb, BARRIER);
-- 
2.28.0.297.g1956fa8f8d-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ