lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200819073104.1141705-4-harshadshirwadkar@gmail.com>
Date:   Wed, 19 Aug 2020 00:30:58 -0700
From:   Harshad Shirwadkar <harshadshirwadkar@...il.com>
To:     linux-ext4@...r.kernel.org
Cc:     tytso@....edu, lyx1209@...il.com,
        Harshad Shirwadkar <harshadshirwadkar@...il.com>
Subject: [PATCH 3/9] ext4: add freespace tree allocator

From: Yuexin Li <lyx1209@...il.com>

This patch adds a new freespace tree allocator. The allocator
organizes free space regions in a couple of in-memory rbtrees, one
sorted by length and the other by offset. We use these per-flex-bg
level trees to quickly scan length sorted free space regions and
determine the best region for a given request. This feature can be
enabled by passing "-o freespace_tree" mount option.

Signed-off-by: Yuexin Li <lyx1209@...il.com>
Co-developed-by: Harshad Shirwadkar <harshadshirwadkar@...il.com>
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@...il.com>
---
 fs/ext4/ext4.h    |  45 +++
 fs/ext4/mballoc.c | 984 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/ext4/mballoc.h |  61 ++-
 fs/ext4/resize.c  |   8 +
 fs/ext4/super.c   |  35 +-
 5 files changed, 1084 insertions(+), 49 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4df6f429de1a..3bb2675d4d40 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -363,6 +363,7 @@ struct flex_groups {
 	atomic64_t	free_clusters;
 	atomic_t	free_inodes;
 	atomic_t	used_dirs;
+	struct ext4_frsp_tree	*frsp_tree;
 };
 
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
@@ -1197,6 +1198,12 @@ struct ext4_inode_info {
 #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM	0x00000008 /* User explicitly
 						specified journal checksum */
 
+
+#define EXT4_MOUNT2_FREESPACE_TREE	0x00000020 /* Enable freespace tree
+						    * allocator (turns buddy
+						    * allocator off)
+						    */
+
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
 #define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
@@ -1402,6 +1409,30 @@ struct ext4_super_block {
 #define ext4_has_strict_mode(sbi) \
 	(sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL)
 
+/*
+ * Freespace tree structure
+ */
+struct ext4_frsp_tree {
+	struct rb_root_cached frsp_offset_root;		/* Root for offset
+							 * sorted tree
+							 */
+	struct rb_root_cached frsp_len_root;		/* Root for Length
+							 * sorted tree
+							 */
+	struct mutex frsp_lock;
+	__u8 frsp_flags;
+	__u32 frsp_max_free_len;			/* Max free extent in
+							 * this flex bg
+							 */
+	__u32 frsp_index;				/* Tree index (flex bg
+							 * number)
+							 */
+};
+
+/* Freespace tree flags */
+
+/* Tree is loaded in memory */
+#define EXT4_MB_FRSP_FLAG_LOADED			0x0001
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1539,6 +1570,9 @@ struct ext4_sb_info {
 	struct flex_groups * __rcu *s_flex_groups;
 	ext4_group_t s_flex_groups_allocated;
 
+	/* freespace trees stuff */
+	int s_mb_num_frsp_trees;
+
 	/* workqueue for reserved extent conversions (buffered io) */
 	struct workqueue_struct *rsv_conversion_wq;
 
@@ -2653,6 +2687,7 @@ extern int ext4_init_inode_table(struct super_block *sb,
 extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
 
 /* mballoc.c */
+extern int ext4_mb_add_frsp_trees(struct super_block *sb, int ngroups);
 extern const struct seq_operations ext4_mb_seq_groups_ops;
 extern long ext4_mb_stats;
 extern long ext4_mb_max_to_scan;
@@ -3084,6 +3119,16 @@ static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
 	return block_group >> sbi->s_log_groups_per_flex;
 }
 
+static inline struct ext4_frsp_tree *
+ext4_get_frsp_tree(struct super_block *sb, ext4_group_t flex_bg)
+{
+	struct flex_groups *fg;
+
+	fg = sbi_array_rcu_deref(EXT4_SB(sb), s_flex_groups, flex_bg);
+
+	return fg->frsp_tree;
+}
+
 static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
 {
 	return 1 << sbi->s_log_groups_per_flex;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 2d8d3d9c7918..74bdc2dcb49c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -333,6 +333,7 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_data_cachep;
+static struct kmem_cache *ext4_freespace_node_cachep;
 
 /* We create slab caches for groupinfo data structures based on the
  * superblock block size.  There will be one per mounted filesystem for
@@ -352,6 +353,11 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
 
+static int ext4_mb_load_allocator(struct super_block *sb, ext4_group_t group,
+			      struct ext4_buddy *e4b, int flags);
+static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex);
+static void ext4_mb_unload_allocator(struct ext4_buddy *e4b);
+
 /*
  * The algorithm using this percpu seq counter goes below:
  * 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -395,6 +401,33 @@ static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 	return addr;
 }
 
+static inline int ext4_blkno_to_flex_offset(struct super_block *sb,
+			ext4_fsblk_t blkno)
+{
+	return blkno % (ext4_flex_bg_size(EXT4_SB(sb)) *
+				EXT4_SB(sb)->s_blocks_per_group);
+}
+
+static inline ext4_fsblk_t ext4_flex_offset_to_blkno(struct super_block *sb,
+			int flex_bg, int flex_offset)
+{
+	return flex_bg * ext4_flex_bg_size(EXT4_SB(sb)) *
+		EXT4_SB(sb)->s_blocks_per_group + flex_offset;
+}
+
+static inline int ext4_mb_frsp_on(struct super_block *sb)
+{
+	return test_opt2(sb, FREESPACE_TREE) &&
+			EXT4_SB(sb)->s_es->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_num_grps_to_flexbg(struct super_block *sb,
+							int ngroups)
+{
+	return (ngroups + ext4_flex_bg_size(EXT4_SB(sb)) - 1) >>
+			(EXT4_SB(sb)->s_es->s_log_groups_per_flex);
+}
+
 static inline int mb_test_bit(int bit, void *addr)
 {
 	/*
@@ -453,6 +486,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 {
 	char *bb;
 
+	WARN_ON(ext4_mb_frsp_on(e4b->bd_sb));
 	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 	BUG_ON(max == NULL);
 
@@ -620,6 +654,9 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 	void *buddy;
 	void *buddy2;
 
+	if (ext4_mb_frsp_on(sb))
+		return 0;
+
 	{
 		static int mb_check_counter;
 		if (mb_check_counter++ % 100 != 0)
@@ -706,6 +743,729 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 #define mb_check_buddy(e4b)
 #endif
 
+/* Generic macro for inserting a new node in cached rbtree */
+#define ext4_mb_rb_insert(__root, __new, __node_t, __entry, __cmp)	do {	\
+	struct rb_node **iter = &((__root)->rb_root.rb_node), *parent = NULL;	\
+	bool leftmost = true;							\
+	__node_t *this = NULL;							\
+										\
+	while (*iter) {								\
+		this = rb_entry(*iter, __node_t, __entry);			\
+		parent = *iter;							\
+		if (__cmp((__new), this)) {					\
+			iter = &((*iter)->rb_left);				\
+		} else {							\
+			iter = &((*iter)->rb_right);				\
+			leftmost = false;					\
+		}								\
+	}									\
+	rb_link_node(&(__new)->__entry, parent, iter);				\
+	rb_insert_color_cached(&(__new)->__entry, __root, leftmost);		\
+} while (0)
+
+static inline int ext4_mb_frsp_offset_cmp(struct ext4_frsp_node *arg1,
+					  struct ext4_frsp_node *arg2)
+{
+	return (arg1->frsp_offset < arg2->frsp_offset);
+}
+
+/*
+ * Free space extents length sorting function, the nodes are sorted in
+ * decreasing order of length
+ */
+static inline int ext4_mb_frsp_len_cmp(struct ext4_frsp_node *arg1,
+				       struct ext4_frsp_node *arg2)
+{
+	return (arg1->frsp_len > arg2->frsp_len);
+}
+
+/* insert to offset-indexed tree */
+static void ext4_mb_frsp_insert_off(struct ext4_frsp_tree *tree,
+				struct ext4_frsp_node *new_entry)
+{
+	memset(&new_entry->frsp_node, 0, sizeof(new_entry->frsp_node));
+	ext4_mb_rb_insert(&tree->frsp_offset_root, new_entry,
+		struct ext4_frsp_node, frsp_node, ext4_mb_frsp_offset_cmp);
+}
+
+/* insert to tree sorted by length */
+static void ext4_mb_frsp_insert_len(struct ext4_frsp_tree *tree,
+				struct ext4_frsp_node *new_entry)
+{
+	memset(&new_entry->frsp_len_node, 0, sizeof(new_entry->frsp_len_node));
+	ext4_mb_rb_insert(&tree->frsp_len_root, new_entry,
+		struct ext4_frsp_node, frsp_len_node,
+		ext4_mb_frsp_len_cmp);
+}
+
+#ifdef CONFIG_EXT4_DEBUG
+/* print freespace_tree in pre-order traversal */
+void ext4_mb_frsp_print_tree_len(struct super_block *sb,
+					struct ext4_frsp_tree *tree)
+{
+	unsigned int count = 0;
+	ext4_fsblk_t blk = 0, blk_end = 0;
+	ext4_group_t group = 0, group_end = 0;
+	struct ext4_frsp_node *entry = NULL;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct rb_node *cur;
+
+	cur = rb_first_cached(&tree->frsp_len_root);
+	mb_debug(sb, "\nTree\tNode\tlength\tblock\tgroup\n");
+
+	while (cur) {
+		entry = rb_entry(cur, struct ext4_frsp_node, frsp_len_node);
+		count++;
+		blk = ext4_flex_offset_to_blkno(sb, tree->frsp_index,
+			entry->frsp_offset);
+		blk_end = ext4_flex_offset_to_blkno(sb, tree->frsp_index,
+			entry->frsp_offset + entry->frsp_len - 1);
+		group = blk / sbi->s_blocks_per_group;
+		group_end = (blk_end-1) / sbi->s_blocks_per_group;
+		mb_debug(sb, "%u\t%u\t%u\t%u(%lld)--%llu\t%u--%u\n",
+			tree->frsp_index, count, entry->frsp_len,
+			entry->frsp_offset, blk, blk_end, group, group_end);
+		cur = rb_next(cur);
+	}
+}
+#endif
+
+static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb)
+{
+	struct ext4_frsp_node *node;
+
+	node = kmem_cache_alloc(ext4_freespace_node_cachep, GFP_NOFS);
+	if (!node)
+		return NULL;
+
+	RB_CLEAR_NODE(&node->frsp_node);
+	RB_CLEAR_NODE(&node->frsp_len_node);
+
+	return node;
+}
+
+static void ext4_mb_frsp_free_node(struct super_block *sb,
+		struct ext4_frsp_node *node)
+{
+	kmem_cache_free(ext4_freespace_node_cachep, node);
+}
+
+/* Evict a tree from memory */
+void ext4_mb_frsp_free_tree(struct super_block *sb, struct ext4_frsp_tree *tree)
+{
+	struct ext4_frsp_node *frsp_node;
+	struct rb_node *node;
+
+	mb_debug(sb, "Evicting tree %d\n", tree->frsp_index);
+	mutex_lock(&tree->frsp_lock);
+	if (!(tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED))
+		goto out;
+
+	node = rb_first_cached(&tree->frsp_offset_root);
+	while (node) {
+		frsp_node = rb_entry(node, struct ext4_frsp_node, frsp_node);
+		rb_erase_cached(node, &tree->frsp_offset_root);
+		rb_erase_cached(&frsp_node->frsp_len_node,
+				&tree->frsp_len_root);
+		ext4_mb_frsp_free_node(sb, frsp_node);
+		node = rb_first_cached(&tree->frsp_offset_root);
+	}
+	tree->frsp_flags &= ~EXT4_MB_FRSP_FLAG_LOADED;
+	tree->frsp_offset_root = RB_ROOT_CACHED;
+	tree->frsp_len_root = RB_ROOT_CACHED;
+out:
+	mutex_unlock(&tree->frsp_lock);
+}
+
+/*
+ * Search tree by flexbg offset. Returns the node containing "target". If
+ * no such node is present, returns NULL. Must be called with tree mutex held.
+ */
+struct ext4_frsp_node *ext4_mb_frsp_search_by_off(struct super_block *sb,
+				struct ext4_frsp_tree *tree,
+				ext4_grpblk_t target)
+{
+	struct rb_root *root = &tree->frsp_offset_root.rb_root;
+	struct rb_node *node = root->rb_node;
+	struct ext4_frsp_node *this = NULL;
+
+	while (node) {
+		this = rb_entry(node, struct ext4_frsp_node, frsp_node);
+		if (this->frsp_offset == target)
+			return this;
+		else if (target < this->frsp_offset)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+/*
+ * Check if two entries in freespace tree can be merged together. Nodes
+ * can merge together only if they are physically contiguous and belong
+ * to the same block group.
+ */
+int ext4_mb_frsp_node_can_merge(struct super_block *sb,
+	struct ext4_frsp_node *prev_entry, struct ext4_frsp_node *cur_entry)
+{
+	if (prev_entry->frsp_offset + prev_entry->frsp_len !=
+		cur_entry->frsp_offset)
+		return 0;
+	if (prev_entry->frsp_offset / EXT4_SB(sb)->s_blocks_per_group !=
+		cur_entry->frsp_offset / EXT4_SB(sb)->s_blocks_per_group)
+		return 0;
+	return 1;
+}
+
+/*
+ * Add new free space region to tree. We insert the new node in both, the
+ * length sorted and the flex-bg offset sorted trees. Must be called with
+ * tree mutex held.
+ */
+int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree,
+				ext4_grpblk_t offset, ext4_grpblk_t len)
+{
+	struct ext4_frsp_node *new_entry = NULL, *next_entry = NULL,
+				*prev_entry = NULL;
+	struct rb_node *left = NULL, *right = NULL;
+
+	new_entry = ext4_mb_frsp_alloc_node(sb);
+	if (!new_entry)
+		return -ENOMEM;
+
+	new_entry->frsp_offset = offset;
+	new_entry->frsp_len = len;
+
+	ext4_mb_frsp_insert_off(tree, new_entry);
+	/* try merge to left and right */
+	/* left */
+	left = rb_prev(&new_entry->frsp_node);
+	if (left) {
+		prev_entry = rb_entry(left, struct ext4_frsp_node, frsp_node);
+		if (ext4_mb_frsp_node_can_merge(sb, prev_entry, new_entry)) {
+			new_entry->frsp_offset = prev_entry->frsp_offset;
+			new_entry->frsp_len += prev_entry->frsp_len;
+			rb_erase_cached(left, &tree->frsp_offset_root);
+			rb_erase_cached(&prev_entry->frsp_len_node,
+						&tree->frsp_len_root);
+			ext4_mb_frsp_free_node(sb, prev_entry);
+		}
+	}
+
+	/* right */
+	right = rb_next(&new_entry->frsp_node);
+	if (right) {
+		next_entry = rb_entry(right, struct ext4_frsp_node, frsp_node);
+		if (ext4_mb_frsp_node_can_merge(sb, new_entry, next_entry)) {
+			new_entry->frsp_len += next_entry->frsp_len;
+			rb_erase_cached(right, &tree->frsp_offset_root);
+			rb_erase_cached(&next_entry->frsp_len_node,
+						&tree->frsp_len_root);
+			ext4_mb_frsp_free_node(sb, next_entry);
+		}
+	}
+	ext4_mb_frsp_insert_len(tree, new_entry);
+
+	return 0;
+}
+
+/*
+ * Free length number of blocks starting at block number block in free space
+ * tree.
+ */
+int ext4_mb_frsp_free_blocks(struct super_block *sb, ext4_group_t group,
+				ext4_grpblk_t block, ext4_grpblk_t length)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_frsp_tree *tree =
+		ext4_get_frsp_tree(sb, ext4_flex_group(sbi, group));
+	int err = 0;
+
+	mutex_lock(&tree->frsp_lock);
+	err = ext4_mb_frsp_add_region(sb, tree,
+			ext4_blkno_to_flex_offset(sb, block), length);
+	mutex_unlock(&tree->frsp_lock);
+
+	return err;
+}
+
+/*
+ * Create freespace tree from on-disk bitmap. Must be called with tree mutex
+ * held. Returns 0 on success, error otherwise.
+ */
+int ext4_mb_frsp_bb_to_tree(struct ext4_buddy *e4b, ext4_group_t group,
+			struct buffer_head *bh)
+{
+	struct super_block *sb = e4b->bd_sb;
+	int length = 0, bit = 0, next;
+	int end = EXT4_SB(sb)->s_blocks_per_group;
+	ext4_fsblk_t block;
+	int ret = 0;
+
+	/* find all unused blocks in bitmap, convert them to new tree node */
+	while (bit < end) {
+		bit = mb_find_next_zero_bit(bh->b_data, end, bit);
+		if (bit >= end)
+			break;
+
+		next = mb_find_next_bit(bh->b_data, end, bit);
+		length = next - bit;
+		block = ext4_group_first_block_no(sb, group) + bit;
+
+		ret = ext4_mb_frsp_add_region(sb, e4b->frsp_tree,
+			ext4_blkno_to_flex_offset(sb, block), length);
+		if (ret)
+			break;
+		bit = next + 1;
+	}
+
+	return ret;
+}
+
+/*
+ * Load one freespace_tree from disk. Assume holding mutex lock on tree.
+ */
+int ext4_mb_frsp_load(struct ext4_buddy *e4b)
+{
+	ext4_group_t ngroups, group_start, group_end, grp;
+	struct ext4_sb_info *sbi = EXT4_SB(e4b->bd_sb);
+	int i, ret = 0;
+	struct buffer_head **bh = NULL;
+
+	if (e4b->frsp_tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED)
+		return 0;
+
+	group_start = e4b->frsp_tree->frsp_index * ext4_flex_bg_size(sbi);
+	group_end = min(group_start + ext4_flex_bg_size(sbi),
+			ext4_get_groups_count(e4b->bd_sb)) - 1;
+	ngroups = group_end - group_start + 1;
+
+	bh = kcalloc(ngroups, sizeof(*bh), GFP_KERNEL);
+	if (!bh)
+		return -ENOMEM;
+	for (i = 0; i < ngroups; i++) {
+		grp = i + group_start;
+		bh[i] = ext4_read_block_bitmap_nowait(e4b->bd_sb, grp, false);
+		if (IS_ERR(bh[i])) {
+			ret = PTR_ERR(bh[i]);
+			goto out;
+		}
+	}
+	for (i = 0; i < ngroups; i++) {
+		grp = i + group_start;
+		if (!bitmap_uptodate(bh[i])) {
+			ret = ext4_wait_block_bitmap(e4b->bd_sb, grp, bh[i]);
+			if (ret)
+				goto out;
+		}
+		ret = ext4_mb_frsp_bb_to_tree(e4b, grp, bh[i]);
+		if (ret)
+			goto out;
+	}
+	e4b->frsp_tree->frsp_flags |= EXT4_MB_FRSP_FLAG_LOADED;
+out:
+	for (i = 0; i < ngroups; i++) {
+		if (!IS_ERR_OR_NULL(bh[i]))
+			put_bh(bh[i]);
+		if (!ret && IS_ERR(bh[i]))
+			ret = PTR_ERR(bh[i]);
+	}
+	kfree(bh);
+	return ret;
+
+}
+
+/*
+ * Determine if node with length len is better than what we have found until
+ * now. Return 1 if that is the case, 0 otherwise. If len is exact match,
+ * set *best = 1.
+ */
+static int ext4_mb_frsp_is_better(struct ext4_allocation_context *ac,
+					int len, int *best)
+{
+	struct ext4_tree_extent *btx = &ac->ac_b_tree_ex;
+	struct ext4_free_extent *gex = &ac->ac_g_ex;
+
+	if (best)
+		*best = 0;
+	if (len == gex->fe_len) {
+		if (best)
+			*best = 1;
+		return 1;
+	}
+	if (ac->ac_criteria == 0 && len < gex->fe_len)
+		return 0;
+	/*
+	 * See if the new node is cloer to goal length than
+	 * the best extent found so far
+	 */
+	if (btx->te_len < gex->fe_len && len > btx->te_len)
+		return 1;
+	if (len > gex->fe_len && len < btx->te_len)
+		return 1;
+	return 0;
+}
+
+/*
+ * check if we have scanned sufficient freespace candidates
+ * stop scanning if reached/exceeded s_max_to_scan
+ */
+static void ext4_mb_frsp_check_limits(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+
+	if (ac->ac_status == AC_STATUS_FOUND)
+		return;
+
+	/*
+	 * Exceeded max number of nodes to scan
+	 */
+	if (ac->ac_found > sbi->s_mb_max_to_scan &&
+			!(ac->ac_flags & EXT4_MB_HINT_FIRST))
+		ac->ac_status = AC_STATUS_BREAK;
+}
+
+/*
+ * Mark free space in selected tree node as used and update the tree.
+ * This must be called with tree lock.
+ */
+static void ext4_mb_frsp_use_best_found(struct ext4_allocation_context *ac,
+			ext4_group_t flex, struct ext4_frsp_node *selected)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_tree_extent *btx = &ac->ac_b_tree_ex;
+	struct ext4_free_extent *bex;
+	unsigned int group_no;
+	struct ext4_buddy e4b;
+
+	WARN_ON(ac->ac_status == AC_STATUS_FOUND);
+	btx->te_len = min(btx->te_len, ac->ac_g_ex.fe_len);
+	ac->ac_status = AC_STATUS_FOUND;
+
+	/* update ac best-found-extent */
+	bex = &ac->ac_b_ex;
+	group_no = (btx->te_flex * ext4_flex_bg_size(sbi)) +
+			(btx->te_flex_start / sbi->s_blocks_per_group);
+	bex->fe_start = btx->te_flex_start % sbi->s_blocks_per_group;
+	bex->fe_group = group_no;
+	bex->fe_len = btx->te_len;
+	bex->fe_node = selected;
+
+	/* Add free blocks to our tree */
+	ext4_mb_load_allocator(ac->ac_sb, group_no, &e4b,
+		EXT4_ALLOCATOR_FRSP_NOLOAD);
+	mb_mark_used(&e4b, bex);
+	ext4_mb_unload_allocator(&e4b);
+}
+/*
+ * The routine checks whether found tree node is good enough. If it is,
+ * then the tree node gets marked used and flag is set to the context
+ * to stop scanning.
+ *
+ * Otherwise, the tree node is compared with the previous found extent and
+ * if new one is better, then it's stored in the context.
+ *
+ * Later, the best found tree node will be used, if mballoc can't find good
+ * enough extent.
+ */
+static int ext4_mb_frsp_measure_node(struct ext4_allocation_context *ac,
+				int tree_idx, struct ext4_frsp_node *cur,
+				int goal)
+{
+	struct ext4_tree_extent *btx = &ac->ac_b_tree_ex;
+	ext4_fsblk_t start;
+	int best_found = 0, max;
+
+	WARN_ON(btx->te_len < 0);
+	WARN_ON(ac->ac_status != AC_STATUS_CONTINUE);
+
+	if (goal) {
+		start = ext4_group_first_block_no(ac->ac_sb,
+					ac->ac_g_ex.fe_group) +
+					ac->ac_g_ex.fe_start;
+		if (cur->frsp_offset > ext4_blkno_to_flex_offset(ac->ac_sb,
+						start))
+			return 0;
+		max = cur->frsp_offset + cur->frsp_len -
+			ext4_blkno_to_flex_offset(ac->ac_sb, start);
+		if (max >= ac->ac_g_ex.fe_len &&
+			ac->ac_g_ex.fe_len == EXT4_SB(ac->ac_sb)->s_stripe) {
+			if (do_div(start, EXT4_SB(ac->ac_sb)->s_stripe) != 0)
+				return 0;
+			best_found = 1;
+		} else if (max >= ac->ac_g_ex.fe_len) {
+			best_found = 1;
+		} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
+			/*
+			 * Sometimes, caller may want to merge even small
+			 * number of blocks to an existing extent
+			 */
+			best_found = 1;
+
+		} else {
+			return 0;
+		}
+		ac->ac_found++;
+		goto use_extent;
+	}
+	ac->ac_found++;
+
+	/* The special case - take what you catch first */
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		best_found = 1;
+		goto use_extent;
+	}
+
+	/*
+	 * Prefer not allocating blocks in first group in flex bg for data
+	 * blocks.
+	 */
+	if (unlikely((ac->ac_criteria < 2) &&
+			(ac->ac_flags & EXT4_MB_HINT_DATA) &&
+			(cur->frsp_offset < EXT4_BLOCKS_PER_GROUP(ac->ac_sb))))
+		return 1;
+
+
+	/* If this is first found extent, just store it in the context */
+	if (btx->te_len == 0)
+		goto use_extent;
+
+	if (ext4_mb_frsp_is_better(ac, cur->frsp_len, &best_found))
+		goto use_extent;
+
+	ext4_mb_frsp_check_limits(ac);
+	return 0;
+
+use_extent:
+	btx->te_flex = tree_idx;
+	if (goal) {
+		btx->te_flex_start = ext4_blkno_to_flex_offset(ac->ac_sb,
+								start);
+		btx->te_len = max;
+	} else {
+		btx->te_flex_start = cur->frsp_offset;
+		btx->te_len = cur->frsp_len;
+	}
+	if (best_found)
+		ext4_mb_frsp_use_best_found(ac, tree_idx, cur);
+	ext4_mb_frsp_check_limits(ac);
+	return 1;
+}
+
+/* Search by goal */
+static noinline_for_stack
+void ext4_mb_frsp_find_by_goal(struct ext4_allocation_context *ac)
+{
+	struct ext4_frsp_node *cur = NULL;
+	unsigned int tree_blk;
+	ext4_fsblk_t blk;
+	struct ext4_buddy e4b;
+	int ret;
+
+	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+		return;
+
+	/* compute start node offset in tree */
+	blk = ext4_group_first_block_no(ac->ac_sb, ac->ac_g_ex.fe_group) +
+			ac->ac_g_ex.fe_start;
+	tree_blk = ext4_blkno_to_flex_offset(ac->ac_sb, blk);
+
+	ret = ext4_mb_load_allocator(ac->ac_sb, ac->ac_g_ex.fe_group, &e4b, 0);
+	if (ret)
+		return;
+
+	/* try goal block and its freespace_tree first */
+	mutex_lock(&e4b.frsp_tree->frsp_lock);
+	cur = ext4_mb_frsp_search_by_off(ac->ac_sb, e4b.frsp_tree, tree_blk);
+	if (cur && tree_blk < cur->frsp_offset + cur->frsp_len - 1)
+		ext4_mb_frsp_measure_node(ac, e4b.frsp_tree->frsp_index, cur,
+						1 /* searching by goal */);
+
+	mutex_unlock(&e4b.frsp_tree->frsp_lock);
+	ext4_mb_unload_allocator(&e4b);
+}
+
+static void ext4_mb_frsp_process(struct ext4_allocation_context *ac,
+				struct ext4_frsp_tree *tree)
+{
+	struct ext4_frsp_node *cur = NULL;
+	struct rb_node *node = NULL;
+	int ret;
+
+	node = rb_first_cached(&tree->frsp_len_root);
+	mb_debug(ac->ac_sb, "Processing tree index %d, flags = %x\n",
+			tree->frsp_index, tree->frsp_flags);
+	/*
+	 * In order to serve the "No data blocks in first group in a flexbg"
+	 * requirement, we cannot do a O(Log N) search here. TODO: it's possible
+	 * to that at CR=2, where that requirement doesn't apply.
+	 */
+	while (node && ac->ac_status == AC_STATUS_CONTINUE) {
+		cur = rb_entry(node, struct ext4_frsp_node, frsp_len_node);
+		if (ac->ac_criteria == 0 && cur->frsp_len < ac->ac_g_ex.fe_len)
+			return;
+		ret = ext4_mb_frsp_measure_node(ac, tree->frsp_index, cur,
+						0 /* searching by len */);
+		if (ret == 0)
+			return;
+		node = rb_next(node);
+	}
+}
+
+/* allocator for freespace_tree */
+static noinline_for_stack int
+ext4_mb_tree_allocator(struct ext4_allocation_context *ac)
+{
+	struct ext4_buddy e4b;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	struct ext4_frsp_tree *tree = NULL;
+	struct ext4_frsp_node *cur = NULL;
+	struct ext4_tree_extent *btx = NULL;
+	int ret = 0, start_idx = 0, tree_idx, j;
+
+	sb = ac->ac_sb;
+	btx = &ac->ac_b_tree_ex;
+	sbi = EXT4_SB(sb);
+
+	start_idx = ext4_flex_group(sbi, ac->ac_g_ex.fe_group);
+	mb_debug(sb, "requested size %d\n", ac->ac_g_ex.fe_len);
+	/* First try searching from goal blk in start-idx-th freespace tree */
+	ext4_mb_frsp_find_by_goal(ac);
+	if (ac->ac_status == AC_STATUS_FOUND)
+		goto out;
+
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		goto out;
+
+	ac->ac_criteria = 0;
+	ac->ac_groups_scanned = 0;
+repeat:
+
+	/* Loop through the rest of trees (flex_bg) */
+	for (j = start_idx;
+		(ac->ac_groups_scanned < sbi->s_mb_num_frsp_trees) &&
+			ac->ac_status == AC_STATUS_CONTINUE; j++) {
+		ac->ac_groups_scanned++;
+		tree_idx = (j % sbi->s_mb_num_frsp_trees);
+
+		ret = ext4_mb_load_allocator(sb,
+				tree_idx * ext4_flex_bg_size(sbi), &e4b, 0);
+		if (ret)
+			goto out;
+		mutex_lock(&e4b.frsp_tree->frsp_lock);
+		ext4_mb_frsp_process(ac, e4b.frsp_tree);
+		mutex_unlock(&e4b.frsp_tree->frsp_lock);
+		ext4_mb_unload_allocator(&e4b);
+	}
+
+	if (ac->ac_status != AC_STATUS_FOUND) {
+		if (ac->ac_criteria < 2) {
+			ac->ac_criteria++;
+			ac->ac_groups_scanned = 0;
+			mb_debug(sb, "Falling back to CR=%d", ac->ac_criteria);
+			goto repeat;
+		}
+		if (btx->te_len > 0 && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+			e4b.frsp_flags = 0;
+			ret = ext4_mb_load_allocator(sb, btx->te_flex *
+					ext4_flex_bg_size(sbi), &e4b, 0);
+			if (ret)
+				goto out;
+			tree = e4b.frsp_tree;
+			mutex_lock(&tree->frsp_lock);
+			cur = ext4_mb_frsp_search_by_off(sb, tree,
+					btx->te_flex_start);
+			if (cur) {
+				ext4_mb_frsp_use_best_found(ac, btx->te_flex,
+								cur);
+				mutex_unlock(&tree->frsp_lock);
+				ext4_mb_unload_allocator(&e4b);
+
+			} else {
+				/*
+				 * Someone else took this freespace node before
+				 * us. Reset the best-found tree extent, and
+				 * turn on FIRST HINT (greedy).
+				 */
+				mutex_unlock(&tree->frsp_lock);
+				ac->ac_b_tree_ex.te_flex_start = 0;
+				ac->ac_b_tree_ex.te_flex = 0;
+				ac->ac_b_tree_ex.te_len = 0;
+				ac->ac_status = AC_STATUS_CONTINUE;
+				ac->ac_flags |= EXT4_MB_HINT_FIRST;
+				ac->ac_groups_scanned = 0;
+				ext4_mb_unload_allocator(&e4b);
+				goto repeat;
+			}
+		}
+	}
+
+	ret = btx->te_len ? 0 : -ENOSPC;
+
+out:
+	return ret;
+}
+
+static void ext4_mb_frsp_init_tree(struct ext4_frsp_tree *tree, int index)
+{
+	tree->frsp_offset_root = RB_ROOT_CACHED;
+	tree->frsp_len_root = RB_ROOT_CACHED;
+	mutex_init(&(tree->frsp_lock));
+	tree->frsp_flags = 0;
+	tree->frsp_index = index;
+}
+
+int ext4_mb_init_freespace_trees(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct flex_groups *fg;
+	int i;
+
+	sbi->s_mb_num_frsp_trees =
+		ext4_num_grps_to_flexbg(sb, ext4_get_groups_count(sb));
+
+	for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) {
+		fg = sbi_array_rcu_deref(sbi, s_flex_groups, i);
+		fg->frsp_tree = kzalloc(sizeof(struct ext4_frsp_tree),
+				GFP_KERNEL);
+		if (!fg->frsp_tree)
+			return -ENOMEM;
+		ext4_mb_frsp_init_tree(fg->frsp_tree, i);
+	}
+
+	return 0;
+}
+
+int ext4_mb_add_frsp_trees(struct super_block *sb, int ngroups)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int flex_bg_count, old_trees_count = sbi->s_mb_num_frsp_trees;
+	int i;
+
+	if (!ext4_mb_frsp_on(sb))
+		return 0;
+
+	flex_bg_count = ext4_num_grps_to_flexbg(sb, ngroups);
+	if (old_trees_count > 0)
+		ext4_mb_frsp_free_tree(sb, ext4_get_frsp_tree(sb,
+						old_trees_count - 1));
+
+	for (i = old_trees_count; i < flex_bg_count; i++) {
+		struct flex_groups *fg;
+
+		fg = sbi_array_rcu_deref(sbi, s_flex_groups, i);
+		fg->frsp_tree = kzalloc(sizeof(*fg->frsp_tree), GFP_KERNEL);
+		if (!fg->frsp_tree)
+			return -ENOMEM;
+		ext4_mb_frsp_init_tree(fg->frsp_tree, i);
+	}
+	sbi->s_mb_num_frsp_trees = flex_bg_count;
+
+	return 0;
+}
+
 /*
  * Divide blocks started from @first with length @len into
  * smaller chunks with power of 2 blocks.
@@ -1042,6 +1802,9 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 	e4b->bd_buddy_page = NULL;
 	e4b->bd_bitmap_page = NULL;
 
+	if (ext4_mb_frsp_on(sb))
+		return 0;
+
 	blocks_per_page = PAGE_SIZE / sb->s_blocksize;
 	/*
 	 * the buddy cache inode stores the block bitmap
@@ -1099,6 +1862,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
 	struct page *page;
 	int ret = 0;
 
+	WARN_ON(ext4_mb_frsp_on(sb));
 	might_sleep();
 	mb_debug(sb, "init group %u\n", group);
 	this_grp = ext4_get_group_info(sb, group);
@@ -1156,6 +1920,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
  * block group lock of all groups for this page; do not hold the BG lock when
  * calling this routine!
+ *
+ * For freespace trees, do not hold tree mutex while calling this routine.
+ * It is okay to hold that mutex only if EXT4_ALLOCATOR_FRSP_NOLOAD flag is
+ * set in e4b->frsp_flags. If that flag is set, this function doesn't try
+ * to load an unloaded tree.
  */
 static noinline_for_stack int
 ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group,
@@ -1166,7 +1935,7 @@ ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group,
 	int pnum;
 	int poff;
 	struct page *page;
-	int ret;
+	int ret = 0;
 	struct ext4_group_info *grp;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct inode *inode = sbi->s_buddy_cache;
@@ -1183,6 +1952,22 @@ ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group,
 	e4b->bd_group = group;
 	e4b->bd_buddy_page = NULL;
 	e4b->bd_bitmap_page = NULL;
+	if (ext4_mb_frsp_on(sb)) {
+		e4b->frsp_tree = ext4_get_frsp_tree(sb,
+					ext4_flex_group(sbi, group));
+		e4b->frsp_flags = flags;
+		if (flags & EXT4_ALLOCATOR_FRSP_NOLOAD)
+			return 0;
+
+		mutex_lock(&e4b->frsp_tree->frsp_lock);
+		if (e4b->frsp_tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED) {
+			mutex_unlock(&e4b->frsp_tree->frsp_lock);
+			return 0;
+		}
+		ret = ext4_mb_frsp_load(e4b);
+		mutex_unlock(&e4b->frsp_tree->frsp_lock);
+		return ret;
+	}
 
 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
 		/*
@@ -1305,6 +2090,8 @@ static int ext4_mb_load_allocator(struct super_block *sb, ext4_group_t group,
 
 static void ext4_mb_unload_allocator(struct ext4_buddy *e4b)
 {
+	if (ext4_mb_frsp_on(e4b->bd_sb))
+		return;
 	if (e4b->bd_bitmap_page)
 		put_page(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
@@ -1497,6 +2284,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	if (first < e4b->bd_info->bb_first_free)
 		e4b->bd_info->bb_first_free = first;
 
+	if (ext4_mb_frsp_on(sb)) {
+		ext4_grpblk_t first_blk = EXT4_C2B(EXT4_SB(sb), first) +
+			ext4_group_first_block_no(sb, e4b->bd_group);
+
+		ext4_unlock_group(sb, e4b->bd_group);
+		ext4_mb_frsp_free_blocks(sb, e4b->bd_group, first_blk, count);
+		ext4_lock_group(sb, e4b->bd_group);
+		return;
+	}
+
 	/* access memory sequentially: check left neighbour,
 	 * clear range and then check right neighbour
 	 */
@@ -1563,6 +2360,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
 	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	BUG_ON(ex == NULL);
 
+	if (ext4_mb_frsp_on(e4b->bd_sb))
+		return 0;
+
 	buddy = mb_find_buddy(e4b, 0, &max);
 	BUG_ON(buddy == NULL);
 	BUG_ON(block >= max);
@@ -1627,17 +2427,74 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 	unsigned ret = 0;
 	int len0 = len;
 	void *buddy;
+	ext4_grpblk_t flex_offset;
 
 	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
 	BUG_ON(e4b->bd_group != ex->fe_group);
+
+	e4b->bd_info->bb_free -= len;
+	if (e4b->bd_info->bb_first_free == start)
+		e4b->bd_info->bb_first_free += len;
+
+	if (ext4_mb_frsp_on(e4b->bd_sb)) {
+		struct ext4_frsp_node *new;
+
+		flex_offset = ext4_blkno_to_flex_offset(e4b->bd_sb,
+					ext4_group_first_block_no(e4b->bd_sb,
+						e4b->bd_group) + ex->fe_start);
+		if (!ex->fe_node) {
+			ex->fe_node = ext4_mb_frsp_search_by_off(e4b->bd_sb,
+					e4b->frsp_tree, flex_offset);
+			if (!ex->fe_node)
+				return 0;
+		}
+		/*
+		 * Remove the node from the trees before we modify these since
+		 * we will change the length and / or offset of this node.
+		 */
+		rb_erase_cached(&ex->fe_node->frsp_node,
+					&e4b->frsp_tree->frsp_offset_root);
+		rb_erase_cached(&ex->fe_node->frsp_len_node,
+					&e4b->frsp_tree->frsp_len_root);
+		RB_CLEAR_NODE(&ex->fe_node->frsp_node);
+		RB_CLEAR_NODE(&ex->fe_node->frsp_len_node);
+		if (flex_offset > ex->fe_node->frsp_offset) {
+			if (flex_offset + ex->fe_len !=
+				ex->fe_node->frsp_offset +
+				ex->fe_node->frsp_len) {
+				/* Need to split the node */
+				new = ext4_mb_frsp_alloc_node(e4b->bd_sb);
+				if (!new)
+					return -ENOMEM;
+				new->frsp_offset = flex_offset + ex->fe_len;
+				new->frsp_len = (ex->fe_node->frsp_offset +
+					ex->fe_node->frsp_len) -
+					new->frsp_offset;
+				ext4_mb_frsp_insert_len(e4b->frsp_tree, new);
+				ext4_mb_frsp_insert_off(e4b->frsp_tree, new);
+			}
+			ex->fe_node->frsp_len = flex_offset -
+						ex->fe_node->frsp_offset;
+		} else if (ex->fe_len < ex->fe_node->frsp_len) {
+			/* used only a part: update node */
+			ex->fe_node->frsp_offset += ex->fe_len;
+			ex->fe_node->frsp_len -= ex->fe_len;
+		} else {
+			ext4_mb_frsp_free_node(e4b->bd_sb, ex->fe_node);
+			return 0;
+		}
+
+		ext4_mb_frsp_insert_len(e4b->frsp_tree, ex->fe_node);
+		ext4_mb_frsp_insert_off(e4b->frsp_tree, ex->fe_node);
+
+		return 0;
+	}
+
 	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	mb_check_buddy(e4b);
 	mb_mark_used_double(e4b, start, len);
 
 	this_cpu_inc(discard_pa_seq);
-	e4b->bd_info->bb_free -= len;
-	if (e4b->bd_info->bb_first_free == start)
-		e4b->bd_info->bb_first_free += len;
 
 	/* let's maintain fragments counter */
 	if (start != 0)
@@ -2778,6 +3635,13 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	rcu_read_lock();
 	kvfree(rcu_dereference(sbi->s_group_info));
 	rcu_read_unlock();
+	if (ext4_mb_frsp_on(sb)) {
+		for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) {
+			struct ext4_frsp_tree *tree = ext4_get_frsp_tree(sb, i);
+
+			kfree(tree);
+		}
+	}
 	return -ENOMEM;
 }
 
@@ -2874,6 +3738,13 @@ int ext4_mb_init(struct super_block *sb)
 		i++;
 	} while (i <= sb->s_blocksize_bits + 1);
 
+	/* init for freespace trees */
+	if (ext4_mb_frsp_on(sb)) {
+		ret = ext4_mb_init_freespace_trees(sb);
+		if (ret)
+			goto out;
+	}
+
 	spin_lock_init(&sbi->s_md_lock);
 	spin_lock_init(&sbi->s_bal_lock);
 	sbi->s_mb_free_pending = 0;
@@ -2940,6 +3811,13 @@ int ext4_mb_init(struct super_block *sb)
 	sbi->s_mb_offsets = NULL;
 	kfree(sbi->s_mb_maxs);
 	sbi->s_mb_maxs = NULL;
+	if (ext4_mb_frsp_on(sb)) {
+		for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) {
+			struct ext4_frsp_tree *tree = ext4_get_frsp_tree(sb, i);
+
+			kfree(tree);
+		}
+	}
 	return ret;
 }
 
@@ -3084,8 +3962,10 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
 		/* No more items in the per group rb tree
 		 * balance refcounts from ext4_mb_free_metadata()
 		 */
-		put_page(e4b.bd_buddy_page);
-		put_page(e4b.bd_bitmap_page);
+		if (!ext4_mb_frsp_on(sb)) {
+			put_page(e4b.bd_buddy_page);
+			put_page(e4b.bd_bitmap_page);
+		}
 	}
 	ext4_unlock_group(sb, entry->efd_group);
 	kmem_cache_free(ext4_free_data_cachep, entry);
@@ -3163,9 +4043,14 @@ int __init ext4_init_mballoc(void)
 					   SLAB_RECLAIM_ACCOUNT);
 	if (ext4_free_data_cachep == NULL)
 		goto out_ac_free;
+	ext4_freespace_node_cachep = KMEM_CACHE(ext4_frsp_node,
+						SLAB_RECLAIM_ACCOUNT);
+	if (ext4_freespace_node_cachep == NULL)
+		goto out_frsp_free;
 
 	return 0;
-
+out_frsp_free:
+	kmem_cache_destroy(ext4_free_data_cachep);
 out_ac_free:
 	kmem_cache_destroy(ext4_ac_cachep);
 out_pa_free:
@@ -3184,6 +4069,7 @@ void ext4_exit_mballoc(void)
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_free_data_cachep);
+	kmem_cache_destroy(ext4_freespace_node_cachep);
 	ext4_groupinfo_destroy_slabs();
 }
 
@@ -3686,6 +4572,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
 		return false;
 
+	if (ext4_mb_frsp_on(ac->ac_sb))
+		return 0;
+
 	/* first, try per-file preallocation */
 	rcu_read_lock();
 	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
@@ -4362,6 +5251,8 @@ static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
 	struct ext4_prealloc_space *pa;
 
 	BUG_ON(ext4_pspace_cachep == NULL);
+	if (ext4_mb_frsp_on(ac->ac_sb))
+		return 0;
 	pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
 	if (!pa)
 		return -ENOMEM;
@@ -4374,6 +5265,8 @@ static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
 {
 	struct ext4_prealloc_space *pa = ac->ac_pa;
 
+	if (ext4_mb_frsp_on(ac->ac_sb))
+		return;
 	BUG_ON(!pa);
 	ac->ac_pa = NULL;
 	WARN_ON(!atomic_dec_and_test(&pa->pa_count));
@@ -4547,6 +5440,13 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	ac->ac_o_ex.fe_len = len;
 	ac->ac_g_ex = ac->ac_o_ex;
 	ac->ac_flags = ar->flags;
+	if (ext4_mb_frsp_on(sb))
+		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+
+	/* set up best-found tree node */
+	ac->ac_b_tree_ex.te_flex_start = 0;
+	ac->ac_b_tree_ex.te_flex = 0;
+	ac->ac_b_tree_ex.te_len = 0;
 
 	/* we have to define context: we'll we work with a file or
 	 * locality group. this is a policy, actually */
@@ -4867,7 +5767,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			goto errout;
 repeat:
 		/* allocate space in core */
-		*errp = ext4_mb_regular_allocator(ac);
+		if (ext4_mb_frsp_on(sb))
+			*errp = ext4_mb_tree_allocator(ac);
+		else
+			*errp = ext4_mb_regular_allocator(ac);
+
 		/*
 		 * pa allocated above is added to grp->bb_prealloc_list only
 		 * when we were able to allocate some block i.e. when
@@ -4880,8 +5784,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			ext4_discard_allocated_blocks(ac);
 			goto errout;
 		}
-		if (ac->ac_status == AC_STATUS_FOUND &&
-			ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
+		/*
+		 * Freespace trees should never return more than what was asked
+		 * for.
+		 */
+		if (!ext4_mb_frsp_on(sb) &&
+			(ac->ac_status == AC_STATUS_FOUND &&
+			ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len))
 			ext4_mb_pa_free(ac);
 	}
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
@@ -4972,13 +5881,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	struct rb_node *parent = NULL, *new_node;
 
 	BUG_ON(!ext4_handle_valid(handle));
-	BUG_ON(e4b->bd_bitmap_page == NULL);
-	BUG_ON(e4b->bd_buddy_page == NULL);
+	BUG_ON(!ext4_mb_frsp_on(sb) && e4b->bd_bitmap_page == NULL);
 
 	new_node = &new_entry->efd_node;
 	cluster = new_entry->efd_start_cluster;
 
-	if (!*n) {
+	if (!*n && !ext4_mb_frsp_on(sb)) {
 		/* first free block exent. We need to
 		   protect buddy cache from being freed,
 		 * otherwise we'll refresh it from
@@ -5457,6 +6365,7 @@ __acquires(bitlock)
 	ex.fe_start = start;
 	ex.fe_group = group;
 	ex.fe_len = count;
+	ex.fe_node = NULL;
 
 	/*
 	 * Mark blocks used, so no one can reuse them while
@@ -5496,6 +6405,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 	void *bitmap;
 	ext4_grpblk_t next, count = 0, free_count = 0;
 	struct ext4_buddy e4b;
+	struct buffer_head *bh = NULL;
 	int ret = 0;
 
 	trace_ext4_trim_all_free(sb, group, start, max);
@@ -5506,7 +6416,17 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 			     ret, group);
 		return ret;
 	}
-	bitmap = e4b.bd_bitmap;
+
+	if (ext4_mb_frsp_on(sb)) {
+		bh = ext4_read_block_bitmap(sb, group);
+		if (IS_ERR(bh)) {
+			ret = PTR_ERR(bh);
+			goto out;
+		}
+		bitmap = bh->b_data;
+	} else {
+		bitmap = e4b.bd_bitmap;
+	}
 
 	ext4_lock_group(sb, group);
 	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
@@ -5553,6 +6473,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
 		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
 	}
 out:
+	if (!IS_ERR_OR_NULL(bh))
+		brelse(bh);
 	ext4_unlock_group(sb, group);
 	ext4_mb_unload_allocator(&e4b);
 
@@ -5613,7 +6535,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	for (group = first_group; group <= last_group; group++) {
 		grp = ext4_get_group_info(sb, group);
 		/* We only do this if the grp has never been initialized */
-		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		if (unlikely(!ext4_mb_frsp_on(sb) && EXT4_MB_GRP_NEED_INIT(grp))) {
 			ret = ext4_mb_init_group(sb, group, GFP_NOFS);
 			if (ret)
 				break;
@@ -5667,16 +6589,25 @@ ext4_mballoc_query_range(
 	ext4_grpblk_t			next;
 	struct ext4_buddy		e4b;
 	int				error;
+	struct buffer_head		*bh = NULL;
 
-	error = ext4_mb_load_allocator(sb, group, &e4b, 0);
-	if (error)
-		return error;
-	bitmap = e4b.bd_bitmap;
-
+	if (ext4_mb_frsp_on(sb)) {
+		bh = ext4_read_block_bitmap(sb, group);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
+			goto out_unload;
+		}
+		bitmap = bh->b_data;
+	} else {
+		error = ext4_mb_load_allocator(sb, group, &e4b, 0);
+		if (error)
+			return error;
+		bitmap = e4b.bd_bitmap;
+	}
 	ext4_lock_group(sb, group);
-
-	start = (e4b.bd_info->bb_first_free > start) ?
-		e4b.bd_info->bb_first_free : start;
+	if (!ext4_mb_frsp_on(sb))
+		start = (e4b.bd_info->bb_first_free > start) ?
+			e4b.bd_info->bb_first_free : start;
 	if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
 		end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
 
@@ -5685,19 +6616,20 @@ ext4_mballoc_query_range(
 		if (start > end)
 			break;
 		next = mb_find_next_bit(bitmap, end + 1, start);
-
 		ext4_unlock_group(sb, group);
 		error = formatter(sb, group, start, next - start, priv);
 		if (error)
 			goto out_unload;
 		ext4_lock_group(sb, group);
-
 		start = next + 1;
 	}
 
 	ext4_unlock_group(sb, group);
 out_unload:
-	ext4_mb_unload_allocator(&e4b);
+	if (!IS_ERR_OR_NULL(bh))
+		brelse(bh);
+	if (!ext4_mb_frsp_on(sb))
+		ext4_mb_unload_allocator(&e4b);
 
 	return error;
 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 6b4d17c2935d..32b9ee452de7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -74,6 +74,20 @@
 #define MB_DEFAULT_GROUP_PREALLOC	512
 
 
+/*
+ * Struct for tree node in freespace_tree
+ */
+struct ext4_frsp_node {
+	ext4_grpblk_t frsp_offset;	/* Start block offset inside
+					 * current flexible group
+					 */
+	ext4_grpblk_t frsp_len;		/*
+					 * Length of the free space in
+					 * number of clusters
+					 */
+	struct rb_node frsp_node;
+	struct rb_node frsp_len_node;
+};
 struct ext4_free_data {
 	/* this links the free block information from sb_info */
 	struct list_head		efd_list;
@@ -121,6 +135,7 @@ struct ext4_free_extent {
 	ext4_grpblk_t fe_start;	/* In cluster units */
 	ext4_group_t fe_group;
 	ext4_grpblk_t fe_len;	/* In cluster units */
+	struct ext4_frsp_node *fe_node;
 };
 
 /*
@@ -141,7 +156,14 @@ struct ext4_locality_group {
 	spinlock_t		lg_prealloc_lock;
 };
 
+struct ext4_tree_extent {
+	ext4_group_t te_flex;		/* flex_bg index (tree index) */
+	ext4_grpblk_t te_flex_start;	/* block offset w.r.t flex bg */
+	ext4_grpblk_t te_len;		/* length */
+};
+
 struct ext4_allocation_context {
+	__u32	ac_id;
 	struct inode *ac_inode;
 	struct super_block *ac_sb;
 
@@ -154,8 +176,16 @@ struct ext4_allocation_context {
 	/* the best found extent */
 	struct ext4_free_extent ac_b_ex;
 
-	/* copy of the best found extent taken before preallocation efforts */
-	struct ext4_free_extent ac_f_ex;
+	/* With freespace trees, we don't use preallocation anymore. */
+	union {
+		/*
+		 * copy of the best found extent taken before
+		 * preallocation efforts
+		 */
+		struct ext4_free_extent ac_f_ex;
+		/* the best found tree extent */
+		struct ext4_tree_extent ac_b_tree_ex;
+	};
 
 	__u16 ac_groups_scanned;
 	__u16 ac_found;
@@ -177,14 +207,31 @@ struct ext4_allocation_context {
 #define AC_STATUS_FOUND		2
 #define AC_STATUS_BREAK		3
 
+/*
+ * Freespace tree flags
+ */
+#define EXT4_ALLOCATOR_FRSP_NOLOAD		0x0001	/*
+							 * Don't load freespace
+							 * tree, if it's not
+							 * in memory.
+							 */
+
 struct ext4_buddy {
-	struct page *bd_buddy_page;
-	void *bd_buddy;
-	struct page *bd_bitmap_page;
-	void *bd_bitmap;
+	union {
+		struct {
+			struct page *bd_buddy_page;
+			void *bd_buddy;
+			struct page *bd_bitmap_page;
+			void *bd_bitmap;
+			__u16 bd_blkbits;
+		};
+		struct {
+			struct ext4_frsp_tree *frsp_tree;
+			__u32 frsp_flags;
+		};
+	};
 	struct ext4_group_info *bd_info;
 	struct super_block *bd_sb;
-	__u16 bd_blkbits;
 	ext4_group_t bd_group;
 };
 
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a50b51270ea9..6a0e1fc18e95 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1679,6 +1679,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	if (err)
 		goto out;
 
+	err = ext4_mb_add_frsp_trees(sb, input->group + 1);
+	if (err)
+		goto out;
+
 	flex_gd.count = 1;
 	flex_gd.groups = input;
 	flex_gd.bg_flags = &bg_flags;
@@ -2051,6 +2055,10 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
 	if (err)
 		goto out;
 
+	err = ext4_mb_add_frsp_trees(sb, n_group + 1);
+	if (err)
+		goto out;
+
 	flex_gd = alloc_flex_gd(flexbg_size);
 	if (flex_gd == NULL) {
 		err = -ENOMEM;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 98aa12602b68..2f4b7061365f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1521,7 +1521,7 @@ enum {
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
-	Opt_prefetch_block_bitmaps,
+	Opt_prefetch_block_bitmaps, Opt_freespace_tree,
 };
 
 static const match_table_t tokens = {
@@ -1608,6 +1608,7 @@ static const match_table_t tokens = {
 	{Opt_init_itable, "init_itable=%u"},
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
+	{Opt_freespace_tree, "freespace_tree"},
 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
 	{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
 	{Opt_test_dummy_encryption, "test_dummy_encryption"},
@@ -1834,6 +1835,8 @@ static const struct mount_opts {
 	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
 	{Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
 	 MOPT_SET},
+	{Opt_freespace_tree, EXT4_MOUNT2_FREESPACE_TREE,
+	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
 	{Opt_err, 0, 0}
 };
 
@@ -4740,12 +4743,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount4a;
 	}
 
+	if (ext4_has_feature_flex_bg(sb))
+		if (!ext4_fill_flex_info(sb)) {
+			ext4_msg(sb, KERN_ERR,
+			       "unable to initialize flex_bg meta info!");
+			goto failed_mount5;
+		}
+
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
 			 err);
-		goto failed_mount5;
+		goto failed_mount6;
 	}
 
 	block = ext4_count_free_clusters(sb);
@@ -4775,14 +4785,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount6;
 	}
 
-	if (ext4_has_feature_flex_bg(sb))
-		if (!ext4_fill_flex_info(sb)) {
-			ext4_msg(sb, KERN_ERR,
-			       "unable to initialize "
-			       "flex_bg meta info!");
-			goto failed_mount6;
-		}
-
 	err = ext4_register_li_request(sb, first_not_zeroed);
 	if (err)
 		goto failed_mount6;
@@ -4856,7 +4858,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ext4_unregister_li_request(sb);
 failed_mount6:
 	ext4_mb_release(sb);
+	percpu_counter_destroy(&sbi->s_freeclusters_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_free_rwsem(&sbi->s_writepages_rwsem);
 	rcu_read_lock();
+
+failed_mount5:
 	flex_groups = rcu_dereference(sbi->s_flex_groups);
 	if (flex_groups) {
 		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
@@ -4864,12 +4873,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		kvfree(flex_groups);
 	}
 	rcu_read_unlock();
-	percpu_counter_destroy(&sbi->s_freeclusters_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
-	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
-	percpu_free_rwsem(&sbi->s_writepages_rwsem);
-failed_mount5:
 	ext4_ext_release(sb);
 	ext4_release_system_zone(sb);
 failed_mount4a:
-- 
2.28.0.220.ged08abb693-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ