[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200821015523.1698374-8-harshads@google.com>
Date: Thu, 20 Aug 2020 18:55:21 -0700
From: Harshad Shirwadkar <harshadshirwadkar@...il.com>
To: linux-ext4@...r.kernel.org
Cc: tytso@....edu, lyx1209@...il.com,
Harshad Shirwadkar <harshadshirwadkar@...il.com>
Subject: [RFC PATCH v2 7/9] ext4: add LRU eviction for free space trees
From: Harshad Shirwadkar <harshadshirwadkar@...il.com>
This patch adds LRU eviction policy for freespace trees. In order to
avoid contention on one LRU lock, the LRU scheme is implemented as two
lists - an active list and an inactive list. Trees in active list
aren't moved inside the list thereby avoiding the need of a
lock. Trees in inactive list become active only if they are accessed
twice in a short interval thereby avoiding outliers to enter the
active list.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@...il.com>
---
fs/ext4/ext4.h | 46 +++++++++++++
fs/ext4/mballoc.c | 164 ++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 197 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 93bf2fe35cf1..64d0dbbcd517 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1454,6 +1454,10 @@ struct ext4_frsp_tree {
*/
struct list_head frsp_list_node;
struct rb_node frsp_len_node;
+ atomic_t frsp_fragments;
+ struct list_head frsp_lru_active_node;
+ unsigned long frsp_last_access;
+ atomic_t frsp_ref;
};
/* Freespace tree flags */
@@ -1468,6 +1472,47 @@ struct ext4_frsp_tree {
#define EXT4_MB_FRSP_DEFAULT_CACHE_AGGRESSION 0x1
#define EXT4_MB_FRSP_MAX_CACHE_AGGRESSION 0x3
+/* LRU management for free space trees */
+struct ext4_mb_frsp_lru {
+ rwlock_t frsp_lru_lock;
+ atomic_t frsp_active_fragments; /* Current #of fragments in
+ * the active queue
+ */
+ u32 frsp_max_active_fragments;
+ /*
+ * List of active trees. Trees at tail are oldest trees in active set.
+ */
+ struct list_head frsp_lru_active;
+ /*
+ * List of inactive trees but loaded trees.
+ */
+ struct list_head frsp_lru_inactive;
+ struct super_block *frsp_lru_sb;
+};
+
+/*
+ * Minimum memory needed for our allocator. The pathological worst case for
+ * freespace trees is when every other block is allocated. In this case,
+ * the allocator will end up storing an extent node of length 1 for each free
+ * block. We need to make sure that the minimum memory available for the
+ * allocator is as much memory needed for 1 worst-case tree. This ensures that
+ * we can at-least keep 1 tree in memory. This way we avoid thrashing.
+ */
+#define EXT4_MB_FRSP_MEM_MIN(sb) \
+ ((sizeof(struct ext4_frsp_node) * ext4_flex_bg_size(EXT4_SB(sb))) \
+ * (EXT4_BLOCKS_PER_GROUP(sb) / 2))
+
+/* Half of the total memory available is allowed for active list */
+#define EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb) \
+ ((EXT4_SB(sb)->s_mb_frsp_mem_limit / \
+ (2 * sizeof(struct ext4_frsp_node))))
+
+/*
+ * Maximum number of jiffies allowed between two successive hits on a freespace
+ * tree before we move it to the "active" queue.
+ */
+#define EXT4_MB_FRSP_ACTIVE_THRESHOLD_JIFFIES 100
+
/*
* fourth extended-fs super-block data in memory
*/
@@ -1615,6 +1660,7 @@ struct ext4_sb_info {
u32 s_mb_frsp_cache_aggression;
atomic_t s_mb_num_fragments;
u32 s_mb_frsp_mem_limit;
+ struct ext4_mb_frsp_lru s_mb_frsp_lru;
/* workqueue for reserved extent conversions (buffered io) */
struct workqueue_struct *rsv_conversion_wq;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b28b7fb0506e..aea0eb8d28da 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -866,10 +866,12 @@ void ext4_mb_frsp_print_tree_len(struct super_block *sb,
}
#endif
-static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb)
+static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb,
+ struct ext4_frsp_tree *tree)
{
struct ext4_frsp_node *node;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_mb_frsp_lru *lru = &EXT4_SB(sb)->s_mb_frsp_lru;
node = kmem_cache_alloc(ext4_freespace_node_cachep, GFP_NOFS);
if (!node)
@@ -879,6 +881,11 @@ static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb)
RB_CLEAR_NODE(&node->frsp_len_node);
atomic_inc(&sbi->s_mb_num_fragments);
+ atomic_inc(&tree->frsp_fragments);
+ read_lock(&lru->frsp_lru_lock);
+ if (!list_empty(&tree->frsp_lru_active_node))
+ atomic_inc(&lru->frsp_active_fragments);
+ read_unlock(&lru->frsp_lru_lock);
if (sbi->s_mb_frsp_mem_limit &&
atomic_read(&sbi->s_mb_num_fragments) >
@@ -892,12 +899,18 @@ static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb)
}
static void ext4_mb_frsp_free_node(struct super_block *sb,
- struct ext4_frsp_node *node)
+ struct ext4_frsp_tree *tree, struct ext4_frsp_node *node)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_mb_frsp_lru *lru = &EXT4_SB(sb)->s_mb_frsp_lru;
kmem_cache_free(ext4_freespace_node_cachep, node);
atomic_dec(&sbi->s_mb_num_fragments);
+ atomic_dec(&tree->frsp_fragments);
+ read_lock(&lru->frsp_lru_lock);
+ if (!list_empty(&tree->frsp_lru_active_node))
+ atomic_dec(&lru->frsp_active_fragments);
+ read_unlock(&lru->frsp_lru_lock);
if (!sbi->s_mb_frsp_mem_limit ||
atomic_read(&sbi->s_mb_num_fragments) <
@@ -915,6 +928,8 @@ void ext4_mb_frsp_free_tree(struct super_block *sb, struct ext4_frsp_tree *tree)
mutex_lock(&tree->frsp_lock);
if (!(tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED))
goto out;
+ if (atomic_read(&tree->frsp_ref))
+ goto out;
node = rb_first_cached(&tree->frsp_offset_root);
while (node) {
@@ -922,7 +937,7 @@ void ext4_mb_frsp_free_tree(struct super_block *sb, struct ext4_frsp_tree *tree)
rb_erase_cached(node, &tree->frsp_offset_root);
rb_erase_cached(&frsp_node->frsp_len_node,
&tree->frsp_len_root);
- ext4_mb_frsp_free_node(sb, frsp_node);
+ ext4_mb_frsp_free_node(sb, tree, frsp_node);
node = rb_first_cached(&tree->frsp_offset_root);
}
tree->frsp_flags &= ~EXT4_MB_FRSP_FLAG_LOADED;
@@ -985,7 +1000,7 @@ int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree,
*prev_entry = NULL;
struct rb_node *left = NULL, *right = NULL;
- new_entry = ext4_mb_frsp_alloc_node(sb);
+ new_entry = ext4_mb_frsp_alloc_node(sb, tree);
if (!new_entry)
return -ENOMEM;
@@ -1004,7 +1019,7 @@ int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree,
rb_erase_cached(left, &tree->frsp_offset_root);
rb_erase_cached(&prev_entry->frsp_len_node,
&tree->frsp_len_root);
- ext4_mb_frsp_free_node(sb, prev_entry);
+ ext4_mb_frsp_free_node(sb, tree, prev_entry);
}
}
@@ -1017,7 +1032,7 @@ int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree,
rb_erase_cached(right, &tree->frsp_offset_root);
rb_erase_cached(&next_entry->frsp_len_node,
&tree->frsp_len_root);
- ext4_mb_frsp_free_node(sb, next_entry);
+ ext4_mb_frsp_free_node(sb, tree, next_entry);
}
}
ext4_mb_frsp_insert_len(tree, new_entry);
@@ -1601,6 +1616,10 @@ static void ext4_mb_frsp_init_tree(struct ext4_frsp_tree *tree, int index)
tree->frsp_flags = 0;
tree->frsp_index = index;
INIT_LIST_HEAD(&tree->frsp_list_node);
+ atomic_set(&tree->frsp_fragments, 0);
+ tree->frsp_last_access = 0;
+ INIT_LIST_HEAD(&tree->frsp_lru_active_node);
+ atomic_set(&tree->frsp_ref, 0);
}
int ext4_mb_init_freespace_trees(struct super_block *sb)
@@ -1627,6 +1646,15 @@ int ext4_mb_init_freespace_trees(struct super_block *sb)
rwlock_init(&sbi->s_mb_frsp_lock);
atomic_set(&sbi->s_mb_num_frsp_trees_cached, 0);
atomic_set(&sbi->s_mb_num_fragments, 0);
+ INIT_LIST_HEAD(&sbi->s_mb_frsp_lru.frsp_lru_active);
+ INIT_LIST_HEAD(&sbi->s_mb_frsp_lru.frsp_lru_inactive);
+ rwlock_init(&sbi->s_mb_frsp_lru.frsp_lru_lock);
+ /* Set the default hard-limit to be as much as buddy bitmaps */
+ sbi->s_mb_frsp_mem_limit = ext4_get_groups_count(sb) <<
+ (sb->s_blocksize_bits + 1);
+ if (sbi->s_mb_frsp_mem_limit < EXT4_MB_FRSP_MEM_MIN(sb))
+ sbi->s_mb_frsp_mem_limit = EXT4_MB_FRSP_MEM_MIN(sb);
+ atomic_set(&sbi->s_mb_frsp_lru.frsp_active_fragments, 0);
return 0;
}
@@ -1664,6 +1692,99 @@ int ext4_mb_add_frsp_trees(struct super_block *sb, int ngroups)
return 0;
}
+/*
+ * Evict one tree from inactive list.
+ */
+static void ext4_frsp_evict(struct super_block *sb)
+{
+ struct ext4_mb_frsp_lru *lru = &EXT4_SB(sb)->s_mb_frsp_lru;
+ struct ext4_frsp_tree *tree = NULL;
+ bool found = false;
+
+ write_lock(&lru->frsp_lru_lock);
+ if (!list_empty(&lru->frsp_lru_inactive)) {
+ /* Evict from front, insert at tail */
+ found = 0;
+ list_for_each_entry(tree, &lru->frsp_lru_inactive,
+ frsp_list_node) {
+ if (!atomic_read(&tree->frsp_ref)) {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ list_del_init(&tree->frsp_list_node);
+ }
+ write_unlock(&lru->frsp_lru_lock);
+ if (found)
+ ext4_mb_frsp_free_tree(sb, tree);
+}
+
+/*
+ * This function maintains LRU lists. "tree" has just been accessed.
+ */
+static void ext4_mb_frsp_maintain_lru(struct super_block *sb,
+ struct ext4_frsp_tree *tree)
+{
+ struct ext4_mb_frsp_lru *lru = &EXT4_SB(sb)->s_mb_frsp_lru;
+ struct ext4_frsp_tree *last;
+ unsigned long current_jiffies = jiffies;
+
+ read_lock(&lru->frsp_lru_lock);
+ if (!list_empty(&tree->frsp_lru_active_node)) {
+ /* Already active, nothing to do */
+ read_unlock(&lru->frsp_lru_lock);
+ goto out;
+ }
+
+ /*
+ * Check if this tree needs to be moved to active list. We move it to
+ * active list if one of the following three conditions is true:
+ * - This is the first access to this tree
+ * - We haven't yet reached the max active fragments threshold, so there
+ * is space in active list.
+ * - This tree was accessed twice in
+ * EXT4_MB_FRSP_ACTIVE_THRESHOLD_JIFFIES interval.
+ */
+ if (tree->frsp_last_access &&
+ EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb) &&
+ atomic_read(&lru->frsp_active_fragments) >
+ EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb) &&
+ current_jiffies - tree->frsp_last_access >
+ EXT4_MB_FRSP_ACTIVE_THRESHOLD_JIFFIES) {
+ read_unlock(&lru->frsp_lru_lock);
+ goto out;
+ }
+ read_unlock(&lru->frsp_lru_lock);
+
+ write_lock(&lru->frsp_lru_lock);
+ /* Check again just in case */
+ if (!list_empty(&tree->frsp_lru_active_node)) {
+ write_unlock(&lru->frsp_lru_lock);
+ goto out;
+ }
+ list_add(&tree->frsp_lru_active_node, &lru->frsp_lru_active);
+ list_del_init(&tree->frsp_list_node);
+ atomic_add(atomic_read(&tree->frsp_fragments),
+ &lru->frsp_active_fragments);
+ /* Remove trees from active queue until we are below the limit */
+ while (EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb) &&
+ atomic_read(&lru->frsp_active_fragments) >
+ EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb)) {
+ last = list_last_entry(&lru->frsp_lru_active,
+ struct ext4_frsp_tree, frsp_lru_active_node);
+ list_del_init(&last->frsp_lru_active_node);
+ /* Evict from front, insert at tail */
+ list_add_tail(&last->frsp_list_node, &lru->frsp_lru_inactive);
+ atomic_sub(atomic_read(&last->frsp_fragments),
+ &lru->frsp_active_fragments);
+ }
+ write_unlock(&lru->frsp_lru_lock);
+
+out:
+ tree->frsp_last_access = current_jiffies;
+}
+
/*
* Divide blocks started from @first with length @len into
* smaller chunks with power of 2 blocks.
@@ -2154,10 +2275,12 @@ ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group,
e4b->frsp_tree = ext4_get_frsp_tree(sb,
ext4_flex_group(sbi, group));
e4b->frsp_flags = flags;
+ ext4_mb_frsp_maintain_lru(sb, e4b->frsp_tree);
if (flags & EXT4_ALLOCATOR_FRSP_NOLOAD)
return 0;
mutex_lock(&e4b->frsp_tree->frsp_lock);
+ atomic_inc(&e4b->frsp_tree->frsp_ref);
if (e4b->frsp_tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED) {
mutex_unlock(&e4b->frsp_tree->frsp_lock);
return 0;
@@ -2285,8 +2408,15 @@ static int ext4_mb_load_allocator(struct super_block *sb, ext4_group_t group,
static void ext4_mb_unload_allocator(struct ext4_buddy *e4b)
{
- if (ext4_mb_frsp_on(e4b->bd_sb))
+ if (ext4_mb_frsp_on(e4b->bd_sb)) {
+ if (!e4b->frsp_tree ||
+ e4b->frsp_flags & EXT4_ALLOCATOR_FRSP_NOLOAD)
+ return;
+ atomic_dec(&e4b->frsp_tree->frsp_ref);
+ if (test_opt2(e4b->bd_sb, FRSP_MEM_CRUNCH))
+ ext4_frsp_evict(e4b->bd_sb);
return;
+ }
if (e4b->bd_bitmap_page)
put_page(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
@@ -2658,7 +2788,8 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
ex->fe_node->frsp_offset +
ex->fe_node->frsp_len) {
/* Need to split the node */
- new = ext4_mb_frsp_alloc_node(e4b->bd_sb);
+ new = ext4_mb_frsp_alloc_node(e4b->bd_sb,
+ e4b->frsp_tree);
if (!new)
return -ENOMEM;
new->frsp_offset = flex_offset + ex->fe_len;
@@ -2675,7 +2806,8 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
ex->fe_node->frsp_offset += ex->fe_len;
ex->fe_node->frsp_len -= ex->fe_len;
} else {
- ext4_mb_frsp_free_node(e4b->bd_sb, ex->fe_node);
+ ext4_mb_frsp_free_node(e4b->bd_sb, e4b->frsp_tree,
+ ex->fe_node);
return 0;
}
@@ -4166,7 +4298,9 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/* No more items in the per group rb tree
* balance refcounts from ext4_mb_free_metadata()
*/
- if (!ext4_mb_frsp_on(sb)) {
+ if (ext4_mb_frsp_on(sb)) {
+ atomic_dec(&e4b.frsp_tree->frsp_ref);
+ } else {
put_page(e4b.bd_buddy_page);
put_page(e4b.bd_bitmap_page);
}
@@ -6146,14 +6280,18 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
new_node = &new_entry->efd_node;
cluster = new_entry->efd_start_cluster;
- if (!*n && !ext4_mb_frsp_on(sb)) {
+ if (!*n) {
/* first free block exent. We need to
protect buddy cache from being freed,
* otherwise we'll refresh it from
* on-disk bitmap and lose not-yet-available
* blocks */
- get_page(e4b->bd_buddy_page);
- get_page(e4b->bd_bitmap_page);
+ if (ext4_mb_frsp_on(sb)) {
+ atomic_inc(&e4b->frsp_tree->frsp_ref);
+ } else {
+ get_page(e4b->bd_buddy_page);
+ get_page(e4b->bd_bitmap_page);
+ }
}
while (*n) {
parent = *n;
--
2.28.0.297.g1956fa8f8d-goog
Powered by blists - more mailing lists