lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1357901627-3068-8-git-send-email-wenqing.lz@taobao.com>
Date:	Fri, 11 Jan 2013 18:53:47 +0800
From:	Zheng Liu <gnehzuil.liu@...il.com>
To:	linux-ext4@...r.kernel.org
Cc:	Jan kara <jack@...e.cz>, "Theodore Ts'o" <tytso@....edu>,
	Zheng Liu <wenqing.lz@...bao.com>
Subject: [PATCH 7/7 v2] ext4: reclaim extents from extent status tree

From: Zheng Liu <wenqing.lz@...bao.com>

Although extent status is loaded on-demand, we also need to reclaim extent
from the tree when we are under a heavy memory pressure because in some cases
fragmented extent tree causes status tree costs too much memory.

Here shrinker framework is used to do this work.  When the tree of an inode is
accessed, this inode will be inserted into the tail of lru list.  The inode is
removed from lru list as clear_inode is called.  When shrinker tries to reclaim
some memory, we will try to reclaim written/unwritten extents from the tree.
Delayed extent shouldn't be reclaimed because they are used by fiemap, bigalloc,
and seek_data/hole.

CC: Jan kara <jack@...e.cz>
CC: "Theodore Ts'o" <tytso@....edu>
Signed-off-by: Zheng Liu <wenqing.lz@...bao.com>
---
 fs/ext4/ext4.h           |   4 ++
 fs/ext4/extents_status.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/extents_status.h |  12 ++++++
 fs/ext4/super.c          |   6 +++
 4 files changed, 131 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 36145ef1..d31f254 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -893,6 +893,7 @@ struct ext4_inode_info {
 	/* extents status tree */
 	struct ext4_es_tree i_es_tree;
 	rwlock_t i_es_lock;
+	struct list_head i_es_lru;
 
 	/* ialloc */
 	ext4_group_t	i_last_alloc_group;
@@ -1308,6 +1309,9 @@ struct ext4_sb_info {
 
 	/* Precomputed FS UUID checksum for seeding other checksums */
 	__u32 s_csum_seed;
+
+	/* Extent status tree shrinker */
+	struct ext4_es_shrinker s_es_shrinker;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ac9bb80..f9e7867 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -141,6 +141,7 @@ static int __es_insert_extent(struct ext4_es_tree *tree,
 			      struct extent_status *newes);
 static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk,
 			      ext4_lblk_t end);
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc);
 
 int __init ext4_init_es(void)
 {
@@ -275,6 +276,8 @@ out:
 
 	read_unlock(&EXT4_I(inode)->i_es_lock);
 
+	ext4_es_lru_add(inode);
+
 	trace_ext4_es_find_extent_exit(inode, es, ret);
 	return ret;
 }
@@ -449,6 +452,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 error:
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 
+	ext4_es_lru_add(inode);
+
 	ext4_es_print_tree(inode);
 
 	return err;
@@ -509,6 +514,8 @@ out:
 
 	read_unlock(&EXT4_I(inode)->i_es_lock);
 
+	ext4_es_lru_add(inode);
+
 	trace_ext4_es_lookup_extent_exit(inode, es, found);
 	return found;
 }
@@ -621,6 +628,108 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 	write_lock(&EXT4_I(inode)->i_es_lock);
 	err = __es_remove_extent(tree, lblk, end);
 	write_unlock(&EXT4_I(inode)->i_es_lock);
+	ext4_es_lru_add(inode);
 	ext4_es_print_tree(inode);
 	return err;
 }
+
+void ext4_es_register_shrinker(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+	INIT_LIST_HEAD(&sbi->s_es_shrinker.es_lru);
+	spin_lock_init(&sbi->s_es_shrinker.es_lru_lock);
+	sbi->s_es_shrinker.es_shrinker.shrink = ext4_es_shrink;
+	sbi->s_es_shrinker.es_shrinker.seeks = DEFAULT_SEEKS;
+	register_shrinker(&sbi->s_es_shrinker.es_shrinker);
+}
+
+void ext4_es_unregister_shrinker(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+	unregister_shrinker(&sbi->s_es_shrinker.es_shrinker);
+}
+
+static int ext4_es_do_shrink(struct ext4_inode_info *ei)
+{
+	struct ext4_es_tree *tree;
+	struct rb_node *node;
+	struct extent_status *es;
+	int shrunk_nr = 0;
+
+	tree = &ei->i_es_tree;
+
+	write_lock(&ei->i_es_lock);
+	node = rb_first(&tree->root);
+	while (node != NULL) {
+		es = rb_entry(node, struct extent_status, rb_node);
+		node = rb_next(&es->rb_node);
+		/*
+		 * We can't reclaim delayed extent from status tree because
+		 * fiemap, bigalloc, and seek_data/hole need to use it.
+		 */
+		if (!ext4_es_is_delayed(es)) {
+			rb_erase(&es->rb_node, &tree->root);
+			ext4_es_free_extent(es);
+			shrunk_nr++;
+		}
+	}
+	tree->cache_es = NULL;
+	write_unlock(&ei->i_es_lock);
+	return shrunk_nr;
+}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct ext4_es_shrinker *es_shrinker = container_of(shrink,
+				struct ext4_es_shrinker, es_shrinker);
+	struct ext4_inode_info *ei;
+	int nr_to_scan = sc->nr_to_scan;
+	int ret, shrunk_nr = 0;
+
+	if (!nr_to_scan)
+		return shrunk_nr;
+
+	spin_lock(&es_shrinker->es_lru_lock);
+	while (!list_empty(&es_shrinker->es_lru)) {
+		if (nr_to_scan-- <= 0)
+			break;
+
+		ei = list_first_entry(&es_shrinker->es_lru,
+				      struct ext4_inode_info, i_es_lru);
+		ret = ext4_es_do_shrink(ei);
+		shrunk_nr += ret;
+		nr_to_scan -= ret;
+	}
+	spin_unlock(&es_shrinker->es_lru_lock);
+	return shrunk_nr;
+}
+
+void ext4_es_lru_add(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_es_shrinker *es_shrinker = &sbi->s_es_shrinker;
+
+	spin_lock(&es_shrinker->es_lru_lock);
+	if (list_empty(&ei->i_es_lru))
+		list_add_tail(&ei->i_es_lru, &es_shrinker->es_lru);
+	else
+		list_move_tail(&ei->i_es_lru, &es_shrinker->es_lru);
+	spin_unlock(&es_shrinker->es_lru_lock);
+}
+
+void ext4_es_lru_del(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_es_shrinker *es_shrinker = &sbi->s_es_shrinker;
+
+	spin_lock(&es_shrinker->es_lru_lock);
+	if (!list_empty(&ei->i_es_lru))
+		list_del_init(&ei->i_es_lru);
+	spin_unlock(&es_shrinker->es_lru_lock);
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 45e623e..422a11e 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,13 @@ struct ext4_es_tree {
 	struct extent_status *cache_es;	/* recently accessed extent */
 };
 
+struct ext4_es_shrinker {
+	struct shrinker es_shrinker;
+	struct list_head es_lru;
+	spinlock_t es_lru_lock ____cacheline_aligned_in_smp;
+	unsigned int es_lru_nr;
+};
+
 extern int __init ext4_init_es(void);
 extern void ext4_exit_es(void);
 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -73,4 +80,9 @@ static inline ext4_fsblk_t ext4_es_get_pblock(struct extent_status *es,
 	return (ext4_es_is_delayed(es) ? ~0 : pb);
 }
 
+extern void ext4_es_register_shrinker(struct super_block *sb);
+extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_lru_add(struct inode *inode);
+extern void ext4_es_lru_del(struct inode *inode);
+
 #endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3d4fb81..baedc75 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -858,6 +858,7 @@ static void ext4_put_super(struct super_block *sb)
 			ext4_abort(sb, "Couldn't clean up the journal");
 	}
 
+	ext4_es_unregister_shrinker(sb);
 	del_timer(&sbi->s_err_report);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
@@ -944,6 +945,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	spin_lock_init(&ei->i_prealloc_lock);
 	ext4_es_init_tree(&ei->i_es_tree);
 	rwlock_init(&ei->i_es_lock);
+	INIT_LIST_HEAD(&ei->i_es_lru);
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
 	ei->i_allocated_meta_blocks = 0;
@@ -1031,6 +1033,7 @@ void ext4_clear_inode(struct inode *inode)
 	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
 	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+	ext4_es_lru_del(inode);
 	if (EXT4_I(inode)->jinode) {
 		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
 					       EXT4_I(inode)->jinode);
@@ -3324,6 +3327,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto cantfind_ext4;
 	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
 
+	/* Register extent status tree shrinker */
+	ext4_es_register_shrinker(sb);
+
 	/* Warn if metadata_csum and gdt_csum are both set. */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
-- 
1.7.12.rc2.18.g61b472e

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists