lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1271674527-2977-3-git-send-email-lczerner@redhat.com>
Date:	Mon, 19 Apr 2010 12:55:27 +0200
From:	Lukas Czerner <lczerner@...hat.com>
To:	linux-ext4@...r.kernel.org
Cc:	Jeff Moyer <jmoyer@...hat.com>,
	Edward Shishkin <eshishki@...hat.com>,
	Eric Sandeen <esandeen@...hat.com>,
	Ric Wheeler <rwheeler@...hat.com>,
	Lukas Czerner <lczerner@...hat.com>
Subject: [PATCH 2/2] Add batched discard support for ext4.

Create an ioctl which walks through all the free extents in each
allocating group and discard those extents. As an addition to improve
its performance one can specify minimum free extent length, so ioctl
will not bother with shorter extents.

This of course means, that with each invocation the ioctl must walk
through whole file system, checking and discarding free extents, which
is not very efficient. The best way to avoid this is to keep track of
deleted (freed) blocks. Then the ioctl have to trim just those free
extents which were recently freed.

In order to implement this I have added new bitmap into ext4_group_info
(bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
walk through bb_bitmap_deleted, compare deleted extents with free
extents trim them and then removes it from the bb_bitmap_deleted. 

But you may notice, that there is one problem. bb_bitmap_deleted does
not survive umount. To bypass the problem the first ioctl call have to
walk through whole file system trimming all free extents.

Signed-off-by: Lukas Czerner <lczerner@...hat.com>
---
 fs/ext4/ext4.h    |    4 +
 fs/ext4/mballoc.c |  207 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/ext4/super.c   |    1 +
 3 files changed, 202 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf..e25f672 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1437,6 +1437,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
+extern int ext4_trim_fs(unsigned int, struct super_block *);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
@@ -1682,6 +1684,8 @@ struct ext4_group_info {
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
 #endif
+	void		*bb_bitmap_deleted;
+	ext4_grpblk_t	bb_deleted;
 	struct rw_semaphore alloc_sem;
 	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
 					 * regions, index is order.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bde9d0b..fbc83fe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2255,6 +2255,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
 	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_deleted = -1;
+
+
 
 #ifdef DOUBLE_CHECK
 	{
@@ -2469,6 +2472,7 @@ int ext4_mb_release(struct super_block *sb)
 #ifdef DOUBLE_CHECK
 			kfree(grinfo->bb_bitmap);
 #endif
+			kfree(grinfo->bb_bitmap_deleted);
 			ext4_lock_group(sb, i);
 			ext4_mb_cleanup_pa(grinfo);
 			ext4_unlock_group(sb, i);
@@ -2528,6 +2532,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	int err, count = 0, count2 = 0;
 	struct ext4_free_data *entry;
 	struct list_head *l, *ltmp;
+	void *bitmap_deleted = NULL;
 
 	list_for_each_safe(l, ltmp, &txn->t_private_list) {
 		entry = list_entry(l, struct ext4_free_data, list);
@@ -2543,6 +2548,14 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		/* there are blocks to put in buddy to make them really free */
 		count += entry->count;
 		count2++;
+
+		if (test_opt(sb, DISCARD) &&
+			(db->bb_bitmap_deleted == NULL) &&
+			(db->bb_deleted >= 0)) {
+			bitmap_deleted =
+				kmalloc(sb->s_blocksize, GFP_KERNEL);
+		}
+
 		ext4_lock_group(sb, entry->group);
 		/* Take it out of per group rb tree */
 		rb_erase(&entry->node, &(db->bb_free_root));
@@ -2555,17 +2568,24 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 			page_cache_release(e4b.bd_buddy_page);
 			page_cache_release(e4b.bd_bitmap_page);
 		}
-		ext4_unlock_group(sb, entry->group);
-		if (test_opt(sb, DISCARD)) {
-			ext4_fsblk_t discard_block;
-
-			discard_block = entry->start_blk +
-				ext4_group_first_block_no(sb, entry->group);
-			trace_ext4_discard_blocks(sb,
-					(unsigned long long)discard_block,
-					entry->count);
-			sb_issue_discard(sb, discard_block, entry->count);
+		if (test_opt(sb, DISCARD) && (db->bb_deleted >= 0)) {
+			if (db->bb_bitmap_deleted == NULL) {
+				db->bb_bitmap_deleted = bitmap_deleted;
+				BUG_ON(db->bb_bitmap_deleted == NULL);
+
+				bitmap_deleted = NULL;
+				mb_clear_bits(db->bb_bitmap_deleted,
+					0, EXT4_BLOCKS_PER_GROUP(sb));
+			}
+
+			db->bb_deleted += entry->count;
+			mb_set_bits(db->bb_bitmap_deleted, entry->start_blk,
+				 entry->count);
 		}
+		ext4_unlock_group(sb, entry->group);
+
+		kfree(bitmap_deleted);
+
 		kmem_cache_free(ext4_free_ext_cachep, entry);
 		ext4_mb_release_desc(&e4b);
 	}
@@ -4639,3 +4659,170 @@ error_return:
 		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
+
+/* Trim "count" blocks starting at "start" in "group"
+ * This must be called under group lock
+ */
+void ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	ext4_fsblk_t discard_block;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_free_extent ex;
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+
+	discard_block = (ext4_fsblk_t)group *
+			EXT4_BLOCKS_PER_GROUP(sb)
+			+ start
+			+ le32_to_cpu(es->s_first_data_block);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long)discard_block,
+			count);
+	sb_issue_discard(sb, discard_block, count);
+
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+}
+
+/* Trim all free blocks, which have at least minlen length */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+	ext4_grpblk_t start, next, count = 0;
+	ext4_group_t group;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	start = 0;
+	ext4_lock_group(sb, group);
+
+	while (start < max) {
+
+		start = mb_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minblocks) {
+
+			count += next - start;
+			ext4_trim_extent(sb, start,
+				next - start, group, e4b);
+
+		}
+		start = next + 1;
+	}
+
+	e4b->bd_info->bb_deleted = 0;
+	ext4_unlock_group(sb, group);
+
+	return count;
+}
+
+/* Trim only blocks which is free and in bb_bitmap_deleted */
+ext4_grpblk_t ext4_trim_deleted(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	struct ext4_group_info *grp;
+	ext4_group_t group;
+	ext4_grpblk_t max, next, count = 0, start = 0;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	grp = ext4_get_group_info(sb, group);
+
+	if (grp->bb_deleted < minblocks)
+		return 0;
+
+	ext4_lock_group(sb, group);
+
+	while (start < EXT4_BLOCKS_PER_GROUP(sb)) {
+		start = mb_find_next_bit(grp->bb_bitmap_deleted,
+			EXT4_BLOCKS_PER_GROUP(sb), start);
+		max = mb_find_next_zero_bit(grp->bb_bitmap_deleted,
+			EXT4_BLOCKS_PER_GROUP(sb), start);
+
+		while (start < max) {
+			start = mb_find_next_zero_bit(bitmap, max, start);
+			if (start >= max)
+				break;
+			next = mb_find_next_bit(bitmap, max, start);
+			if (next > max)
+				next = max;
+
+			if ((next - start) >= minblocks) {
+				count += next - start;
+
+				ext4_trim_extent(sb, start,
+					next - start, group, e4b);
+
+				mb_clear_bits(grp->bb_bitmap_deleted,
+					start, next - start);
+			}
+			start = next + 1;
+		}
+	}
+	grp->bb_deleted -= count;
+
+	ext4_unlock_group(sb, group);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	return count;
+}
+
+int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
+{
+	struct ext4_buddy e4b;
+	struct ext4_group_info *grp;
+	ext4_group_t group;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t minblocks;
+
+	if (!test_opt(sb, DISCARD))
+		return 0;
+
+	minblocks = DIV_ROUND_UP(minlen, sb->s_blocksize);
+	if (unlikely(minblocks > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	for (group = 0; group < ngroups; group++) {
+		int err;
+
+		err = ext4_mb_load_buddy(sb, group, &e4b);
+		if (err) {
+			ext4_error(sb, "Error in loading buddy "
+					"information for %u", group);
+			continue;
+		}
+		grp = ext4_get_group_info(sb, group);
+
+		if (grp->bb_deleted < 0) {
+			/* First run after mount */
+			ext4_trim_all_free(sb, &e4b, minblocks);
+		} else if (grp->bb_deleted >= minblocks) {
+			/* Trim only blocks deleted since first run */
+			ext4_trim_deleted(sb, &e4b, minblocks);
+		}
+
+		ext4_mb_release_desc(&e4b);
+	}
+
+	return 0;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c..253eb98 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1109,6 +1109,7 @@ static const struct super_operations ext4_sops = {
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
-- 
1.6.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ