lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <1429632229-9723-1-git-send-email-hcchang@vt.edu>
Date:	Tue, 21 Apr 2015 12:03:49 -0400
From:	Hung-Ching Chang <hcchang@...edu>
To:	"Theodore Ts'o" <tytso@....edu>,
	Andreas Dilger <adilger.kernel@...ger.ca>
Cc:	linux-ext4@...r.kernel.org, Hung-Ching Chang <hcchang@...edu>
Subject: [PATCH 1/1] ext4/jbd2: Force copy-outs during metadata commiting

The do_get_write_access()(fs/jbd2/transaction.c) may have to wait on the buffer that is being flushed out by the committing transaction and has not yet made the copy-out by the current transaction. When such condition is true, do_get_write_access() has to wait until BH_Shadow is cleared, which could take a very long time because of waiting on IO. In our research*, we observed that this situation become worse on parallel-transactional workloads (Filbebench** OLTP and IOzone*** write tests) to the ext4 filesystem. This patch forces copy-out to enable, and can be configured on the fly via /sys/fs/ext4/<device>/force_copyout. Our experimental results*, with force copy-out, showed ~25% speedup on Filebench OLTP and ~3X speedup on IOzone.

*Hung-Ching Chang, Bo Li, Godmar Back, Ali R. Butt, Kirk W. Cameron, "LUC: Limiting the Unintended Consequences of Power Scaling on Parallel Transaction-oriented Workloads," accepted by in proceedings of 29th IEEE International Parallel & Distributed Processing Symposium (IPDPS), Hyderabad, INDIA, 2015.

**http://filebench.sourceforge.net/
***http://www.iozone.org

Signed-off-by: Hung-Ching Chang <hcchang@...edu>
---
 Documentation/filesystems/ext4.txt |   11 +++++++++++
 fs/ext4/ext4.h                     |    6 ++++++
 fs/ext4/super.c                    |   26 ++++++++++++++++++++++++++
 fs/jbd2/journal.c                  |    2 ++
 include/linux/jbd2.h               |    7 +++++++
 5 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 6c0108e..8282959 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -510,6 +510,17 @@ Files in /sys/fs/ext4/<devname>
                               in the file system. If there is not enough space
                               for the reserved space when mounting the file
                               mount will _not_ fail.
+
+ force_copyout=<0|1(*)>       This enables/disables the force copyout in the
+                              jbd code. force_copyout=0 disables,
+                              force_copyout=1 enables. When enabled, the
+                              committing transaction makes copy-outs for the
+                              metadata for IO submission. This removes
+                              potential locks when the current transaction
+                              attempts to modify the metadata, which are also
+                              owned by the committing transaction and are
+                              being flushed out to non-volatile storage, but
+                              copy-outs have not yet made for these metadata.
 ..............................................................................
 
 Ioctls
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f63c3d5..585836f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1247,6 +1247,7 @@ struct ext4_sb_info {
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
+	u32 s_force_copyout;
 	struct block_device *journal_bdev;
 #ifdef CONFIG_QUOTA
 	char *s_qf_names[EXT4_MAXQUOTAS];	/* Names of quota files with journalled quota */
@@ -1620,6 +1621,11 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
 
 /*
+ * Default force copyout is set to enable
+ */
+#define EXT4_DEF_FORCE_COPYOUT 1
+
+/*
  * Minimum number of groups in a flexgroup before we separate out
  * directories into the first block group of a flexgroup
  */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e061e66..e09f7d2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2508,6 +2508,27 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 	return count;
 }
 
+static ssize_t force_copyout_store(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi,
+				   const char *buf, size_t count)
+{
+	journal_t *journal = sbi->s_journal;
+	unsigned long t;
+	int ret;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
+
+	if (t >= 2)
+		return -EINVAL;
+
+	sbi->s_force_copyout = t;
+	if (journal)
+		journal->j_force_copyout = sbi->s_force_copyout;
+	return count;
+}
+
 static ssize_t sbi_ui_show(struct ext4_attr *a,
 			   struct ext4_sb_info *sbi, char *buf)
 {
@@ -2652,6 +2673,8 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
 EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
 EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+EXT4_ATTR_OFFSET(force_copyout, 0644, sbi_ui_show,
+		 force_copyout_store, s_force_copyout);
 
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
@@ -2678,6 +2701,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(errors_count),
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
+	ATTR_LIST(force_copyout),
 	NULL,
 };
 
@@ -3444,6 +3468,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = sbi;
 	sbi->s_sb = sb;
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+	sbi->s_force_copyout = EXT4_DEF_FORCE_COPYOUT;
 	sbi->s_sb_block = sb_block;
 	if (sb->s_bdev->bd_part)
 		sbi->s_sectors_written_start =
@@ -4321,6 +4346,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 	journal->j_commit_interval = sbi->s_commit_interval;
 	journal->j_min_batch_time = sbi->s_min_batch_time;
 	journal->j_max_batch_time = sbi->s_max_batch_time;
+	journal->j_force_copyout = sbi->s_force_copyout;
 
 	write_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b96bd80..2637234 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -396,6 +396,8 @@ repeat:
 		new_page = virt_to_page(jh_in->b_frozen_data);
 		new_offset = offset_in_page(jh_in->b_frozen_data);
 	} else {
+		if (journal->j_force_copyout == 1)
+			need_copy_out = 1;  /* pessimistically copy data out */
 		new_page = jh2bh(jh_in)->b_page;
 		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
 	}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 20e7f78..eb72346 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -968,6 +968,13 @@ struct journal_s
 	u32			j_min_batch_time;
 	u32			j_max_batch_time;
 
+	/*
+	 * when force_copyout is set to enable(1), the committing
+	 * transaction makes copy-outs for the metadata and uses the copied
+	 * buffers for IO submission.
+	 */
+	u32                     j_force_copyout;
+
 	/* This function is called when a transaction is closed */
 	void			(*j_commit_callback)(journal_t *,
 						     transaction_t *);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ