lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <871f4cbf-f7bc-35d8-bd42-32721b44c458@linux.alibaba.com>
Date:   Tue, 27 Nov 2018 19:48:28 +0800
From:   Xiaoguang Wang <xiaoguang.wang@...ux.alibaba.com>
To:     linux-ext4@...r.kernel.org
Cc:     fisherthepooh@...tonmail.com, Jan Kara <jack@...e.cz>,
        tytso@....edu
Subject: jbd2 bh_shadow issue

hi,

I also meet this issue in our server, sometimes we got big latency for
buffered writes. Using fio, I have done some tests to reproduce this issue,
Please see test results in this mail:
https://www.spinics.net/lists/linux-ext4/msg62885.html

I agree that doing buffer copy-out for every journaled buffer wastes cpu
time, so here I wonder whether we can only do buffer copy-out when buffer
is BH_SHADOW state. The core idea is simple:
     In do_get_write_access(), if buffer has already been in BH_SHADOW state,
     we allocate a new page, make buffer's b_page point to this new page, and
     following journal dirty work can be done in this patch. When transaction
     commits, in jbd2_journal_write_metadata_buffer(), we copy new page's conten
     to primary page, make buffer point to primary page and free the temporary
     page.

I have written a test patch, the big long latency will disapper. Do you think
the method is useful? If you think so, I can submit a formal patch and do fstests.
Anyway I think this issue is not good, maybe we should fix it:)

Here I attach my test patch here:
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@...ux.alibaba.com>
---
  fs/jbd2/journal.c            | 26 ++++++++++++++++++++++++--
  fs/jbd2/transaction.c        | 31 +++++++++++++++++++++++++++----
  include/linux/buffer_head.h  |  1 +
  include/linux/jbd2.h         |  7 +++++++
  include/linux/journal-head.h |  4 ++++
  5 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8ef6b6daaa7a..bb0612cf5447 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -393,8 +393,30 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
  		new_page = virt_to_page(jh_in->b_frozen_data);
  		new_offset = offset_in_page(jh_in->b_frozen_data);
  	} else {
-		new_page = jh2bh(jh_in)->b_page;
-		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+		if (buffer_lege(bh_in)) {
+			char *primary, *temp_primary = jh_in->b_temp_data;
+			struct page *temp_page;
+			unsigned temp_offset;
+
+			new_page = jh_in->page;
+			new_offset = jh_in->page_offset;
+
+			temp_page = virt_to_page(temp_primary);
+			temp_offset = offset_in_page(temp_primary);
+			primary = kmap_atomic(new_page);
+			mapped_data = kmap_atomic(temp_page);
+
+			memcpy(primary + new_offset, mapped_data + temp_offset,
+			       bh_in->b_size);
+			set_bh_page(bh_in, new_page, new_offset);
+			kunmap_atomic(primary);
+			kunmap_atomic(temp_primary);
+			jbd2_free(temp_primary, bh_in->b_size);
+			clear_buffer_primarycopied(bh_in);
+		} else {
+			new_page = jh2bh(jh_in)->b_page;
+			new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+		}
  	}

  	mapped_data = kmap_atomic(new_page);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index c0b66a7a795b..6ad8ec94c7b8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -827,6 +827,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
  	journal_t *journal;
  	int error;
  	char *frozen_buffer = NULL;
+	char *tmp_primary = NULL;
  	unsigned long start_lock, time_lock;

  	if (is_handle_aborted(handle))
@@ -956,10 +957,29 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
  	 * here.
  	 */
  	if (buffer_shadow(bh)) {
-		JBUFFER_TRACE(jh, "on shadow: sleep");
-		jbd_unlock_bh_state(bh);
-		wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
-		goto repeat;
+		char *mapped_data;
+		struct page *new_page;
+		unsigned int primary_offset, new_offset;
+
+		if (!tmp_primary) {
+			jbd_unlock_bh_state(bh);
+			tmp_primary = jbd2_alloc(bh->b_size, GFP_NOFS |
+						 __GFP_NOFAIL);
+			goto repeat;
+		}
+
+		primary_offset = offset_in_page(bh->b_data);
+		mapped_data = kmap_atomic(bh->b_page);
+		memcpy(tmp_primary, mapped_data + primary_offset, bh->b_size);
+		kunmap_atomic(mapped_data);
+
+		new_page = virt_to_page(tmp_primary);
+		new_offset = offset_in_page(tmp_primary);
+		jh->b_temp_data = tmp_primary;
+		jh->page = bh->b_page;
+		jh->page_offset = primary_offset;
+		set_bh_page(bh, new_page, new_offset);
+		set_buffer_primarycopied(bh);
  	}

  	/*
@@ -1009,6 +1029,9 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
  	if (unlikely(frozen_buffer))	/* It's usually NULL */
  		jbd2_free(frozen_buffer, bh->b_size);

+	if (tmp_primary)
+		jbd2_free(frozen_buffer, bh->b_size);
+
  	JBUFFER_TRACE(jh, "exit");
  	return error;
  }
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 96225a77c112..88fbe345d291 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -66,6 +66,7 @@ struct buffer_head {
  	struct page *b_page;		/* the page this bh is mapped to */

  	sector_t b_blocknr;		/* start block number */
+	sector_t orig_blocknr;		/* start block number */
  	size_t b_size;			/* size of mapping */
  	char *b_data;			/* pointer to data within the page */

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b708e5169d1d..0928f9bea8f4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -321,6 +321,12 @@ enum jbd_state_bits {
  	BH_Shadow,		/* IO on shadow buffer is running */
  	BH_Verified,		/* Metadata block has been verified ok */
  	BH_JBDPrivateStart,	/* First bit available for private use by FS */
+	/*
+	 * We can not dirty a bh while it's going to disk with BH_Shadow flag.
+	 * In this case, we can copy this bh and let later fs dirty operations
+	 * go to this new bh.
+	 */
+	BH_PrimaryCopied,
  };

  BUFFER_FNS(JBD, jbd)
@@ -334,6 +340,7 @@ TAS_BUFFER_FNS(RevokeValid, revokevalid)
  BUFFER_FNS(Freed, freed)
  BUFFER_FNS(Shadow, shadow)
  BUFFER_FNS(Verified, verified)
+BUFFER_FNS(PrimaryCopied, primarycopied)

  static inline struct buffer_head *jh2bh(struct journal_head *jh)
  {
diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index 9fb870524314..0fa317124d9a 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -51,6 +51,10 @@ struct journal_head {
  	 */
  	char *b_frozen_data;

+	struct page *page;
+	unsigned int page_offset;
+	char *b_temp_data;
+
  	/*
  	 * Pointer to a saved copy of the buffer containing no uncommitted
  	 * deallocation references, so that allocations can avoid overwriting
-- 
2.14.4.44.g2045bb6





Regards,
Xiaoguang Wang

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ