[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <871f4cbf-f7bc-35d8-bd42-32721b44c458@linux.alibaba.com>
Date: Tue, 27 Nov 2018 19:48:28 +0800
From: Xiaoguang Wang <xiaoguang.wang@...ux.alibaba.com>
To: linux-ext4@...r.kernel.org
Cc: fisherthepooh@...tonmail.com, Jan Kara <jack@...e.cz>,
tytso@....edu
Subject: jbd2 bh_shadow issue
hi,
I also meet this issue in our server, sometimes we got big latency for
buffered writes. Using fio, I have done some tests to reproduce this issue,
Please see test results in this mail:
https://www.spinics.net/lists/linux-ext4/msg62885.html
I agree that doing buffer copy-out for every journaled buffer wastes cpu
time, so here I wonder whether we can only do buffer copy-out when buffer
is BH_SHADOW state. The core idea is simple:
In do_get_write_access(), if buffer has already been in BH_SHADOW state,
we allocate a new page, make buffer's b_page point to this new page, and
following journal dirty work can be done in this patch. When transaction
commits, in jbd2_journal_write_metadata_buffer(), we copy new page's conten
to primary page, make buffer point to primary page and free the temporary
page.
I have written a test patch, the big long latency will disapper. Do you think
the method is useful? If you think so, I can submit a formal patch and do fstests.
Anyway I think this issue is not good, maybe we should fix it:)
Here I attach my test patch here:
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@...ux.alibaba.com>
---
fs/jbd2/journal.c | 26 ++++++++++++++++++++++++--
fs/jbd2/transaction.c | 31 +++++++++++++++++++++++++++----
include/linux/buffer_head.h | 1 +
include/linux/jbd2.h | 7 +++++++
include/linux/journal-head.h | 4 ++++
5 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8ef6b6daaa7a..bb0612cf5447 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -393,8 +393,30 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
new_page = virt_to_page(jh_in->b_frozen_data);
new_offset = offset_in_page(jh_in->b_frozen_data);
} else {
- new_page = jh2bh(jh_in)->b_page;
- new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+ if (buffer_lege(bh_in)) {
+ char *primary, *temp_primary = jh_in->b_temp_data;
+ struct page *temp_page;
+ unsigned temp_offset;
+
+ new_page = jh_in->page;
+ new_offset = jh_in->page_offset;
+
+ temp_page = virt_to_page(temp_primary);
+ temp_offset = offset_in_page(temp_primary);
+ primary = kmap_atomic(new_page);
+ mapped_data = kmap_atomic(temp_page);
+
+ memcpy(primary + new_offset, mapped_data + temp_offset,
+ bh_in->b_size);
+ set_bh_page(bh_in, new_page, new_offset);
+ kunmap_atomic(primary);
+ kunmap_atomic(temp_primary);
+ jbd2_free(temp_primary, bh_in->b_size);
+ clear_buffer_primarycopied(bh_in);
+ } else {
+ new_page = jh2bh(jh_in)->b_page;
+ new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+ }
}
mapped_data = kmap_atomic(new_page);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index c0b66a7a795b..6ad8ec94c7b8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -827,6 +827,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
journal_t *journal;
int error;
char *frozen_buffer = NULL;
+ char *tmp_primary = NULL;
unsigned long start_lock, time_lock;
if (is_handle_aborted(handle))
@@ -956,10 +957,29 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* here.
*/
if (buffer_shadow(bh)) {
- JBUFFER_TRACE(jh, "on shadow: sleep");
- jbd_unlock_bh_state(bh);
- wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
- goto repeat;
+ char *mapped_data;
+ struct page *new_page;
+ unsigned int primary_offset, new_offset;
+
+ if (!tmp_primary) {
+ jbd_unlock_bh_state(bh);
+ tmp_primary = jbd2_alloc(bh->b_size, GFP_NOFS |
+ __GFP_NOFAIL);
+ goto repeat;
+ }
+
+ primary_offset = offset_in_page(bh->b_data);
+ mapped_data = kmap_atomic(bh->b_page);
+ memcpy(tmp_primary, mapped_data + primary_offset, bh->b_size);
+ kunmap_atomic(mapped_data);
+
+ new_page = virt_to_page(tmp_primary);
+ new_offset = offset_in_page(tmp_primary);
+ jh->b_temp_data = tmp_primary;
+ jh->page = bh->b_page;
+ jh->page_offset = primary_offset;
+ set_bh_page(bh, new_page, new_offset);
+ set_buffer_primarycopied(bh);
}
/*
@@ -1009,6 +1029,9 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
if (unlikely(frozen_buffer)) /* It's usually NULL */
jbd2_free(frozen_buffer, bh->b_size);
+ if (tmp_primary)
+ jbd2_free(frozen_buffer, bh->b_size);
+
JBUFFER_TRACE(jh, "exit");
return error;
}
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 96225a77c112..88fbe345d291 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -66,6 +66,7 @@ struct buffer_head {
struct page *b_page; /* the page this bh is mapped to */
sector_t b_blocknr; /* start block number */
+ sector_t orig_blocknr; /* start block number */
size_t b_size; /* size of mapping */
char *b_data; /* pointer to data within the page */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b708e5169d1d..0928f9bea8f4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -321,6 +321,12 @@ enum jbd_state_bits {
BH_Shadow, /* IO on shadow buffer is running */
BH_Verified, /* Metadata block has been verified ok */
BH_JBDPrivateStart, /* First bit available for private use by FS */
+ /*
+ * We can not dirty a bh while it's going to disk with BH_Shadow flag.
+ * In this case, we can copy this bh and let later fs dirty operations
+ * go to this new bh.
+ */
+ BH_PrimaryCopied,
};
BUFFER_FNS(JBD, jbd)
@@ -334,6 +340,7 @@ TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
BUFFER_FNS(Shadow, shadow)
BUFFER_FNS(Verified, verified)
+BUFFER_FNS(PrimaryCopied, primarycopied)
static inline struct buffer_head *jh2bh(struct journal_head *jh)
{
diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index 9fb870524314..0fa317124d9a 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -51,6 +51,10 @@ struct journal_head {
*/
char *b_frozen_data;
+ struct page *page;
+ unsigned int page_offset;
+ char *b_temp_data;
+
/*
* Pointer to a saved copy of the buffer containing no uncommitted
* deallocation references, so that allocations can avoid overwriting
--
2.14.4.44.g2045bb6
Regards,
Xiaoguang Wang
Powered by blists - more mailing lists