[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Fri, 6 Jun 2008 23:54:52 +0530
From: "Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>
To: cmm@...ibm.com, tytso@....edu, sandeen@...hat.com
Cc: linux-ext4@...r.kernel.org,
"Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>,
Jan Kara <jack@...e.cz>
Subject: [PATCH] ext4: Fix delalloc sync hang with journal lock inversion
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>
Signed-off-by: Jan Kara <jack@...e.cz>
---
fs/ext4/inode.c | 107 ++++++++++++++++++++++++++++++++++++------------------
fs/mpage.c | 12 +++----
2 files changed, 76 insertions(+), 43 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0f8d071..b5bc627 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1480,50 +1480,74 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
up_write(&EXT4_I(inode)->i_data_sem);
if (EXT4_I(inode)->i_disksize == disksize) {
- if (handle == NULL)
- handle = ext4_journal_start(inode, 1);
- if (!IS_ERR(handle))
- ext4_mark_inode_dirty(handle, inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
}
}
-
ret = 0;
}
-
return ret;
}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh) || buffer_delay(bh);
+}
+
/* FIXME!! only support data=writeback mode */
-static int __ext4_da_writepage(struct page *page,
+/*
+ * get called vi ext4_da_writepages after taking page lock
+ * We may end up doing block allocation here in case
+ * mpage_da_map_blocks failed to allocate blocks.
+ */
+static int ext4_da_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
int ret = 0;
+ loff_t size;
+ unsigned long len;
+ handle_t *handle = NULL;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
handle = ext4_journal_current_handle();
+ if (!handle) {
+ /*
+ * This can happen when we aren't called via
+ * ext4_da_writepages() but directly (shrink_page_list).
+ * We cannot easily start a transaction here so we just skip
+ * writing the page in case we would have to do so.
+ */
+ size = i_size_read(inode);
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ page_bufs = page_buffers(page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
- if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
- EXT4_I(inode)->i_disksize = inode->i_size;
- ext4_mark_inode_dirty(handle, inode);
+ if (walk_page_buffers(NULL, page_bufs, 0,
+ len, NULL, ext4_bh_unmapped_or_delay)) {
+ /*
+ * We can't do block allocation under
+ * page lock without a handle . So redirty
+ * the page and return
+ */
+ BUG_ON(wbc->sync_mode != WB_SYNC_NONE);
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
}
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
+
return ret;
}
-static int ext4_da_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- if (!ext4_journal_current_handle())
- return __ext4_da_writepage(page, wbc);
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
-}
/*
* For now just follow the DIO way to estimate the max credits
@@ -1545,8 +1569,8 @@ static int ext4_da_writepages(struct address_space *mapping,
handle_t *handle = NULL;
int needed_blocks;
int ret = 0;
- unsigned range_cyclic;
long to_write;
+ loff_t range_start = 0;
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -1563,8 +1587,14 @@ static int ext4_da_writepages(struct address_space *mapping,
needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
to_write = wbc->nr_to_write;
- range_cyclic = wbc->range_cyclic;
- wbc->range_cyclic = 1;
+ if (!wbc->range_cyclic) {
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+ range_start = wbc->range_start;
+ }
while (!ret && to_write) {
/* start a new transaction*/
@@ -1579,17 +1609,27 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
- to_write -= wbc->nr_to_write;
+ to_write -= wbc->nr_to_write;
ret = mpage_da_writepages(mapping, wbc,
ext4_da_get_block_write);
ext4_journal_stop(handle);
- to_write += wbc->nr_to_write;
+ if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
}
out_writepages:
wbc->nr_to_write = to_write;
- wbc->range_cyclic = range_cyclic;
+ if (range_start)
+ wbc->range_start = range_start;
return ret;
}
@@ -1720,11 +1760,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
- return !buffer_mapped(bh) || buffer_delay(bh);
-}
-
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
diff --git a/fs/mpage.c b/fs/mpage.c
index cde7f11..c4376ec 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -849,13 +849,11 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
do {
if (cur_logical >= logical + blocks)
break;
-
if (buffer_delay(bh)) {
bh->b_blocknr = pblock;
clear_buffer_delay(bh);
- } else if (buffer_mapped(bh)) {
+ } else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
- }
cur_logical++;
pblock++;
@@ -930,10 +928,10 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
if (buffer_delay(lbh))
mpage_put_bnr_to_bhs(mpd, next, &new);
- /* go for the remaining blocks */
- next += new.b_size >> mpd->inode->i_blkbits;
- remain -= new.b_size;
- }
+ /* go for the remaining blocks */
+ next += new.b_size >> mpd->inode->i_blkbits;
+ remain -= new.b_size;
+ }
}
#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
--
1.5.5.1.357.g1af8b.dirty
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists