ext4: Fix delalloc sync hang with journal lock inversion From: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Signed-off-by: Jan Kara --- fs/ext4/inode.c | 99 +++++++++++++++++++++++++++++++++++---------------- fs/mpage.c | 14 ++++--- mm/page-writeback.c | 7 +++- 3 files changed, 80 insertions(+), 40 deletions(-) Index: linux-2.6-linus/fs/ext4/inode.c =================================================================== --- linux-2.6-linus.orig/fs/ext4/inode.c +++ linux-2.6-linus/fs/ext4/inode.c @@ -1480,50 +1480,74 @@ static int ext4_da_get_block_write(struc up_write(&EXT4_I(inode)->i_data_sem); if (EXT4_I(inode)->i_disksize == disksize) { - if (handle == NULL) - handle = ext4_journal_start(inode, 1); - if (!IS_ERR(handle)) - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); + return ret; } } - ret = 0; } - return ret; } + +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh) || buffer_delay(bh); +} + /* FIXME!! only support data=writeback mode */ -static int __ext4_da_writepage(struct page *page, +/* + * get called vi ext4_da_writepages after taking page lock + * We may end up doing block allocation here in case + * mpage_da_map_blocks failed to allocate blocks. + */ +static int ext4_da_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; - handle_t *handle = NULL; int ret = 0; + loff_t size; + unsigned long len; + handle_t *handle = NULL; + struct buffer_head *page_bufs; + struct inode *inode = page->mapping->host; handle = ext4_journal_current_handle(); + if (!handle) { + /* + * This can happen when we aren't called via + * ext4_da_writepages() but directly (shrink_page_list). + * We cannot easily start a transaction here so we just skip + * writing the page in case we would have to do so. + */ + size = i_size_read(inode); + + page_bufs = page_buffers(page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (walk_page_buffers(NULL, page_bufs, 0, + len, NULL, ext4_bh_unmapped_or_delay)) { + /* + * We can't do block allocation under + * page lock without a handle . So redirty + * the page and return + */ + BUG_ON(wbc->sync_mode != WB_SYNC_NONE); + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_get_block, wbc); + ret = nobh_writepage(page, ext4_da_get_block_write, wbc); else - ret = block_write_full_page(page, ext4_get_block, wbc); - - if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) { - EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); - } + ret = block_write_full_page(page, ext4_da_get_block_write, wbc); return ret; } -static int ext4_da_writepage(struct page *page, - struct writeback_control *wbc) -{ - if (!ext4_journal_current_handle()) - return __ext4_da_writepage(page, wbc); - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} /* * For now just follow the DIO way to estimate the max credits @@ -1547,6 +1571,7 @@ static int ext4_da_writepages(struct add int ret = 0; unsigned range_cyclic; long to_write; + pgoff_t index; /* * Estimate the worse case needed credits to write out @@ -1557,6 +1582,15 @@ static int ext4_da_writepages(struct add to_write = wbc->nr_to_write; range_cyclic = wbc->range_cyclic; wbc->range_cyclic = 1; + index = mapping->writeback_index; + if (!range_cyclic) { + /* + * We force cyclic write out of pages. If the + * caller didn't request for range_cyclic update + * set the writeback_index to what the caller requested. + */ + mapping->writeback_index = wbc->range_start >> PAGE_CACHE_SHIFT; + } while (!ret && to_write) { /* start a new transaction*/ @@ -1571,17 +1605,24 @@ static int ext4_da_writepages(struct add */ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; - to_write -= wbc->nr_to_write; + to_write -= wbc->nr_to_write; ret = mpage_da_writepages(mapping, wbc, ext4_da_get_block_write); ext4_journal_stop(handle); - to_write += wbc->nr_to_write; + if (wbc->nr_to_write) { + /* There is no more writeout needed */ + to_write += wbc->nr_to_write; + break; + } + wbc->nr_to_write = to_write; } out_writepages: wbc->nr_to_write = to_write; wbc->range_cyclic = range_cyclic; + if (!range_cyclic) + mapping->writeback_index = index; return ret; } @@ -1712,11 +1753,6 @@ static int bput_one(handle_t *handle, st return 0; } -static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) -{ - return !buffer_mapped(bh) || buffer_delay(bh); -} - /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't Index: linux-2.6-linus/fs/mpage.c =================================================================== --- linux-2.6-linus.orig/fs/mpage.c +++ linux-2.6-linus/fs/mpage.c @@ -849,13 +849,12 @@ static void mpage_put_bnr_to_bhs(struct do { if (cur_logical >= logical + blocks) break; - if (buffer_delay(bh)) { bh->b_blocknr = pblock; clear_buffer_delay(bh); - } else if (buffer_mapped(bh)) { + set_buffer_mapped(bh); + } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); - } cur_logical++; pblock++; @@ -930,10 +929,10 @@ static void mpage_da_map_blocks(struct m if (buffer_delay(lbh)) mpage_put_bnr_to_bhs(mpd, next, &new); - /* go for the remaining blocks */ - next += new.b_size >> mpd->inode->i_blkbits; - remain -= new.b_size; - } + /* go for the remaining blocks */ + next += new.b_size >> mpd->inode->i_blkbits; + remain -= new.b_size; + } } #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) @@ -1052,6 +1051,7 @@ static int __mpage_da_writepage(struct p head = page_buffers(page); bh = head; do { + BUG_ON(buffer_locked(bh)); if (buffer_dirty(bh)) mpage_add_bh_to_extent(mpd, logical, bh); Index: linux-2.6-linus/mm/page-writeback.c =================================================================== --- linux-2.6-linus.orig/mm/page-writeback.c +++ linux-2.6-linus/mm/page-writeback.c @@ -881,7 +881,12 @@ int write_cache_pages(struct address_spa pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ - end = -1; + /* + * write only till the specified range_end even in cyclic mode + */ + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (!end) + end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT;