Inverse locking order of page_lock and transaction start. Signed-off-by: Jan Kara --- fs/ext4/ext4.h | 4 +- fs/ext4/extents.c | 15 +-- fs/ext4/inode.c | 327 +++++++++++++++++++++++++++++------------------------ 3 files changed, 183 insertions(+), 163 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fa25f20..b786a17 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1116,7 +1116,7 @@ extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); -extern int ext4_block_truncate_page(handle_t *handle, struct page *page, +extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); extern int ext4_get_block(struct inode *inode, sector_t iblock, @@ -1285,7 +1285,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create, int extend_disksize); -extern void ext4_ext_truncate(struct inode *, struct page *); +extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 64ab158..a7280c7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2882,7 +2882,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(struct inode * inode, struct page *page) +void ext4_ext_truncate(struct inode * inode) { struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; @@ -2895,18 +2895,11 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) */ err = ext4_writepage_trans_blocks(inode) + 3; handle = ext4_journal_start(inode, err); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) return; - } - if (page) - ext4_block_truncate_page(handle, page, mapping, inode->i_size); + if (inode->i_size & (sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e9db7a7..d0fd1c6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1204,19 +1204,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, to = from + len; retry: - page = __grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; - handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { - unlock_page(page); - page_cache_release(page); ret = PTR_ERR(handle); goto out; } + page = __grab_cache_page(mapping, index); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, ext4_get_block); @@ -1226,8 +1227,8 @@ retry: } if (ret) { - ext4_journal_stop(handle); unlock_page(page); + ext4_journal_stop(handle); page_cache_release(page); } @@ -1256,29 +1257,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) } /* - * Generic write_end handler for ordered and writeback ext4 journal modes. - * We can't use generic_write_end, because that unlocks the page and we need to - * unlock the page after ext4_journal_stop, but ext4_journal_stop must run - * after block_write_end. - */ -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = file->f_mapping->host; - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - return copied; -} - -/* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * @@ -1291,7 +1269,7 @@ static int ext4_ordered_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; unsigned from, to; int ret = 0, ret2; @@ -1312,7 +1290,7 @@ static int ext4_ordered_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - copied = ext4_generic_write_end(file, mapping, pos, len, copied, + copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata); if (copied < 0) ret = copied; @@ -1320,8 +1298,6 @@ static int ext4_ordered_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1332,7 +1308,7 @@ static int ext4_writeback_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; int ret = 0, ret2; loff_t new_i_size; @@ -1340,7 +1316,7 @@ static int ext4_writeback_write_end(struct file *file, if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - copied = ext4_generic_write_end(file, mapping, pos, len, copied, + copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata); if (copied < 0) ret = copied; @@ -1348,8 +1324,6 @@ static int ext4_writeback_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1388,10 +1362,10 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } + unlock_page(page); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); page_cache_release(page); return ret ? ret : copied; @@ -1663,11 +1637,10 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) } /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext4_get_block() should be journalled - * along with the data so we don't crash and then get metadata which - * refers to old data. + * Note that we don't need to start a transaction unless we're journaling + * data because we should have holes filled from ext4_page_mkwrite(). If + * we are journaling data, we cannot start transaction directly because + * transaction start ranks above page lock so we have to do some magic... * * In all journalling modes block_write_full_page() will start the I/O. * @@ -1711,10 +1684,8 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) * disastrous. Any write() or metadata operation will sync the fs for * us. * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ -static int ext4_ordered_writepage(struct page *page, +static int __ext4_ordered_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; @@ -1723,22 +1694,6 @@ static int ext4_ordered_writepage(struct page *page, int ret = 0; int err; - J_ASSERT(PageLocked(page)); - - /* - * We give up here if we're reentered, because it might be for a - * different filesystem. - */ - if (ext4_journal_current_handle()) - goto out_fail; - - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_fail; - } - if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1762,114 +1717,139 @@ static int ext4_ordered_writepage(struct page *page, * and generally junk. */ if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + handle = ext4_journal_start(inode, + ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_put; + } + + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, jbd2_journal_dirty_data_fn); + err = ext4_journal_stop(handle); if (!ret) ret = err; } - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bput_one); - err = ext4_journal_stop(handle); - if (!ret) - ret = err; +out_put: + walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, + bput_one); return ret; +} + +static int ext4_ordered_writepage(struct page *page, + struct writeback_control *wbc) +{ + J_ASSERT(PageLocked(page)); + + /* + * We give up here if we're reentered, because it might be for a + * different filesystem. + */ + if (!ext4_journal_current_handle()) + return __ext4_ordered_writepage(page, wbc); -out_fail: redirty_page_for_writepage(wbc, page); unlock_page(page); - return ret; + return 0; } -static int ext4_writeback_writepage(struct page *page, +static int __ext4_writeback_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; + + if (test_opt(inode->i_sb, NOBH)) + return nobh_writepage(page, ext4_get_block, wbc); + else + return block_write_full_page(page, ext4_get_block, wbc); +} + + +static int ext4_writeback_writepage(struct page *page, + struct writeback_control *wbc) +{ + if (!ext4_journal_current_handle()) + return __ext4_writeback_writepage(page, wbc); + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +static int __ext4_journalled_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; handle_t *handle = NULL; int ret = 0; int err; - if (ext4_journal_current_handle()) - goto out_fail; + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block); + if (ret != 0) + goto out_unlock; + + page_bufs = page_buffers(page); + walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, + bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_fail; + goto out; } - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_get_block, wbc); - else - ret = block_write_full_page(page, ext4_get_block, wbc); + ret = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + err = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, write_end_fn); + if (ret == 0) + ret = err; err = ext4_journal_stop(handle); if (!ret) ret = err; - return ret; -out_fail: - redirty_page_for_writepage(wbc, page); + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + goto out; + +out_unlock: unlock_page(page); +out: return ret; } static int ext4_journalled_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - int ret = 0; - int err; - if (ext4_journal_current_handle()) goto no_write; - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; - } - if (!page_has_buffers(page) || PageChecked(page)) { /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_get_block); - if (ret != 0) { - ext4_journal_stop(handle); - goto out_unlock; - } - ret = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); - - err = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, write_end_fn); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - unlock_page(page); + return __ext4_journalled_writepage(page, wbc); } else { /* * It may be a page full of checkpoint-mode buffers. We don't * really know unless we go poke around in the buffer_heads. * But block_write_full_page will do the right thing. */ - ret = block_write_full_page(page, ext4_get_block, wbc); + return block_write_full_page(page, ext4_get_block, wbc); } - err = ext4_journal_stop(handle); - if (!ret) - ret = err; -out: - return ret; - no_write: redirty_page_for_writepage(wbc, page); -out_unlock: unlock_page(page); - goto out; + return 0; } static int ext4_readpage(struct file *file, struct page *page) @@ -2084,7 +2064,7 @@ void ext4_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -int ext4_block_truncate_page(handle_t *handle, struct page *page, +int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -2093,8 +2073,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; + struct page *page; int err = 0; + page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); + if (!page) + return -EINVAL; + blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -2558,7 +2543,6 @@ void ext4_truncate(struct inode *inode) int n; ext4_lblk_t last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -2568,41 +2552,21 @@ void ext4_truncate(struct inode *inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - /* - * We have to lock the EOF page here, because lock_page() nests - * outside jbd2_journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - return; - } - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { - ext4_ext_truncate(inode, page); + ext4_ext_truncate(inode); return; } handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) return; /* AKPM: return what? */ - } last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (page) - ext4_block_truncate_page(handle, page, mapping, inode->i_size); + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) @@ -3718,13 +3682,76 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return err; } +static int ext4_bh_mapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) { + struct file *file = vma->vm_file; + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long len; + loff_t size; + int ret = -EINVAL, err; + handle_t *handle; + struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1 }; + + /* + * Get i_alloc_sem to stop truncates messing with the inode. We cannot + * get i_mutex because we are already holding mmap_sem. We could + * as well just lock the page but we need to start a transaction + * before locking the page and don't want to unnecessarity start + * the transaction if we don't need to write the page. + */ + down_read(&inode->i_alloc_sem); + size = i_size_read(inode); + if (page->mapping != mapping || size <= page_offset(page) + || !PageUptodate(page)) { + /* page got truncated from under us? */ + goto out_unlock; + } + ret = 0; + if (PageMappedToDisk(page)) + goto out_unlock; + + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_mapped)) + goto out_unlock; + } + /* - * if ext4_get_block resulted in a split of an uninitialized extent, - * in file system full case, we will have to take the journal write - * access and zero out the page. The journal handle get initialized - * in ext4_get_block. + * OK, we need to fill the hole... Start a transaction, lock the + * page and do writepage */ - return block_page_mkwrite(vma, page, ext4_get_block); + + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_unlock; + } + lock_page(page); + wbc.range_start = page_offset(page); + wbc.range_end = page_offset(page) + PAGE_CACHE_SIZE; + if (ext4_should_writeback_data(inode)) + ret = __ext4_writeback_writepage(page, &wbc); + else if (ext4_should_order_data(inode)) + ret = __ext4_ordered_writepage(page, &wbc); + else + ret = __ext4_journalled_writepage(page, &wbc); + /* Page got unlocked in writepage */ + err = ext4_journal_stop(handle); + if (!ret) + ret = err; +out_unlock: + up_read(&inode->i_alloc_sem); + return ret; } -- 1.5.2.4