ext4: Add generic writepage callback From: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V --- fs/ext4/inode.c | 338 +++++++++++++++++-------------------------------------- 1 files changed, 104 insertions(+), 234 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1efb296..c1ddaaf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2326,7 +2326,7 @@ static int __mpage_da_writepage(struct page *page, * We need to try to allocate * unmapped blocks in the same page. * Otherwise we won't make progress - * with the page in ext4_da_writepage + * with the page in ext4_writepage */ if (ext4_bh_delay_or_unwritten(NULL, bh)) { mpage_add_bh_to_extent(mpd, logical, @@ -2452,14 +2452,102 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, return ret; } +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int __ext4_journalled_writepage(struct page *page, + struct writeback_control *wbc, + unsigned int len) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + page_bufs = page_buffers(page); + BUG_ON(!page_bufs); + walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); + + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = walk_page_buffers(handle, page_bufs, 0, len, + NULL, do_journal_get_write_access); + + err = walk_page_buffers(handle, page_bufs, 0, len, + NULL, write_end_fn); + if (ret == 0) + ret = err; + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + + walk_page_buffers(handle, page_bufs, 0, len, + NULL, bput_one); + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; +out: + return ret; +} + /* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + * * This function can get called via... * - ext4_da_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) * - shrink_page_list via pdflush (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) + * + * We don't do any block allocation in this function. If we have page with + * multiple blocks we need to write those buffer_heads that are mapped. This + * is important for mmaped based write. So if we do with blocksize 1K + * truncate(f, 1024); + * a = mmap(f, 0, 4096); + * a[0] = 'a'; + * truncate(f, 4096); + * we have in the page first buffer_head mapped via page_mkwrite call back + * but other bufer_heads would be unmapped but dirty(dirty done via the + * do_wp_page). So writepage should write the first block. If we modify + * the mmap area beyond 1024 we will again get a page_fault and the + * page_mkwrite callback will do the block allocation and mark the + * buffer_heads mapped. + * + * We redirty the page if we have any buffer_heads that is either delay or + * unwritten in the page. + * + * We can get recursively called as show below. + * + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext4_writepage() + * + * But since we don't do any block allocation we should not deadlock. + * Page also have the dirty flag cleared so we don't get recurive page_lock. */ -static int ext4_da_writepage(struct page *page, +static int ext4_writepage(struct page *page, struct writeback_control *wbc) { int ret = 0; @@ -2468,7 +2556,7 @@ static int ext4_da_writepage(struct page *page, struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; - trace_mark(ext4_da_writepage, + trace_mark(ext4_writepage, "dev %s ino %lu page_index %lu", inode->i_sb->s_id, inode->i_ino, page->index); size = i_size_read(inode); @@ -2532,6 +2620,15 @@ static int ext4_da_writepage(struct page *page, block_commit_write(page, 0, len); } + if (PageChecked(page) && ext4_should_journal_data(inode)) { + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + return __ext4_journalled_writepage(page, wbc, len); + } + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ret = nobh_writepage(page, noalloc_get_block_write, wbc); else @@ -3085,233 +3182,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, ext4_get_block); } -static int bget_one(handle_t *handle, struct buffer_head *bh) -{ - get_bh(bh); - return 0; -} - -static int bput_one(handle_t *handle, struct buffer_head *bh) -{ - put_bh(bh); - return 0; -} - -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), noone guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * In all journaling modes block_write_full_page() will start the I/O. - * - * Problem: - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * Similar for: - * - * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext4_get_block(). We will deadlock on various things like - * lock_journal and i_data_sem - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * - * We don't honour synchronous mounts for writepage(). That would be - * disastrous. Any write() or metadata operation will sync the fs for - * us. - * - */ -static int __ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - - if (test_opt(inode->i_sb, NOBH)) - return nobh_writepage(page, noalloc_get_block_write, wbc); - else - return block_write_full_page(page, noalloc_get_block_write, - wbc); -} - -static int ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_mark(ext4_normal_writepage, - "dev %s ino %lu page_index %lu", - inode->i_sb->s_id, inode->i_ino, page->index); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_delay_or_unwritten)); - } - - if (!ext4_journal_current_handle()) - return __ext4_normal_writepage(page, wbc); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - -static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) -{ - loff_t size; - unsigned int len; - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct buffer_head *page_bufs; - handle_t *handle = NULL; - int ret = 0; - int err; - - size = i_size_read(inode); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - ret = block_prepare_write(page, 0, len, - noalloc_get_block_write); - if (ret != 0) - goto out_unlock; - - page_bufs = page_buffers(page); - walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); - /* As soon as we unlock the page, it can go away, but we have - * references to buffers so we are safe */ - unlock_page(page); - - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - ret = walk_page_buffers(handle, page_bufs, 0, len, - NULL, do_journal_get_write_access); - - err = walk_page_buffers(handle, page_bufs, 0, len, - NULL, write_end_fn); - if (ret == 0) - ret = err; - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - - walk_page_buffers(handle, page_bufs, 0, len, - NULL, bput_one); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - goto out; - -out_unlock: - unlock_page(page); -out: - return ret; -} - -static int ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_mark(ext4_journalled_writepage, - "dev %s ino %lu page_index %lu", - inode->i_sb->s_id, inode->i_ino, page->index); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_delay_or_unwritten)); - } - - if (ext4_journal_current_handle()) - goto no_write; - - if (PageChecked(page)) { - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc); - } else { - /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. - */ - return block_write_full_page(page, noalloc_get_block_write, - wbc); - } -no_write: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - static int ext4_readpage(struct file *file, struct page *page) { return mpage_readpage(page, ext4_get_block); @@ -3458,7 +3328,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) static const struct address_space_operations ext4_ordered_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_ordered_write_end, @@ -3474,7 +3344,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) static const struct address_space_operations ext4_writeback_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_writeback_write_end, @@ -3490,7 +3360,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_journalled_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, @@ -3505,7 +3375,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_da_writepage, + .writepage = ext4_writepage, .writepages = ext4_da_writepages, .sync_page = block_sync_page, .write_begin = ext4_da_write_begin,