[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4E0242D1.3030106@redhat.com>
Date: Wed, 22 Jun 2011 14:30:25 -0500
From: Eric Sandeen <sandeen@...hat.com>
To: Jan Kara <jack@...e.cz>
CC: Ted Tso <tytso@....edu>, Christoph Hellwig <hch@...radead.org>,
linux-ext4@...r.kernel.org
Subject: Re: [PATCH v3] ext4: Rewrite ext4_page_mkwrite() to use generic helpers
On 6/22/11 1:24 PM, Jan Kara wrote:
> Rewrite ext4_page_mkwrite() to use __block_page_mkwrite() helper. This
> removes the need of using i_alloc_sem to avoid races with truncate which
> seems to be the wrong locking order according to lock ordering documented in
> mm/rmap.c. Also calling ext4_da_write_begin() as used by the old code seems to
> be problematic because we can decide to flush delay-allocated blocks which
> will acquire s_umount semaphore - again creating unpleasant lock dependency
> if not directly a deadlock.
I have a customer testcase which reliably locks up with freeze & mmap, and
with this patch in place (and the other 2 higher-level patches that made
it into 3.0-rcX) it passes, FWIW.
-Eric
> Also add a check for frozen filesystem so that we don't busyloop in page fault
> when the filesystem is frozen.
>
> Signed-off-by: Jan Kara <jack@...e.cz>
> ---
> fs/ext4/inode.c | 106 ++++++++++++++++++++++++++++--------------------------
> 1 files changed, 55 insertions(+), 51 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index e3126c0..bd30976 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5843,80 +5843,84 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> struct page *page = vmf->page;
> loff_t size;
> unsigned long len;
> - int ret = -EINVAL;
> - void *fsdata;
> + int ret;
> struct file *file = vma->vm_file;
> struct inode *inode = file->f_path.dentry->d_inode;
> struct address_space *mapping = inode->i_mapping;
> + handle_t *handle;
> + get_block_t *get_block;
> + int retries = 0;
>
> /*
> - * Get i_alloc_sem to stop truncates messing with the inode. We cannot
> - * get i_mutex because we are already holding mmap_sem.
> + * This check is racy but catches the common case. We rely on
> + * __block_page_mkwrite() to do a reliable check.
> */
> - down_read(&inode->i_alloc_sem);
> - size = i_size_read(inode);
> - if (page->mapping != mapping || size <= page_offset(page)
> - || !PageUptodate(page)) {
> - /* page got truncated from under us? */
> - goto out_unlock;
> + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> + /* Delalloc case is easy... */
> + if (test_opt(inode->i_sb, DELALLOC) &&
> + !ext4_should_journal_data(inode) &&
> + !ext4_nonda_switch(inode->i_sb)) {
> + do {
> + ret = __block_page_mkwrite(vma, vmf,
> + ext4_da_get_block_prep);
> + } while (ret == -ENOSPC &&
> + ext4_should_retry_alloc(inode->i_sb, &retries));
> + goto out_ret;
> }
> - ret = 0;
>
> lock_page(page);
> - wait_on_page_writeback(page);
> - if (PageMappedToDisk(page)) {
> - up_read(&inode->i_alloc_sem);
> - return VM_FAULT_LOCKED;
> + size = i_size_read(inode);
> + /* Page got truncated from under us? */
> + if (page->mapping != mapping || page_offset(page) > size) {
> + unlock_page(page);
> + ret = VM_FAULT_NOPAGE;
> + goto out;
> }
>
> if (page->index == size >> PAGE_CACHE_SHIFT)
> len = size & ~PAGE_CACHE_MASK;
> else
> len = PAGE_CACHE_SIZE;
> -
> /*
> - * return if we have all the buffers mapped. This avoid
> - * the need to call write_begin/write_end which does a
> - * journal_start/journal_stop which can block and take
> - * long time
> + * Return if we have all the buffers mapped. This avoids the need to do
> + * journal_start/journal_stop which can block and take a long time
> */
> if (page_has_buffers(page)) {
> if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
> ext4_bh_unmapped)) {
> - up_read(&inode->i_alloc_sem);
> - return VM_FAULT_LOCKED;
> + /* Wait so that we don't change page under IO */
> + wait_on_page_writeback(page);
> + ret = VM_FAULT_LOCKED;
> + goto out;
> }
> }
> unlock_page(page);
> - /*
> - * OK, we need to fill the hole... Do write_begin write_end
> - * to do block allocation/reservation.We are not holding
> - * inode.i__mutex here. That allow * parallel write_begin,
> - * write_end call. lock_page prevent this from happening
> - * on the same page though
> - */
> - ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
> - len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
> - if (ret < 0)
> - goto out_unlock;
> - ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
> - len, len, page, fsdata);
> - if (ret < 0)
> - goto out_unlock;
> - ret = 0;
> -
> - /*
> - * write_begin/end might have created a dirty page and someone
> - * could wander in and start the IO. Make sure that hasn't
> - * happened.
> - */
> - lock_page(page);
> - wait_on_page_writeback(page);
> - up_read(&inode->i_alloc_sem);
> - return VM_FAULT_LOCKED;
> -out_unlock:
> - if (ret)
> + /* OK, we need to fill the hole... */
> + if (ext4_should_dioread_nolock(inode))
> + get_block = ext4_get_block_write;
> + else
> + get_block = ext4_get_block;
> +retry_alloc:
> + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
> + if (IS_ERR(handle)) {
> ret = VM_FAULT_SIGBUS;
> - up_read(&inode->i_alloc_sem);
> + goto out;
> + }
> + ret = __block_page_mkwrite(vma, vmf, get_block);
> + if (!ret && ext4_should_journal_data(inode)) {
> + if (walk_page_buffers(handle, page_buffers(page), 0,
> + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
> + unlock_page(page);
> + ret = VM_FAULT_SIGBUS;
> + goto out;
> + }
> + ext4_set_inode_state(inode, EXT4_STATE_JDATA);
> + }
> + ext4_journal_stop(handle);
> + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
> + goto retry_alloc;
> +out_ret:
> + ret = block_page_mkwrite_return(ret);
> +out:
> return ret;
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists