linux-ext4 - Re: [PATCH 2/3] ext4: basic bug shield for move_extent_per

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <878vczdlof.fsf@openvz.org>
Date:	Tue, 28 Aug 2012 20:29:52 +0400
From:	Dmitry Monakhov <dmonakhov@...nvz.org>
To:	linux-ext4@...r.kernel.org
Cc:	tytso@....edu, a-fujita@...jp.nec.com
Subject: Re: [PATCH 2/3] ext4: basic bug shield for move_extent_per_page

On Tue, 28 Aug 2012 20:21:42 +0400, Dmitry Monakhov <dmonakhov@...nvz.org> wrote:
In order to test MOVE_EXT ioctl code i use run e4defrag and fsstress
in parallel:

View attachment "285" of type "text/plain" (2522 bytes)


> The move_extent_per_page function is just one big peace of crap.
> 
> Non-full list of bugs:
> 1) uninitialized extent optimization does not hold page's lock,
>    and simply replace brunches, so writeback code goes
>    crazy because block mapping changed under it's feets
>    kernel BUG at fs/ext4/inode.c:1434!
> 
> 2) uninitialized extent may became initialized right after we
>    drop i_data_sem, so extent state must be rechecked
> 
> 3) Locked pages goes up to date via following sequence:
>    ->readpage(page); lock_page(page); use_that_page(page)
>    But after readpage() one may invalidate it because it is
>    uptodate and unlocked (reclaimer does that)
>    As result kernel bug at include/linux/buffer_head.c:133!
> 4) We call write_begin() with already opened stansaction which
>    result in following deadlock:
> ->move_extent_per_page()
>   ->ext4_journal_start()-> hold journal transaction
>   ->write_begin()
>     ->ext4_da_write_begin()
>       ->ext4_nonda_switch()
>         ->writeback_inodes_sb_if_idle()  --> will wait for journal_stop()
> 
> 5) try_to_release_page() may fail and it does fail if one of page's bh was
>    pinned by journal
> 
> 6) If we about to change page's mapping we MUST hold it's lock during entire
>    remapping procedure, this is true for both pages(original and donor one)
> 
> Fixes:
> 
> - Avoid (1) and (2) simply by temproraly drop uninitialized extent handling
>   optimization, this will be reimplemented later.
> 
> - Fix (3) by manually forcing page to uptodate state w/o dropping it's lock
> 
> - Fix (4) by rearranging existing locking:
>   from: journal_start(); ->write_begin
>   to: write_begin(); journal_extend()
> - Fix (5) simply by checking retvalue
> - (6) requires full function's code rearrangement, will be done later.
> ---
>  fs/ext4/move_extent.c |   74 ++++++++++++++++++++++++-------------------------
>  1 files changed, 36 insertions(+), 38 deletions(-)
> 
> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
> index c5826c6..c5ca317 100644
> --- a/fs/ext4/move_extent.c
> +++ b/fs/ext4/move_extent.c
> @@ -812,11 +812,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
>  	 * inode and donor_inode may change each different metadata blocks.
>  	 */
>  	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
> -	handle = ext4_journal_start(orig_inode, jblocks);
> -	if (IS_ERR(handle)) {
> -		*err = PTR_ERR(handle);
> -		return 0;
> -	}
>  
>  	if (segment_eq(get_fs(), KERNEL_DS))
>  		w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
> @@ -824,19 +819,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
>  	orig_blk_offset = orig_page_offset * blocks_per_page +
>  		data_offset_in_page;
>  
> -	/*
> -	 * If orig extent is uninitialized one,
> -	 * it's not necessary force the page into memory
> -	 * and then force it to be written out again.
> -	 * Just swap data blocks between orig and donor.
> -	 */
> -	if (uninit) {
> -		replaced_count = mext_replace_branches(handle, orig_inode,
> -						donor_inode, orig_blk_offset,
> -						block_len_in_page, err);
> -		goto out2;
> -	}
> -
>  	offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
>  
>  	/* Calculate data_size */
> @@ -862,12 +844,28 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
>  				 &page, &fsdata);
>  	if (unlikely(*err < 0))
>  		goto out;
> +	handle = journal_current_handle();
>  
> +	if (!page_has_buffers(page))
> +		create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
> +
> +	/* Force page uptodate similar block_write_commit */
> +	page_zero_new_buffers(page, 0, PAGE_SIZE);
>  	if (!PageUptodate(page)) {
> -		mapping->a_ops->readpage(o_filp, page);
> -		lock_page(page);
> +		struct buffer_head *head, *bh;
> +		bh = head =  page_buffers(page);
> +		do {
> +			if (!bh_uptodate_or_lock(bh)) {
> +				if (bh_submit_read(bh) < 0) {
> +					put_bh(bh);
> +					err2  = -EIO;
> +					goto drop_page;
> +				}
> +			}
> +			bh = bh->b_this_page;
> +		} while (bh != head);
>  	}
> -
> +	SetPageUptodate(page);
>  	/*
>  	 * try_to_release_page() doesn't call releasepage in writeback mode.
>  	 * We should care about the order of writing to the same file
> @@ -876,9 +874,15 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
>  	 * writeback of the page.
>  	 */
>  	wait_on_page_writeback(page);
> -
> -	/* Release old bh and drop refs */
> -	try_to_release_page(page, 0);
> +	/* Finally page is fully uptodate and locked, it is time to drop
> +	 * old mapping info, function may fail other task hold reference
> +	 * on page's bh */
> +	if (!try_to_release_page(page, 0)) {
> +		replaced_size = 0;
> +		goto write_end;
> +	}
> +	if (ext4_journal_extend(handle, jblocks) < 0)
> +		goto write_end;
>  
>  	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
>  					orig_blk_offset, block_len_in_page,
> @@ -889,7 +893,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
>  			replaced_size =
>  				block_len_in_page << orig_inode->i_blkbits;
>  		} else
> -			goto out;
> +			goto drop_page;
>  	}
>  
>  	if (!page_has_buffers(page))
> @@ -903,30 +907,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
>  		*err = ext4_get_block(orig_inode,
>  				(sector_t)(orig_blk_offset + i), bh, 0);
>  		if (*err < 0)
> -			goto out;
> +			goto drop_page;
>  
>  		if (bh->b_this_page != NULL)
>  			bh = bh->b_this_page;
>  	}
> -
> +write_end:
>  	*err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
>  			       page, fsdata);
> -	page = NULL;
> -
>  out:
> -	if (unlikely(page)) {
> -		if (PageLocked(page))
> -			unlock_page(page);
> -		page_cache_release(page);
> -		ext4_journal_stop(handle);
> -	}
> -out2:
> -	ext4_journal_stop(handle);
> -
>  	if (err2)
>  		*err = err2;
>  
>  	return replaced_count;
> +drop_page:
> +	unlock_page(page);
> +	page_cache_release(page);
> +	ext4_journal_stop(handle);
> +	goto out;
>  }
>  
>  /**
> -- 
> 1.7.7.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html