lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20130308124940.GB18949@gmail.com>
Date:	Fri, 8 Mar 2013 20:49:40 +0800
From:	Zheng Liu <gnehzuil.liu@...il.com>
To:	Lukáš Czerner <lczerner@...hat.com>
Cc:	linux-ext4@...r.kernel.org
Subject: Re: [BUG][Bigalloc] applictions will be blocked for more than 120s
 when we run xfstests #083

On Thu, Mar 07, 2013 at 03:07:25PM +0100, Lukáš Czerner wrote:
[snip]
> 
> Yes, I can confirm that. The problem is that when we have delayed
> write into unwritten extent we do not reserve any space, which is ok
> because the data has already been allocated, however we might need
> metadata blocks to cover unwritten extent conversion which we do not
> have reserved.
> 
> Then in writeback time when the extent splic actually happen we
> might not have enough space to allocate metadata blocks hence
> ext4_map_blocks() in mpage_da_map_and_submit() will return -ENOSPC
> to the ext4_da_writepages() caller.
> 
> However we're in writeback and we do not expect allocation to fail
> because of ENOSPC at all because we should have reserved everything
> we need to complete successfully so in the loop we'll force the
> journal commit hoping that some blocks will be released and retry
> the allocation again...and we'll be stuck in this loop forever.
> 
> Now here is patch which fixes the problem for me, however it still
> needs some testing. Also we should probably do something about the
> infinite loop in the ext4_da_writepages() - at least warn the user
> if we try too many times so we at least know what's happening
> because it was not easy to find this out.
> 
> Hopefully I'll send the proper patch soon, but feel free to test the
> fix yourself.

I have seen that you have sent the patch series to the mailing list, and
I will take a close look at them.

For this patch, I can confirm that xfstests #083 never hang, and I only
see the warning from ext4_da_update_reserve_space() in #269.  I guess
that has been fixed by your patch series.  Thanks for fixing it.
Tested-by: Zheng Liu <wenqing.lz@...bao.com>

Regards,
                                                - Zheng

> 
> Thanks!
> -Lukas
> 
> ---
>  fs/ext4/ext4.h  |    1 +
>  fs/ext4/inode.c |   76 +++++++++++++++++++++++++++++++++++++++++++++++++-----
>  2 files changed, 70 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 6e16c18..c20efe2 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -581,6 +581,7 @@ enum {
>  #define EXT4_GET_BLOCKS_NO_LOCK			0x0100
>  	/* Do not put hole in extent cache */
>  #define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200
> +#define EXT4_GET_BLOCKS_METADATA_RESERVED	0x0400
>  
>  /*
>   * Flags used by ext4_free_blocks
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 9ea0cde..46cc739 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -606,7 +606,8 @@ found:
>  	 * let the underlying get_block() function know to
>  	 * avoid double accounting
>  	 */
> -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> +	if ((flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ||
> +	    (flags & EXT4_GET_BLOCKS_METADATA_RESERVED))
>  		ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
>  	/*
>  	 * We need to check for EXT4 here because migrate
> @@ -636,7 +637,8 @@ found:
>  			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
>  			ext4_da_update_reserve_space(inode, retval, 1);
>  	}
> -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> +	if ((flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ||
> +	    (flags & EXT4_GET_BLOCKS_METADATA_RESERVED))
>  		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
>  
>  	if (retval > 0) {
> @@ -1215,6 +1217,56 @@ static int ext4_journalled_write_end(struct file *file,
>  	return ret ? ret : copied;
>  }
>  
> +
> +/*
> + * Reserve a metadata for a single block located at lblock
> + */
> +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
> +{
> +	int retries = 0;
> +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> +	struct ext4_inode_info *ei = EXT4_I(inode);
> +	unsigned int md_needed;
> +	ext4_lblk_t save_last_lblock;
> +	int save_len;
> +
> +	/*
> +	 * recalculate the amount of metadata blocks to reserve
> +	 * in order to allocate nrblocks
> +	 * worse case is one extent per block
> +	 */
> +repeat:
> +	spin_lock(&ei->i_block_reservation_lock);
> +	/*
> +	 * ext4_calc_metadata_amount() has side effects, which we have
> +	 * to be prepared undo if we fail to claim space.
> +	 */
> +	save_len = ei->i_da_metadata_calc_len;
> +	save_last_lblock = ei->i_da_metadata_calc_last_lblock;
> +	md_needed = EXT4_NUM_B2C(sbi,
> +				 ext4_calc_metadata_amount(inode, lblock));
> +	trace_ext4_da_reserve_space(inode, md_needed);
> +
> +	/*
> +	 * We do still charge estimated metadata to the sb though;
> +	 * we cannot afford to run out of free blocks.
> +	 */
> +	if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
> +		ei->i_da_metadata_calc_len = save_len;
> +		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
> +		spin_unlock(&ei->i_block_reservation_lock);
> +		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
> +			yield();
> +			goto repeat;
> +		}
> +		return -ENOSPC;
> +	}
> +	ei->i_reserved_meta_blocks += md_needed;
> +	spin_unlock(&ei->i_block_reservation_lock);
> +
> +	return 0;       /* success */
> +}
> +
>  /*
>   * Reserve a single cluster located at lblock
>   */
> @@ -1601,7 +1653,8 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
>  	 */
>  	map.m_lblk = next;
>  	map.m_len = max_blocks;
> -	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
> +	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
> +			   EXT4_GET_BLOCKS_METADATA_RESERVED;
>  	if (ext4_should_dioread_nolock(mpd->inode))
>  		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
>  	if (mpd->b_state & (1 << BH_Delay))
> @@ -1766,7 +1819,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
>  			      struct buffer_head *bh)
>  {
>  	struct extent_status es;
> -	int retval;
> +	int retval, ret;
>  	sector_t invalid_block = ~((sector_t) 0xffff);
>  
>  	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
> @@ -1804,9 +1857,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
>  		map->m_len = retval;
>  		if (ext4_es_is_written(&es))
>  			map->m_flags |= EXT4_MAP_MAPPED;
> -		else if (ext4_es_is_unwritten(&es))
> +		else if (ext4_es_is_unwritten(&es)) {
> +			/*
> +			 * We have delalloc write into the unwritten extent
> +			 * which means that we have to reserve metadata
> +			 * potentially required for converting unwritten
> +			 * extent.
> +			 */
> +			ret = ext4_da_reserve_metadata(inode, iblock);
> +			if (ret)
> +				/* not enough space to reserve */
> +				retval = ret;
>  			map->m_flags |= EXT4_MAP_UNWRITTEN;
> -		else
> +		} else
>  			BUG_ON(1);
>  
>  		return retval;
> @@ -1838,7 +1901,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
>  
>  add_delayed:
>  	if (retval == 0) {
> -		int ret;
>  		/*
>  		 * XXX: __block_prepare_write() unmaps passed block,
>  		 * is it OK?
> -- 
> 1.7.7.6
> 
> 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists