lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1219699564.6394.26.camel@mingming-laptop>
Date:	Mon, 25 Aug 2008 14:26:04 -0700
From:	Mingming Cao <cmm@...ibm.com>
To:	"Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>
Cc:	tytso@....edu, sandeen@...hat.com, linux-ext4@...r.kernel.org
Subject: Re: [RFC PATCH -v2] ext4: Add percpu dirty block accounting.


在 2008-08-25一的 16:50 +0530,Aneesh Kumar K.V写道:
> This patch add dirty block accounting using percpu_counters.
> Delayed allocation block reservation is now done by updating
> dirty block counter. In the later patch we switch to non
> delalloc mode if the filesystem free blocks is < that
> 150 % of total filesystem  dirty blocks
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>
> ---
>  fs/ext4/balloc.c  |   64 ++++++++++++++++++++++++++++------------------------
>  fs/ext4/ext4_sb.h |    1 +
>  fs/ext4/inode.c   |   25 ++++++++++++--------
>  fs/ext4/mballoc.c |   17 ++-----------
>  fs/ext4/super.c   |    8 +++++-
>  5 files changed, 60 insertions(+), 55 deletions(-)
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index b7d1347..4ebe3b6 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1605,11 +1605,13 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
>  int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
>  						ext4_fsblk_t nblocks)
>  {
> -	s64 free_blocks;
> +	s64 free_blocks, dirty_blocks;
>  	ext4_fsblk_t root_blocks = 0;
>  	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> +	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
> 
> -	free_blocks = percpu_counter_read(fbc);
> +	free_blocks = percpu_counter_read_positive(fbc);
> +	dirty_blocks = percpu_counter_read_positive(dbc);
> 
>  	if (!capable(CAP_SYS_RESOURCE) &&
>  		sbi->s_resuid != current->fsuid &&
> @@ -1620,26 +1622,27 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
>  	 * counters. So we need to make sure we have free blocks more
>  	 * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
>  	 */
> -	if (free_blocks - (nblocks + root_blocks) <
> +	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
>  				(4 * (FBC_BATCH * nr_cpu_ids))) {
> -		/*
> -		 * We need to sum and claim under lock
> -		 * This is the slow patch which will be
> -		 * taken when we are very low on free blocks
> -		 */
> -		if (percpu_counter_sum_and_sub(fbc, nblocks + root_blocks))
> -			return -ENOSPC;
> -		/* add root_blocks back */
> -		percpu_counter_add(fbc, root_blocks);
> -		return 0;
> +
> +		free_blocks  = percpu_counter_sum(fbc);
> +		dirty_blocks = percpu_counter_sum(dbc);
> +		if (dirty_blocks < 0) {
> +			printk(KERN_CRIT "Dirty block accounting "
> +					"went wrong %lld\n",
> +					dirty_blocks);
> +		}
>  	}
>  #endif
> -	if (free_blocks < (root_blocks + nblocks))
> +	/* Check whether we have space after
> +	 * accounting for current dirty blocks
> +	 */
> +	if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
>  		/* we don't have free space */
>  		return -ENOSPC;
> 
> -	/* reduce fs free blocks counter */
> -	percpu_counter_sub(fbc, nblocks);
> +	/* Add the blocks to nblocks */
> +	percpu_counter_add(dbc, nblocks);
>  	return 0;
>  }
> 

I noticed that you dropped the code to update the counter together with
the accurate percpu_counter_sum(). This will open a window that two
allocation reservation get passed through when the fs is almost
full/fully booked... Any reason to drop that?

> @@ -1655,10 +1658,13 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
>  ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
>  						ext4_fsblk_t nblocks)
>  {
> -	ext4_fsblk_t free_blocks;
> +	ext4_fsblk_t free_blocks, dirty_blocks;
>  	ext4_fsblk_t root_blocks = 0;
> +	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> +	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
> 
> -	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
> +	free_blocks  = percpu_counter_read_positive(fbc);
> +	dirty_blocks = percpu_counter_read_positive(dbc);
> 
>  	if (!capable(CAP_SYS_RESOURCE) &&
>  		sbi->s_resuid != current->fsuid &&
> @@ -1669,16 +1675,16 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
>  	 * counters. So we need to make sure we have free blocks more
>  	 * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
>  	 */
> -	if (free_blocks - (nblocks + root_blocks) <
> +	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
>  				(4 * (FBC_BATCH * nr_cpu_ids))) {
> -		free_blocks =
> -			percpu_counter_sum(&sbi->s_freeblocks_counter);
> +		free_blocks = percpu_counter_sum_positive(fbc);
> +		dirty_blocks = percpu_counter_sum_positive(dbc);
>  	}
>  #endif
> -	if (free_blocks <= root_blocks)
> +	if (free_blocks <= (root_blocks + dirty_blocks))
>  		/* we don't have free space */
>  		return 0;
> -	if (free_blocks - root_blocks < nblocks)
> +	if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
>  		return free_blocks - root_blocks;
>  	return nblocks;
>  }
> @@ -1965,13 +1971,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
>  	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
>  	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
>  	spin_unlock(sb_bgl_lock(sbi, group_no));
> -	if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
> -		/*
> -		 * we allocated less blocks than we
> -		 * claimed. Add the difference back.
> -		 */
> -		percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
> -	}
> +	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
> +	/*
> +	 * Now reduce the dirty block count also. Should not go negative
> +	 */
> +	percpu_counter_sub(&sbi->s_dirtyblocks_counter, num);

ah... I think this is a bug which casue the ENOPSC still
You are updating the s_dirtyblocks_counter here, taking away the block
reservation counter, regardless whether this block allocation is go
through the delalloc mode or nondelalloc mode.

For example when fs is relatively fully booked, you have a file1 come in
and switch to the non delalloc mode, due to fs is relatively fully
booked. then after file1 done block allocation, it reduced the
s_dirtyblocks_counter, this is wrong, and will cause later ENOSPC.

>  	if (sbi->s_log_groups_per_flex) {
>  		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
>  		spin_lock(sb_bgl_lock(sbi, flex_group));
> diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
> index 6300226..0fa3762 100644
> --- a/fs/ext4/ext4_sb.h
> +++ b/fs/ext4/ext4_sb.h
> @@ -59,6 +59,7 @@ struct ext4_sb_info {
>  	struct percpu_counter s_freeblocks_counter;
>  	struct percpu_counter s_freeinodes_counter;
>  	struct percpu_counter s_dirs_counter;
> +	struct percpu_counter s_dirtyblocks_counter;
>  	struct blockgroup_lock s_blockgroup_lock;
> 
>  	/* root of the per fs reservation window tree */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 72a4a71..3f3ecc0 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1030,19 +1030,25 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
>  	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
>  	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
> 
> -	/* Account for allocated meta_blocks */
> -	mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
> +	if (mdb_free) {
> +		/* Account for allocated meta_blocks */
> +		mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
> 
> -	/* update fs free blocks counter for truncate case */
> -	percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
> +		/* update fs dirty blocks counter */
> +		/*
> +		 * FIXME!! doing this get the free block count wrong
> +		 * But we need to take care of over allocated meta-data
> +		 * blocks
> +		 */
> +		//percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);

I think we should update the overall reserved metadata blocks. Turn it
off only hide the real problem.

> +		EXT4_I(inode)->i_allocated_meta_blocks = 0;
> +		EXT4_I(inode)->i_reserved_meta_blocks = mdb;
> +	}
> 
>  	/* update per-inode reservations */
>  	BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
>  	EXT4_I(inode)->i_reserved_data_blocks -= used;
> 
> -	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
> -	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
> -	EXT4_I(inode)->i_allocated_meta_blocks = 0;
>  	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
>  }
> 
> @@ -1588,8 +1594,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
> 
>  	release = to_free + mdb_free;
> 
> -	/* update fs free blocks counter for truncate case */
> -	percpu_counter_add(&sbi->s_freeblocks_counter, release);
> +	/* update fs dirty blocks counter for truncate case */
> +	percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
> 
>  	/* update per-inode reservations */
>  	BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
> @@ -2490,7 +2496,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
>  	index = pos >> PAGE_CACHE_SHIFT;
>  	from = pos & (PAGE_CACHE_SIZE - 1);
>  	to = from + len;
> -
>  retry:
>  	/*
>  	 * With delayed allocation, we don't log the i_disksize update
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 419009f..4da4b9a 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2971,22 +2971,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
>  	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
>  	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
>  	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
> -
> +	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
>  	/*
> -	 * free blocks account has already be reduced/reserved
> -	 * at write_begin() time for delayed allocation
> -	 * do not double accounting
> +	 * Now reduce the dirty block count also. Should not go negative
>  	 */
> -	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
> -			ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
> -		/*
> -		 * we allocated less blocks than we calimed
> -		 * Add the difference back
> -		 */
> -		percpu_counter_add(&sbi->s_freeblocks_counter,
> -				ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
> -	}
> -
> +	percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len);

Same bug as before. We can't update s_dirtyblocks unconditionally. At
least check if this allocation request is coming from delalloc.

>  	if (sbi->s_log_groups_per_flex) {
>  		ext4_group_t flex_group = ext4_flex_group(sbi,
>  							  ac->ac_b_ex.fe_group);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index ed77786..7b9db51 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -520,6 +520,7 @@ static void ext4_put_super(struct super_block *sb)
>  	percpu_counter_destroy(&sbi->s_freeblocks_counter);
>  	percpu_counter_destroy(&sbi->s_freeinodes_counter);
>  	percpu_counter_destroy(&sbi->s_dirs_counter);
> +	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
>  	brelse(sbi->s_sbh);
>  #ifdef CONFIG_QUOTA
>  	for (i = 0; i < MAXQUOTAS; i++)
> @@ -2259,6 +2260,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		err = percpu_counter_init(&sbi->s_dirs_counter,
>  				ext4_count_dirs(sb));
>  	}
> +	if (!err) {
> +		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
> +	}
>  	if (err) {
>  		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
>  		goto failed_mount3;
> @@ -2491,6 +2495,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	percpu_counter_destroy(&sbi->s_freeblocks_counter);
>  	percpu_counter_destroy(&sbi->s_freeinodes_counter);
>  	percpu_counter_destroy(&sbi->s_dirs_counter);
> +	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
>  failed_mount2:
>  	for (i = 0; i < db_count; i++)
>  		brelse(sbi->s_group_desc[i]);
> @@ -3164,7 +3169,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
>  	buf->f_type = EXT4_SUPER_MAGIC;
>  	buf->f_bsize = sb->s_blocksize;
>  	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
> -	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
> +	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
> +		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);

 in case the counter turns out negative, I think better to add a slow
path here,  use the accurate version of freeblocks-dirtyblocks.


>  	ext4_free_blocks_count_set(es, buf->f_bfree);
>  	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
>  	if (buf->f_bfree < ext4_r_blocks_count(es))

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ