lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <1218752165.6362.18.camel@mingming-laptop>
Date:	Thu, 14 Aug 2008 15:16:05 -0700
From:	Mingming Cao <cmm@...ibm.com>
To:	"Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>
Cc:	tytso@....edu, sandeen@...hat.com, linux-ext4@...r.kernel.org
Subject: Re: [PATCH] ext4: Fix small file fragmentation


在 2008-08-14四的 23:14 +0530,Aneesh Kumar K.V写道:
> mballoc small file block allocation use per cpu prealloc
> space. Use goal block when searching for the right prealloc
> space. Also make sure ext4_da_writepages tries to write
> all the pages for small files in single attempt
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>
> ---
>  fs/ext4/inode.c   |   21 +++++++++++++++------
>  fs/ext4/mballoc.c |   44 +++++++++++++++++++++++++++++++++++++-------
>  fs/inode.c        |    1 +
>  3 files changed, 53 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index e144896..0b34998 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2318,13 +2318,12 @@ static int ext4_writepages_trans_blocks(struct inode *inode)
>  static int ext4_da_writepages(struct address_space *mapping,
>                                  struct writeback_control *wbc)
>  {
> -	struct inode *inode = mapping->host;
>  	handle_t *handle = NULL;
> -	int needed_blocks;
> -	int ret = 0;
> -	long to_write;
>  	loff_t range_start = 0;
> -	long pages_skipped = 0;
> +	struct inode *inode = mapping->host;
> +	int needed_blocks, ret = 0, nr_to_writebump = 0;
> +	long to_write, pages_skipped = 0;
> +	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
> 
>  	/*
>  	 * No pages to write? This is mainly a kludge to avoid starting
> @@ -2333,6 +2332,16 @@ static int ext4_da_writepages(struct address_space *mapping,
>  	 */
>  	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
>  		return 0;
> +	/*
> +	 * Make sure nr_to_write is >= sbi->s_mb_stream_request
> +	 * This make sure small files blocks are allocated in
> +	 * single attempt. This ensure that small files
> +	 * get less fragmented.
> +	 */
> +	if (wbc->nr_to_write < sbi->s_mb_stream_request) {
> +		nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
> +		wbc->nr_to_write = sbi->s_mb_stream_request;
> +	}
> 

do_writepages() could be called with wbc with a specified range, is it
okay  forces da writepages to flush at last 16
pages(sbi->s_mb_stream_request) all the time, which is more than what
the caller asked for?

I assume you trying to address the fragmentation issue with small
request for da_writepages() discussed in previous email? (A little more
description will be helpful here:))

>  	if (!wbc->range_cyclic)
>  		/*
> @@ -2413,7 +2422,7 @@ static int ext4_da_writepages(struct address_space *mapping,
>  	}
> 
>  out_writepages:
> -	wbc->nr_to_write = to_write;
> +	wbc->nr_to_write = to_write - nr_to_writebump;
>  	wbc->range_start = range_start;
>  	return ret;
>  }
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index b14a7c7..1afcb11 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -3286,6 +3286,29 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
>  	mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
>  }
> 
> +static struct ext4_prealloc_space *
> +ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
> +			struct ext4_prealloc_space *pa,
> +			struct ext4_prealloc_space *cpa)
> +{
> +	ext4_fsblk_t cur_distance, new_distance;
> +
> +	if (cpa == NULL) {
> +		atomic_inc(&pa->pa_count);
> +		return pa;
> +	}
> +	cur_distance = abs(goal_block - cpa->pa_pstart);
> +	new_distance = abs(goal_block - pa->pa_pstart);
> +
> +	if (cur_distance < new_distance)
> +		return cpa;
> +
> +	/* drop the previous reference */
> +	atomic_dec(&cpa->pa_count);
> +	atomic_inc(&pa->pa_count);
> +	return pa;
> +}
> +
>  /*
>   * search goal blocks in preallocated space
>   */
> @@ -3295,7 +3318,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
>  	int order, i;
>  	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
>  	struct ext4_locality_group *lg;
> -	struct ext4_prealloc_space *pa;
> +	struct ext4_prealloc_space *pa, *cpa = NULL;
> +	ext4_fsblk_t goal_block;
> 
>  	/* only data can be preallocated */
>  	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
> @@ -3338,6 +3362,10 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
>  		/* The max size of hash table is PREALLOC_TB_SIZE */
>  		order = PREALLOC_TB_SIZE - 1;
> 
> +	goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
> +			ac->ac_g_ex.fe_start +
> +			le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
> +
>  	for (i = order; i < PREALLOC_TB_SIZE; i++) {
>  		rcu_read_lock();
>  		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
> @@ -3345,17 +3373,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
>  			spin_lock(&pa->pa_lock);
>  			if (pa->pa_deleted == 0 &&
>  					pa->pa_free >= ac->ac_o_ex.fe_len) {
> -				atomic_inc(&pa->pa_count);
> -				ext4_mb_use_group_pa(ac, pa);
> -				spin_unlock(&pa->pa_lock);
> -				ac->ac_criteria = 20;
> -				rcu_read_unlock();
> -				return 1;
> +
> +				cpa = ext4_mb_check_group_pa(goal_block,
> +								pa, cpa);
>  			}

cpa is initalized as NULL, and I could not see where we set cpa any
pointer before calling ext4_mb_check_group_pa().   If I understand
right, the code above passes a  NULL pointer to
ext4_mb_check_group_pa(), which will result in just choose the pa
pointer directly,  and bypass the distance calculation guided by the
goal block.  Did I miss anything?

>  			spin_unlock(&pa->pa_lock);
>  		}
>  		rcu_read_unlock();
>  	}
> +	if (cpa) {
> +		ext4_mb_use_group_pa(ac, cpa);
> +		ac->ac_criteria = 20;
> +		return 1;
> +	}
>  	return 0;
>  }
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index b6726f6..d77f0ee 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -163,6 +163,7 @@ static struct inode *alloc_inode(struct super_block *sb)
>  		mapping->a_ops = &empty_aops;
>   		mapping->host = inode;
>  		mapping->flags = 0;
> +		mapping->writeback_index = 0;
>  		mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
>  		mapping->assoc_mapping = NULL;
>  		mapping->backing_dev_info = &default_backing_dev_info;

Could you explain what's this change for?

Regards,
Mingming

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ