lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20130210084511.GB14075@gmail.com>
Date:	Sun, 10 Feb 2013 16:45:11 +0800
From:	Zheng Liu <gnehzuil.liu@...il.com>
To:	Theodore Ts'o <tytso@....edu>
Cc:	Zheng Liu <wenqing.lz@...bao.com>, Jan kara <jack@...e.cz>,
	linux-ext4@...r.kernel.org
Subject: Re: [PATCH 09/10 v5] ext4: convert unwritten extents from extent
 status tree in end_io

On Fri, Feb 08, 2013 at 04:44:05PM +0800, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@...bao.com>
> 
> This commit tries to convert unwritten extents from extent status tree
> in end_io callback functions and ext4_ext_direct_IO.
> 
> Signed-off-by: Zheng Liu <wenqing.lz@...bao.com>
> Cc: "Theodore Ts'o" <tytso@....edu>
> Cc: Jan kara <jack@...e.cz>
> ---
>  fs/ext4/extents.c           |   6 +-
>  fs/ext4/extents_status.c    | 180 ++++++++++++++++++++++++++++++++++++++++----
>  fs/ext4/extents_status.h    |   2 +
>  fs/ext4/inode.c             |   5 ++
>  fs/ext4/page-io.c           |   8 +-
>  include/trace/events/ext4.h |  25 ++++++
>  6 files changed, 208 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 9f21430..a03cabf 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4443,8 +4443,10 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
>  			ret = PTR_ERR(handle);
>  			break;
>  		}
> -		ret = ext4_map_blocks(handle, inode, &map,
> -				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
> +		down_write(&EXT4_I(inode)->i_data_sem);
> +		ret = ext4_ext_map_blocks(handle, inode, &map,
> +					  EXT4_GET_BLOCKS_IO_CONVERT_EXT);
> +		up_write(&EXT4_I(inode)->i_data_sem);
>  		if (ret <= 0) {
>  			WARN_ON(ret <= 0);
>  			ext4_msg(inode->i_sb, KERN_ERR,

Hi Ted,

This chunk is missing in your 'dev' branch of ext4.  If we call
ext4_map_blocks here, unwritten extent in disk will never be converted
because ext4_map_blocks first tries to lookup extent status tree and the
unwriten extent in this tree has been converted in end_io callback
function.  It always looks up a written extent in cache, and return
immdiately.  Please check it again.

Thanks,
                                                - Zheng

> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index bac5286..eab8893 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -248,10 +248,11 @@ ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
>  	struct extent_status *es1 = NULL;
>  	struct rb_node *node;
>  	ext4_lblk_t ret = EXT_MAX_BLOCKS;
> +	unsigned long flags;
>  
>  	trace_ext4_es_find_delayed_extent_enter(inode, es->es_lblk);
>  
> -	read_lock(&EXT4_I(inode)->i_es_lock);
> +	read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
>  	tree = &EXT4_I(inode)->i_es_tree;
>  
>  	/* find extent in cache firstly */
> @@ -291,7 +292,7 @@ out:
>  		}
>  	}
>  
> -	read_unlock(&EXT4_I(inode)->i_es_lock);
> +	read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
>  
>  	ext4_es_lru_add(inode);
>  	trace_ext4_es_find_delayed_extent_exit(inode, es, ret);
> @@ -458,6 +459,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  {
>  	struct extent_status newes;
>  	ext4_lblk_t end = lblk + len - 1;
> +	unsigned long flags;
>  	int err = 0;
>  
>  	es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
> @@ -471,14 +473,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  	ext4_es_store_status(&newes, status);
>  	trace_ext4_es_insert_extent(inode, &newes);
>  
> -	write_lock(&EXT4_I(inode)->i_es_lock);
> +	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
>  	err = __es_remove_extent(inode, lblk, end);
>  	if (err != 0)
>  		goto error;
>  	err = __es_insert_extent(inode, &newes);
>  
>  error:
> -	write_unlock(&EXT4_I(inode)->i_es_lock);
> +	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
>  
>  	ext4_es_lru_add(inode);
>  	ext4_es_print_tree(inode);
> @@ -498,13 +500,14 @@ int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es)
>  	struct ext4_es_tree *tree;
>  	struct extent_status *es1 = NULL;
>  	struct rb_node *node;
> +	unsigned long flags;
>  	int found = 0;
>  
>  	trace_ext4_es_lookup_extent_enter(inode, es->es_lblk);
>  	es_debug("lookup extent in block %u\n", es->es_lblk);
>  
>  	tree = &EXT4_I(inode)->i_es_tree;
> -	read_lock(&EXT4_I(inode)->i_es_lock);
> +	read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
>  
>  	/* find extent in cache firstly */
>  	es->es_len = es->es_pblk = 0;
> @@ -539,7 +542,7 @@ out:
>  		es->es_pblk = es1->es_pblk;
>  	}
>  
> -	read_unlock(&EXT4_I(inode)->i_es_lock);
> +	read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
>  
>  	ext4_es_lru_add(inode);
>  	trace_ext4_es_lookup_extent_exit(inode, es, found);
> @@ -649,6 +652,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  			  ext4_lblk_t len)
>  {
>  	ext4_lblk_t end;
> +	unsigned long flags;
>  	int err = 0;
>  
>  	trace_ext4_es_remove_extent(inode, lblk, len);
> @@ -658,9 +662,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  	end = lblk + len - 1;
>  	BUG_ON(end < lblk);
>  
> -	write_lock(&EXT4_I(inode)->i_es_lock);
> +	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
>  	err = __es_remove_extent(inode, lblk, end);
> -	write_unlock(&EXT4_I(inode)->i_es_lock);
> +	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
>  	ext4_es_print_tree(inode);
>  	return err;
>  }
> @@ -671,6 +675,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
>  					struct ext4_sb_info, s_es_shrinker);
>  	struct ext4_inode_info *ei;
>  	struct list_head *cur, *tmp, scanned;
> +	unsigned long flags;
>  	int nr_to_scan = sc->nr_to_scan;
>  	int ret, nr_shrunk = 0;
>  
> @@ -687,16 +692,16 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
>  
>  		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
>  
> -		read_lock(&ei->i_es_lock);
> +		read_lock_irqsave(&ei->i_es_lock, flags);
>  		if (ei->i_es_lru_nr == 0) {
> -			read_unlock(&ei->i_es_lock);
> +			read_unlock_irqrestore(&ei->i_es_lock, flags);
>  			continue;
>  		}
> -		read_unlock(&ei->i_es_lock);
> +		read_unlock_irqrestore(&ei->i_es_lock, flags);
>  
> -		write_lock(&ei->i_es_lock);
> +		write_lock_irqsave(&ei->i_es_lock, flags);
>  		ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
> -		write_unlock(&ei->i_es_lock);
> +		write_unlock_irqrestore(&ei->i_es_lock, flags);
>  
>  		nr_shrunk += ret;
>  		nr_to_scan -= ret;
> @@ -756,14 +761,15 @@ static int ext4_es_reclaim_extents_count(struct super_block *sb)
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
>  	struct ext4_inode_info *ei;
>  	struct list_head *cur;
> +	unsigned long flags;
>  	int nr_cached = 0;
>  
>  	spin_lock(&sbi->s_es_lru_lock);
>  	list_for_each(cur, &sbi->s_es_lru) {
>  		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
> -		read_lock(&ei->i_es_lock);
> +		read_lock_irqsave(&ei->i_es_lock, flags);
>  		nr_cached += ei->i_es_lru_nr;
> -		read_unlock(&ei->i_es_lock);
> +		read_unlock_irqrestore(&ei->i_es_lock, flags);
>  	}
>  	spin_unlock(&sbi->s_es_lru_lock);
>  	trace_ext4_es_reclaim_extents_count(sb, nr_cached);
> @@ -801,3 +807,147 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
>  	tree->cache_es = NULL;
>  	return nr_shrunk;
>  }
> +
> +int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
> +				      size_t size)
> +{
> +	struct ext4_es_tree *tree;
> +	struct rb_node *node;
> +	struct extent_status *es, orig_es, conv_es;
> +	ext4_lblk_t end, len1, len2;
> +	ext4_lblk_t lblk = 0, len = 0;
> +	ext4_fsblk_t block;
> +	unsigned long flags;
> +	unsigned int blkbits;
> +	int err = 0;
> +
> +	trace_ext4_es_convert_unwritten_extents(inode, offset, size);
> +	blkbits = inode->i_blkbits;
> +	lblk = offset >> blkbits;
> +	len = (EXT4_BLOCK_ALIGN(offset + size, blkbits) >> blkbits) - lblk;
> +
> +	end = lblk + len - 1;
> +	BUG_ON(end < lblk);
> +
> +	tree = &EXT4_I(inode)->i_es_tree;
> +
> +	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
> +	es = __es_tree_search(&tree->root, lblk);
> +	if (!es)
> +		goto out;
> +	if (es->es_lblk > end)
> +		goto out;
> +
> +	tree->cache_es = NULL;
> +
> +	orig_es.es_lblk = es->es_lblk;
> +	orig_es.es_len = es->es_len;
> +	orig_es.es_pblk = es->es_pblk;
> +
> +	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
> +	len2 = ext4_es_end(es) > end ?
> +	       ext4_es_end(es) - end : 0;
> +	if (len1 > 0)
> +		es->es_len = len1;
> +	if (len2 > 0) {
> +		if (len1 > 0) {
> +			struct extent_status newes;
> +
> +			newes.es_lblk = end + 1;
> +			newes.es_len = len2;
> +			block = ext4_es_pblock(&orig_es) +
> +				orig_es.es_len - len2;
> +			ext4_es_store_pblock(&newes, block);
> +			ext4_es_store_status(&newes, ext4_es_status(&orig_es));
> +			err = __es_insert_extent(inode, &newes);
> +			if (err) {
> +				es->es_lblk = orig_es.es_lblk;
> +				es->es_len = orig_es.es_len;
> +				es->es_pblk = orig_es.es_pblk;
> +				goto out;
> +			}
> +
> +			conv_es.es_lblk = orig_es.es_lblk + len1;
> +			conv_es.es_len = orig_es.es_len - len1 - len2;
> +			block = ext4_es_pblock(&orig_es) + len1;
> +			ext4_es_store_pblock(&conv_es, block);
> +			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> +			err = __es_insert_extent(inode, &conv_es);
> +			if (err) {
> +				int err2 = __es_remove_extent(inode,
> +							conv_es.es_lblk,
> +							ext4_es_end(&newes));
> +				if (err2)
> +					goto out;
> +				es->es_lblk = orig_es.es_lblk;
> +				es->es_len = orig_es.es_len;
> +				es->es_pblk = orig_es.es_pblk;
> +				goto out;
> +			}
> +		} else {
> +			es->es_lblk = end + 1;
> +			es->es_len = len2;
> +			block = ext4_es_pblock(&orig_es) +
> +				orig_es.es_len - len2;
> +			ext4_es_store_pblock(es, block);
> +
> +			conv_es.es_lblk = orig_es.es_lblk;
> +			conv_es.es_len = orig_es.es_len - len2;
> +			ext4_es_store_pblock(&conv_es,
> +					     ext4_es_pblock(&orig_es));
> +			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> +			err = __es_insert_extent(inode, &conv_es);
> +			if (err) {
> +				es->es_lblk = orig_es.es_lblk;
> +				es->es_len = orig_es.es_len;
> +				es->es_pblk = orig_es.es_pblk;
> +			}
> +		}
> +		goto out;
> +	}
> +
> +	if (len1 > 0) {
> +		node = rb_next(&es->rb_node);
> +		if (node)
> +			es = rb_entry(node, struct extent_status, rb_node);
> +		else
> +			es = NULL;
> +	}
> +
> +	while (es && ext4_es_end(es) <= end) {
> +		node = rb_next(&es->rb_node);
> +		ext4_es_store_status(es, EXTENT_STATUS_WRITTEN);
> +		if (!inode) {
> +			es = NULL;
> +			break;
> +		}
> +		es = rb_entry(node, struct extent_status, rb_node);
> +	}
> +
> +	if (es && es->es_lblk < end + 1) {
> +		ext4_lblk_t orig_len = es->es_len;
> +
> +		/*
> +		 * Here we first set conv_es just because of avoiding copy the
> +		 * value of es to a temporary variable.
> +		 */
> +		len1 = ext4_es_end(es) - end;
> +		conv_es.es_lblk = es->es_lblk;
> +		conv_es.es_len = es->es_len - len1;
> +		ext4_es_store_pblock(&conv_es, ext4_es_pblock(es));
> +		ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
> +
> +		es->es_lblk = end + 1;
> +		es->es_len = len1;
> +		block = ext4_es_pblock(es) + orig_len - len1;
> +		ext4_es_store_pblock(es, block);
> +
> +		err = __es_insert_extent(inode, &conv_es);
> +		if (err)
> +			goto out;
> +	}
> +
> +out:
> +	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
> +	return err;
> +}
> diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
> index 938ad2b..2849d74 100644
> --- a/fs/ext4/extents_status.h
> +++ b/fs/ext4/extents_status.h
> @@ -54,6 +54,8 @@ extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  extern ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
>  					       struct extent_status *es);
>  extern int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es);
> +extern int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
> +					     size_t size);
>  
>  static inline int ext4_es_is_written(struct extent_status *es)
>  {
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 670779a..08cf720 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3063,6 +3063,7 @@ out:
>  		io_end->result = ret;
>  	}
>  
> +	ext4_es_convert_unwritten_extents(inode, offset, size);
>  	ext4_add_complete_io(io_end);
>  }
>  
> @@ -3088,6 +3089,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
>  	 */
>  	inode = io_end->inode;
>  	ext4_set_io_unwritten_flag(inode, io_end);
> +	ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
>  	ext4_add_complete_io(io_end);
>  out:
>  	bh->b_private = NULL;
> @@ -3246,6 +3248,9 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
>  						EXT4_STATE_DIO_UNWRITTEN)) {
>  		int err;
> +		err = ext4_es_convert_unwritten_extents(inode, offset, ret);
> +		if (err)
> +			ret = err;
>  		/*
>  		 * for non AIO case, since the IO is already
>  		 * completed, we could do the conversion right here
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 0016fbc..66ea30e 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -276,6 +276,13 @@ static void ext4_end_bio(struct bio *bio, int error)
>  		error = 0;
>  	bio_put(bio);
>  
> +	/*
> +	 * We need to convert unwrittne extents in extent status tree before
> +	 * end_page_writeback() is called.  Otherwise, when dioread_nolock is
> +	 * enabled, we will be likely to read stale data.
> +	 */
> +	inode = io_end->inode;
> +	ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
>  	for (i = 0; i < io_end->num_io_pages; i++) {
>  		struct page *page = io_end->pages[i]->p_page;
>  		struct buffer_head *bh, *head;
> @@ -305,7 +312,6 @@ static void ext4_end_bio(struct bio *bio, int error)
>  		put_io_page(io_end->pages[i]);
>  	}
>  	io_end->num_io_pages = 0;
> -	inode = io_end->inode;
>  
>  	if (error) {
>  		io_end->flag |= EXT4_IO_END_ERROR;
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index f0734b3..d32e3d5 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -2233,6 +2233,31 @@ TRACE_EVENT(ext4_es_lookup_extent_exit,
>  		  __entry->found ? __entry->status : 0)
>  );
>  
> +TRACE_EVENT(ext4_es_convert_unwritten_extents,
> +	TP_PROTO(struct inode *inode, loff_t offset, loff_t size),
> +
> +	TP_ARGS(inode, offset, size),
> +
> +	TP_STRUCT__entry(
> +		__field(	dev_t,	dev			)
> +		__field(	ino_t,	ino			)
> +		__field(	loff_t,	offset			)
> +		__field(	loff_t, size			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev	= inode->i_sb->s_dev;
> +		__entry->ino	= inode->i_ino;
> +		__entry->offset	= offset;
> +		__entry->size	= size;
> +	),
> +
> +	TP_printk("dev %d,%d ino %lu convert unwritten extents [%llu/%llu",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long) __entry->ino,
> +		  __entry->offset, __entry->size)
> +);
> +
>  TRACE_EVENT(ext4_es_reclaim_extents_count,
>  	TP_PROTO(struct super_block *sb, int nr_cached),
>  
> -- 
> 1.7.12.rc2.18.g61b472e
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists