lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87mwt54ppc.fsf@openvz.org>
Date:	Thu, 11 Apr 2013 09:10:55 +0400
From:	Dmitry Monakhov <dmonakhov@...nvz.org>
To:	Jan Kara <jack@...e.cz>, Ted Tso <tytso@....edu>
Cc:	linux-ext4@...r.kernel.org
Subject: Re: [PATCH 02/29] ext4: Use io_end for multiple bios

On Mon,  8 Apr 2013 23:32:07 +0200, Jan Kara <jack@...e.cz> wrote:
> Change writeback path to create just one io_end structure for the extent
> to which we submit IO and share it among bios writing that extent. This
> prevents needless splitting and joining of unwritten extents when they
> cannot be submitted as a single bio.
Looks good.
Reviewed-by: Dmitry Monakhov <dmonakhov@...nvz.org>
> 
> Signed-off-by: Jan Kara <jack@...e.cz>
> ---
>  fs/ext4/ext4.h    |    8 +++-
>  fs/ext4/inode.c   |   85 ++++++++++++++++++++-----------------
>  fs/ext4/page-io.c |  121 +++++++++++++++++++++++++++++++++--------------------
>  3 files changed, 128 insertions(+), 86 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 3c70547..edf9b9e 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -207,6 +207,7 @@ typedef struct ext4_io_end {
>  	ssize_t			size;		/* size of the extent */
>  	struct kiocb		*iocb;		/* iocb struct for AIO */
>  	int			result;		/* error value for AIO */
> +	atomic_t		count;		/* reference counter */
>  } ext4_io_end_t;
>  
>  struct ext4_io_submit {
> @@ -2601,11 +2602,14 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
>  
>  /* page-io.c */
>  extern int __init ext4_init_pageio(void);
> -extern void ext4_add_complete_io(ext4_io_end_t *io_end);
>  extern void ext4_exit_pageio(void);
>  extern void ext4_ioend_wait(struct inode *);
> -extern void ext4_free_io_end(ext4_io_end_t *io);
>  extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
> +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
> +extern int ext4_put_io_end(ext4_io_end_t *io_end);
> +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
> +extern void ext4_io_submit_init(struct ext4_io_submit *io,
> +				struct writeback_control *wbc);
>  extern void ext4_end_io_work(struct work_struct *work);
>  extern void ext4_io_submit(struct ext4_io_submit *io);
>  extern int ext4_bio_write_page(struct ext4_io_submit *io,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 6b8ec2a..ba07412 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1409,7 +1409,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
>  	struct ext4_io_submit io_submit;
>  
>  	BUG_ON(mpd->next_page <= mpd->first_page);
> -	memset(&io_submit, 0, sizeof(io_submit));
> +	ext4_io_submit_init(&io_submit, mpd->wbc);
> +	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
> +	if (!io_submit.io_end)
> +		return -ENOMEM;
>  	/*
>  	 * We need to start from the first_page to the next_page - 1
>  	 * to make sure we also write the mapped dirty buffer_heads.
> @@ -1497,6 +1500,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
>  		pagevec_release(&pvec);
>  	}
>  	ext4_io_submit(&io_submit);
> +	/* Drop io_end reference we got from init */
> +	ext4_put_io_end_defer(io_submit.io_end);
>  	return ret;
>  }
>  
> @@ -2116,9 +2121,16 @@ static int ext4_writepage(struct page *page,
>  		 */
>  		return __ext4_journalled_writepage(page, len);
>  
> -	memset(&io_submit, 0, sizeof(io_submit));
> +	ext4_io_submit_init(&io_submit, wbc);
> +	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
> +	if (!io_submit.io_end) {
> +		redirty_page_for_writepage(wbc, page);
> +		return -ENOMEM;
> +	}
>  	ret = ext4_bio_write_page(&io_submit, page, len, wbc);
>  	ext4_io_submit(&io_submit);
> +	/* Drop io_end reference we got from init */
> +	ext4_put_io_end_defer(io_submit.io_end);
>  	return ret;
>  }
>  
> @@ -2957,9 +2969,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
>  	struct inode *inode = file_inode(iocb->ki_filp);
>          ext4_io_end_t *io_end = iocb->private;
>  
> -	/* if not async direct IO or dio with 0 bytes write, just return */
> -	if (!io_end || !size)
> -		goto out;
> +	/* if not async direct IO just return */
> +	if (!io_end) {
> +		inode_dio_done(inode);
> +		if (is_async)
> +			aio_complete(iocb, ret, 0);
> +		return;
> +	}
>  
>  	ext_debug("ext4_end_io_dio(): io_end 0x%p "
>  		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
> @@ -2967,25 +2983,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
>  		  size);
>  
>  	iocb->private = NULL;
> -
> -	/* if not aio dio with unwritten extents, just free io and return */
> -	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
> -		ext4_free_io_end(io_end);
> -out:
> -		inode_dio_done(inode);
> -		if (is_async)
> -			aio_complete(iocb, ret, 0);
> -		return;
> -	}
> -
>  	io_end->offset = offset;
>  	io_end->size = size;
>  	if (is_async) {
>  		io_end->iocb = iocb;
>  		io_end->result = ret;
>  	}
> -
> -	ext4_add_complete_io(io_end);
> +	ext4_put_io_end_defer(io_end);
>  }
>  
>  /*
> @@ -3019,6 +3023,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  	get_block_t *get_block_func = NULL;
>  	int dio_flags = 0;
>  	loff_t final_size = offset + count;
> +	ext4_io_end_t *io_end = NULL;
>  
>  	/* Use the old path for reads and writes beyond i_size. */
>  	if (rw != WRITE || final_size > inode->i_size)
> @@ -3057,13 +3062,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  	iocb->private = NULL;
>  	ext4_inode_aio_set(inode, NULL);
>  	if (!is_sync_kiocb(iocb)) {
> -		ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
> +		io_end = ext4_init_io_end(inode, GFP_NOFS);
>  		if (!io_end) {
>  			ret = -ENOMEM;
>  			goto retake_lock;
>  		}
>  		io_end->flag |= EXT4_IO_END_DIRECT;
> -		iocb->private = io_end;
> +		/*
> +		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
> +		 */
> +		iocb->private = ext4_get_io_end(io_end);
>  		/*
>  		 * we save the io structure for current async direct
>  		 * IO, so that later ext4_map_blocks() could flag the
> @@ -3087,26 +3095,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  				   NULL,
>  				   dio_flags);
>  
> -	if (iocb->private)
> -		ext4_inode_aio_set(inode, NULL);
>  	/*
> -	 * The io_end structure takes a reference to the inode, that
> -	 * structure needs to be destroyed and the reference to the
> -	 * inode need to be dropped, when IO is complete, even with 0
> -	 * byte write, or failed.
> -	 *
> -	 * In the successful AIO DIO case, the io_end structure will
> -	 * be destroyed and the reference to the inode will be dropped
> -	 * after the end_io call back function is called.
> -	 *
> -	 * In the case there is 0 byte write, or error case, since VFS
> -	 * direct IO won't invoke the end_io call back function, we
> -	 * need to free the end_io structure here.
> +	 * Put our reference to io_end. This can free the io_end structure e.g.
> +	 * in sync IO case or in case of error. It can even perform extent
> +	 * conversion if all bios we submitted finished before we got here.
> +	 * Note that in that case iocb->private can be already set to NULL
> +	 * here.
>  	 */
> -	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
> -		ext4_free_io_end(iocb->private);
> -		iocb->private = NULL;
> -	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
> +	if (io_end) {
> +		ext4_inode_aio_set(inode, NULL);
> +		ext4_put_io_end(io_end);
> +		/*
> +		 * In case of error or no write ext4_end_io_dio() was not
> +		 * called so we have to put iocb's reference.
> +		 */
> +		if (ret <= 0 && ret != -EIOCBQUEUED) {
> +			WARN_ON(iocb->private != io_end);
> +			ext4_put_io_end(io_end);
> +			iocb->private = NULL;
> +		}
> +	}
> +	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
>  						EXT4_STATE_DIO_UNWRITTEN)) {
>  		int err;
>  		/*
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 73bc011..da8bddf 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -51,17 +51,28 @@ void ext4_ioend_wait(struct inode *inode)
>  	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
>  }
>  
> -void ext4_free_io_end(ext4_io_end_t *io)
> +static void ext4_release_io_end(ext4_io_end_t *io_end)
>  {
> -	int i;
> +	BUG_ON(!list_empty(&io_end->list));
> +	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
> +
> +	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
> +		wake_up_all(ext4_ioend_wq(io_end->inode));
> +	if (io_end->flag & EXT4_IO_END_DIRECT)
> +		inode_dio_done(io_end->inode);
> +	if (io_end->iocb)
> +		aio_complete(io_end->iocb, io_end->result, 0);
> +	kmem_cache_free(io_end_cachep, io_end);
> +}
>  
> -	BUG_ON(!io);
> -	BUG_ON(!list_empty(&io->list));
> -	BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
> +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
> +{
> +	struct inode *inode = io_end->inode;
>  
> -	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
> -		wake_up_all(ext4_ioend_wq(io->inode));
> -	kmem_cache_free(io_end_cachep, io);
> +	io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
> +	/* Wake up anyone waiting on unwritten extent conversion */
> +	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
> +		wake_up_all(ext4_ioend_wq(inode));
>  }
>  
>  /* check a range of space and convert unwritten extents to written. */
> @@ -84,13 +95,8 @@ static int ext4_end_io(ext4_io_end_t *io)
>  			 "(inode %lu, offset %llu, size %zd, error %d)",
>  			 inode->i_ino, offset, size, ret);
>  	}
> -	/* Wake up anyone waiting on unwritten extent conversion */
> -	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
> -		wake_up_all(ext4_ioend_wq(inode));
> -	if (io->flag & EXT4_IO_END_DIRECT)
> -		inode_dio_done(inode);
> -	if (io->iocb)
> -		aio_complete(io->iocb, io->result, 0);
> +	ext4_clear_io_unwritten_flag(io);
> +	ext4_release_io_end(io);
>  	return ret;
>  }
>  
> @@ -121,7 +127,7 @@ static void dump_completed_IO(struct inode *inode)
>  }
>  
>  /* Add the io_end to per-inode completed end_io list. */
> -void ext4_add_complete_io(ext4_io_end_t *io_end)
> +static void ext4_add_complete_io(ext4_io_end_t *io_end)
>  {
>  	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
>  	struct workqueue_struct *wq;
> @@ -158,8 +164,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
>  		err = ext4_end_io(io);
>  		if (unlikely(!ret && err))
>  			ret = err;
> -		io->flag &= ~EXT4_IO_END_UNWRITTEN;
> -		ext4_free_io_end(io);
>  	}
>  	return ret;
>  }
> @@ -191,10 +195,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
>  		atomic_inc(&EXT4_I(inode)->i_ioend_count);
>  		io->inode = inode;
>  		INIT_LIST_HEAD(&io->list);
> +		atomic_set(&io->count, 1);
>  	}
>  	return io;
>  }
>  
> +void ext4_put_io_end_defer(ext4_io_end_t *io_end)
> +{
> +	if (atomic_dec_and_test(&io_end->count)) {
> +		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
> +			ext4_release_io_end(io_end);
> +			return;
> +		}
> +		ext4_add_complete_io(io_end);
> +	}
> +}
> +
> +int ext4_put_io_end(ext4_io_end_t *io_end)
> +{
> +	int err = 0;
> +
> +	if (atomic_dec_and_test(&io_end->count)) {
> +		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
> +			err = ext4_convert_unwritten_extents(io_end->inode,
> +						io_end->offset, io_end->size);
> +			ext4_clear_io_unwritten_flag(io_end);
> +		}
> +		ext4_release_io_end(io_end);
> +	}
> +	return err;
> +}
> +
> +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
> +{
> +	atomic_inc(&io_end->count);
> +	return io_end;
> +}
> +
>  /*
>   * Print an buffer I/O error compatible with the fs/buffer.c.  This
>   * provides compatibility with dmesg scrapers that look for a specific
> @@ -277,12 +314,7 @@ static void ext4_end_bio(struct bio *bio, int error)
>  			     bi_sector >> (inode->i_blkbits - 9));
>  	}
>  
> -	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
> -		ext4_free_io_end(io_end);
> -		return;
> -	}
> -
> -	ext4_add_complete_io(io_end);
> +	ext4_put_io_end_defer(io_end);
>  }
>  
>  void ext4_io_submit(struct ext4_io_submit *io)
> @@ -296,40 +328,37 @@ void ext4_io_submit(struct ext4_io_submit *io)
>  		bio_put(io->io_bio);
>  	}
>  	io->io_bio = NULL;
> -	io->io_op = 0;
> +}
> +
> +void ext4_io_submit_init(struct ext4_io_submit *io,
> +			 struct writeback_control *wbc)
> +{
> +	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
> +	io->io_bio = NULL;
>  	io->io_end = NULL;
>  }
>  
> -static int io_submit_init(struct ext4_io_submit *io,
> -			  struct inode *inode,
> -			  struct writeback_control *wbc,
> -			  struct buffer_head *bh)
> +static int io_submit_init_bio(struct ext4_io_submit *io,
> +			      struct buffer_head *bh)
>  {
> -	ext4_io_end_t *io_end;
> -	struct page *page = bh->b_page;
>  	int nvecs = bio_get_nr_vecs(bh->b_bdev);
>  	struct bio *bio;
>  
> -	io_end = ext4_init_io_end(inode, GFP_NOFS);
> -	if (!io_end)
> -		return -ENOMEM;
>  	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
>  	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
>  	bio->bi_bdev = bh->b_bdev;
> -	bio->bi_private = io->io_end = io_end;
>  	bio->bi_end_io = ext4_end_bio;
> -
> -	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
> -
> +	bio->bi_private = ext4_get_io_end(io->io_end);
> +	if (!io->io_end->size)
> +		io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
> +				     + bh_offset(bh);
>  	io->io_bio = bio;
> -	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
>  	io->io_next_block = bh->b_blocknr;
>  	return 0;
>  }
>  
>  static int io_submit_add_bh(struct ext4_io_submit *io,
>  			    struct inode *inode,
> -			    struct writeback_control *wbc,
>  			    struct buffer_head *bh)
>  {
>  	ext4_io_end_t *io_end;
> @@ -340,18 +369,18 @@ submit_and_retry:
>  		ext4_io_submit(io);
>  	}
>  	if (io->io_bio == NULL) {
> -		ret = io_submit_init(io, inode, wbc, bh);
> +		ret = io_submit_init_bio(io, bh);
>  		if (ret)
>  			return ret;
>  	}
> +	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
> +	if (ret != bh->b_size)
> +		goto submit_and_retry;
>  	io_end = io->io_end;
>  	if (buffer_uninit(bh))
>  		ext4_set_io_unwritten_flag(inode, io_end);
> -	io->io_end->size += bh->b_size;
> +	io_end->size += bh->b_size;
>  	io->io_next_block++;
> -	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
> -	if (ret != bh->b_size)
> -		goto submit_and_retry;
>  	return 0;
>  }
>  
> @@ -423,7 +452,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
>  	do {
>  		if (!buffer_async_write(bh))
>  			continue;
> -		ret = io_submit_add_bh(io, inode, wbc, bh);
> +		ret = io_submit_add_bh(io, inode, bh);
>  		if (ret) {
>  			/*
>  			 * We only get here on ENOMEM.  Not much else
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ