[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <1202324887.4112.11.camel@localhost.localdomain>
Date: Wed, 06 Feb 2008 11:08:07 -0800
From: Mingming Cao <cmm@...ibm.com>
To: Jan Kara <jack@...e.cz>
Cc: linux-ext4@...r.kernel.org
Subject: Re: [PATCH] ext4: Fix DIO locking
On Wed, 2008-02-06 at 17:06 +0100, Jan Kara wrote:
> Hi,
>
> this patch fixes lock inversion in ext4 direct IO path. The similar patch
> has already been accepted for ext3. Mingming, will you put it into ext4
> patch queue? Thanks.
>
Thanks! Added to the ext4 patch queue.
Mingming
> Honza
> --
> Jan Kara <jack@...e.cz>
> SUSE Labs, CR
> ---
>
> We cannot start transaction in ext4_direct_IO() and just let it last during the
> whole write because dio_get_page() acquires mmap_sem which ranks above
> transaction start (e.g. because we have dependency chain
> mmap_sem->PageLock->journal_start, or because we update atime while holding
> mmap_sem) and thus deadlocks could happen. We solve the problem by starting
> a transaction separately for each ext4_get_block() call.
>
> We *could* have a problem that we allocate a block and before its data are
> written out the machine crashes and thus we expose stale data. But that
> does not happen because for hole-filling generic code falls back to buffered
> writes and for file extension, we add inode to orphan list and thus in
> case of crash, journal replay will truncate inode back to the original size.
>
> Signed-off-by: Jan Kara <jack@...e.cz>
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index bb717cb..1948b97 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -895,7 +895,16 @@ out:
> return err;
> }
>
> -#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
> +/* Maximum number of blocks we map for direct IO at once. */
> +#define DIO_MAX_BLOCKS 4096
> +/*
> + * Number of credits we need for writing DIO_MAX_BLOCKS:
> + * We need sb + group descriptor + bitmap + inode -> 4
> + * For B blocks with A block pointers per block we need:
> + * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
> + * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
> + */
> +#define DIO_CREDITS 25
>
> int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
> unsigned long max_blocks, struct buffer_head *bh,
> @@ -942,49 +951,31 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
> struct buffer_head *bh_result, int create)
> {
> handle_t *handle = ext4_journal_current_handle();
> - int ret = 0;
> + int ret = 0, started = 0;
> unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
>
> - if (!create)
> - goto get_block; /* A read */
> -
> - if (max_blocks == 1)
> - goto get_block; /* A single block get */
> -
> - if (handle->h_transaction->t_state == T_LOCKED) {
> - /*
> - * Huge direct-io writes can hold off commits for long
> - * periods of time. Let this commit run.
> - */
> - ext4_journal_stop(handle);
> - handle = ext4_journal_start(inode, DIO_CREDITS);
> - if (IS_ERR(handle))
> +
> + if (create && !handle) { /* Direct IO write... */
> + if (max_blocks > DIO_MAX_BLOCKS)
> + max_blocks = DIO_MAX_BLOCKS;
> + handle = ext4_journal_start(inode, DIO_CREDITS +
> + 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
> + if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> - goto get_block;
> - }
> -
> - if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
> - /*
> - * Getting low on buffer credits...
> - */
> - ret = ext4_journal_extend(handle, DIO_CREDITS);
> - if (ret > 0) {
> - /*
> - * Couldn't extend the transaction. Start a new one.
> - */
> - ret = ext4_journal_restart(handle, DIO_CREDITS);
> + goto out;
> }
> + started = 1;
> }
>
> -get_block:
> - if (ret == 0) {
> - ret = ext4_get_blocks_wrap(handle, inode, iblock,
> + ret = ext4_get_blocks_wrap(handle, inode, iblock,
> max_blocks, bh_result, create, 0);
> - if (ret > 0) {
> - bh_result->b_size = (ret << inode->i_blkbits);
> - ret = 0;
> - }
> + if (ret > 0) {
> + bh_result->b_size = (ret << inode->i_blkbits);
> + ret = 0;
> }
> + if (started)
> + ext4_journal_stop(handle);
> +out:
> return ret;
> }
>
> @@ -1674,7 +1665,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
> * if the machine crashes during the write.
> *
> * If the O_DIRECT write is intantiating holes inside i_size and the machine
> - * crashes then stale disk data _may_ be exposed inside the file.
> + * crashes then stale disk data _may_ be exposed inside the file. But current
> + * VFS code falls back into buffered path in that case so we are safe.
> */
> static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> const struct iovec *iov, loff_t offset,
> @@ -1683,7 +1675,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> struct file *file = iocb->ki_filp;
> struct inode *inode = file->f_mapping->host;
> struct ext4_inode_info *ei = EXT4_I(inode);
> - handle_t *handle = NULL;
> + handle_t *handle;
> ssize_t ret;
> int orphan = 0;
> size_t count = iov_length(iov, nr_segs);
> @@ -1691,17 +1683,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> if (rw == WRITE) {
> loff_t final_size = offset + count;
>
> - handle = ext4_journal_start(inode, DIO_CREDITS);
> - if (IS_ERR(handle)) {
> - ret = PTR_ERR(handle);
> - goto out;
> - }
> if (final_size > inode->i_size) {
> + /* Credits for sb + inode write */
> + handle = ext4_journal_start(inode, 2);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out;
> + }
> ret = ext4_orphan_add(handle, inode);
> - if (ret)
> - goto out_stop;
> + if (ret) {
> + ext4_journal_stop(handle);
> + goto out;
> + }
> orphan = 1;
> ei->i_disksize = inode->i_size;
> + ext4_journal_stop(handle);
> }
> }
>
> @@ -1709,18 +1705,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> offset, nr_segs,
> ext4_get_block, NULL);
>
> - /*
> - * Reacquire the handle: ext4_get_block() can restart the transaction
> - */
> - handle = ext4_journal_current_handle();
> -
> -out_stop:
> - if (handle) {
> + if (orphan) {
> int err;
>
> - if (orphan && inode->i_nlink)
> + /* Credits for sb + inode write */
> + handle = ext4_journal_start(inode, 2);
> + if (IS_ERR(handle)) {
> + /* This is really bad luck. We've written the data
> + * but cannot extend i_size. Bail out and pretend
> + * the write failed... */
> + ret = PTR_ERR(handle);
> + goto out;
> + }
> + if (inode->i_nlink)
> ext4_orphan_del(handle, inode);
> - if (orphan && ret > 0) {
> + if (ret > 0) {
> loff_t end = offset + ret;
> if (end > inode->i_size) {
> ei->i_disksize = end;
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
-
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists