[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6601abe90910200736i78fa50b0jb4db9fed8800c805@mail.gmail.com>
Date: Tue, 20 Oct 2009 07:36:55 -0700
From: Curt Wohlgemuth <curtw@...gle.com>
To: Jan Kara <jack@...e.cz>
Cc: linux-ext4@...r.kernel.org, tytso@....edu, chris.mason@...cle.com
Subject: Re: [PATCH 4/4] ext4: Wait for proper transaction commit on fsync
On Tue, Oct 20, 2009 at 12:24 AM, Jan Kara <jack@...e.cz> wrote:
> We cannot rely on buffer dirty bits during fsync because pdflush can come
> before fsync is called and clear dirty bits without forcing a transaction
> commit. What we do is that we track which transaction has last changed
> the inode and which transaction last changed allocation and force it to
> disk on fsync.
>
> Signed-off-by: Jan Kara <jack@...e.cz>
> ---
> fs/ext4/ext4.h | 7 +++++++
> fs/ext4/extents.c | 5 +++++
> fs/ext4/fsync.c | 40 +++++++++++++++++-----------------------
> fs/ext4/inode.c | 34 ++++++++++++++++++++++++++++++++++
> 4 files changed, 63 insertions(+), 23 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 984ca0c..5639f30 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -702,6 +702,13 @@ struct ext4_inode_info {
> struct list_head i_aio_dio_complete_list;
> /* current io_end structure for async DIO write*/
> ext4_io_end_t *cur_aio_dio;
> +
> + /*
> + * Transactions that contain inode's metadata needed to complete
> + * fsync and fdatasync, respectively.
> + */
> + atomic_t i_sync_tid;
> + atomic_t i_datasync_tid;
> };
>
> /*
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 10539e3..3e167f6 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3315,6 +3315,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
> newblock = ext_pblock(&newex);
> allocated = ext4_ext_get_actual_len(&newex);
> set_buffer_new(bh_result);
> +
> + atomic_set(&EXT4_I(inode)->i_sync_tid, handle->h_transaction->t_tid);
> + atomic_set(&EXT4_I(inode)->i_datasync_tid,
> + handle->h_transaction->t_tid);
> + printk("Datasync tid %u\n", handle->h_transaction->t_tid);
Both here and in ext4_ind_get_blocks() below, I think you need to
guard the atomic_set() calls with ext4_handle_valid().
Curt
>
> /* Cache only when it is _not_ an uninitialized extent */
> if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> index 2bf9413..03e0fe9 100644
> --- a/fs/ext4/fsync.c
> +++ b/fs/ext4/fsync.c
> @@ -51,25 +51,26 @@
> int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
> {
> struct inode *inode = dentry->d_inode;
> + struct ext4_inode_info *ei = EXT4_I(inode);
> journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> - int err, ret = 0;
> + int ret = 0;
> + tid_t commit_tid;
>
> J_ASSERT(ext4_journal_current_handle() == NULL);
>
> trace_ext4_sync_file(file, dentry, datasync);
>
> + if (inode->i_sb->s_flags & MS_RDONLY)
> + goto out;
> +
> ret = flush_aio_dio_completed_IO(inode);
> if (ret < 0)
> goto out;
> /*
> - * data=writeback:
> + * data=writeback,ordered:
> * The caller's filemap_fdatawrite()/wait will sync the data.
> - * sync_inode() will sync the metadata
> - *
> - * data=ordered:
> - * The caller's filemap_fdatawrite() will write the data and
> - * sync_inode() will write the inode if it is dirty. Then the caller's
> - * filemap_fdatawait() will wait on the pages.
> + * Metadata is in the journal, we wait for proper transaction to
> + * commit here.
> *
> * data=journal:
> * filemap_fdatawrite won't do anything (the buffers are clean).
> @@ -87,23 +88,16 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
> if (!journal)
> ret = sync_mapping_buffers(inode->i_mapping);
>
> - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
> - goto flush;
> + if (datasync)
> + commit_tid = atomic_read(&ei->i_datasync_tid);
> + else
> + commit_tid = atomic_read(&ei->i_sync_tid);
>
> - /*
> - * The VFS has written the file data. If the inode is unaltered
> - * then we need not start a commit.
> - */
> - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
> - struct writeback_control wbc = {
> - .sync_mode = WB_SYNC_ALL,
> - .nr_to_write = 0, /* sys_fsync did this */
> - };
> - err = sync_inode(inode, &wbc);
> - if (ret == 0)
> - ret = err;
> + if (jbd2_log_start_commit(journal, commit_tid)) {
> + jbd2_log_wait_commit(journal, commit_tid);
> + goto out;
> }
> -flush:
> +
> if (journal && (journal->j_flags & JBD2_BARRIER))
> blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
> out:
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 9105f40..412de4e 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1024,6 +1024,10 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
> goto cleanup;
>
> set_buffer_new(bh_result);
> +
> + atomic_set(&EXT4_I(inode)->i_sync_tid, handle->h_transaction->t_tid);
> + atomic_set(&EXT4_I(inode)->i_datasync_tid,
> + handle->h_transaction->t_tid);
> got_it:
> map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
> if (count > blocks_to_boundary)
> @@ -4777,6 +4781,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> struct ext4_inode_info *ei;
> struct buffer_head *bh;
> struct inode *inode;
> + journal_t *journal = EXT4_SB(sb)->s_journal;
> long ret;
> int block;
>
> @@ -4842,6 +4847,34 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> ei->i_data[block] = raw_inode->i_block[block];
> INIT_LIST_HEAD(&ei->i_orphan);
>
> + /*
> + * Set transaction id's of transactions that have to be committed
> + * to finish f[data]sync. We set them to currently running transaction
> + * as we cannot be sure that the inode or some of its metadata isn't
> + * part of the transaction - the inode could have been reclaimed and
> + * now it is reread from disk.
> + */
> + if (journal) {
> + transaction_t *transaction;
> + tid_t tid;
> +
> + spin_lock(&journal->j_state_lock);
> + if (journal->j_running_transaction)
> + transaction = journal->j_running_transaction;
> + else
> + transaction = journal->j_committing_transaction;
> + if (transaction)
> + tid = transaction->t_tid;
> + else
> + tid = journal->j_commit_sequence;
> + spin_unlock(&journal->j_state_lock);
> + atomic_set(&ei->i_sync_tid, tid);
> + atomic_set(&ei->i_datasync_tid, tid);
> + } else {
> + atomic_set(&ei->i_sync_tid, 0);
> + atomic_set(&ei->i_datasync_tid, 0);
> + }
> +
> if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
> ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
> if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
> @@ -5102,6 +5135,7 @@ static int ext4_do_update_inode(handle_t *handle,
> err = rc;
> ei->i_state &= ~EXT4_STATE_NEW;
>
> + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
> out_brelse:
> brelse(bh);
> ext4_std_error(inode->i_sb, err);
> --
> 1.6.0.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists