[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEH94LjXRG755cTsuGN_R0V+J9aRvEJaS+0aQSAOkNWL1UbXTA@mail.gmail.com>
Date: Wed, 31 Jul 2013 12:07:32 +0800
From: Zhi Yong Wu <zwu.kernel@...il.com>
To: Dave Chinner <david@...morbit.com>
Cc: xfstests <xfs@....sgi.com>,
"linux-fsdevel@...r.kernel.org" <linux-fsdevel@...r.kernel.org>,
linux-kernel mlist <linux-kernel@...r.kernel.org>,
Zhi Yong Wu <wuzhy@...ux.vnet.ibm.com>
Subject: Re: [PATCH v2] xfs: introduce object readahead to log recovery
On Wed, Jul 31, 2013 at 7:11 AM, Dave Chinner <david@...morbit.com> wrote:
> On Tue, Jul 30, 2013 at 05:59:07PM +0800, zwu.kernel@...il.com wrote:
>> From: Zhi Yong Wu <wuzhy@...ux.vnet.ibm.com>
>>
>> It can take a long time to run log recovery operation because it is
>> single threaded and is bound by read latency. We can find that it took
>> most of the time to wait for the read IO to occur, so if one object
>> readahead is introduced to log recovery, it will obviously reduce the
>> log recovery time.
>>
>> Log recovery time stat:
>>
>> w/o this patch w/ this patch
>>
>> real: 0m15.023s 0m7.802s
>> user: 0m0.001s 0m0.001s
>> sys: 0m0.246s 0m0.107s
>>
>> Signed-off-by: Zhi Yong Wu <wuzhy@...ux.vnet.ibm.com>
>> ---
>> fs/xfs/xfs_log_recover.c | 162 +++++++++++++++++++++++++++++++++++++++++++++--
>> fs/xfs/xfs_log_recover.h | 2 +
>> 2 files changed, 159 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
>> index 7681b19..029826f 100644
>> --- a/fs/xfs/xfs_log_recover.c
>> +++ b/fs/xfs/xfs_log_recover.c
>> @@ -3116,6 +3116,111 @@ xlog_recover_free_trans(
>> kmem_free(trans);
>> }
>>
>> +STATIC void
>> +xlog_recover_buffer_ra_pass2(
>> + struct xlog *log,
>> + struct xlog_recover_item *item)
>> +{
>> + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
>> + xfs_mount_t *mp = log->l_mp;
>
> struct xfs_buf_log_format
> struct xfs_mount
Why? *_t is also used in a lot of other places.
>
>> +
>> + if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
>> + buf_f->blf_len, buf_f->blf_flags)) {
>> + return;
>> + }
>> +
>> + xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
>> + buf_f->blf_len, NULL);
>> +}
>> +
>> +STATIC void
>> +xlog_recover_inode_ra_pass2(
>> + struct xlog *log,
>> + struct xlog_recover_item *item)
>> +{
>> + xfs_inode_log_format_t in_buf, *in_f;
>> + xfs_mount_t *mp = log->l_mp;
>
> struct xfs_inode_log_format
> struct xfs_mount
>
> and a separate declaration for each variable.
>
> Also, in_buf and in_f are not very good names as tehy don't follow
> any commonly used XFs naming convention. The shorthand for a
> struct xfs_inode_log_format variable is "ilf" and a pointer to such
> an object is "ilfp". i.e:
>
> struct xfs_inode_log_format ilf_buf;
> struct xfs_inode_log_format *ilfp;
>
>> +xlog_recover_dquot_ra_pass2(
>> + struct xlog *log,
>> + struct xlog_recover_item *item)
>> +{
>> + xfs_mount_t *mp = log->l_mp;
>> + struct xfs_disk_dquot *recddq;
>> + int error;
>> + xfs_dq_logformat_t *dq_f;
>> + uint type;
>
> More typedefs.
>
>> +
>> +
>> + if (mp->m_qflags == 0)
>> + return;
>> +
>> + recddq = item->ri_buf[1].i_addr;
>> + if (recddq == NULL)
>> + return;
>> + if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t))
>> + return;
>> +
>> + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
>> + ASSERT(type);
>> + if (log->l_quotaoffs_flag & type)
>> + return;
>> +
>> + dq_f = item->ri_buf[0].i_addr;
>> + ASSERT(dq_f);
>> + error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
>> + "xlog_recover_dquot_ra_pass2 (log copy)");
>
> You don't need to do validation of the dquot in the transaction
> here - it's unlikely to be corrupt. Just do the readahead like for a
> normal buffer, and the validation can occur when recovering the
> item in the next pass.
Agreed, done.
>
>> + if (error)
>> + return;
>> + ASSERT(dq_f->qlf_len == 1);
>> +
>> + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
>> + dq_f->qlf_len, NULL);
>> +}
>> +
>> +STATIC void
>> +xlog_recover_ra_pass2(
>> + struct xlog *log,
>> + struct xlog_recover_item *item)
>> +{
>> + switch (ITEM_TYPE(item)) {
>> + case XFS_LI_BUF:
>> + xlog_recover_buffer_ra_pass2(log, item);
>> + break;
>> + case XFS_LI_INODE:
>> + xlog_recover_inode_ra_pass2(log, item);
>> + break;
>> + case XFS_LI_DQUOT:
>> + xlog_recover_dquot_ra_pass2(log, item);
>> + break;
>> + case XFS_LI_EFI:
>> + case XFS_LI_EFD:
>> + case XFS_LI_QUOTAOFF:
>> + default:
>> + break;
>> + }
>> +}
>> +
>> STATIC int
>> xlog_recover_commit_pass1(
>> struct xlog *log,
>> @@ -3177,6 +3282,26 @@ xlog_recover_commit_pass2(
>> }
>> }
>>
>> +STATIC int
>> +xlog_recover_items_pass2(
>> + struct xlog *log,
>> + struct xlog_recover *trans,
>> + struct list_head *buffer_list,
>> + struct list_head *ra_list)
>> +{
>> + int error = 0;
>> + xlog_recover_item_t *item;
>
> typedef.
>
>> +
>> + list_for_each_entry(item, ra_list, ri_list) {
>> + error = xlog_recover_commit_pass2(log, trans,
>> + buffer_list, item);
>> + if (error)
>> + return error;
>> + }
>> +
>> + return error;
>> +}
>> +
>> /*
>> * Perform the transaction.
>> *
>> @@ -3189,9 +3314,11 @@ xlog_recover_commit_trans(
>> struct xlog_recover *trans,
>> int pass)
>> {
>> - int error = 0, error2;
>> - xlog_recover_item_t *item;
>> + int error = 0, error2, ra_qdepth = 0;
>> + xlog_recover_item_t *item, *next;
>
> typedef, one variable per line.
>
>> LIST_HEAD (buffer_list);
>> + LIST_HEAD (ra_list);
>> + LIST_HEAD (all_ra_list);
>
> Isn't the second the "recovered" list?
>
> i.e. objects are moved to the ra_list when readhead is issued,
> then when they are committed they are moved to the "recovered"
> list?
>
>> hlist_del(&trans->r_list);
>>
>> @@ -3199,14 +3326,21 @@ xlog_recover_commit_trans(
>> if (error)
>> return error;
>>
>> - list_for_each_entry(item, &trans->r_itemq, ri_list) {
>> + list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
>> switch (pass) {
>> case XLOG_RECOVER_PASS1:
>> error = xlog_recover_commit_pass1(log, trans, item);
>> break;
>> case XLOG_RECOVER_PASS2:
>> - error = xlog_recover_commit_pass2(log, trans,
>> - &buffer_list, item);
>> + if (ra_qdepth++ >= XLOG_RECOVER_MAX_QDEPTH) {
>
> I'd define XLOG_RECOVER_MAX_QDEPTH inside this function with all the
> local variables. It has not scope outside this function.
>
> Also, "items_queued" is a better name than ra_qdepth - we are
> tracking how many items we've queued for recovery, not the number of
> readahead IOs we have in progress. Similar for
> XLOG_RECOVER_MAX_QDEPTH - XLOG_RECOVER_COMMIT_QUEUE_MAX might be
> better.
Applied all above.
>
>
>> + error = xlog_recover_items_pass2(log, trans,
>> + &buffer_list, &ra_list);
>> + list_splice_tail_init(&ra_list, &all_ra_list);
>> + ra_qdepth = 0;
>> + } else {
>> + xlog_recover_ra_pass2(log, item);
>> + list_move_tail(&item->ri_list, &ra_list);
>> + }
>> break;
>> default:
>> ASSERT(0);
>> @@ -3216,9 +3350,27 @@ xlog_recover_commit_trans(
>> goto out;
>> }
>>
>> + if (!list_empty(&ra_list)) {
>> + error = xlog_recover_items_pass2(log, trans,
>> + &buffer_list, &ra_list);
>> + if (error)
>> + goto out;
>> +
>> + list_splice_tail_init(&ra_list, &all_ra_list);
>> + }
>> +
>> + if (!list_empty(&all_ra_list))
>> + list_splice_init(&all_ra_list, &trans->r_itemq);
>> +
>> xlog_recover_free_trans(trans);
>>
>> out:
>> + if (!list_empty(&ra_list))
>> + list_splice_tail_init(&ra_list, &all_ra_list);
>> +
>> + if (!list_empty(&all_ra_list))
>> + list_splice_init(&all_ra_list, &trans->r_itemq);
>
> The error handling here is all wrong. xlog_recover_free_trans()
> frees everything on the trans->r_itemq, and then frees trans, so
> this is both leaky and a use after free. This is all you need to do
For normal path, the above two list_splice_* are not executed at all.
> here:
>
> @@ -3216,6 +3359,13 @@ xlog_recover_commit_trans(
> if (error)
> goto out;
> }
> + if (!list_empty(&ra_list)) {
> + error = recover_commit(log, trans, &buffer_list, &ra_list);
> + list_splice_init(&ra_list, &done_list);
> + }
> +
> + if (!list_empty(&done_list))
> + list_splice(&done_list, &trans->r_itemq);
>
> xlog_recover_free_trans(trans);
>
>
> i.e. run the recovery of the remainder of the ra_list, splice it
> to the done list, the splice the done list back to the trans and
> then free the trans. The error path falls through naturally and
> without needing any special handling....
For error path, do you not need to splice ra_list and done_list to the trans?
I thought that this transaction may be continued to recovery log later.
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@...morbit.com
--
Regards,
Zhi Yong Wu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists