linux-kernel - Re: [PATCH] f2fs: fix long latency due to discard during umount

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200330065335.GT20234@codeaurora.org>
Date:   Mon, 30 Mar 2020 12:23:35 +0530
From:   Sahitya Tummala <stummala@...eaurora.org>
To:     Chao Yu <yuchao0@...wei.com>
Cc:     Jaegeuk Kim <jaegeuk@...nel.org>,
        linux-f2fs-devel@...ts.sourceforge.net,
        linux-kernel@...r.kernel.org, stummala@...eaurora.org
Subject: Re: [PATCH] f2fs: fix long latency due to discard during umount

Hi Chao,

On Fri, Mar 27, 2020 at 08:35:42AM +0530, Sahitya Tummala wrote:
> On Fri, Mar 27, 2020 at 09:51:43AM +0800, Chao Yu wrote:
> > 
> > With this patch, most of xfstest cases cost 5 * n second longer than before.
> > 
> > E.g. generic/003, during umount(), we looped into retrying one bio
> > submission.
> > 
> > [61279.829724] F2FS-fs (zram1): Found nat_bits in checkpoint
> > [61279.885337] F2FS-fs (zram1): Mounted with checkpoint version = 5cf3cb8e
> > [61281.912832] submit discard bio start [23555,1]
> > [61281.912835] f2fs_submit_discard_endio [23555,1] err:-11
> > [61281.912836] submit discard bio end [23555,1]
> > [61281.912836] move dc to retry list [23555,1]
> > 
> > ...
> > 
> > [61286.881212] submit discard bio start [23555,1]
> > [61286.881217] f2fs_submit_discard_endio [23555,1] err:-11
> > [61286.881223] submit discard bio end [23555,1]
> > [61286.881224] move dc to retry list [23555,1]
> > [61286.905198] submit discard bio start [23555,1]
> > [61286.905203] f2fs_submit_discard_endio [23555,1] err:-11
> > [61286.905205] submit discard bio end [23555,1]
> > [61286.905206] move dc to retry list [23555,1]
> > [61286.929157] F2FS-fs (zram1): Issue discard(23555, 23555, 1) failed, ret: -11
> > 
> > Could you take a look at this issue?
> 
> Let me check and get back on this.

I found the issue. The dc with multiple bios is getting requeued again and
again in case if one of its bio gets -EAGAIN error. Even the successfully
completed bios are getting requeued again resulting into long latency.
I have fixed it by splitting the dc in such case so that we can requeue only
the leftover bios into a new dc and retry that later within the 5 sec timeout.

Please help to review v3 posted and if it looks good, I would like to request
you to test the earlier regression scenario with it to check the result again?

thanks,

> 
> Thanks,
> 
> > 
> > Thanks,
> > 
> > > 
> > > Thanks,
> > > 
> > >> Thanks,
> > >>
> > >>> +				break;
> > >>> +			}
> > >>> +		}
> > >>>  
> > >>>  		atomic_inc(&dcc->issued_discard);
> > >>>  
> > >>> @@ -1463,6 +1477,40 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
> > >>>  	return issued;
> > >>>  }
> > >>>  
> > >>> +static bool __should_discard_retry(struct f2fs_sb_info *sbi,
> > > s> > +		struct discard_policy *dpolicy)
> > >>> +{
> > >>> +	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
> > >>> +	struct discard_cmd *dc, *tmp;
> > >>> +	bool retry = false;
> > >>> +	unsigned long flags;
> > >>> +
> > >>> +	if (dpolicy->type != DPOLICY_UMOUNT)
> > >>> +		f2fs_bug_on(sbi, 1);
> > >>> +
> > >>> +	mutex_lock(&dcc->cmd_lock);
> > >>> +	list_for_each_entry_safe(dc, tmp, &(dcc->retry_list), list) {
> > >>> +		if (dpolicy->timeout != 0 &&
> > >>> +			f2fs_time_over(sbi, dpolicy->timeout)) {
> > >>> +			retry = false;
> > >>> +			break;
> > >>> +		}
> > >>> +
> > >>> +		spin_lock_irqsave(&dc->lock, flags);
> > >>> +		if (!dc->bio_ref) {
> > >>> +			dc->state = D_PREP;
> > >>> +			dc->error = 0;
> > >>> +			reinit_completion(&dc->wait);
> > >>> +			__relocate_discard_cmd(dcc, dc);
> > >>> +			retry = true;
> > >>> +		}
> > >>> +		spin_unlock_irqrestore(&dc->lock, flags);
> > >>> +	}
> > >>> +	mutex_unlock(&dcc->cmd_lock);
> > >>> +
> > >>> +	return retry;
> > >>> +}
> > >>> +
> > >>>  static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
> > >>>  					struct discard_policy *dpolicy)
> > >>>  {
> > >>> @@ -1470,12 +1518,13 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
> > >>>  	struct list_head *pend_list;
> > >>>  	struct discard_cmd *dc, *tmp;
> > >>>  	struct blk_plug plug;
> > >>> -	int i, issued = 0;
> > >>> +	int i, err, issued = 0;
> > >>>  	bool io_interrupted = false;
> > >>>  
> > >>>  	if (dpolicy->timeout != 0)
> > >>>  		f2fs_update_time(sbi, dpolicy->timeout);
> > >>>  
> > >>> +retry:
> > >>>  	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
> > >>>  		if (dpolicy->timeout != 0 &&
> > >>>  				f2fs_time_over(sbi, dpolicy->timeout))
> > >>> @@ -1509,7 +1558,10 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
> > >>>  				break;
> > >>>  			}
> > >>>  
> > >>> -			__submit_discard_cmd(sbi, dpolicy, dc, &issued);
> > >>> +			err = __submit_discard_cmd(sbi, dpolicy, dc, &issued);
> > >>> +			if (err == -EAGAIN)
> > >>> +				congestion_wait(BLK_RW_ASYNC,
> > >>> +						DEFAULT_IO_TIMEOUT);
> > >>>  
> > >>>  			if (issued >= dpolicy->max_requests)
> > >>>  				break;
> > >>> @@ -1522,6 +1574,10 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
> > >>>  			break;
> > >>>  	}
> > >>>  
> > >>> +	if (!list_empty(&dcc->retry_list) &&
> > >>> +		__should_discard_retry(sbi, dpolicy))
> > >>> +		goto retry;
> > >>> +
> > >>>  	if (!issued && io_interrupted)
> > >>>  		issued = -1;
> > >>>  
> > >>> @@ -1613,6 +1669,12 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
> > >>>  		goto next;
> > >>>  	}
> > >>>  
> > >>> +	if (dpolicy->type == DPOLICY_UMOUNT &&
> > >>> +		!list_empty(&dcc->retry_list)) {
> > >>> +		wait_list = &dcc->retry_list;
> > >>> +		goto next;
> > >>> +	}
> > >>> +
> > >>>  	return trimmed;
> > >>>  }
> > >>>  
> > >>> @@ -2051,6 +2113,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
> > >>>  	for (i = 0; i < MAX_PLIST_NUM; i++)
> > >>>  		INIT_LIST_HEAD(&dcc->pend_list[i]);
> > >>>  	INIT_LIST_HEAD(&dcc->wait_list);
> > >>> +	INIT_LIST_HEAD(&dcc->retry_list);
> > >>>  	INIT_LIST_HEAD(&dcc->fstrim_list);
> > >>>  	mutex_init(&dcc->cmd_lock);
> > >>>  	atomic_set(&dcc->issued_discard, 0);
> > >>>
> > > 
> 
> -- 
> --
> Sent by a consultant of the Qualcomm Innovation Center, Inc.
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum.

-- 
--
Sent by a consultant of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum.