[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <52F30B5E.8020202@hitachi.com>
Date: Thu, 06 Feb 2014 13:11:10 +0900
From: Eiichi Tsukata <eiichi.tsukata.xh@...achi.com>
To: James Bottomley <jbottomley@...allels.com>
Cc: "linux-scsi@...r.kernel.org" <linux-scsi@...r.kernel.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
yrl.pp-manager.tt@...achi.com
Subject: Re: [REVIEW PATCH] scsi: Add 'retry_timeout' to avoid infinite command
retry
(2014/02/06 1:55), James Bottomley wrote:
> On Wed, 2014-02-05 at 14:47 +0900, Eiichi Tsukata wrote:
>> Currently, scsi error handling in scsi_decide_disposition() tries to
>> unconditionally requeue scsi command when device keeps some error state.
>> This is because retryable errors are thought to be temporary and the scsi
>> device will soon recover from those errors. Normally, such retry policy is
>> appropriate because the device will soon recover from temporary error state.
> This description isn't very descriptive. I presume you're talking about
> the ADD_TO_MLQUEUE return from scsi_decide_disposition()?
>
Thanks for reviewing, James.
I was talking about ADD_TO_MLQUEUE and NEEDS_RETRY.
I'll fix the description.
>> But there is no guarantee that device is able to recover from error state
>> immediately. Some hardware error may prevent device from recovering.
>> Therefore hardware error can results in infinite command retry loop.
> If you're talking about ADD_TO_MLQUEUE, I don't believe this is correct:
> there's a test in scsi_softirq_done() that makes sure the maximum
> lifetime is retries*timeout and fails the command after that.
I see, threre's already timeout in scsi_softirq_done() so my patch is not correct
as you say.
What I really want to do is to prevent command from retrying indefinitely
in scsi_io_completion()
with action == ACTION_RETRY || action == ACTION_DELAYED_RETRY.
In v2 patch, I'll change the location of retry timeout check from scsi_softirq_done()
to scsi_io_completion().
Eiichi
(2014/02/06 1:55), James Bottomley wrote:
> On Wed, 2014-02-05 at 14:47 +0900, Eiichi Tsukata wrote:
>> Currently, scsi error handling in scsi_decide_disposition() tries to
>> unconditionally requeue scsi command when device keeps some error state.
>> This is because retryable errors are thought to be temporary and the scsi
>> device will soon recover from those errors. Normally, such retry policy is
>> appropriate because the device will soon recover from temporary error state.
> This description isn't very descriptive. I presume you're talking about
> the ADD_TO_MLQUEUE return from scsi_decide_disposition()?
>
>> But there is no guarantee that device is able to recover from error state
>> immediately. Some hardware error may prevent device from recovering.
>> Therefore hardware error can results in infinite command retry loop.
> If you're talking about ADD_TO_MLQUEUE, I don't believe this is correct:
> there's a test in scsi_softirq_done() that makes sure the maximum
> lifetime is retries*timeout and fails the command after that.
>
> James
>
>
>> This patchs adds 'retry_timeout' sysfs attribute which limits the retry time
>> of each scsi command. Once scsi command retry time is longer than this timeout,
>> the command is treated as failure. 'retry_timeout' is set to '0' by default
>> which means no timeout set.
>>
>> Signed-off-by: Eiichi Tsukata <eiichi.tsukata.xh@...achi.com>
>> Cc: "James E.J. Bottomley" <JBottomley@...allels.com>
>> Cc: linux-scsi@...r.kernel.org
>> Cc: linux-kernel@...r.kernel.org
>> ---
>> drivers/scsi/scsi_lib.c | 22 +++++++++++++++++++++-
>> drivers/scsi/scsi_scan.c | 1 +
>> drivers/scsi/scsi_sysfs.c | 32 ++++++++++++++++++++++++++++++++
>> include/scsi/scsi.h | 1 +
>> include/scsi/scsi_device.h | 1 +
>> 5 files changed, 56 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>> index 7bd7f0d..a9db69e 100644
>> --- a/drivers/scsi/scsi_lib.c
>> +++ b/drivers/scsi/scsi_lib.c
>> @@ -1492,6 +1492,23 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
>> blk_complete_request(req);
>> }
>>
>> +/*
>> + * Check if scsi command excessed retry timeout
>> + */
>> +static int scsi_retry_timed_out(struct scsi_cmnd *cmd)
>> +{
>> + unsigned int retry_timeout;
>> +
>> + retry_timeout = cmd->device->retry_timeout;
>> + if (retry_timeout &&
>> + time_before(cmd->jiffies_at_alloc + retry_timeout, jiffies)) {
>> + scmd_printk(KERN_INFO, cmd, "retry timeout\n");
>> + return 1;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> static void scsi_softirq_done(struct request *rq)
>> {
>> struct scsi_cmnd *cmd = rq->special;
>> @@ -1512,7 +1529,10 @@ static void scsi_softirq_done(struct request *rq)
>> wait_for/HZ);
>> disposition = SUCCESS;
>> }
>> -
>> +
>> + if (scsi_retry_timed_out(cmd))
>> + disposition = FAILED;
>> +
>> scsi_log_completion(cmd, disposition);
>>
>> switch (disposition) {
>> diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
>> index 307a811..4ab044a 100644
>> --- a/drivers/scsi/scsi_scan.c
>> +++ b/drivers/scsi/scsi_scan.c
>> @@ -925,6 +925,7 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
>> sdev->no_dif = 1;
>>
>> sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
>> + sdev->retry_timeout = SCSI_DEFAULT_RETRY_TIMEOUT;
>>
>> if (*bflags & BLIST_SKIP_VPD_PAGES)
>> sdev->skip_vpd_pages = 1;
>> diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
>> index 8ff62c2..eaa2118 100644
>> --- a/drivers/scsi/scsi_sysfs.c
>> +++ b/drivers/scsi/scsi_sysfs.c
>> @@ -627,6 +627,37 @@ sdev_store_eh_timeout(struct device *dev, struct device_attribute *attr,
>> static DEVICE_ATTR(eh_timeout, S_IRUGO | S_IWUSR, sdev_show_eh_timeout, sdev_store_eh_timeout);
>>
>> static ssize_t
>> +sdev_show_retry_timeout(struct device *dev,
>> + struct device_attribute *attr, char *buf)
>> +{
>> + struct scsi_device *sdev;
>> + sdev = to_scsi_device(dev);
>> + return snprintf(buf, 20, "%u\n", sdev->retry_timeout / HZ);
>> +}
>> +
>> +static ssize_t
>> +sdev_store_retry_timeout(struct device *dev, struct device_attribute *attr,
>> + const char *buf, size_t count)
>> +{
>> + struct scsi_device *sdev;
>> + unsigned int retry_timeout;
>> + int err;
>> +
>> + if (!capable(CAP_SYS_ADMIN))
>> + return -EACCES;
>> +
>> + sdev = to_scsi_device(dev);
>> + err = kstrtouint(buf, 10, &retry_timeout);
>> + if (err)
>> + return err;
>> + sdev->retry_timeout = retry_timeout * HZ;
>> +
>> + return count;
>> +}
>> +static DEVICE_ATTR(retry_timeout, S_IRUGO | S_IWUSR,
>> + sdev_show_retry_timeout, sdev_store_retry_timeout);
>> +
>> +static ssize_t
>> store_rescan_field (struct device *dev, struct device_attribute *attr,
>> const char *buf, size_t count)
>> {
>> @@ -797,6 +828,7 @@ static struct attribute *scsi_sdev_attrs[] = {
>> &dev_attr_state.attr,
>> &dev_attr_timeout.attr,
>> &dev_attr_eh_timeout.attr,
>> + &dev_attr_retry_timeout.attr,
>> &dev_attr_iocounterbits.attr,
>> &dev_attr_iorequest_cnt.attr,
>> &dev_attr_iodone_cnt.attr,
>> diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
>> index 66d42ed..545408d 100644
>> --- a/include/scsi/scsi.h
>> +++ b/include/scsi/scsi.h
>> @@ -16,6 +16,7 @@ struct scsi_cmnd;
>>
>> enum scsi_timeouts {
>> SCSI_DEFAULT_EH_TIMEOUT = 10 * HZ,
>> + SCSI_DEFAULT_RETRY_TIMEOUT = 0, /* disabled by default */
>> };
>>
>> /*
>> diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
>> index d65fbec..04fc5ee 100644
>> --- a/include/scsi/scsi_device.h
>> +++ b/include/scsi/scsi_device.h
>> @@ -121,6 +121,7 @@ struct scsi_device {
>> * pass settings from slave_alloc to scsi
>> * core. */
>> unsigned int eh_timeout; /* Error handling timeout */
>> + unsigned int retry_timeout; /* Command retry timeout */
>> unsigned writeable:1;
>> unsigned removable:1;
>> unsigned changed:1; /* Data invalid due to media change */
>
>
> .
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists