[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <88f1a3af-c6f4-42eb-bf63-504998028973@grimberg.me>
Date: Thu, 25 Dec 2025 15:00:14 +0200
From: Sagi Grimberg <sagi@...mberg.me>
To: Alex Tran <alex.t.tran@...il.com>, Keith Busch <kbusch@...nel.org>,
Jens Axboe <axboe@...nel.dk>, Christoph Hellwig <hch@....de>
Cc: linux-nvme@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/2] nvme/host: add delayed retries upon non-fatal error
during ns validation
On 21/12/2025 23:26, Alex Tran wrote:
> If a non-fatal error is received during nvme namespace validation, it
> should not be ignored and the namespace should not be removed immediately.
> Rather, delayed retires should be performed on the namespace validation
> process.
>
> This handles non-fatal issues more robustly, by retrying a few times before
> giving up and removing the namespace. The number of retries is set
> to 3 and the interval between retries is set to 3 seconds.
>
> Signed-off-by: Alex Tran <alex.t.tran@...il.com>
> ---
> drivers/nvme/host/core.c | 43 +++++++++++++++++++++++++++++++++++++++----
> drivers/nvme/host/nvme.h | 9 +++++++++
> 2 files changed, 48 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fab321e79b7cdbb89d96d950c1cc8c1128906770..2e208d894b27f85f7f6358eb697be262ce45aed6 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -139,6 +139,7 @@ static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
> struct nvme_command *cmd);
> static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
> u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
> +static void nvme_validate_ns_work(struct work_struct *work);
>
> void nvme_queue_scan(struct nvme_ctrl *ctrl)
> {
> @@ -4118,6 +4119,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
> ns->ctrl = ctrl;
> kref_init(&ns->kref);
>
> + INIT_DELAYED_WORK(&ns->validate_work, nvme_validate_ns_work);
> +
> if (nvme_init_ns_head(ns, info))
> goto out_cleanup_disk;
>
> @@ -4215,6 +4218,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
> {
> bool last_path = false;
>
> + cancel_delayed_work_sync(&ns->validate_work);
> +
> if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
> return;
>
> @@ -4285,12 +4290,42 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
> out:
> /*
> * Only remove the namespace if we got a fatal error back from the
> - * device, otherwise ignore the error and just move on.
> - *
> - * TODO: we should probably schedule a delayed retry here.
> + * device, otherwise delayed retries are performed.
> */
> - if (ret > 0 && (ret & NVME_STATUS_DNR))
> + if (ret > 0 && (ret & NVME_STATUS_DNR)) {
> nvme_ns_remove(ns);
> + } else if (ret > 0) {
> + if (ns->validate_retries < NVME_NS_VALIDATION_MAX_RETRIES) {
> + ns->validate_retries++;
> +
> + if (!nvme_get_ns(ns))
> + return;
> +
> + dev_warn(
> + ns->ctrl->device,
> + "validation failed for nsid %d, retry %d/%d in %ds\n",
> + ns->head->ns_id, ns->validate_retries,
> + NVME_NS_VALIDATION_MAX_RETRIES,
> + NVME_NS_VALIDATION_RETRY_INTERVAL);
> + memcpy(&ns->pending_info, info, sizeof(*info));
> + schedule_delayed_work(
> + &ns->validate_work,
> + NVME_NS_VALIDATION_RETRY_INTERVAL * HZ);
Given that ns scanning is already async, wouldn't it be simpler to
simply retry locally
in a loop?
Powered by blists - more mailing lists