lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20251221-nvme_ns_validation-v1-2-9f7a385707af@gmail.com>
Date: Sun, 21 Dec 2025 13:26:11 -0800
From: Alex Tran <alex.t.tran@...il.com>
To: Keith Busch <kbusch@...nel.org>, Jens Axboe <axboe@...nel.dk>, 
 Christoph Hellwig <hch@....de>, Sagi Grimberg <sagi@...mberg.me>
Cc: linux-nvme@...ts.infradead.org, linux-kernel@...r.kernel.org, 
 Alex Tran <alex.t.tran@...il.com>
Subject: [PATCH 2/2] nvme/host: add delayed retries upon non-fatal error
 during ns validation

If a non-fatal error is received during nvme namespace validation, it
should not be ignored and the namespace should not be removed immediately.
Rather, delayed retires should be performed on the namespace validation
process.

This handles non-fatal issues more robustly, by retrying a few times before
giving up and removing the namespace. The number of retries is set
to 3 and the interval between retries is set to 3 seconds.

Signed-off-by: Alex Tran <alex.t.tran@...il.com>
---
 drivers/nvme/host/core.c | 43 +++++++++++++++++++++++++++++++++++++++----
 drivers/nvme/host/nvme.h |  9 +++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fab321e79b7cdbb89d96d950c1cc8c1128906770..2e208d894b27f85f7f6358eb697be262ce45aed6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -139,6 +139,7 @@ static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
 				   struct nvme_command *cmd);
 static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
 		u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
+static void nvme_validate_ns_work(struct work_struct *work);
 
 void nvme_queue_scan(struct nvme_ctrl *ctrl)
 {
@@ -4118,6 +4119,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 	ns->ctrl = ctrl;
 	kref_init(&ns->kref);
 
+	INIT_DELAYED_WORK(&ns->validate_work, nvme_validate_ns_work);
+
 	if (nvme_init_ns_head(ns, info))
 		goto out_cleanup_disk;
 
@@ -4215,6 +4218,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 {
 	bool last_path = false;
 
+	cancel_delayed_work_sync(&ns->validate_work);
+
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
@@ -4285,12 +4290,42 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
 out:
 	/*
 	 * Only remove the namespace if we got a fatal error back from the
-	 * device, otherwise ignore the error and just move on.
-	 *
-	 * TODO: we should probably schedule a delayed retry here.
+	 * device, otherwise delayed retries are performed.
 	 */
-	if (ret > 0 && (ret & NVME_STATUS_DNR))
+	if (ret > 0 && (ret & NVME_STATUS_DNR)) {
 		nvme_ns_remove(ns);
+	} else if (ret > 0) {
+		if (ns->validate_retries < NVME_NS_VALIDATION_MAX_RETRIES) {
+			ns->validate_retries++;
+
+			if (!nvme_get_ns(ns))
+				return;
+
+			dev_warn(
+				ns->ctrl->device,
+				"validation failed for nsid %d, retry %d/%d in %ds\n",
+				ns->head->ns_id, ns->validate_retries,
+				NVME_NS_VALIDATION_MAX_RETRIES,
+				NVME_NS_VALIDATION_RETRY_INTERVAL);
+			memcpy(&ns->pending_info, info, sizeof(*info));
+			schedule_delayed_work(
+				&ns->validate_work,
+				NVME_NS_VALIDATION_RETRY_INTERVAL * HZ);
+		} else {
+			dev_err(ns->ctrl->device,
+				"validation failed for nsid %d after %d retries\n",
+				ns->head->ns_id,
+				NVME_NS_VALIDATION_MAX_RETRIES);
+		}
+	}
+}
+
+static void nvme_validate_ns_work(struct work_struct *work)
+{
+	struct nvme_ns *ns = container_of(to_delayed_work(work), struct nvme_ns,
+					  validate_work);
+	nvme_validate_ns(ns, &ns->pending_info);
+	nvme_put_ns(ns);
 }
 
 static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ff4e7213131298a1a019eaa3822ca26f857b2443..17a4123e5e4da9828ef5662acca54e6aa9fd3cb9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -46,6 +46,12 @@ extern unsigned int admin_timeout;
 #define NVME_CTRL_PAGE_SHIFT	12
 #define NVME_CTRL_PAGE_SIZE	(1 << NVME_CTRL_PAGE_SHIFT)
 
+/*
+ * Default to 3 retries in intervals of 3 seconds for namespace validation
+ */
+#define NVME_NS_VALIDATION_MAX_RETRIES 3
+#define NVME_NS_VALIDATION_RETRY_INTERVAL 3
+
 extern struct workqueue_struct *nvme_wq;
 extern struct workqueue_struct *nvme_reset_wq;
 extern struct workqueue_struct *nvme_delete_wq;
@@ -565,6 +571,9 @@ struct nvme_ns {
 	struct device		cdev_device;
 
 	struct nvme_fault_inject fault_inject;
+	struct delayed_work validate_work;
+	struct nvme_ns_info pending_info;
+	unsigned int validate_retries;
 };
 
 /* NVMe ns supports metadata actions by the controller (generate/strip) */

-- 
2.51.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ