lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1654056747-40143-2-git-send-email-mikelley@microsoft.com>
Date:   Tue, 31 May 2022 21:12:27 -0700
From:   Michael Kelley <mikelley@...rosoft.com>
To:     kbusch@...nel.org, axboe@...com, hch@....de, sagi@...mberg.me,
        linux-nvme@...ts.infradead.org, linux-kernel@...r.kernel.org
Cc:     mikelley@...rosoft.com, caroline.subramoney@...rosoft.com,
        riwurd@...rosoft.com, nathan.obr@...rosoft.com
Subject: [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller

In the NVM Express Revision 1.4 spec, Figure 145 describes possible
values for an AER with event type "Error" (value 000b). For a
Persistent Internal Error (value 03h), the host should perform a
controller reset.

Add support for this error using code that already exists for
doing a controller reset in response to a request timeout.

This new support was tested in a lab environment where we can
generate the persistent internal error on demand, and observe
both the Linux side and NVMe controller side to see that the
controller reset has been done.

Signed-off-by: Michael Kelley <mikelley@...rosoft.com>
---

 drivers/nvme/host/pci.c | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h    |  4 ++++
 2 files changed, 41 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4dd87ac..b2140e9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -131,6 +131,7 @@ struct nvme_dev {
 	void __iomem *bar;
 	unsigned long bar_mapped_size;
 	struct work_struct remove_work;
+	struct work_struct persistent_err_work;
 	struct mutex shutdown_lock;
 	bool subsystem;
 	u64 cmb_size;
@@ -1119,6 +1120,39 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_persistent_err_work(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev,
+						persistent_err_work);
+
+	nvme_dev_disable(dev, false);
+	nvme_reset_ctrl(&dev->ctrl);
+}
+
+static bool nvme_check_aen_error(struct nvme_dev *dev,
+			__le16 status, volatile union nvme_result *res)
+{
+	u32 result = le32_to_cpu(res->u32);
+	u32 csts;
+
+	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
+		return false;
+
+	/* Currently only handle Persistent Internal Error */
+	if ((result & 0x07) != NVME_AER_ERROR ||
+	    ((result & 0xff00) >> 8) != NVME_AER_ERROR_PERSIST_INT_ERR)
+		return false;
+
+	/* NVMe Spec 1.4 says to reset the controller */
+	csts = readl(dev->bar + NVME_REG_CSTS);
+	if (!nvme_should_reset(dev, csts))
+		return false;
+
+	nvme_warn_reset(dev, csts);
+	queue_work(nvme_wq, &dev->persistent_err_work);
+	return true;
+}
+
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 				   struct io_comp_batch *iob, u16 idx)
 {
@@ -1133,6 +1167,8 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	 * for them but rather special case them here.
 	 */
 	if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
+		if (nvme_check_aen_error(nvmeq->dev, cqe->status, &cqe->result))
+			return;
 		nvme_complete_async_event(&nvmeq->dev->ctrl,
 				cqe->status, &cqe->result);
 		return;
@@ -3085,6 +3121,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	INIT_WORK(&dev->persistent_err_work, nvme_persistent_err_work);
 	mutex_init(&dev->shutdown_lock);
 
 	result = nvme_setup_prp_pools(dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 29ec3e3..8ced243 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -712,6 +712,10 @@ enum {
 };
 
 enum {
+	NVME_AER_ERROR_PERSIST_INT_ERR	= 0x03,
+};
+
+enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
 	NVME_AER_NOTICE_ANA		= 0x03,
-- 
1.8.3.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ