linux-kernel - [PATCH v9 12/16] cxl/pci: Introduce CXL Endpoint protocol error handlers

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250603172239.159260-13-terry.bowman@amd.com>
Date: Tue, 3 Jun 2025 12:22:35 -0500
From: Terry Bowman <terry.bowman@....com>
To: <PradeepVineshReddy.Kodamati@....com>, <dave@...olabs.net>,
	<jonathan.cameron@...wei.com>, <dave.jiang@...el.com>,
	<alison.schofield@...el.com>, <vishal.l.verma@...el.com>,
	<ira.weiny@...el.com>, <dan.j.williams@...el.com>, <bhelgaas@...gle.com>,
	<bp@...en8.de>, <ming.li@...omail.com>, <shiju.jose@...wei.com>,
	<dan.carpenter@...aro.org>, <Smita.KoralahalliChannabasappa@....com>,
	<kobayashi.da-06@...itsu.com>, <terry.bowman@....com>, <yanfei.xu@...el.com>,
	<rrichter@....com>, <peterz@...radead.org>, <colyli@...e.de>,
	<uaisheng.ye@...el.com>, <fabio.m.de.francesco@...ux.intel.com>,
	<ilpo.jarvinen@...ux.intel.com>, <yazen.ghannam@....com>,
	<linux-cxl@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
	<linux-pci@...r.kernel.org>
Subject: [PATCH v9 12/16] cxl/pci: Introduce CXL Endpoint protocol error handlers

CXL Endpoint protocol errors are currently handled using PCI error
handlers. The CXL Endpoint requires CXL specific handling in the case of
uncorrectable error handling not provided by the PCI handlers.

Add CXL specific handlers for CXL Endpoints. Rename the existing
cxl_error_handlers to be pci_error_handlers to more correctly indicate
the error type and follow naming consistency.

Keep the existing PCI Endpoint handlers. PCI handlers can be called if the
CXL device is not trained for alternate protocol (CXL). Update the CXL
Endpoint PCI handlers to call the CXL handler. If the CXL uncorrectable
handler returns PCI_ERS_RESULT_PANIC then the PCI handler invokes panic().

Signed-off-by: Terry Bowman <terry.bowman@....com>
---
 drivers/cxl/core/pci.c | 80 ++++++++++++++++++++++--------------------
 drivers/cxl/core/ras.c | 10 +++---
 drivers/cxl/cxl.h      |  4 +++
 drivers/cxl/cxlpci.h   |  6 ++--
 drivers/cxl/pci.c      |  8 ++---
 5 files changed, 58 insertions(+), 50 deletions(-)

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index f5f87c2c3fd5..e094ef518e0a 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -710,8 +710,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
  * Log the state of the RAS status registers and prepare them to log the
  * next error status. Return 1 if reset needed.
  */
-static bool __cxl_handle_ras(struct device *dev, u64 serial,
-			     void __iomem *ras_base)
+static pci_ers_result_t __cxl_handle_ras(struct device *dev, u64 serial,
+					 void __iomem *ras_base)
 {
 	u32 hl[CXL_HEADERLOG_SIZE_U32];
 	void __iomem *addr;
@@ -720,13 +720,13 @@ static bool __cxl_handle_ras(struct device *dev, u64 serial,
 
 	if (!ras_base) {
 		dev_warn_once(dev, "CXL RAS register block is not mapped");
-		return false;
+		return PCI_ERS_RESULT_NONE;
 	}
 
 	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
 	status = readl(addr);
 	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
-		return false;
+		return PCI_ERS_RESULT_NONE;
 
 	/* If multiple errors, log header points to first error from ctrl reg */
 	if (hweight32(status) > 1) {
@@ -743,12 +743,11 @@ static bool __cxl_handle_ras(struct device *dev, u64 serial,
 	trace_cxl_aer_uncorrectable_error(dev, serial, status, fe, hl);
 	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
 
-	return true;
+	return PCI_ERS_RESULT_PANIC;
 }
 
 static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
 {
-
 	return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
 }
 
@@ -844,14 +843,15 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
 static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
 #endif
 
-void cxl_cor_error_detected(struct pci_dev *pdev)
+void cxl_cor_error_detected(struct device *dev)
 {
+	struct pci_dev *pdev = to_pci_dev(dev);
 	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct device *dev = &cxlds->cxlmd->dev;
+	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
 
-	scoped_guard(device, dev) {
-		if (!dev->driver) {
-			dev_warn(&pdev->dev,
+	scoped_guard(device, cxlmd_dev) {
+		if (!cxlmd_dev->driver) {
+			dev_warn(dev,
 				 "%s: memdev disabled, abort error handling\n",
 				 dev_name(dev));
 			return;
@@ -865,20 +865,28 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
 
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state)
+void pci_cor_error_detected(struct pci_dev *pdev)
 {
 	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
-	struct device *dev = &cxlmd->dev;
-	bool ue;
+	struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
 
-	scoped_guard(device, dev) {
+	cxl_cor_error_detected(&pdev->dev);
+}
+EXPORT_SYMBOL_NS_GPL(pci_cor_error_detected, "CXL");
+
+pci_ers_result_t cxl_error_detected(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
+	pci_ers_result_t ue;
+
+	scoped_guard(device, cxlmd_dev) {
 		if (!dev->driver) {
 			dev_warn(&pdev->dev,
 				 "%s: memdev disabled, abort error handling\n",
 				 dev_name(dev));
-			return PCI_ERS_RESULT_DISCONNECT;
+			return PCI_ERS_RESULT_PANIC;
 		}
 
 		if (cxlds->rcd)
@@ -892,29 +900,25 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 		ue = cxl_handle_endpoint_ras(cxlds);
 	}
 
-
-	switch (state) {
-	case pci_channel_io_normal:
-		if (ue) {
-			device_release_driver(dev);
-			return PCI_ERS_RESULT_NEED_RESET;
-		}
-		return PCI_ERS_RESULT_CAN_RECOVER;
-	case pci_channel_io_frozen:
-		dev_warn(&pdev->dev,
-			 "%s: frozen state error detected, disable CXL.mem\n",
-			 dev_name(dev));
-		device_release_driver(dev);
-		return PCI_ERS_RESULT_NEED_RESET;
-	case pci_channel_io_perm_failure:
-		dev_warn(&pdev->dev,
-			 "failure state error detected, request disconnect\n");
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-	return PCI_ERS_RESULT_NEED_RESET;
+	return ue;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
 
+pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t error)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct device *cxlmd_dev __free(put_device) = &cxlds->cxlmd->dev;
+	pci_ers_result_t rc;
+
+	rc = cxl_error_detected(&pdev->dev);
+	if (rc == PCI_ERS_RESULT_PANIC)
+		panic("CXL cachemem error.");
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(pci_error_detected, "CXL");
+
 static int cxl_flit_size(struct pci_dev *pdev)
 {
 	if (cxl_pci_flit_256(pdev))
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 0ef8c2068c0c..664f532cc838 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -153,7 +153,7 @@ static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
 	struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
 
 	device_lock(&pdev->dev);
-	vote = cxl_error_detected(pdev, pci_channel_io_frozen);
+	vote = cxl_error_detected(&pdev->dev);
 	*result = cxl_merge_result(*result, vote);
 	device_unlock(&pdev->dev);
 
@@ -223,7 +223,7 @@ static int cxl_rch_handle_error_iter(struct pci_dev *pdev, void *data)
 	struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
 
 	if (err_info->severity == AER_CORRECTABLE)
-		cxl_cor_error_detected(pdev);
+		cxl_cor_error_detected(dev);
 	else
 		cxl_do_recovery(pdev);
 
@@ -244,6 +244,8 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
 static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
 {
 	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
 
 	if (!pdev) {
 		pr_err("Failed to find the CXL device\n");
@@ -260,15 +262,13 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
 
 	if (err_info->severity == AER_CORRECTABLE) {
 		int aer = pdev->aer_cap;
-		struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-		struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
 
 		if (aer)
 			pci_clear_and_set_config_dword(pdev,
 						       aer + PCI_ERR_COR_STATUS,
 						       0, PCI_ERR_COR_INTERNAL);
 
-		cxl_cor_error_detected(pdev);
+		cxl_cor_error_detected(&pdev->dev);
 
 		pcie_clear_device_status(pdev);
 	} else {
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 73be66ef36a2..6fd9a42eb304 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -11,6 +11,7 @@
 #include <linux/log2.h>
 #include <linux/node.h>
 #include <linux/io.h>
+#include <linux/pci.h>
 
 extern const struct nvdimm_security_ops *cxl_security_ops;
 
@@ -797,6 +798,9 @@ static inline int cxl_root_decoder_autoremove(struct device *host,
 }
 int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint);
 
+void cxl_cor_error_detected(struct device *dev);
+pci_ers_result_t cxl_error_detected(struct device *dev);
+
 /**
  * struct cxl_endpoint_dvsec_info - Cached DVSEC info
  * @mem_enabled: cached value of mem_enabled in the DVSEC at init time
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 6f1396ef7b77..a572c57c6c63 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -133,7 +133,7 @@ struct cxl_dev_state;
 int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
 			struct cxl_endpoint_dvsec_info *info);
 void read_cdat_data(struct cxl_port *port);
-void cxl_cor_error_detected(struct pci_dev *pdev);
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state);
+void pci_cor_error_detected(struct pci_dev *pdev);
+pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t error);
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 785aa2af5eaa..2b948e94bc97 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1112,11 +1112,11 @@ static void cxl_reset_done(struct pci_dev *pdev)
 	}
 }
 
-static const struct pci_error_handlers cxl_error_handlers = {
-	.error_detected	= cxl_error_detected,
+static const struct pci_error_handlers pci_error_handlers = {
+	.error_detected = pci_error_detected,
 	.slot_reset	= cxl_slot_reset,
 	.resume		= cxl_error_resume,
-	.cor_error_detected	= cxl_cor_error_detected,
+	.cor_error_detected	= pci_cor_error_detected,
 	.reset_done	= cxl_reset_done,
 };
 
@@ -1124,7 +1124,7 @@ static struct pci_driver cxl_pci_driver = {
 	.name			= KBUILD_MODNAME,
 	.id_table		= cxl_mem_pci_tbl,
 	.probe			= cxl_pci_probe,
-	.err_handler		= &cxl_error_handlers,
+	.err_handler		= &pci_error_handlers,
 	.dev_groups		= cxl_rcd_groups,
 	.driver	= {
 		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
-- 
2.34.1