[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250827013539.903682-17-terry.bowman@amd.com>
Date: Tue, 26 Aug 2025 20:35:31 -0500
From: Terry Bowman <terry.bowman@....com>
To: <dave@...olabs.net>, <jonathan.cameron@...wei.com>,
<dave.jiang@...el.com>, <alison.schofield@...el.com>,
<dan.j.williams@...el.com>, <bhelgaas@...gle.com>, <shiju.jose@...wei.com>,
<ming.li@...omail.com>, <Smita.KoralahalliChannabasappa@....com>,
<rrichter@....com>, <dan.carpenter@...aro.org>,
<PradeepVineshReddy.Kodamati@....com>, <lukas@...ner.de>,
<Benjamin.Cheatham@....com>, <sathyanarayanan.kuppuswamy@...ux.intel.com>,
<linux-cxl@...r.kernel.org>, <alucerop@....com>, <ira.weiny@...el.com>
CC: <linux-kernel@...r.kernel.org>, <linux-pci@...r.kernel.org>
Subject: [PATCH v11 16/23] cxl/pci: Introduce CXL Endpoint protocol error handlers
CXL Endpoint protocol errors are currently handled using PCI error
handlers. The CXL Endpoint requires CXL specific handling in the case of
uncorrectable error (UCE) handling not provided by the PCI handlers.
Add CXL specific handlers for CXL Endpoints. Rename the existing
cxl_error_handlers to be pci_error_handlers to more correctly indicate
the error type and follow naming consistency.
The PCI handlers will be called if the CXL device is not trained for
alternate protocol (CXL). Update the CXL Endpoint PCI handlers to call the
CXL UCE handlers.
The existing EP UCE handler includes checks for various results. These are
no longer needed because CXL UCE recovery will not be attempted. Implement
cxl_handle_ras() to return PCI_ERS_RESULT_NONE or PCI_ERS_RESULT_PANIC. The
CXL UCE handler is called by cxl_do_recovery() that acts on the return
value. In the case of the PCI handler path, call panic() if the result is
PCI_ERS_RESULT_PANIC.
Signed-off-by: Terry Bowman <terry.bowman@....com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@...ux.intel.com>
---
Changes in v10->v11:
- cxl_error_detected() - Change handlers' scoped_guard() to guard() (Jonathan)
- cxl_error_detected() - Remove extra line (Shiju)
- Changes moved to core/ras.c (Terry)
- cxl_error_detected(), remove 'ue' and return with function call. (Jonathan)
- Remove extra space in documentation for PCI_ERS_RESULT_PANIC definition
- Move #include "pci.h from cxl.h to core.h (Terry)
- Remove unnecessary includes of cxl.h and core.h in mem.c (Terry)
---
drivers/cxl/core/core.h | 17 +++++++
drivers/cxl/core/ras.c | 110 +++++++++++++++++++---------------------
drivers/cxl/cxlpci.h | 15 ------
drivers/cxl/pci.c | 9 ++--
include/linux/pci.h | 3 ++
5 files changed, 78 insertions(+), 76 deletions(-)
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 2fa76a913264..6e3e7f2e0e2d 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -6,6 +6,7 @@
#include <cxl/mailbox.h>
#include <linux/rwsem.h>
+#include <linux/pci.h>
extern const struct device_type cxl_nvdimm_bridge_type;
extern const struct device_type cxl_nvdimm_type;
@@ -149,6 +150,11 @@ void cxl_ras_exit(void);
void cxl_switch_port_init_ras(struct cxl_port *port);
void cxl_endpoint_port_init_ras(struct cxl_port *ep);
void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host);
+pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t error);
+void pci_cor_error_detected(struct pci_dev *pdev);
+void cxl_cor_error_detected(struct device *dev);
+pci_ers_result_t cxl_error_detected(struct device *dev);
#else
static inline int cxl_ras_init(void)
{
@@ -162,6 +168,17 @@ static inline void cxl_switch_port_init_ras(struct cxl_port *port) { }
static inline void cxl_endpoint_port_init_ras(struct cxl_port *ep) { }
static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport,
struct device *host) { }
+static inline pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t error)
+{
+ return PCI_ERS_RESULT_NONE;
+}
+static inline void pci_cor_error_detected(struct pci_dev *pdev) { }
+static inline void cxl_cor_error_detected(struct device *dev) { }
+static inline pci_ers_result_t cxl_error_detected(struct device *dev)
+{
+ return PCI_ERS_RESULT_NONE;
+}
#endif // CONFIG_CXL_RAS
int cxl_gpf_port_setup(struct cxl_dport *dport);
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 42b6e0b092d5..b285448c2d9c 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -129,7 +129,7 @@ void cxl_ras_exit(void)
cancel_work_sync(&cxl_cper_prot_err_work);
}
-static bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base);
+static pci_ers_result_t cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base);
static void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base);
#ifdef CONFIG_CXL_RCH_RAS
@@ -371,7 +371,7 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
* Log the state of the RAS status registers and prepare them to log the
* next error status. Return 1 if reset needed.
*/
-static bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
+static pci_ers_result_t cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
{
u32 hl[CXL_HEADERLOG_SIZE_U32];
void __iomem *addr;
@@ -380,13 +380,13 @@ static bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_bas
if (!ras_base) {
dev_warn_once(dev, "CXL RAS register block is not mapped");
- return false;
+ return PCI_ERS_RESULT_NONE;
}
addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
status = readl(addr);
if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
- return false;
+ return PCI_ERS_RESULT_NONE;
/* If multiple errors, log header points to first error from ctrl reg */
if (hweight32(status) > 1) {
@@ -403,76 +403,72 @@ static bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_bas
trace_cxl_aer_uncorrectable_error(dev, status, fe, hl, serial);
writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
- return true;
+ return PCI_ERS_RESULT_PANIC;
}
-void cxl_cor_error_detected(struct pci_dev *pdev)
+void cxl_cor_error_detected(struct device *dev)
{
+ struct pci_dev *pdev = to_pci_dev(dev);
struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
- struct device *dev = &cxlds->cxlmd->dev;
+ struct device *cxlmd_dev = &cxlds->cxlmd->dev;
- scoped_guard(device, dev) {
- if (!dev->driver) {
- dev_warn(&pdev->dev,
- "%s: memdev disabled, abort error handling\n",
- dev_name(dev));
- return;
- }
+ guard(device)(cxlmd_dev);
- if (cxlds->rcd)
- cxl_handle_rdport_errors(cxlds);
-
- cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
+ if (!cxlmd_dev->driver) {
+ dev_warn(&pdev->dev, "%s: memdev disabled, abort error handling", dev_name(dev));
+ return;
}
+
+ if (cxlds->rcd)
+ cxl_handle_rdport_errors(cxlds);
+
+ cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
}
EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state)
+void pci_cor_error_detected(struct pci_dev *pdev)
{
+ cxl_cor_error_detected(&pdev->dev);
+}
+EXPORT_SYMBOL_NS_GPL(pci_cor_error_detected, "CXL");
+
+pci_ers_result_t cxl_error_detected(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
- struct cxl_memdev *cxlmd = cxlds->cxlmd;
- struct device *dev = &cxlmd->dev;
- bool ue;
+ struct device *cxlmd_dev = &cxlds->cxlmd->dev;
- scoped_guard(device, dev) {
- if (!dev->driver) {
- dev_warn(&pdev->dev,
- "%s: memdev disabled, abort error handling\n",
- dev_name(dev));
- return PCI_ERS_RESULT_DISCONNECT;
- }
+ guard(device)(cxlmd_dev);
- if (cxlds->rcd)
- cxl_handle_rdport_errors(cxlds);
- /*
- * A frozen channel indicates an impending reset which is fatal to
- * CXL.mem operation, and will likely crash the system. On the off
- * chance the situation is recoverable dump the status of the RAS
- * capability registers and bounce the active state of the memdev.
- */
- ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
- }
-
-
- switch (state) {
- case pci_channel_io_normal:
- if (ue) {
- device_release_driver(dev);
- return PCI_ERS_RESULT_NEED_RESET;
- }
- return PCI_ERS_RESULT_CAN_RECOVER;
- case pci_channel_io_frozen:
+ if (!dev->driver) {
dev_warn(&pdev->dev,
- "%s: frozen state error detected, disable CXL.mem\n",
+ "%s: memdev disabled, abort error handling\n",
dev_name(dev));
- device_release_driver(dev);
- return PCI_ERS_RESULT_NEED_RESET;
- case pci_channel_io_perm_failure:
- dev_warn(&pdev->dev,
- "failure state error detected, request disconnect\n");
return PCI_ERS_RESULT_DISCONNECT;
}
- return PCI_ERS_RESULT_NEED_RESET;
+
+ if (cxlds->rcd)
+ cxl_handle_rdport_errors(cxlds);
+
+ /*
+ * A frozen channel indicates an impending reset which is fatal to
+ * CXL.mem operation, and will likely crash the system. On the off
+ * chance the situation is recoverable dump the status of the RAS
+ * capability registers and bounce the active state of the memdev.
+ */
+ return cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
}
EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
+
+pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t error)
+{
+ pci_ers_result_t rc;
+
+ rc = cxl_error_detected(&pdev->dev);
+ if (rc == PCI_ERS_RESULT_PANIC)
+ panic("CXL cachemem error.");
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(pci_error_detected, "CXL");
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index a6da0abfa506..ccf0ca36bc00 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -79,19 +79,4 @@ struct cxl_dev_state;
int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
struct cxl_endpoint_dvsec_info *info);
void read_cdat_data(struct cxl_port *port);
-
-#ifdef CONFIG_CXL_RAS
-void cxl_cor_error_detected(struct pci_dev *pdev);
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state);
-#else
-static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
-
-static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state)
-{
- return PCI_ERS_RESULT_NONE;
-}
-#endif
-
#endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index bd95be1f3d5c..6803c2fb906b 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -16,6 +16,7 @@
#include "cxlpci.h"
#include "cxl.h"
#include "pmu.h"
+#include "core/core.h"
/**
* DOC: cxl pci
@@ -1112,11 +1113,11 @@ static void cxl_reset_done(struct pci_dev *pdev)
}
}
-static const struct pci_error_handlers cxl_error_handlers = {
- .error_detected = cxl_error_detected,
+static const struct pci_error_handlers pci_error_handlers = {
+ .error_detected = pci_error_detected,
.slot_reset = cxl_slot_reset,
.resume = cxl_error_resume,
- .cor_error_detected = cxl_cor_error_detected,
+ .cor_error_detected = pci_cor_error_detected,
.reset_done = cxl_reset_done,
};
@@ -1124,7 +1125,7 @@ static struct pci_driver cxl_pci_driver = {
.name = KBUILD_MODNAME,
.id_table = cxl_mem_pci_tbl,
.probe = cxl_pci_probe,
- .err_handler = &cxl_error_handlers,
+ .err_handler = &pci_error_handlers,
.dev_groups = cxl_rcd_groups,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 79878243b681..3dcab36c437f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -868,6 +868,9 @@ enum pci_ers_result {
/* No AER capabilities registered for the driver */
PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6,
+
+ /* System is unstable, panic. Is CXL specific */
+ PCI_ERS_RESULT_PANIC = (__force pci_ers_result_t) 7,
};
/* PCI bus error event callbacks */
--
2.51.0.rc2.21.ge5ab6b3e5a
Powered by blists - more mailing lists