[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250208002941.4135321-18-terry.bowman@amd.com>
Date: Fri, 7 Feb 2025 18:29:41 -0600
From: Terry Bowman <terry.bowman@....com>
To: <linux-cxl@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
<linux-pci@...r.kernel.org>, <nifan.cxl@...il.com>, <dave@...olabs.net>,
<jonathan.cameron@...wei.com>, <dave.jiang@...el.com>,
<alison.schofield@...el.com>, <vishal.l.verma@...el.com>,
<dan.j.williams@...el.com>, <bhelgaas@...gle.com>, <mahesh@...ux.ibm.com>,
<ira.weiny@...el.com>, <oohall@...il.com>, <Benjamin.Cheatham@....com>,
<rrichter@....com>, <nathan.fontenot@....com>, <terry.bowman@....com>,
<Smita.KoralahalliChannabasappa@....com>, <lukas@...ner.de>,
<ming.li@...omail.com>, <PradeepVineshReddy.Kodamati@....com>
Subject: [PATCH v6 17/17] cxl/pci: Handle CXL Endpoint and RCH protocol errors separately from PCIe errors
CXL Endpoint and Restricted CXL host (RCH) Downstream Port Protocol Errors
are currently treated as PCIe errors, which does not properly process CXL
uncorrectable (UCE) errors. When a CXL device encounters an uncorrectable
protocol error, the system should panic to prevent potential CXL memory
corruption.
Treat CXL Endpoint protocol errors as CXL errors. This requires updates in
the CXL and AER drivers.
Update the CXL Endpoint driver with a new declaration for struct
cxl_error_handlers named cxl_ep_error_handlers. Move the existing CE and
UCE handler assignments from cxl_error_handlers to the new
cxl_ep_error_handlers. Remove the 'state' parameter from the UCE handler
interface because it is not used in CXL recovery.
Update the AER driver to associate CXL Protocol errors with CXL error
handling. Change detection in handles_cxl_errors() from using
pcie_is_cxl_port() to instead use pcie_is_cxl().
Update AER driver to use CXL handlers for RCH handling.
Signed-off-by: Terry Bowman <terry.bowman@....com>
---
drivers/cxl/core/pci.c | 26 +++++---------------------
drivers/cxl/cxlpci.h | 3 +--
drivers/cxl/pci.c | 10 +++++++---
drivers/pci/pcie/aer.c | 11 ++++++-----
4 files changed, 19 insertions(+), 31 deletions(-)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 36e686a31045..18d47a14959e 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -1075,8 +1075,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
}
EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state)
+pci_ers_result_t cxl_error_detected(struct pci_dev *pdev)
{
struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
struct cxl_memdev *cxlmd = cxlds->cxlmd;
@@ -1088,7 +1087,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
dev_warn(&pdev->dev,
"%s: memdev disabled, abort error handling\n",
dev_name(dev));
- return PCI_ERS_RESULT_DISCONNECT;
+ return PCI_ERS_RESULT_PANIC;
}
if (cxlds->rcd)
@@ -1102,26 +1101,11 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
ue = cxl_handle_endpoint_ras(cxlds);
}
-
- switch (state) {
- case pci_channel_io_normal:
- if (ue) {
- device_release_driver(dev);
- return PCI_ERS_RESULT_NEED_RESET;
- }
- return PCI_ERS_RESULT_CAN_RECOVER;
- case pci_channel_io_frozen:
- dev_warn(&pdev->dev,
- "%s: frozen state error detected, disable CXL.mem\n",
- dev_name(dev));
+ if (ue) {
device_release_driver(dev);
- return PCI_ERS_RESULT_NEED_RESET;
- case pci_channel_io_perm_failure:
- dev_warn(&pdev->dev,
- "failure state error detected, request disconnect\n");
- return PCI_ERS_RESULT_DISCONNECT;
+ return PCI_ERS_RESULT_PANIC;
}
- return PCI_ERS_RESULT_NEED_RESET;
+ return PCI_ERS_RESULT_CAN_RECOVER;
}
EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 54e219b0049e..4b8910d934d5 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -133,6 +133,5 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
struct cxl_endpoint_dvsec_info *info);
void read_cdat_data(struct cxl_port *port);
void cxl_cor_error_detected(struct pci_dev *pdev);
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
- pci_channel_state_t state);
+pci_ers_result_t cxl_error_detected(struct pci_dev *pdev);
#endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index b2c943a4de0a..520570741402 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1104,11 +1104,14 @@ static void cxl_reset_done(struct pci_dev *pdev)
}
}
-static const struct pci_error_handlers cxl_error_handlers = {
+static const struct cxl_error_handlers cxl_ep_error_handlers = {
.error_detected = cxl_error_detected,
+ .cor_error_detected = cxl_cor_error_detected,
+};
+
+static const struct pci_error_handlers pcie_ep_error_handlers = {
.slot_reset = cxl_slot_reset,
.resume = cxl_error_resume,
- .cor_error_detected = cxl_cor_error_detected,
.reset_done = cxl_reset_done,
};
@@ -1116,7 +1119,8 @@ static struct pci_driver cxl_pci_driver = {
.name = KBUILD_MODNAME,
.id_table = cxl_mem_pci_tbl,
.probe = cxl_pci_probe,
- .err_handler = &cxl_error_handlers,
+ .err_handler = &pcie_ep_error_handlers,
+ .cxl_err_handler = &cxl_ep_error_handlers,
.dev_groups = cxl_rcd_groups,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 8e3a60411610..07c888fd4c08 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -993,7 +993,7 @@ static bool cxl_error_is_native(struct pci_dev *dev)
static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
{
struct aer_err_info *info = (struct aer_err_info *)data;
- const struct pci_error_handlers *err_handler;
+ const struct cxl_error_handlers *err_handler;
if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
return 0;
@@ -1001,7 +1001,8 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
/* protect dev->driver */
device_lock(&dev->dev);
- err_handler = dev->driver ? dev->driver->err_handler : NULL;
+ err_handler = dev->driver ? dev->driver->cxl_err_handler : NULL;
+
if (!err_handler)
goto out;
@@ -1010,9 +1011,9 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
err_handler->cor_error_detected(dev);
} else if (err_handler->error_detected) {
if (info->severity == AER_NONFATAL)
- err_handler->error_detected(dev, pci_channel_io_normal);
+ err_handler->error_detected(dev);
else if (info->severity == AER_FATAL)
- err_handler->error_detected(dev, pci_channel_io_frozen);
+ err_handler->error_detected(dev);
cxl_do_recovery(dev);
}
@@ -1070,7 +1071,7 @@ static bool handles_cxl_errors(struct pci_dev *dev)
if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
else
- handles_cxl = pcie_is_cxl_port(dev);
+ handles_cxl = pcie_is_cxl(dev);
return handles_cxl;
}
--
2.34.1
Powered by blists - more mailing lists