[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250603172239.159260-6-terry.bowman@amd.com>
Date: Tue, 3 Jun 2025 12:22:28 -0500
From: Terry Bowman <terry.bowman@....com>
To: <PradeepVineshReddy.Kodamati@....com>, <dave@...olabs.net>,
<jonathan.cameron@...wei.com>, <dave.jiang@...el.com>,
<alison.schofield@...el.com>, <vishal.l.verma@...el.com>,
<ira.weiny@...el.com>, <dan.j.williams@...el.com>, <bhelgaas@...gle.com>,
<bp@...en8.de>, <ming.li@...omail.com>, <shiju.jose@...wei.com>,
<dan.carpenter@...aro.org>, <Smita.KoralahalliChannabasappa@....com>,
<kobayashi.da-06@...itsu.com>, <terry.bowman@....com>, <yanfei.xu@...el.com>,
<rrichter@....com>, <peterz@...radead.org>, <colyli@...e.de>,
<uaisheng.ye@...el.com>, <fabio.m.de.francesco@...ux.intel.com>,
<ilpo.jarvinen@...ux.intel.com>, <yazen.ghannam@....com>,
<linux-cxl@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
<linux-pci@...r.kernel.org>
Subject: [PATCH v9 05/16] CXL/PCI: Introduce CXL uncorrectable protocol error recovery
Create cxl_do_recovery() to provide uncorrectable protocol error (UCE)
handling. Follow similar design as found in PCIe error driver,
pcie_do_recovery(). One difference is cxl_do_recovery() will treat all UCEs
as fatal with a kernel panic. This is to prevent corruption on CXL memory.
Copy the PCI error driver's merge_result() and rename as cxl_merge_result().
Introduce PCI_ERS_RESULT_PANIC and add support in the cxl_merge_result()
routine.
Copy pci_walk_bridge() to cxl_walk_bridge(). Make a change to walk the
first device in all cases.
Copy the PCI error driver's report_error_detected() to cxl_report_error_detected().
Note, only CXL Endpoints are currently supported. Add locking for PCI
device as done in PCI's report_error_detected(). Add reference counting for
the CXL device responsible for cleanup of the CXL RAS. This is necessary
to prevent the RAS registers from disappearing before logging is completed.
Call panic() to halt the system in the case of uncorrectable errors (UCE)
in cxl_do_recovery(). Export pci_aer_clear_fatal_status() for CXL to use
if a UCE is not found. In this case the AER status must be cleared and
uses pci_aer_clear_fatal_status().
Signed-off-by: Terry Bowman <terry.bowman@....com>
---
drivers/cxl/core/ras.c | 79 ++++++++++++++++++++++++++++++++++++++++++
include/linux/pci.h | 3 ++
2 files changed, 82 insertions(+)
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 9ed5c682e128..715f7221ea3a 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -110,8 +110,87 @@ static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
#ifdef CONFIG_PCIEAER_CXL
+static pci_ers_result_t cxl_merge_result(enum pci_ers_result orig,
+ enum pci_ers_result new)
+{
+ if (new == PCI_ERS_RESULT_PANIC)
+ return PCI_ERS_RESULT_PANIC;
+
+ if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
+ return PCI_ERS_RESULT_NO_AER_DRIVER;
+
+ if (new == PCI_ERS_RESULT_NONE)
+ return orig;
+
+ switch (orig) {
+ case PCI_ERS_RESULT_CAN_RECOVER:
+ case PCI_ERS_RESULT_RECOVERED:
+ orig = new;
+ break;
+ case PCI_ERS_RESULT_DISCONNECT:
+ if (new == PCI_ERS_RESULT_NEED_RESET)
+ orig = PCI_ERS_RESULT_NEED_RESET;
+ break;
+ default:
+ break;
+ }
+
+ return orig;
+}
+
+static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
+{
+ pci_ers_result_t vote, *result = data;
+ struct cxl_dev_state *cxlds;
+
+ if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
+ (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_END))
+ return 0;
+
+ cxlds = pci_get_drvdata(pdev);
+ struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
+
+ device_lock(&pdev->dev);
+ vote = cxl_error_detected(pdev, pci_channel_io_frozen);
+ *result = cxl_merge_result(*result, vote);
+ device_unlock(&pdev->dev);
+
+ return 0;
+}
+
+static void cxl_walk_bridge(struct pci_dev *bridge,
+ int (*cb)(struct pci_dev *, void *),
+ void *userdata)
+{
+ if (cb(bridge, userdata))
+ return;
+
+ if (bridge->subordinate)
+ pci_walk_bus(bridge->subordinate, cb, userdata);
+}
+
static void cxl_do_recovery(struct pci_dev *pdev)
{
+ struct pci_host_bridge *host = pci_find_host_bridge(pdev->bus);
+ pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
+
+ cxl_walk_bridge(pdev, cxl_report_error_detected, &status);
+ if (status == PCI_ERS_RESULT_PANIC)
+ panic("CXL cachemem error.");
+
+ /*
+ * If we have native control of AER, clear error status in the device
+ * that detected the error. If the platform retained control of AER,
+ * it is responsible for clearing this status. In that case, the
+ * signaling device may not even be visible to the OS.
+ */
+ if (host->native_aer) {
+ pcie_clear_device_status(pdev);
+ pci_aer_clear_nonfatal_status(pdev);
+ pci_aer_clear_fatal_status(pdev);
+ }
+
+ pci_info(pdev, "CXL uncorrectable error.\n");
}
static int cxl_rch_handle_error_iter(struct pci_dev *pdev, void *data)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index cd53715d53f3..b0e7545162de 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -870,6 +870,9 @@ enum pci_ers_result {
/* No AER capabilities registered for the driver */
PCI_ERS_RESULT_NO_AER_DRIVER = (__force pci_ers_result_t) 6,
+
+ /* System is unstable, panic */
+ PCI_ERS_RESULT_PANIC = (__force pci_ers_result_t) 7,
};
/* PCI bus error event callbacks */
--
2.34.1
Powered by blists - more mailing lists