[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241008221657.1130181-5-terry.bowman@amd.com>
Date: Tue, 8 Oct 2024 17:16:46 -0500
From: Terry Bowman <terry.bowman@....com>
To: <ming4.li@...el.com>, <linux-cxl@...r.kernel.org>,
<linux-kernel@...r.kernel.org>, <linux-pci@...r.kernel.org>,
<dave@...olabs.net>, <jonathan.cameron@...wei.com>, <dave.jiang@...el.com>,
<alison.schofield@...el.com>, <vishal.l.verma@...el.com>,
<dan.j.williams@...el.com>, <bhelgaas@...gle.com>, <mahesh@...ux.ibm.com>,
<oohall@...il.com>, <Benjamin.Cheatham@....com>, <rrichter@....com>,
<nathan.fontenot@....com>, <smita.koralahallichannabasappa@....com>,
<terry.bowman@....com>
Subject: [PATCH 04/15] cxl/aer/pci: Add CXL PCIe port correctable error support in AER service driver
The AER service driver currently does not manage CXL PCIe port
protocol errors reported by CXL root ports, CXL upstream switch ports,
and CXL downstream switch ports. Consequently, RAS protocol errors
from CXL PCIe port devices are not properly logged or handled.
These errors are reported to the OS via the root port's AER correctable
and uncorrectable internal error fields. While the AER driver supports
handling downstream port protocol errors in restricted CXL host (RCH)
mode also known as CXL1.1, it lacks the same functionality for CXL
PCIe ports operating in virtual hierarchy (VH) mode, introduced in
CXL2.0.
To address this gap, update the AER driver to handle CXL PCIe port
device protocol correctable errors (CE).
The uncorrectable error handling (UCE) will be added in a future
patch.
Make this update alongside the existing downstream port RCH error
handling logic, extending support to CXL PCIe ports in VH.
Signed-off-by: Terry Bowman <terry.bowman@....com>
---
drivers/pci/pcie/aer.c | 54 +++++++++++++++++++++++++++++++++---------
1 file changed, 43 insertions(+), 11 deletions(-)
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index dc8b17999001..1c996287d4ce 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -40,6 +40,8 @@
#define AER_MAX_TYPEOF_COR_ERRS 16 /* as per PCI_ERR_COR_STATUS */
#define AER_MAX_TYPEOF_UNCOR_ERRS 27 /* as per PCI_ERR_UNCOR_STATUS*/
+#define CXL_DVSEC_PORT_EXTENSIONS 3
+
struct aer_err_source {
u32 status; /* PCI_ERR_ROOT_STATUS */
u32 id; /* PCI_ERR_ROOT_ERR_SRC */
@@ -941,6 +943,17 @@ static bool find_source_device(struct pci_dev *parent,
return true;
}
+static bool is_pcie_cxl_port(struct pci_dev *dev)
+{
+ if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
+ (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM) &&
+ (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM))
+ return false;
+
+ return (!!pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL,
+ CXL_DVSEC_PORT_EXTENSIONS));
+}
+
static bool is_internal_error(struct aer_err_info *info)
{
if (info->severity == AER_CORRECTABLE)
@@ -1032,14 +1045,22 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
{
- /*
- * Internal errors of an RCEC indicate an AER error in an
- * RCH's downstream port. Check and handle them in the CXL.mem
- * device driver.
- */
- if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
- is_internal_error(info))
+ if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
+
+ if (info->severity == AER_CORRECTABLE) {
+ struct cxl_port_err_hndlrs *cxl_port_hndlrs =
+ find_cxl_port_hndlrs();
+ int aer = dev->aer_cap;
+
+ if (aer)
+ pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
+ info->status);
+
+ if (cxl_port_hndlrs && cxl_port_hndlrs->cor_error_detected)
+ cxl_port_hndlrs->cor_error_detected(dev);
+ pcie_clear_device_status(dev);
+ }
}
static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
@@ -1057,9 +1078,13 @@ static bool handles_cxl_errors(struct pci_dev *dev)
{
bool handles_cxl = false;
- if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
- pcie_aer_is_native(dev))
+ if (!pcie_aer_is_native(dev))
+ return false;
+
+ if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
+ else
+ handles_cxl = is_pcie_cxl_port(dev);
return handles_cxl;
}
@@ -1077,6 +1102,10 @@ static void cxl_enable_internal_errors(struct pci_dev *dev)
static inline void cxl_enable_internal_errors(struct pci_dev *dev) { }
static inline void cxl_handle_error(struct pci_dev *dev,
struct aer_err_info *info) { }
+static bool handles_cxl_errors(struct pci_dev *dev)
+{
+ return false;
+}
#endif
void register_cxl_port_hndlrs(struct cxl_port_err_hndlrs *_cxl_port_hndlrs)
@@ -1134,8 +1163,11 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
{
- cxl_handle_error(dev, info);
- pci_aer_handle_error(dev, info);
+ if (is_internal_error(info) && handles_cxl_errors(dev))
+ cxl_handle_error(dev, info);
+ else
+ pci_aer_handle_error(dev, info);
+
pci_dev_put(dev);
}
--
2.34.1
Powered by blists - more mailing lists