[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250208002941.4135321-15-terry.bowman@amd.com>
Date: Fri, 7 Feb 2025 18:29:38 -0600
From: Terry Bowman <terry.bowman@....com>
To: <linux-cxl@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
<linux-pci@...r.kernel.org>, <nifan.cxl@...il.com>, <dave@...olabs.net>,
<jonathan.cameron@...wei.com>, <dave.jiang@...el.com>,
<alison.schofield@...el.com>, <vishal.l.verma@...el.com>,
<dan.j.williams@...el.com>, <bhelgaas@...gle.com>, <mahesh@...ux.ibm.com>,
<ira.weiny@...el.com>, <oohall@...il.com>, <Benjamin.Cheatham@....com>,
<rrichter@....com>, <nathan.fontenot@....com>, <terry.bowman@....com>,
<Smita.KoralahalliChannabasappa@....com>, <lukas@...ner.de>,
<ming.li@...omail.com>, <PradeepVineshReddy.Kodamati@....com>
Subject: [PATCH v6 14/17] cxl/pci: Update CXL Port RAS logging to also display PCIe SBDF
CXL RAS errors are currently logged using the associated CXL port's name
returned from devname(). They are typically named with 'port1', 'port2',
etc. to indicate the hierarchial location in the CXL topology. But, this
doesn't clearly indicate the CXL card or slot reporting the error.
Update the logging to also log the corresponding PCIe devname. This will
give a PCIe SBDF or ACPI object name (in case of CXL HB). This will provide
details helping users understand which physical slot and card has the
error.
Below is example output after making these changes.
Correctable error example output:
cxl_port_aer_correctable_error: device=port1 (0000:0c:00.0) parent=root0 (pci0000:0c) status='Received Error From Physical Layer'
Uncorrectable error example output:
cxl_port_aer_uncorrectable_error: device=port1 (0000:0c:00.0) parent=root0 (pci0000:0c) status: 'Memory Byte Enable Parity Error' first_error: 'Memory Byte Enable Parity Error'
Signed-off-by: Terry Bowman <terry.bowman@....com>
---
drivers/cxl/core/pci.c | 31 +++++++++++++++--------------
drivers/cxl/core/trace.h | 42 +++++++++++++++++++++++++---------------
2 files changed, 42 insertions(+), 31 deletions(-)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 61e6d33d2270..f154dcf6dfda 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -652,14 +652,14 @@ void read_cdat_data(struct cxl_port *port)
}
EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
-static void __cxl_handle_cor_ras(struct device *dev,
+static void __cxl_handle_cor_ras(struct device *cxl_dev, struct device *pcie_dev,
void __iomem *ras_base)
{
void __iomem *addr;
u32 status;
if (!ras_base) {
- dev_warn_once(dev, "CXL RAS register block is not mapped");
+ dev_warn_once(cxl_dev, "CXL RAS register block is not mapped");
return;
}
@@ -677,7 +677,7 @@ static void __cxl_handle_cor_ras(struct device *dev,
static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds)
{
- return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+ return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, NULL, cxlds->regs.ras);
}
/* CXL spec rev3.0 8.2.4.16.1 */
@@ -701,7 +701,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
* Log the state of the RAS status registers and prepare them to log the
* next error status. Return 1 if reset needed.
*/
-static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
+static pci_ers_result_t __cxl_handle_ras(struct device *cxl_dev, struct device *pcie_dev,
+ void __iomem *ras_base)
{
u32 hl[CXL_HEADERLOG_SIZE_U32];
void __iomem *addr;
@@ -709,7 +710,7 @@ static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_b
u32 fe;
if (!ras_base) {
- dev_warn_once(dev, "CXL RAS register block is not mapped");
+ dev_warn_once(cxl_dev, "CXL RAS register block is not mapped");
return PCI_ERS_RESULT_NONE;
}
@@ -730,10 +731,10 @@ static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_b
}
header_log_copy(ras_base, hl);
- if (is_cxl_memdev(dev))
- trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl);
- else if (is_cxl_port(dev))
- trace_cxl_port_aer_uncorrectable_error(dev, status, fe, hl);
+ if (is_cxl_memdev(cxl_dev))
+ trace_cxl_aer_uncorrectable_error(to_cxl_memdev(cxl_dev), status, fe, hl);
+ else if (is_cxl_port(cxl_dev))
+ trace_cxl_port_aer_uncorrectable_error(cxl_dev, pcie_dev, status, fe, hl);
writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
@@ -742,7 +743,7 @@ static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_b
static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
{
- return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+ return __cxl_handle_ras(&cxlds->cxlmd->dev, NULL, cxlds->regs.ras);
}
#ifdef CONFIG_PCIEAER_CXL
@@ -814,7 +815,7 @@ static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev, struct device **dev)
struct cxl_dport *dport = NULL;
port = find_cxl_port(&pdev->dev, &dport);
- if (!port) {
+ if (!port || !is_cxl_port(&port->dev)) {
pci_err(pdev, "Failed to find root/dport in CXL topology\n");
return NULL;
}
@@ -848,7 +849,7 @@ static void cxl_port_cor_error_detected(struct pci_dev *pdev)
struct device *dev;
void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
- __cxl_handle_cor_ras(dev, ras_base);
+ __cxl_handle_cor_ras(dev, &pdev->dev, ras_base);
}
static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
@@ -856,7 +857,7 @@ static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
struct device *dev;
void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
- return __cxl_handle_ras(dev, ras_base);
+ return __cxl_handle_ras(dev, &pdev->dev, ras_base);
}
void cxl_uport_init_ras_reporting(struct cxl_port *port)
@@ -909,13 +910,13 @@ EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
struct cxl_dport *dport)
{
- return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, NULL, dport->regs.ras);
}
static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds,
struct cxl_dport *dport)
{
- return __cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ return __cxl_handle_ras(&cxlds->cxlmd->dev, NULL, dport->regs.ras);
}
/*
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index b536233ac210..a74803f4aa22 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -49,18 +49,22 @@
)
TRACE_EVENT(cxl_port_aer_uncorrectable_error,
- TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
- TP_ARGS(dev, status, fe, hl),
+ TP_PROTO(struct device *cxl_dev, struct device *pcie_dev, u32 status, u32 fe, u32 *hl),
+ TP_ARGS(cxl_dev, pcie_dev, status, fe, hl),
TP_STRUCT__entry(
- __string(devname, dev_name(dev))
- __string(parent, dev_name(dev->parent))
+ __string(cxl_name, dev_name(cxl_dev))
+ __string(cxl_parent_name, dev_name(cxl_dev->parent))
+ __string(pcie_name, dev_name(pcie_dev))
+ __string(pcie_parent_name, dev_name(pcie_dev->parent))
__field(u32, status)
__field(u32, first_error)
__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
),
TP_fast_assign(
- __assign_str(devname);
- __assign_str(parent);
+ __assign_str(cxl_name);
+ __assign_str(cxl_parent_name);
+ __assign_str(pcie_name);
+ __assign_str(pcie_parent_name);
__entry->status = status;
__entry->first_error = fe;
/*
@@ -69,8 +73,9 @@ TRACE_EVENT(cxl_port_aer_uncorrectable_error,
*/
memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
),
- TP_printk("device=%s parent=%s status: '%s' first_error: '%s'",
- __get_str(devname), __get_str(parent),
+ TP_printk("device=%s (%s) parent=%s (%s) status: '%s' first_error: '%s'",
+ __get_str(cxl_name), __get_str(pcie_name),
+ __get_str(cxl_parent_name), __get_str(pcie_parent_name),
show_uc_errs(__entry->status),
show_uc_errs(__entry->first_error)
)
@@ -125,20 +130,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
)
TRACE_EVENT(cxl_port_aer_correctable_error,
- TP_PROTO(struct device *dev, u32 status),
- TP_ARGS(dev, status),
+ TP_PROTO(struct device *cxl_dev, struct device *pcie_dev, u32 status),
+ TP_ARGS(cxl_dev, pcie_dev, status),
TP_STRUCT__entry(
- __string(devname, dev_name(dev))
- __string(parent, dev_name(dev->parent))
+ __string(cxl_name, dev_name(cxl_dev))
+ __string(cxl_parent_name, dev_name(cxl_dev->parent))
+ __string(pcie_name, dev_name(pcie_dev))
+ __string(pcie_parent_name, dev_name(pcie_dev->parent))
__field(u32, status)
),
TP_fast_assign(
- __assign_str(devname);
- __assign_str(parent);
+ __assign_str(cxl_name);
+ __assign_str(cxl_parent_name);
+ __assign_str(pcie_name);
+ __assign_str(pcie_parent_name);
__entry->status = status;
),
- TP_printk("device=%s parent=%s status='%s'",
- __get_str(devname), __get_str(parent),
+ TP_printk("device=%s (%s) parent=%s (%s) status='%s'",
+ __get_str(cxl_name), __get_str(pcie_name),
+ __get_str(cxl_parent_name), __get_str(pcie_parent_name),
show_ce_errs(__entry->status)
)
);
--
2.34.1
Powered by blists - more mailing lists