[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260114182055.46029-23-terry.bowman@amd.com>
Date: Wed, 14 Jan 2026 12:20:43 -0600
From: Terry Bowman <terry.bowman@....com>
To: <dave@...olabs.net>, <jonathan.cameron@...wei.com>,
<dave.jiang@...el.com>, <alison.schofield@...el.com>,
<dan.j.williams@...el.com>, <bhelgaas@...gle.com>, <shiju.jose@...wei.com>,
<ming.li@...omail.com>, <Smita.KoralahalliChannabasappa@....com>,
<rrichter@....com>, <dan.carpenter@...aro.org>,
<PradeepVineshReddy.Kodamati@....com>, <lukas@...ner.de>,
<Benjamin.Cheatham@....com>, <sathyanarayanan.kuppuswamy@...ux.intel.com>,
<linux-cxl@...r.kernel.org>, <vishal.l.verma@...el.com>, <alucerop@....com>,
<ira.weiny@...el.com>
CC: <linux-kernel@...r.kernel.org>, <linux-pci@...r.kernel.org>,
<terry.bowman@....com>
Subject: [PATCH v14 22/34] cxl: Update CXL Endpoint tracing
CXL protocol error handling will be expanded to soon include CXL Port
support along with existing Endpoint support. 2 updates are needed first:
- Update calling interfaces to use 'struct device*'
- Log serial number
Add serial number parameter to the trace logging. This is used for EPs
and 0 is provided for CXL port devices without a serial number.
Leave the correctable and uncorrectable trace routines' TP_STRUCT__entry()
unchanged with respect to member data types and order.
Below is output of correctable and uncorrectable protocol error logging.
CXL Root Port and CXL Endpoint examples are included below.
Root Port:
cxl_aer_correctable_error: device=0000:0c:00.0 host=pci0000:0c serial: 0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: device=0000:0c:00.0 host=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
Endpoint:
cxl_aer_correctable_error: memdev=mem3 host=0000:0f:00.0 serial=0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: memdev=mem3 host=0000:0f:00.0 serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
Signed-off-by: Terry Bowman <terry.bowman@....com>
Reviewed-by: Shiju Jose <shiju.jose@...wei.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@...wei.com>
Reviewed-by: Dave Jiang <dave.jiang@...el.com>
---
Changes in v13->v14:
- Update commit headline (Bjorn)
Changes in v12->v13:
- Added Dave Jiang's review-by
Changes in v11 -> v12:
- Correct parameters to call trace_cxl_aer_correctable_error()
- Add reviewed-by for Jonathan and Shiju
Changes in v10->v11:
- Updated CE and UCE trace routines to maintain consistent TP_Struct ABI
and unchanged TP_printk() logging.
---
drivers/cxl/core/core.h | 4 ++--
drivers/cxl/core/ras.c | 35 ++++++++++++++++++++---------------
drivers/cxl/core/ras_rch.c | 4 ++--
drivers/cxl/core/trace.h | 25 +++++++++++++------------
4 files changed, 37 insertions(+), 31 deletions(-)
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 422531799af2..306762a15dc0 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -147,8 +147,8 @@ int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
#ifdef CONFIG_CXL_RAS
int cxl_ras_init(void);
void cxl_ras_exit(void);
-bool cxl_handle_ras(struct device *dev, void __iomem *ras_base);
-void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base);
+bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base);
+void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base);
void cxl_dport_map_rch_aer(struct cxl_dport *dport);
void cxl_disable_rch_root_ints(struct cxl_dport *dport);
void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds);
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index d71fcac31cf2..84abcf90fa99 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -13,7 +13,7 @@ static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
{
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
- trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+ trace_cxl_aer_correctable_error(&pdev->dev, status, 0);
}
static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
@@ -28,8 +28,8 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
else
fe = status;
- trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
- ras_cap.header_log);
+ trace_cxl_aer_uncorrectable_error(&pdev->dev, status, fe,
+ ras_cap.header_log, 0);
}
static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
@@ -37,7 +37,7 @@ static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
{
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
- trace_cxl_aer_correctable_error(cxlmd, status);
+ trace_cxl_aer_correctable_error(&cxlmd->dev, status, cxlmd->cxlds->serial);
}
static void
@@ -45,6 +45,7 @@ cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
struct cxl_ras_capability_regs ras_cap)
{
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
u32 fe;
if (hweight32(status) > 1)
@@ -53,8 +54,9 @@ cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
else
fe = status;
- trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
- ras_cap.header_log);
+ trace_cxl_aer_uncorrectable_error(&cxlmd->dev, status, fe,
+ ras_cap.header_log,
+ cxlds->serial);
}
static int match_memdev_by_parent(struct device *dev, const void *uport)
@@ -160,7 +162,7 @@ void devm_cxl_dport_ras_setup(struct cxl_dport *dport)
}
EXPORT_SYMBOL_NS_GPL(devm_cxl_dport_ras_setup, "CXL");
-void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base)
+void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base)
{
void __iomem *addr;
u32 status;
@@ -170,10 +172,11 @@ void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base)
addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
status = readl(addr);
- if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
- writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
- trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status);
- }
+ if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK))
+ return;
+ writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
+
+ trace_cxl_aer_correctable_error(dev, status, serial);
}
/* CXL spec rev3.0 8.2.4.16.1 */
@@ -197,7 +200,7 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
* Log the state of the RAS status registers and prepare them to log the
* next error status. Return 1 if reset needed.
*/
-bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
+bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
{
u32 hl[CXL_HEADERLOG_SIZE_U32];
void __iomem *addr;
@@ -224,7 +227,7 @@ bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
}
header_log_copy(ras_base, hl);
- trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl);
+ trace_cxl_aer_uncorrectable_error(dev, status, fe, hl, serial);
writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
return true;
@@ -246,7 +249,8 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
if (cxlds->rcd)
cxl_handle_rdport_errors(cxlds);
- cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+ cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial,
+ cxlds->regs.ras);
}
}
EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
@@ -275,7 +279,8 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
* chance the situation is recoverable dump the status of the RAS
* capability registers and bounce the active state of the memdev.
*/
- ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+ ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial,
+ cxlds->regs.ras);
}
diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c
index 0a8b3b9b6388..3e33374e07f2 100644
--- a/drivers/cxl/core/ras_rch.c
+++ b/drivers/cxl/core/ras_rch.c
@@ -115,7 +115,7 @@ void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
pci_print_aer(pdev, severity, &aer_regs);
if (severity == AER_CORRECTABLE)
- cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
else
- cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
}
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index a972e4ef1936..c569d92b6000 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -77,11 +77,12 @@ TRACE_EVENT(cxl_port_aer_uncorrectable_error,
);
TRACE_EVENT(cxl_aer_uncorrectable_error,
- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
- TP_ARGS(cxlmd, status, fe, hl),
+ TP_PROTO(const struct device *cxlmd, u32 status, u32 fe, u32 *hl,
+ u64 serial),
+ TP_ARGS(cxlmd, status, fe, hl, serial),
TP_STRUCT__entry(
- __string(memdev, dev_name(&cxlmd->dev))
- __string(host, dev_name(cxlmd->dev.parent))
+ __string(memdev, dev_name(cxlmd))
+ __string(host, dev_name(cxlmd->parent))
__field(u64, serial)
__field(u32, status)
__field(u32, first_error)
@@ -90,7 +91,7 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
TP_fast_assign(
__assign_str(memdev);
__assign_str(host);
- __entry->serial = cxlmd->cxlds->serial;
+ __entry->serial = serial;
__entry->status = status;
__entry->first_error = fe;
/*
@@ -138,24 +139,24 @@ TRACE_EVENT(cxl_port_aer_correctable_error,
__entry->status = status;
),
TP_printk("device=%s host=%s status='%s'",
- __get_str(device), __get_str(host),
- show_ce_errs(__entry->status)
+ __get_str(device), __get_str(host),
+ show_ce_errs(__entry->status)
)
);
TRACE_EVENT(cxl_aer_correctable_error,
- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
- TP_ARGS(cxlmd, status),
+ TP_PROTO(const struct device *cxlmd, u32 status, u64 serial),
+ TP_ARGS(cxlmd, status, serial),
TP_STRUCT__entry(
- __string(memdev, dev_name(&cxlmd->dev))
- __string(host, dev_name(cxlmd->dev.parent))
+ __string(memdev, dev_name(cxlmd))
+ __string(host, dev_name(cxlmd->parent))
__field(u64, serial)
__field(u32, status)
),
TP_fast_assign(
__assign_str(memdev);
__assign_str(host);
- __entry->serial = cxlmd->cxlds->serial;
+ __entry->serial = serial;
__entry->status = status;
),
TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
--
2.34.1
Powered by blists - more mailing lists