lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221021185615.605233-6-terry.bowman@amd.com>
Date:   Fri, 21 Oct 2022 13:56:15 -0500
From:   Terry Bowman <terry.bowman@....com>
To:     <alison.schofield@...el.com>, <vishal.l.verma@...el.com>,
        <dave.jiang@...el.com>, <ira.weiny@...el.com>,
        <bwidawsk@...nel.org>, <dan.j.williams@...el.com>
CC:     <terry.bowman@....com>, <linux-cxl@...r.kernel.org>,
        <linux-kernel@...r.kernel.org>, <bhelgaas@...gle.com>,
        <rafael@...nel.org>, <lenb@...nel.org>,
        <Jonathan.Cameron@...wei.com>, <dave@...olabs.net>,
        <rrichter@....com>
Subject: [PATCH 5/5] cxl/pci: Log CXL device's PCIe AER and CXL RAS error information

The CXL downport PCIe AER and CXL RAS capability information needs to be
logged during PCIe AER error handling.

The existing PCIe AER error handler logs native AER errors but does not
log upport/downport AER capability residing in the RCRB. The CXL1.1
RCRB does not have a BDF and is not enunmerable. The existing error handler
logic does not display CXL RAS details either.

Add a CXL error handler to the existing PCI error handlers. Add a call
to the CXL error handler within the PCIe AER error handler. Implement the
driver's CXL callback to log downport PCIe AER and CXL RAS capability
information.

Signed-off-by: Terry Bowman <terry.bowman@....com>
---
 drivers/cxl/pci.c      | 76 ++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pcie/aer.c | 45 ++++++++++++++++++++++++-
 include/linux/pci.h    |  4 +++
 3 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 80a01b304efe..dceda9f9fc60 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -7,6 +7,7 @@
 #include <linux/sizes.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
+#include <linux/aer.h>
 #include <linux/pci.h>
 #include <linux/pci-doe.h>
 #include <linux/io.h>
@@ -14,6 +15,9 @@
 #include "cxlpci.h"
 #include "cxl.h"
 
+extern void cxl_print_aer(struct pci_dev *dev, int aer_severity,
+			  struct aer_capability_regs *aer);
+
 /**
  * DOC: cxl pci
  *
@@ -744,9 +748,80 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM))
 		rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd);
 
+	dev_set_drvdata(&pdev->dev, cxlmd);
+
 	return rc;
 }
 
+struct ras_cap {
+	u32 uc_error_status;
+	u32 uc_error_mask;
+	u32 uc_error_severity;
+	u32 c_error_status;
+	u32 c_error_mask;
+	u32 ctrl;
+	u32 log[];
+};
+
+/*
+ * Log the state of the CXL downport AER and RAS status registers.
+ */
+static void cxl_error_report(struct cxl_memdev *cxlmd)
+{
+	struct pci_dev *pdev = to_pci_dev(cxlmd->cxlds->dev);
+	struct aer_capability_regs *aer_cap;
+	struct ras_cap *ras_cap;
+
+	aer_cap = (struct aer_capability_regs *)cxlmd->cxlds->aer_map.base;
+	ras_cap = (struct ras_cap *)cxlmd->cxlds->ras_map.base;
+
+	pci_err(pdev, "CXL Error Report\n");
+	pci_err(pdev, "AER Errors:\n");
+	if (aer_cap) {
+		cxl_print_aer(pdev, AER_CORRECTABLE, aer_cap);
+		cxl_print_aer(pdev, AER_NONFATAL, aer_cap);
+		cxl_print_aer(pdev, AER_FATAL, aer_cap);
+	}
+
+	pci_err(pdev, "RAS Errors:\n");
+	if (ras_cap) {
+		pci_err(pdev, "RAS: uc_error_status = %X\n", readl(&ras_cap->uc_error_status));
+		pci_err(pdev, "RAS: uc_error_mask = %X\n", readl(&ras_cap->uc_error_mask));
+		pci_err(pdev, "RAS: uc_error_severity = %X\n", readl(&ras_cap->uc_error_severity));
+		pci_err(pdev, "RAS: c_error_status = %X\n", readl(&ras_cap->c_error_status));
+		pci_err(pdev, "RAS: c_error_mask = %X\n", readl(&ras_cap->c_error_mask));
+		pci_err(pdev, "RAS: ras_caps->ctrl = %X\n", readl(&ras_cap->ctrl));
+		pci_err(pdev, "RAS: log = %X\n", readl(&ras_cap->log));
+	}
+}
+
+static void cxl_error_detected(struct pci_dev *pdev)
+{
+	struct cxl_memdev *cxlmd;
+
+	if (!is_cxl_memdev(&pdev->dev)) {
+		pci_err(pdev, "CXL memory device is invalid\n");
+		return;
+	}
+
+	cxlmd = dev_get_drvdata(&pdev->dev);
+	if (!cxlmd) {
+		pci_err(pdev, "CXL memory device is NULL\n");
+		return;
+	}
+
+	if (!cxlmd->cxlds) {
+		pci_err(pdev, "CXL device state object is NULL\n");
+		return;
+	}
+
+	cxl_error_report(cxlmd);
+}
+
+static struct pci_error_handlers cxl_error_handlers = {
+	.cxl_error_detected = cxl_error_detected,
+};
+
 static const struct pci_device_id cxl_mem_pci_tbl[] = {
 	/* PCI class code for CXL.mem Type-3 Devices */
 	{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
@@ -761,6 +836,7 @@ static struct pci_driver cxl_pci_driver = {
 	.driver	= {
 		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
 	},
+	.err_handler		= &cxl_error_handlers,
 };
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e2d8a74f83c3..dea04d412406 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -811,6 +811,13 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
 }
 #endif
 
+void cxl_print_aer(struct pci_dev *dev, int aer_severity,
+		    struct aer_capability_regs *aer)
+{
+	cper_print_aer(dev, aer_severity, aer);
+}
+EXPORT_SYMBOL_GPL(cxl_print_aer);
+
 /**
  * add_error_device - list device to be handled
  * @e_info: pointer to error info
@@ -1169,6 +1176,40 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
 	}
 }
 
+static int report_cxl_errors_iter(struct pci_dev *pdev, void *data)
+{
+	struct pci_driver *pdrv = pdev->driver;
+
+	if (pdrv &&
+	    pdrv->err_handler &&
+	    pdrv->err_handler->cxl_error_detected)
+		pdrv->err_handler->cxl_error_detected(pdev);
+
+	return 0;
+}
+
+static void report_cxl_errors(struct aer_rpc *rpc,
+			      struct aer_err_source *e_src)
+{
+	struct pci_dev *pdev = rpc->rpd;
+	struct aer_err_info e_info;
+	u32 uncor_status, cor_status;
+
+	pci_read_config_dword(pdev, pdev->aer_cap + PCI_ERR_UNCOR_STATUS, &uncor_status);
+	pci_read_config_dword(pdev, pdev->aer_cap + PCI_ERR_COR_STATUS, &cor_status);
+
+	if (!uncor_status && !cor_status)
+		return;
+
+	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(pdev, report_cxl_errors_iter, &e_info);
+	else
+		pci_walk_bus(pdev->subordinate, report_cxl_errors_iter, &e_info);
+
+	pci_write_config_dword(pdev, pdev->aer_cap + PCI_ERR_UNCOR_STATUS, uncor_status);
+	pci_write_config_dword(pdev, pdev->aer_cap + PCI_ERR_COR_STATUS, cor_status);
+}
+
 /**
  * aer_isr - consume errors detected by root port
  * @irq: IRQ assigned to Root Port
@@ -1185,8 +1226,10 @@ static irqreturn_t aer_isr(int irq, void *context)
 	if (kfifo_is_empty(&rpc->aer_fifo))
 		return IRQ_NONE;
 
-	while (kfifo_get(&rpc->aer_fifo, &e_src))
+	while (kfifo_get(&rpc->aer_fifo, &e_src)) {
+		report_cxl_errors(rpc, &e_src);
 		aer_isr_one_error(rpc, &e_src);
+	}
 	return IRQ_HANDLED;
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2bda4a4e47e8..4f4b3a8f5454 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -827,6 +827,10 @@ enum pci_ers_result {
 
 /* PCI bus error event callbacks */
 struct pci_error_handlers {
+
+	/* CXL error detected on this device */
+	void (*cxl_error_detected)(struct pci_dev *dev);
+
 	/* PCI bus error detected on this device */
 	pci_ers_result_t (*error_detected)(struct pci_dev *dev,
 					   pci_channel_state_t error);
-- 
2.34.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ