lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250603172239.159260-5-terry.bowman@amd.com>
Date: Tue, 3 Jun 2025 12:22:27 -0500
From: Terry Bowman <terry.bowman@....com>
To: <PradeepVineshReddy.Kodamati@....com>, <dave@...olabs.net>,
	<jonathan.cameron@...wei.com>, <dave.jiang@...el.com>,
	<alison.schofield@...el.com>, <vishal.l.verma@...el.com>,
	<ira.weiny@...el.com>, <dan.j.williams@...el.com>, <bhelgaas@...gle.com>,
	<bp@...en8.de>, <ming.li@...omail.com>, <shiju.jose@...wei.com>,
	<dan.carpenter@...aro.org>, <Smita.KoralahalliChannabasappa@....com>,
	<kobayashi.da-06@...itsu.com>, <terry.bowman@....com>, <yanfei.xu@...el.com>,
	<rrichter@....com>, <peterz@...radead.org>, <colyli@...e.de>,
	<uaisheng.ye@...el.com>, <fabio.m.de.francesco@...ux.intel.com>,
	<ilpo.jarvinen@...ux.intel.com>, <yazen.ghannam@....com>,
	<linux-cxl@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
	<linux-pci@...r.kernel.org>
Subject: [PATCH v9 04/16] PCI/AER: Dequeue forwarded CXL error

The AER driver is now designed to forward CXL protocol errors to the CXL
driver. Update the CXL driver with functionality to dequeue the forwarded
CXL error from the kfifo. Also, update the CXL driver to begin the protocol
error handling processing using the work received from the FIFO.

Introduce function cxl_prot_err_work_fn() to dequeue work forwarded by the
AER service driver. This will begin the CXL protocol error processing
with a call to cxl_handle_prot_error().

Update cxl/core/ras.c by adding cxl_rch_handle_error_iter() that was
previously in the AER driver.

Introduce sbdf_to_pci() to take the SBDF values from 'struct cxl_prot_error_info'
and use in discovering the erring PCI device. Make scope based reference
increments/decrements for the discovered PCI device and the associated
CXL device.

Implement cxl_handle_prot_error() to differentiate between Restricted CXL
Host (RCH) protocol errors and CXL virtual host (VH) protocol errors.
RCH errors will be processed with a call to walk the associated Root
Complex Event Collector's (RCEC) secondary bus looking for the Root Complex
Integrated Endpoint (RCiEP) to handle the RCH error. Export pcie_walk_rcec()
so the CXL driver can walk the RCEC's downstream bus, searching for
the RCiEP.

VH correctable error (CE) processing will call the CXL CE handler. VH
uncorrectable errors (UCE) will call cxl_do_recovery(), implemented as a
stub for now and to be updated in future patch. Export pci_aer_clean_fatal_status()
and pci_clean_device_status() used to clean up AER status after handling.

Signed-off-by: Terry Bowman <terry.bowman@....com>
---
 drivers/cxl/core/ras.c  | 92 +++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.c       |  1 +
 drivers/pci/pci.h       |  8 ----
 drivers/pci/pcie/aer.c  |  1 +
 drivers/pci/pcie/rcec.c |  1 +
 include/linux/aer.h     |  2 +
 include/linux/pci.h     | 10 +++++
 7 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index d35525e79e04..9ed5c682e128 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -110,8 +110,100 @@ static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
 
 #ifdef CONFIG_PCIEAER_CXL
 
+static void cxl_do_recovery(struct pci_dev *pdev)
+{
+}
+
+static int cxl_rch_handle_error_iter(struct pci_dev *pdev, void *data)
+{
+	struct cxl_prot_error_info *err_info = data;
+	struct pci_dev *pdev_ref __free(pci_dev_put) = pci_dev_get(pdev);
+	struct cxl_dev_state *cxlds;
+
+	/*
+	 * The capability, status, and control fields in Device 0,
+	 * Function 0 DVSEC control the CXL functionality of the
+	 * entire device (CXL 3.0, 8.1.3).
+	 */
+	if (pdev->devfn != PCI_DEVFN(0, 0))
+		return 0;
+
+	/*
+	 * CXL Memory Devices must have the 502h class code set (CXL
+	 * 3.0, 8.1.12.1).
+	 */
+	if ((pdev->class >> 8) != PCI_CLASS_MEMORY_CXL)
+		return 0;
+
+	if (!is_cxl_memdev(&pdev->dev) || !pdev->dev.driver)
+		return 0;
+
+	cxlds = pci_get_drvdata(pdev);
+	struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
+
+	if (err_info->severity == AER_CORRECTABLE)
+		cxl_cor_error_detected(pdev);
+	else
+		cxl_do_recovery(pdev);
+
+	return 1;
+}
+
+static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
+{
+	unsigned int devfn = PCI_DEVFN(err_info->device,
+				       err_info->function);
+	struct pci_dev *pdev __free(pci_dev_put) =
+		pci_get_domain_bus_and_slot(err_info->segment,
+					    err_info->bus,
+					    devfn);
+	return pdev;
+}
+
+static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
+{
+	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
+
+	if (!pdev) {
+		pr_err("Failed to find the CXL device\n");
+		return;
+	}
+
+	/*
+	 * Internal errors of an RCEC indicate an AER error in an
+	 * RCH's downstream port. Check and handle them in the CXL.mem
+	 * device driver.
+	 */
+	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_EC)
+		return pcie_walk_rcec(pdev, cxl_rch_handle_error_iter, err_info);
+
+	if (err_info->severity == AER_CORRECTABLE) {
+		int aer = pdev->aer_cap;
+		struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+		struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
+
+		if (aer)
+			pci_clear_and_set_config_dword(pdev,
+						       aer + PCI_ERR_COR_STATUS,
+						       0, PCI_ERR_COR_INTERNAL);
+
+		cxl_cor_error_detected(pdev);
+
+		pcie_clear_device_status(pdev);
+	} else {
+		cxl_do_recovery(pdev);
+	}
+}
+
 static void cxl_prot_err_work_fn(struct work_struct *work)
 {
+	struct cxl_prot_err_work_data wd;
+
+	while (cxl_prot_err_kfifo_get(&wd)) {
+		struct cxl_prot_error_info *err_info = &wd.err_info;
+
+		cxl_handle_prot_error(err_info);
+	}
 }
 
 #else
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e77d5b53c0ce..524ac32b744a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2328,6 +2328,7 @@ void pcie_clear_device_status(struct pci_dev *dev)
 	pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
 	pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
 }
+EXPORT_SYMBOL_NS_GPL(pcie_clear_device_status, "CXL");
 #endif
 
 /**
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d6296500b004..3c54a5ed803e 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -649,16 +649,10 @@ static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; }
 void pci_rcec_init(struct pci_dev *dev);
 void pci_rcec_exit(struct pci_dev *dev);
 void pcie_link_rcec(struct pci_dev *rcec);
-void pcie_walk_rcec(struct pci_dev *rcec,
-		    int (*cb)(struct pci_dev *, void *),
-		    void *userdata);
 #else
 static inline void pci_rcec_init(struct pci_dev *dev) { }
 static inline void pci_rcec_exit(struct pci_dev *dev) { }
 static inline void pcie_link_rcec(struct pci_dev *rcec) { }
-static inline void pcie_walk_rcec(struct pci_dev *rcec,
-				  int (*cb)(struct pci_dev *, void *),
-				  void *userdata) { }
 #endif
 
 #ifdef CONFIG_PCI_ATS
@@ -967,7 +961,6 @@ void pci_no_aer(void);
 void pci_aer_init(struct pci_dev *dev);
 void pci_aer_exit(struct pci_dev *dev);
 extern const struct attribute_group aer_stats_attr_group;
-void pci_aer_clear_fatal_status(struct pci_dev *dev);
 int pci_aer_clear_status(struct pci_dev *dev);
 int pci_aer_raw_clear_status(struct pci_dev *dev);
 void pci_save_aer_state(struct pci_dev *dev);
@@ -976,7 +969,6 @@ void pci_restore_aer_state(struct pci_dev *dev);
 static inline void pci_no_aer(void) { }
 static inline void pci_aer_init(struct pci_dev *d) { }
 static inline void pci_aer_exit(struct pci_dev *d) { }
-static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
 static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
 static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
 static inline void pci_save_aer_state(struct pci_dev *dev) { }
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 5350fa5be784..6e88331c6303 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -290,6 +290,7 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
 	if (status)
 		pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
 }
+EXPORT_SYMBOL_GPL(pci_aer_clear_fatal_status);
 
 /**
  * pci_aer_raw_clear_status - Clear AER error registers.
diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
index d0bcd141ac9c..fb6cf6449a1d 100644
--- a/drivers/pci/pcie/rcec.c
+++ b/drivers/pci/pcie/rcec.c
@@ -145,6 +145,7 @@ void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *),
 
 	walk_rcec(walk_rcec_helper, &rcec_data);
 }
+EXPORT_SYMBOL_NS_GPL(pcie_walk_rcec, "CXL");
 
 void pci_rcec_init(struct pci_dev *dev)
 {
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 550407240ab5..c9a18eca16f8 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -77,12 +77,14 @@ struct cxl_prot_err_work_data {
 
 #if defined(CONFIG_PCIEAER)
 int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
+void pci_aer_clear_fatal_status(struct pci_dev *dev);
 int pcie_aer_is_native(struct pci_dev *dev);
 #else
 static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
+static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
 static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index bff3009f9ff0..cd53715d53f3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1806,6 +1806,9 @@ extern bool pcie_ports_native;
 
 int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
 			  bool use_lt);
+void pcie_walk_rcec(struct pci_dev *rcec,
+		    int (*cb)(struct pci_dev *, void *),
+		    void *userdata);
 #else
 #define pcie_ports_disabled	true
 #define pcie_ports_native	false
@@ -1816,8 +1819,15 @@ static inline int pcie_set_target_speed(struct pci_dev *port,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline void pcie_walk_rcec(struct pci_dev *rcec,
+				  int (*cb)(struct pci_dev *, void *),
+				  void *userdata) { }
+
 #endif
 
+void pcie_clear_device_status(struct pci_dev *dev);
+
 #define PCIE_LINK_STATE_L0S		(BIT(0) | BIT(1)) /* Upstr/dwnstr L0s */
 #define PCIE_LINK_STATE_L1		BIT(2)	/* L1 state */
 #define PCIE_LINK_STATE_L1_1		BIT(3)	/* ASPM L1.1 state */
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ