[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250925223440.3539069-19-terry.bowman@amd.com>
Date: Thu, 25 Sep 2025 17:34:33 -0500
From: Terry Bowman <terry.bowman@....com>
To: <dave@...olabs.net>, <jonathan.cameron@...wei.com>,
<dave.jiang@...el.com>, <alison.schofield@...el.com>,
<dan.j.williams@...el.com>, <bhelgaas@...gle.com>, <shiju.jose@...wei.com>,
<ming.li@...omail.com>, <Smita.KoralahalliChannabasappa@....com>,
<rrichter@....com>, <dan.carpenter@...aro.org>,
<PradeepVineshReddy.Kodamati@....com>, <lukas@...ner.de>,
<Benjamin.Cheatham@....com>, <sathyanarayanan.kuppuswamy@...ux.intel.com>,
<linux-cxl@...r.kernel.org>, <alucerop@....com>, <ira.weiny@...el.com>
CC: <linux-kernel@...r.kernel.org>, <linux-pci@...r.kernel.org>,
<terry.bowman@....com>
Subject: [PATCH v12 18/25] CXL/AER: Introduce aer_cxl_vh.c in AER driver for forwarding CXL errors
CXL virtual hierarchy (VH) RAS handling for CXL Port devices will be added
soon. This requires a notification mechanism for the AER driver to share
the AER interrupt with the CXL driver. The notification will be used as an
indication for the CXL drivers to handle and log the CXL RAS errors.
Note, 'CXL protocol error' terminology will refer to CXL VH and not
CXL RCH errors unless specifically noted going forward.
Introduce a new file in the AER driver to handle the CXL protocol errors
named pci/pcie/aer_cxl_vh.c.
Add a kfifo work queue to be used by the AER and CXL drivers. The AER
driver will be the sole kfifo producer adding work and the cxl_core will be
the sole kfifo consumer removing work. Add the boilerplate kfifo support.
Encapsulate the kfifo, RW semaphore, and work pointer in a single structure.
Add CXL work queue handler registration functions in the AER driver. Export
the functions allowing CXL driver to access. Implement registration
functions for the CXL driver to assign or clear the work handler function.
Synchronize accesses using the RW semaphore.
Introduce 'struct cxl_proto_err_work_data' to serve as the kfifo work data.
This will contain a reference to the erring PCI device and the error
severity. This will be used when the work is dequeued by the cxl_core driver.
Signed-off-by: Terry Bowman <terry.bowman@....com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@...wei.com>
---
Changes in v11->v12:
- Rename drivers/pci/pcie/cxl_aer.c to drivers/pci/pcie/aer_cxl_vh.c (Lukas)
Changes in v10->v11:
- cxl_error_detected() - Change handlers' scoped_guard() to guard() (Jonathan)
- cxl_error_detected() - Remove extra line (Shiju)
- Changes moved to core/ras.c (Terry)
- cxl_error_detected(), remove 'ue' and return with function call. (Jonathan)
- Remove extra space in documentation for PCI_ERS_RESULT_PANIC definition
- Move #include "pci.h from cxl.h to core.h (Terry)
- Remove unnecessary includes of cxl.h and core.h in mem.c (Terry)
---
drivers/pci/pci.h | 4 ++
drivers/pci/pcie/Makefile | 1 +
drivers/pci/pcie/aer.c | 25 ++-------
drivers/pci/pcie/aer_cxl_vh.c | 95 +++++++++++++++++++++++++++++++++++
include/linux/aer.h | 17 +++++++
5 files changed, 121 insertions(+), 21 deletions(-)
create mode 100644 drivers/pci/pcie/aer_cxl_vh.c
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f7631f40e57c..22e8f9a18a09 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1234,8 +1234,12 @@ static inline void cxl_rch_enable_rcec(struct pci_dev *rcec) { }
#ifdef CONFIG_CXL_RAS
bool is_internal_error(struct aer_err_info *info);
+bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info);
+void cxl_forward_error(struct pci_dev *pdev, struct aer_err_info *info);
#else
static inline bool is_internal_error(struct aer_err_info *info) { return false; }
+static inline bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info) { return false; }
+static inline void cxl_forward_error(struct pci_dev *pdev, struct aer_err_info *info) { }
#endif
#endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 970e7cbc5b34..72992b3ea417 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o
obj-y += aspm.o
obj-$(CONFIG_PCIEAER) += aer.o err.o tlp.o
obj-$(CONFIG_CXL_RCH_RAS) += aer_cxl_rch.o
+obj-$(CONFIG_CXL_RAS) += aer_cxl_vh.o
obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o
obj-$(CONFIG_PCIE_PME) += pme.o
obj-$(CONFIG_PCIE_DPC) += dpc.o
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 6ba8f84add70..ccefbcfe5145 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1093,8 +1093,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
}
-#ifdef CONFIG_CXL_RAS
-
/**
* pci_aer_unmask_internal_errors - unmask internal errors
* @dev: pointer to the pci_dev data structure
@@ -1120,24 +1118,6 @@ void pci_aer_unmask_internal_errors(struct pci_dev *dev)
}
EXPORT_SYMBOL_GPL(pci_aer_unmask_internal_errors);
-bool cxl_error_is_native(struct pci_dev *dev)
-{
- struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
-
- return (pcie_ports_native || host->native_aer);
-}
-EXPORT_SYMBOL_NS_GPL(cxl_error_is_native, "CXL");
-
-bool is_internal_error(struct aer_err_info *info)
-{
- if (info->severity == AER_CORRECTABLE)
- return info->status & PCI_ERR_COR_INTERNAL;
-
- return info->status & PCI_ERR_UNC_INTN;
-}
-EXPORT_SYMBOL_NS_GPL(is_internal_error, "CXL");
-#endif /* CONFIG_CXL_RAS */
-
/**
* pci_aer_handle_error - handle logging error into an event log
* @dev: pointer to pci_dev data structure of error source device
@@ -1174,7 +1154,10 @@ static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
{
cxl_rch_handle_error(dev, info);
- pci_aer_handle_error(dev, info);
+ if (is_cxl_error(dev, info))
+ cxl_forward_error(dev, info);
+ else
+ pci_aer_handle_error(dev, info);
pci_dev_put(dev);
}
diff --git a/drivers/pci/pcie/aer_cxl_vh.c b/drivers/pci/pcie/aer_cxl_vh.c
new file mode 100644
index 000000000000..8c0979299446
--- /dev/null
+++ b/drivers/pci/pcie/aer_cxl_vh.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
+
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <linux/pci.h>
+#include <linux/bitfield.h>
+#include <linux/kfifo.h>
+#include "../pci.h"
+
+#define CXL_ERROR_SOURCES_MAX 128
+
+struct cxl_proto_err_kfifo {
+ struct work_struct *work;
+ struct rw_semaphore rw_sema;
+ DECLARE_KFIFO(fifo, struct cxl_proto_err_work_data,
+ CXL_ERROR_SOURCES_MAX);
+};
+
+static struct cxl_proto_err_kfifo cxl_proto_err_kfifo = {
+ .rw_sema = __RWSEM_INITIALIZER(cxl_proto_err_kfifo.rw_sema)
+};
+
+bool cxl_error_is_native(struct pci_dev *dev)
+{
+ struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
+
+ return (pcie_ports_native || host->native_aer);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_error_is_native, "CXL");
+
+bool is_internal_error(struct aer_err_info *info)
+{
+ if (info->severity == AER_CORRECTABLE)
+ return info->status & PCI_ERR_COR_INTERNAL;
+
+ return info->status & PCI_ERR_UNC_INTN;
+}
+EXPORT_SYMBOL_NS_GPL(is_internal_error, "CXL");
+
+bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info)
+{
+ if (!info || !info->is_cxl)
+ return false;
+
+ if (pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT)
+ return false;
+
+ return is_internal_error(info);
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_error, "CXL");
+
+void cxl_forward_error(struct pci_dev *pdev, struct aer_err_info *info)
+{
+ struct cxl_proto_err_work_data wd = (struct cxl_proto_err_work_data) {
+ .severity = info->severity,
+ .pdev = pdev
+ };
+
+ guard(rwsem_write)(&cxl_proto_err_kfifo.rw_sema);
+
+ if (!cxl_proto_err_kfifo.work) {
+ dev_warn_once(&pdev->dev, "CXL driver is not registered for kfifo");
+ return;
+ }
+
+ if (!kfifo_put(&cxl_proto_err_kfifo.fifo, wd)) {
+ dev_err_ratelimited(&pdev->dev, "CXL kfifo overflow\n");
+ return;
+ }
+
+ schedule_work(cxl_proto_err_kfifo.work);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_forward_error, "CXL");
+
+void cxl_register_proto_err_work(struct work_struct *work)
+{
+ guard(rwsem_write)(&cxl_proto_err_kfifo.rw_sema);
+ cxl_proto_err_kfifo.work = work;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_register_proto_err_work, "CXL");
+
+void cxl_unregister_proto_err_work(void)
+{
+ guard(rwsem_write)(&cxl_proto_err_kfifo.rw_sema);
+ cxl_proto_err_kfifo.work = NULL;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_unregister_proto_err_work, "CXL");
+
+int cxl_proto_err_kfifo_get(struct cxl_proto_err_work_data *wd)
+{
+ guard(rwsem_read)(&cxl_proto_err_kfifo.rw_sema);
+ return kfifo_get(&cxl_proto_err_kfifo.fifo, wd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_proto_err_kfifo_get, "CXL");
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 2ef820563996..6b2c87d1b5b6 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -10,6 +10,7 @@
#include <linux/errno.h>
#include <linux/types.h>
+#include <linux/workqueue_types.h>
#define AER_NONFATAL 0
#define AER_FATAL 1
@@ -53,6 +54,16 @@ struct aer_capability_regs {
u16 uncor_err_source;
};
+/**
+ * struct cxl_proto_err_work_data - Error information used in CXL error handling
+ * @severity: AER severity
+ * @pdev: PCI device detecting the error
+ */
+struct cxl_proto_err_work_data {
+ int severity;
+ struct pci_dev *pdev;
+};
+
#if defined(CONFIG_PCIEAER)
int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
int pcie_aer_is_native(struct pci_dev *dev);
@@ -68,8 +79,14 @@ static inline void pci_aer_unmask_internal_errors(struct pci_dev *dev) { }
#ifdef CONFIG_CXL_RAS
bool cxl_error_is_native(struct pci_dev *dev);
+int cxl_proto_err_kfifo_get(struct cxl_proto_err_work_data *wd);
+void cxl_register_proto_err_work(struct work_struct *work);
+void cxl_unregister_proto_err_work(void);
#else
static inline bool cxl_error_is_native(struct pci_dev *dev) { return false; }
+static inline int cxl_proto_err_kfifo_get(struct cxl_proto_err_work_data *wd) { return 0; }
+static inline void cxl_register_proto_err_work(struct work_struct *work) { }
+static inline void cxl_unregister_proto_err_work(void) { }
#endif
void pci_print_aer(struct pci_dev *dev, int aer_severity,
--
2.34.1
Powered by blists - more mailing lists