[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250910093347.75822-6-tianruidong@linux.alibaba.com>
Date: Wed, 10 Sep 2025 17:33:47 +0800
From: Ruidong Tian <tianruidong@...ux.alibaba.com>
To: xueshuai@...ux.alibaba.com,
palmer@...belt.com,
paul.walmsley@...ive.com,
linux-riscv@...ts.infradead.org,
linux-kernel@...r.kernel.org,
linux-acpi@...r.kernel.org
Cc: james.morse@....com,
tony.luck@...el.com,
cleger@...osinc.com,
hchauhan@...tanamicro.com,
tianruidong@...ux.alibaba.com
Subject: [RFC PATCH 5/5] riscv: Add Hardware Error Exception trap handler
Implement the Hardware Error Exception trap handler for RISC-V architecture
synchronous hardware error handling. This enables the OS to receive
hardware error notifications from firmware through the standardized ACPI
HEST (Hardware Error Source Table) interface.
The implementation includes:
- A new exception vector entry for Hardware Error Exceptio
- A trap handler (do_trap_hardware_error) that processes hardware errors
in both kernel(panic now) and user modes(SIGBUS)
- Integration with APEI GHES (Generic Hardware Error Source) to report
hardware errors from firmware
This change enables RISC-V systems with ACPI to handle synchronous
hardware errors in a firmware-first manner.
Signed-off-by: Ruidong Tian <tianruidong@...ux.alibaba.com>
---
arch/riscv/include/asm/acpi.h | 2 ++
arch/riscv/kernel/acpi.c | 55 +++++++++++++++++++++++++++++++++++
arch/riscv/kernel/entry.S | 4 +++
arch/riscv/kernel/traps.c | 19 ++++++++++++
4 files changed, 80 insertions(+)
diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h
index 0c599452ef48..ae861885b97d 100644
--- a/arch/riscv/include/asm/acpi.h
+++ b/arch/riscv/include/asm/acpi.h
@@ -91,6 +91,7 @@ int acpi_get_riscv_isa(struct acpi_table_header *table,
void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size,
u32 *cboz_size, u32 *cbop_size);
+int apei_claim_hee(struct pt_regs *regs);
#else
static inline void acpi_init_rintc_map(void) { }
static inline struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu)
@@ -108,6 +109,7 @@ static inline void acpi_get_cbo_block_size(struct acpi_table_header *table,
u32 *cbom_size, u32 *cboz_size,
u32 *cbop_size) { }
+static inline int apei_claim_hee(struct pt_regs *regs) { return -ENOENT; }
#endif /* CONFIG_ACPI */
#ifdef CONFIG_ACPI_NUMA
diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c
index 3f6d5a6789e8..928f9474bfee 100644
--- a/arch/riscv/kernel/acpi.c
+++ b/arch/riscv/kernel/acpi.c
@@ -20,6 +20,11 @@
#include <linux/of_fdt.h>
#include <linux/pci.h>
#include <linux/serial_core.h>
+#include <linux/efi.h>
+#include <linux/irq_work.h>
+#include <linux/nmi.h>
+#include <acpi/ghes.h>
+#include <asm/csr.h>
int acpi_noirq = 1; /* skip ACPI IRQ initialization */
int acpi_disabled = 1;
@@ -334,3 +339,53 @@ int raw_pci_write(unsigned int domain, unsigned int bus,
}
#endif /* CONFIG_PCI */
+
+/*
+ * Claim Hardware Error Exception as a firmware first notification.
+ *
+ * Used by RISC-V exception handler for hardware error processing.
+ * @regs may be NULL when called from process context.
+ */
+int apei_claim_hee(struct pt_regs *regs)
+{
+ int err = -ENOENT;
+ bool return_to_irqs_enabled;
+ unsigned long flags;
+
+ if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES))
+ return err;
+
+ /* Save current interrupt state */
+ local_irq_save(flags);
+ return_to_irqs_enabled = !irqs_disabled();
+
+ if (regs)
+ return_to_irqs_enabled = (regs->status & SR_SIE) != 0;
+
+ /*
+ * HEE can interrupt other operations, handle as NMI-like context
+ * to ensure proper APEI processing
+ */
+ nmi_enter();
+ err = ghes_notify_hee();
+ nmi_exit();
+
+ /*
+ * APEI NMI-like notifications are deferred to irq_work. Unless
+ * we interrupted irqs-masked code, we can do that now.
+ */
+ if (!err) {
+ if (return_to_irqs_enabled) {
+ local_irq_restore(flags);
+ irq_work_run();
+ } else {
+ pr_warn_ratelimited("APEI work queued but not completed");
+ err = -EINPROGRESS;
+ }
+ } else {
+ local_irq_restore(flags);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(apei_claim_hee);
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 3a0ec6fd5956..1cbefe934d84 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -459,6 +459,10 @@ SYM_DATA_START_LOCAL(excp_vect_table)
RISCV_PTR do_page_fault /* load page fault */
RISCV_PTR do_trap_unknown
RISCV_PTR do_page_fault /* store page fault */
+ RISCV_PTR do_trap_unknown
+ RISCV_PTR do_trap_unknown
+ RISCV_PTR do_trap_unknown
+ RISCV_PTR do_trap_hardware_error /* Hardware Error */
SYM_DATA_END_LABEL(excp_vect_table, SYM_L_LOCAL, excp_vect_table_end)
#ifndef CONFIG_MMU
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 80230de167de..48f1ea1e03e6 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -22,6 +22,7 @@
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/entry-common.h>
+#include <linux/acpi.h>
#include <asm/asm-prototypes.h>
#include <asm/bug.h>
@@ -442,3 +443,21 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs)
wait_for_interrupt();
}
#endif
+
+asmlinkage __visible __trap_section void do_trap_hardware_error(struct pt_regs *regs)
+{
+ if (user_mode(regs)) {
+ irqentry_enter_from_user_mode(regs);
+
+ if (apei_claim_hee(regs))
+ do_trap_error(regs, SIGBUS, BUS_OBJERR, regs->badaddr, "Hardware Error");
+
+ irqentry_exit_to_user_mode(regs);
+ } else {
+ irqentry_state_t state = irqentry_nmi_enter(regs);
+
+ die(regs, "Hardware Error");
+
+ irqentry_nmi_exit(regs, state);
+ }
+}
--
2.43.7
Powered by blists - more mailing lists