[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251028-scallion-list-c8aa5f350286@spud>
Date: Tue, 28 Oct 2025 10:42:12 +0000
From: Conor Dooley <conor@...nel.org>
To: Yunhui Cui <cuiyunhui@...edance.com>
Cc: paul.walmsley@...ive.com, palmer@...belt.com, aou@...s.berkeley.edu,
alex@...ti.fr, luxu.kernel@...edance.com, atishp@...osinc.com,
cleger@...osinc.com, ajones@...tanamicro.com,
apatel@...tanamicro.com, linux-kernel@...r.kernel.org,
linux-riscv@...ts.infradead.org, songshuaishuai@...ylab.org,
bjorn@...osinc.com, charlie@...osinc.com, masahiroy@...nel.org,
valentina.fernandezalanis@...rochip.com, jassisinghbrar@...il.com,
conor.dooley@...rochip.com
Subject: Re: [PATCH 3/3] riscv: crash: use NMI to stop the CPU
On Mon, Oct 27, 2025 at 09:34:31PM +0800, Yunhui Cui wrote:
> NMI is more robust than IPI for stopping CPUs during crashes,
> especially with interrupts disabled. Add SBI_SSE_EVENT_LOCAL_CRASH_NMI
> eventid to implement NMI for stopping CPUs.
>
> Signed-off-by: Yunhui Cui <cuiyunhui@...edance.com>
> ---
> arch/riscv/include/asm/crash.h | 1 +
> arch/riscv/include/asm/sbi.h | 1 +
> arch/riscv/kernel/crash.c | 31 +++++++++++++-
> drivers/firmware/riscv/sse_nmi.c | 71 +++++++++++++++++++++++++++++++-
> include/linux/sse_nmi.h | 8 ++++
> 5 files changed, 109 insertions(+), 3 deletions(-)
> create mode 100644 include/linux/sse_nmi.h
>
> diff --git a/arch/riscv/include/asm/crash.h b/arch/riscv/include/asm/crash.h
> index b64df919277d4..5076f297cbc15 100644
> --- a/arch/riscv/include/asm/crash.h
> +++ b/arch/riscv/include/asm/crash.h
> @@ -5,6 +5,7 @@
>
> #ifdef CONFIG_KEXEC_CORE
> void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs);
> +void cpu_crash_stop(unsigned int cpu, struct pt_regs *regs);
> #else
> static inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> {
> diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
> index 52d3fdf2d4cc1..65cce85237879 100644
> --- a/arch/riscv/include/asm/sbi.h
> +++ b/arch/riscv/include/asm/sbi.h
> @@ -487,6 +487,7 @@ enum sbi_sse_attr_id {
> #define SBI_SSE_EVENT_GLOBAL_LOW_PRIO_RAS 0x00108000
> #define SBI_SSE_EVENT_LOCAL_SOFTWARE_INJECTED 0xffff0000
> #define SBI_SSE_EVENT_LOCAL_UNKNOWN_NMI 0xffff0001
> +#define SBI_SSE_EVENT_LOCAL_CRASH_NMI 0xffff0002
> #define SBI_SSE_EVENT_GLOBAL_SOFTWARE_INJECTED 0xffff8000
>
> #define SBI_SSE_EVENT_PLATFORM BIT(14)
> diff --git a/arch/riscv/kernel/crash.c b/arch/riscv/kernel/crash.c
> index 12598bbc2df04..9f3f0becfdd95 100644
> --- a/arch/riscv/kernel/crash.c
> +++ b/arch/riscv/kernel/crash.c
> @@ -3,14 +3,16 @@
> #include <linux/cpu.h>
> #include <linux/delay.h>
> #include <linux/kexec.h>
> +#include <linux/sse_nmi.h>
> #include <linux/smp.h>
> #include <linux/sched.h>
>
> +#include <asm/crash.h>
> #include <asm/cpu_ops.h>
>
> static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0);
>
> -inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> +void cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> {
> crash_save_cpu(regs, cpu);
>
> @@ -27,6 +29,11 @@ inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> wait_for_interrupt();
> }
>
> +inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> +{
> + cpu_crash_stop(cpu, regs);
> +}
> +
> /*
> * The number of CPUs online, not counting this CPU (which may not be
> * fully online and so not counted in num_online_cpus()).
> @@ -38,6 +45,24 @@ static inline unsigned int num_other_online_cpus(void)
> return num_online_cpus() - this_cpu_online;
> }
>
> +#ifdef CONFIG_RISCV_SSE_NMI
> +static int send_nmi_stop_cpu(cpumask_t *mask)
> +{
> + unsigned int cpu;
> + int ret = 0;
> +
> + for_each_cpu(cpu, mask)
> + ret += carsh_nmi_stop_cpu(cpu);
+= ? I don't really get why this sort of overcomplication is needed, why
not just return immediately here with a real error code, since you're
going to have to go to the ipi fallback anyway?
> +
> + return ret;
> +}
> +#else
> +static inline int send_nmi_stop_cpu(cpumask_t *mask)
> +{
> + return -EOPNOTSUPP;
> +}
> +#endif
> +
> void crash_smp_send_stop(void)
> {
> static int cpus_stopped;
> @@ -66,7 +91,9 @@ void crash_smp_send_stop(void)
> atomic_set(&waiting_for_crash_ipi, num_other_online_cpus());
>
> pr_crit("SMP: stopping secondary CPUs\n");
> - send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
> +
> + if (send_nmi_stop_cpu(&mask))
> + send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
>
> /* Wait up to one second for other CPUs to stop */
> timeout = USEC_PER_SEC;
> diff --git a/drivers/firmware/riscv/sse_nmi.c b/drivers/firmware/riscv/sse_nmi.c
> index 2c1eaea2bbabc..152d787075345 100644
> --- a/drivers/firmware/riscv/sse_nmi.c
> +++ b/drivers/firmware/riscv/sse_nmi.c
> @@ -4,13 +4,16 @@
>
> #include <linux/nmi.h>
> #include <linux/riscv_sbi_sse.h>
> +#include <linux/sse_nmi.h>
> #include <linux/sysctl.h>
>
> +#include <asm/crash.h>
> #include <asm/irq_regs.h>
> #include <asm/sbi.h>
>
> int unknown_nmi_panic;
> static struct sse_event *unknown_nmi_evt;
> +static struct sse_event *crash_nmi_evt;
> static struct ctl_table_header *unknown_nmi_sysctl_header;
>
> static int __init setup_unknown_nmi_panic(char *str)
> @@ -32,6 +35,12 @@ const struct ctl_table unknown_nmi_table[] = {
> },
> };
>
> +static inline struct sbiret sbi_sse_ecall(int fid, unsigned long arg0,
> + unsigned long arg1)
> +{
> + return sbi_ecall(SBI_EXT_SSE, fid, arg0, arg1, 0, 0, 0, 0);
> +}
> +
> static int unknown_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
> {
> pr_emerg("NMI received for unknown on CPU %d.\n", smp_processor_id());
> @@ -73,9 +82,69 @@ static int unknown_nmi_init(void)
> return ret;
> }
>
> +#ifdef CONFIG_KEXEC_CORE
> +int carsh_nmi_stop_cpu(unsigned int cpu)
typo: crash
> +{
> + unsigned int hart_id = cpuid_to_hartid_map(cpu);
> + u32 evt = SBI_SSE_EVENT_LOCAL_CRASH_NMI;
> + struct sbiret ret;
> +
> + ret = sbi_sse_ecall(SBI_SSE_EVENT_INJECT, evt, hart_id);
> + if (ret.error) {
> + pr_err("Failed to signal event %x, error %ld\n", evt, ret.error);
Isn't this going to emit pointless (and maybe confusing) error messages
on systems that enable the option but don't support SSE? And it's going
to be one for each secondary CPU too.
> + return sbi_err_map_linux_errno(ret.error);
> + }
> +
> + return 0;
> +}
> +
> +static int crash_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
> +{
> + cpu_crash_stop(smp_processor_id(), regs);
> +
> + return 0;
> +}
> +
> +static int crash_nmi_init(void)
> +{
> + int ret;
> +
> + crash_nmi_evt = sse_event_register(SBI_SSE_EVENT_LOCAL_CRASH_NMI, 0,
> + crash_nmi_handler, NULL);
> + if (IS_ERR(crash_nmi_evt))
> + return PTR_ERR(crash_nmi_evt);
> +
> + ret = sse_event_enable(crash_nmi_evt);
> + if (ret) {
> + sse_event_unregister(crash_nmi_evt);
> + return ret;
> + }
> +
> + pr_info("Using SSE for crash NMI event delivery\n");
> +
> + return 0;
> +}
> +#endif
> +
> static int __init sse_nmi_init(void)
> {
> - return unknown_nmi_init();
> + int ret;
> +
> + ret = unknown_nmi_init();
> + if (ret) {
> + pr_err("Unknown_nmi_init failed with error %d\n", ret);
> + return ret;
> + }
This change looks like it shouldn't be in this patch, if you want it to
print an error, just do that from the start?
> +
> +#ifdef CONFIG_KEXEC_CORE
Can this be IS_ENABLED() or does crash_nmi_init() not have a stub?
> + ret = crash_nmi_init();
> + if (ret) {
> + pr_err("Crash_nmi_init failed with error %d\n", ret);
> + return ret;
> + }
> +#endif
> +
> + return 0;
> }
>
> late_initcall(sse_nmi_init);
> diff --git a/include/linux/sse_nmi.h b/include/linux/sse_nmi.h
> new file mode 100644
> index 0000000000000..548a348ac0a46
> --- /dev/null
> +++ b/include/linux/sse_nmi.h
> @@ -0,0 +1,8 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __LINUX_RISCV_SSE_NMI_H
> +#define __LINUX_RISCV_SSE_NMI_H
> +
> +int carsh_nmi_stop_cpu(unsigned int cpu);
> +
> +#endif
> --
> 2.39.5
>
Download attachment "signature.asc" of type "application/pgp-signature" (229 bytes)
Powered by blists - more mailing lists