[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEEQ3wngRa-YHbrodOVhSoRzam4MSd8ihxXEd-6rhBmx_rtzTg@mail.gmail.com>
Date: Mon, 3 Nov 2025 21:36:47 +0800
From: yunhui cui <cuiyunhui@...edance.com>
To: Conor Dooley <conor@...nel.org>
Cc: paul.walmsley@...ive.com, palmer@...belt.com, aou@...s.berkeley.edu,
alex@...ti.fr, luxu.kernel@...edance.com, atishp@...osinc.com,
cleger@...osinc.com, ajones@...tanamicro.com, apatel@...tanamicro.com,
linux-kernel@...r.kernel.org, linux-riscv@...ts.infradead.org,
songshuaishuai@...ylab.org, bjorn@...osinc.com, charlie@...osinc.com,
masahiroy@...nel.org, valentina.fernandezalanis@...rochip.com,
jassisinghbrar@...il.com, conor.dooley@...rochip.com
Subject: Re: [External] Re: [PATCH 3/3] riscv: crash: use NMI to stop the CPU
Hi Conor,
On Tue, Oct 28, 2025 at 6:42 PM Conor Dooley <conor@...nel.org> wrote:
>
> On Mon, Oct 27, 2025 at 09:34:31PM +0800, Yunhui Cui wrote:
> > NMI is more robust than IPI for stopping CPUs during crashes,
> > especially with interrupts disabled. Add SBI_SSE_EVENT_LOCAL_CRASH_NMI
> > eventid to implement NMI for stopping CPUs.
> >
> > Signed-off-by: Yunhui Cui <cuiyunhui@...edance.com>
> > ---
> > arch/riscv/include/asm/crash.h | 1 +
> > arch/riscv/include/asm/sbi.h | 1 +
> > arch/riscv/kernel/crash.c | 31 +++++++++++++-
> > drivers/firmware/riscv/sse_nmi.c | 71 +++++++++++++++++++++++++++++++-
> > include/linux/sse_nmi.h | 8 ++++
> > 5 files changed, 109 insertions(+), 3 deletions(-)
> > create mode 100644 include/linux/sse_nmi.h
> >
> > diff --git a/arch/riscv/include/asm/crash.h b/arch/riscv/include/asm/crash.h
> > index b64df919277d4..5076f297cbc15 100644
> > --- a/arch/riscv/include/asm/crash.h
> > +++ b/arch/riscv/include/asm/crash.h
> > @@ -5,6 +5,7 @@
> >
> > #ifdef CONFIG_KEXEC_CORE
> > void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs);
> > +void cpu_crash_stop(unsigned int cpu, struct pt_regs *regs);
> > #else
> > static inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> > {
> > diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
> > index 52d3fdf2d4cc1..65cce85237879 100644
> > --- a/arch/riscv/include/asm/sbi.h
> > +++ b/arch/riscv/include/asm/sbi.h
> > @@ -487,6 +487,7 @@ enum sbi_sse_attr_id {
> > #define SBI_SSE_EVENT_GLOBAL_LOW_PRIO_RAS 0x00108000
> > #define SBI_SSE_EVENT_LOCAL_SOFTWARE_INJECTED 0xffff0000
> > #define SBI_SSE_EVENT_LOCAL_UNKNOWN_NMI 0xffff0001
> > +#define SBI_SSE_EVENT_LOCAL_CRASH_NMI 0xffff0002
> > #define SBI_SSE_EVENT_GLOBAL_SOFTWARE_INJECTED 0xffff8000
> >
> > #define SBI_SSE_EVENT_PLATFORM BIT(14)
> > diff --git a/arch/riscv/kernel/crash.c b/arch/riscv/kernel/crash.c
> > index 12598bbc2df04..9f3f0becfdd95 100644
> > --- a/arch/riscv/kernel/crash.c
> > +++ b/arch/riscv/kernel/crash.c
> > @@ -3,14 +3,16 @@
> > #include <linux/cpu.h>
> > #include <linux/delay.h>
> > #include <linux/kexec.h>
> > +#include <linux/sse_nmi.h>
> > #include <linux/smp.h>
> > #include <linux/sched.h>
> >
> > +#include <asm/crash.h>
> > #include <asm/cpu_ops.h>
> >
> > static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0);
> >
> > -inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> > +void cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> > {
> > crash_save_cpu(regs, cpu);
> >
> > @@ -27,6 +29,11 @@ inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> > wait_for_interrupt();
> > }
> >
> > +inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> > +{
> > + cpu_crash_stop(cpu, regs);
> > +}
> > +
> > /*
> > * The number of CPUs online, not counting this CPU (which may not be
> > * fully online and so not counted in num_online_cpus()).
> > @@ -38,6 +45,24 @@ static inline unsigned int num_other_online_cpus(void)
> > return num_online_cpus() - this_cpu_online;
> > }
> >
> > +#ifdef CONFIG_RISCV_SSE_NMI
> > +static int send_nmi_stop_cpu(cpumask_t *mask)
> > +{
> > + unsigned int cpu;
> > + int ret = 0;
> > +
> > + for_each_cpu(cpu, mask)
> > + ret += carsh_nmi_stop_cpu(cpu);
>
> += ? I don't really get why this sort of overcomplication is needed, why
> not just return immediately here with a real error code, since you're
> going to have to go to the ipi fallback anyway?
A cpumask may be needed to mark the CPUs where NMI failed, and then
continue to stop the CPUs via IPI.
>
> > +
> > + return ret;
> > +}
> > +#else
> > +static inline int send_nmi_stop_cpu(cpumask_t *mask)
> > +{
> > + return -EOPNOTSUPP;
> > +}
> > +#endif
> > +
> > void crash_smp_send_stop(void)
> > {
> > static int cpus_stopped;
> > @@ -66,7 +91,9 @@ void crash_smp_send_stop(void)
> > atomic_set(&waiting_for_crash_ipi, num_other_online_cpus());
> >
> > pr_crit("SMP: stopping secondary CPUs\n");
> > - send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
> > +
> > + if (send_nmi_stop_cpu(&mask))
> > + send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
> >
> > /* Wait up to one second for other CPUs to stop */
> > timeout = USEC_PER_SEC;
> > diff --git a/drivers/firmware/riscv/sse_nmi.c b/drivers/firmware/riscv/sse_nmi.c
> > index 2c1eaea2bbabc..152d787075345 100644
> > --- a/drivers/firmware/riscv/sse_nmi.c
> > +++ b/drivers/firmware/riscv/sse_nmi.c
> > @@ -4,13 +4,16 @@
> >
> > #include <linux/nmi.h>
> > #include <linux/riscv_sbi_sse.h>
> > +#include <linux/sse_nmi.h>
> > #include <linux/sysctl.h>
> >
> > +#include <asm/crash.h>
> > #include <asm/irq_regs.h>
> > #include <asm/sbi.h>
> >
> > int unknown_nmi_panic;
> > static struct sse_event *unknown_nmi_evt;
> > +static struct sse_event *crash_nmi_evt;
> > static struct ctl_table_header *unknown_nmi_sysctl_header;
> >
> > static int __init setup_unknown_nmi_panic(char *str)
> > @@ -32,6 +35,12 @@ const struct ctl_table unknown_nmi_table[] = {
> > },
> > };
> >
> > +static inline struct sbiret sbi_sse_ecall(int fid, unsigned long arg0,
> > + unsigned long arg1)
> > +{
> > + return sbi_ecall(SBI_EXT_SSE, fid, arg0, arg1, 0, 0, 0, 0);
> > +}
> > +
> > static int unknown_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
> > {
> > pr_emerg("NMI received for unknown on CPU %d.\n", smp_processor_id());
> > @@ -73,9 +82,69 @@ static int unknown_nmi_init(void)
> > return ret;
> > }
> >
> > +#ifdef CONFIG_KEXEC_CORE
> > +int carsh_nmi_stop_cpu(unsigned int cpu)
>
> typo: crash
Okay.
>
> > +{
> > + unsigned int hart_id = cpuid_to_hartid_map(cpu);
> > + u32 evt = SBI_SSE_EVENT_LOCAL_CRASH_NMI;
> > + struct sbiret ret;
> > +
> > + ret = sbi_sse_ecall(SBI_SSE_EVENT_INJECT, evt, hart_id);
> > + if (ret.error) {
> > + pr_err("Failed to signal event %x, error %ld\n", evt, ret.error);
>
> Isn't this going to emit pointless (and maybe confusing) error messages
> on systems that enable the option but don't support SSE? And it's going
> to be one for each secondary CPU too.
Okay, I'll fix this in the next version.
>
> > + return sbi_err_map_linux_errno(ret.error);
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static int crash_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
> > +{
> > + cpu_crash_stop(smp_processor_id(), regs);
> > +
> > + return 0;
> > +}
> > +
> > +static int crash_nmi_init(void)
> > +{
> > + int ret;
> > +
> > + crash_nmi_evt = sse_event_register(SBI_SSE_EVENT_LOCAL_CRASH_NMI, 0,
> > + crash_nmi_handler, NULL);
> > + if (IS_ERR(crash_nmi_evt))
> > + return PTR_ERR(crash_nmi_evt);
> > +
> > + ret = sse_event_enable(crash_nmi_evt);
> > + if (ret) {
> > + sse_event_unregister(crash_nmi_evt);
> > + return ret;
> > + }
> > +
> > + pr_info("Using SSE for crash NMI event delivery\n");
> > +
> > + return 0;
> > +}
> > +#endif
> > +
> > static int __init sse_nmi_init(void)
> > {
> > - return unknown_nmi_init();
> > + int ret;
> > +
> > + ret = unknown_nmi_init();
> > + if (ret) {
> > + pr_err("Unknown_nmi_init failed with error %d\n", ret);
> > + return ret;
> > + }
>
> This change looks like it shouldn't be in this patch, if you want it to
> print an error, just do that from the start?
Okay.
>
> > +
> > +#ifdef CONFIG_KEXEC_CORE
>
> Can this be IS_ENABLED() or does crash_nmi_init() not have a stub?
I'll add a stub function in the next version, so no more #ifdef
CONFIG_KEXEC_CORE here.
>
> > + ret = crash_nmi_init();
> > + if (ret) {
> > + pr_err("Crash_nmi_init failed with error %d\n", ret);
> > + return ret;
> > + }
> > +#endif
> > +
> > + return 0;
> > }
> >
> > late_initcall(sse_nmi_init);
> > diff --git a/include/linux/sse_nmi.h b/include/linux/sse_nmi.h
> > new file mode 100644
> > index 0000000000000..548a348ac0a46
> > --- /dev/null
> > +++ b/include/linux/sse_nmi.h
> > @@ -0,0 +1,8 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +
> > +#ifndef __LINUX_RISCV_SSE_NMI_H
> > +#define __LINUX_RISCV_SSE_NMI_H
> > +
> > +int carsh_nmi_stop_cpu(unsigned int cpu);
> > +
> > +#endif
> > --
> > 2.39.5
> >
Thanks,
Yunhui
Powered by blists - more mailing lists