[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <00ad5958-dc5f-4d18-ad24-9de607912bf8@intel.com>
Date: Thu, 29 Jan 2026 09:07:51 -0800
From: "Chang S. Bae" <chang.seok.bae@...el.com>
To: Thomas Gleixner <tglx@...nel.org>, <linux-kernel@...r.kernel.org>
CC: <x86@...nel.org>, <mingo@...hat.com>, <bp@...en8.de>,
<dave.hansen@...ux.intel.com>, <peterz@...radead.org>, <david.kaplan@....com>
Subject: Re: [PATCH 1/7] stop_machine: Introduce stop_machine_nmi()
On 1/28/2026 12:02 AM, Thomas Gleixner wrote:
> On Sun, Jan 25 2026 at 01:42, Chang S. Bae wrote:
>> +/**
>> + * stop_machine_nmi: freeze the machine and run this function in NMI context
>> + * @fn: the function to run
>> + * @data: the data ptr for the @fn()
>> + * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
>
> Please format these tabular, use uppercase letters to start the
> explanation, use CPU[s] all over the place and write out words instead
> of using made up abbreviations. This is documentation not twitter.
>
> * @fn: The function to invoke
> * @data: The data pointer for @fn()
> * @cpus: A cpumask containing the CPUs to run fn() on
>
> Also this NULL == any online CPU is just made up. What's wrong with
> cpu_online_mask?
>
>> +
>> +bool noinstr stop_machine_nmi_handler(void);
>> +DECLARE_STATIC_KEY_FALSE(stop_machine_nmi_handler_enable);
>> +static __always_inline bool stop_machine_nmi_handler_enabled(void)
>
> Can you please separate the declarations from the inline with an empty
> new line? This glued together way to write it is unreadable.
Yes, I fixed them all on my local right now.
>
>> +{
>> + return static_branch_unlikely(&stop_machine_nmi_handler_enable);
>> +}
>> +
>> #else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
>>
>> static __always_inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
>> @@ -186,5 +217,24 @@ stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
>> return stop_machine(fn, data, cpus);
>> }
>>
>> +/* stop_machine_nmi() is only supported in SMP systems. */
>> +static __always_inline int stop_machine_nmi(cpu_stop_fn_t fn, void *data,
>> + const struct cpumask *cpus)
>
> Align the second line argument with the first argument above.
>
> See https://www.kernel.org/doc/html/latest/process/maintainer-tip.html
Sorry for lot of misalignment issues in this change that I missed out.
>> +{
>> + return -EINVAL;
>> +}
>> +
>
>> +
>> +void arch_send_self_nmi(void);
>> #endif /* _LINUX_STOP_MACHINE */
>> diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
>> index 3fe6b0c99f3d..189b4b108d13 100644
>> --- a/kernel/stop_machine.c
>> +++ b/kernel/stop_machine.c
>> @@ -174,6 +174,8 @@ struct multi_stop_data {
>>
>> enum multi_stop_state state;
>> atomic_t thread_ack;
>> +
>> + bool use_nmi;
>> };
>>
>> static void set_state(struct multi_stop_data *msdata,
>> @@ -197,6 +199,42 @@ notrace void __weak stop_machine_yield(const struct cpumask *cpumask)
>> cpu_relax();
>> }
>>
>> +struct stop_machine_nmi_ctrl {
>> + bool nmi_enabled;
>> + struct multi_stop_data *msdata;
>> + int err;
>
> Please align the struct member names tabular. See documentation.
Fixed.
>
>> +};
>> +
>> +DEFINE_STATIC_KEY_FALSE(stop_machine_nmi_handler_enable);
>> +static DEFINE_PER_CPU(struct stop_machine_nmi_ctrl, stop_machine_nmi_ctrl);
>> +
>> +static void enable_nmi_handler(struct multi_stop_data *msdata)
>> +{
>> + this_cpu_write(stop_machine_nmi_ctrl.msdata, msdata);
>> + this_cpu_write(stop_machine_nmi_ctrl.nmi_enabled, true);
>> +}
>> +
>> +void __weak arch_send_self_nmi(void)
>> +{
>> + /* Arch code must implement this to support stop_machine_nmi() */
>
> Architecture
Fixed.
>
>> +}
>
> Also this weak function is wrong.
>
> All of this NMI mode needs to be guarded with a config option as it
> otherwise is compiled in unconditionally and any accidental usage on an
> architecture which does not support this will result in a undecodable
> malfunction. There is a world outside of x86.
>
> With that arch_send_self_nmi() becomes a plain declaration in a header.
I see.
>
>> +
>> +bool noinstr stop_machine_nmi_handler(void)
>> +{
>> + struct multi_stop_data *msdata;
>> + int err;
>> +
>> + if (!raw_cpu_read(stop_machine_nmi_ctrl.nmi_enabled))
>> + return false;
>> +
>> + raw_cpu_write(stop_machine_nmi_ctrl.nmi_enabled, false);
>> +
>> + msdata = raw_cpu_read(stop_machine_nmi_ctrl.msdata);
>> + err = msdata->fn(msdata->data);
>> + raw_cpu_write(stop_machine_nmi_ctrl.err, err);
>> + return true;
>> +}
>> +
>> /* This is the cpu_stop function which stops the CPU. */
>> static int multi_cpu_stop(void *data)
>> {
>> @@ -234,8 +272,15 @@ static int multi_cpu_stop(void *data)
>> hard_irq_disable();
>> break;
>> case MULTI_STOP_RUN:
>> - if (is_active)
>> - err = msdata->fn(msdata->data);
>> + if (is_active) {
>> + if (msdata->use_nmi) {
>> + enable_nmi_handler(msdata);
>> + arch_send_self_nmi();
>> + err = raw_cpu_read(stop_machine_nmi_ctrl.err);
>> + } else {
>> + err = msdata->fn(msdata->data);
>> + }
>
> And this wants to become
>
> if (IS_ENABLED(CONFIG_STOMP_MACHINE_NMI) && msdata->use_nmi)
> err = stop_this_cpu_nmi(msdata);
> else
> err = msdata->fn(msdata->data);
Although that config option is very clear and makes tons of sense, the
latter reads like a (silent) fallback path for a stop_machine_nmi()
invocation with CONFIG_STOMP_MACHINE_NMI=n.
Maybe this might be clear to reject the NMI option right away with
something like:
stop_machine_cpuslocked_nmi(...)
{
if (!IS_ENABLED(CONFIG_STOMP_MACHINE_NMI))
return -EOPNOTSUPP;
...
};
>>
>> -int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
>> - const struct cpumask *cpus)
>> +static int __stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
>> + const struct cpumask *cpus, bool use_nmi)
>
> The argument alignment was correct before....
Sigh... fixed, again.
>> +int stop_machine_nmi(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
>> +{
>> + int ret;
>> +
>> + cpus_read_lock();
>> + ret = stop_machine_cpuslocked_nmi(fn, data, cpus);
>> + cpus_read_unlock();
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(stop_machine_nmi);
>
> Why needs this to be exported? No module has any business with stomp
> machine.
Not at all. Removed.
I really appreciate your time and effort for the review!
Powered by blists - more mailing lists