[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d255d8a9-8e45-485e-9853-80c343bbb73b@suse.com>
Date: Fri, 31 Oct 2025 12:22:58 +0200
From: Nikolay Borisov <nik.borisov@...e.com>
To: David Kaplan <david.kaplan@....com>, Thomas Gleixner
<tglx@...utronix.de>, Borislav Petkov <bp@...en8.de>,
Peter Zijlstra <peterz@...radead.org>, Josh Poimboeuf <jpoimboe@...nel.org>,
Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>,
Ingo Molnar <mingo@...hat.com>, Dave Hansen <dave.hansen@...ux.intel.com>,
x86@...nel.org, "H . Peter Anvin" <hpa@...or.com>
Cc: Alexander Graf <graf@...zon.com>,
Boris Ostrovsky <boris.ostrovsky@...cle.com>, linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH 50/56] x86/alternative: Add re-patch support
On 10/13/25 17:34, David Kaplan wrote:
> Updating alternatives is done under the biggest hammers possible. The
> freezer is used to freeze all processes and kernel threads at safe
> points to ensure they are not in the middle of a sequence we're about to
> patch. Then stop_machine_nmi() synchronizes all CPUs and puts them into
> a tight spin loop while re-patching occurs. The actual patching is done
> using simple memcpy, just like during boot.
>
> Signed-off-by: David Kaplan <david.kaplan@....com>
> ---
> arch/x86/include/asm/alternative.h | 6 ++
> arch/x86/kernel/alternative.c | 131 +++++++++++++++++++++++++++++
> 2 files changed, 137 insertions(+)
>
> diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
> index 61ce8a4b1aa6..f0b863292c3c 100644
> --- a/arch/x86/include/asm/alternative.h
> +++ b/arch/x86/include/asm/alternative.h
> @@ -19,6 +19,7 @@
> #ifndef __ASSEMBLER__
>
> #include <linux/stddef.h>
> +#include <linux/static_call_types.h>
>
> /*
> * Alternative inline assembly for SMP.
> @@ -89,6 +90,9 @@ extern s32 __cfi_sites[], __cfi_sites_end[];
> extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
> extern s32 __smp_locks[], __smp_locks_end[];
>
> +extern struct static_call_site __start_static_call_sites[],
> + __stop_static_call_sites[];
> +
> /*
> * Debug flag that can be tested to see whether alternative
> * instructions were patched in already:
> @@ -98,6 +102,8 @@ extern int alternatives_patched;
> struct module;
>
> #ifdef CONFIG_DYNAMIC_MITIGATIONS
> +extern void cpu_update_alternatives(void);
> +extern void cpu_prepare_repatch_alternatives(void);
> extern void reset_retpolines(s32 *start, s32 *end, struct module *mod);
> extern void reset_returns(s32 *start, s32 *end, struct module *mod);
> extern void reset_alternatives(struct alt_instr *start, struct alt_instr *end,
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 23bb3386ec5e..613cb645bd9f 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -6,12 +6,15 @@
> #include <linux/vmalloc.h>
> #include <linux/memory.h>
> #include <linux/execmem.h>
> +#include <linux/stop_machine.h>
> +#include <linux/freezer.h>
>
> #include <asm/text-patching.h>
> #include <asm/insn.h>
> #include <asm/ibt.h>
> #include <asm/set_memory.h>
> #include <asm/nmi.h>
> +#include <asm/bugs.h>
>
> int __read_mostly alternatives_patched;
>
> @@ -3468,4 +3471,132 @@ void its_free_all(struct module *mod)
> its_page = NULL;
> }
> #endif
> +static atomic_t thread_ack;
> +
> +/*
> + * This function is called by ALL online CPUs but only CPU0 will do the
> + * re-patching. It is important that all other cores spin in the tight loop
> + * below (and not in multi_cpu_stop) because they cannot safely do return
> + * instructions while returns are being patched. Therefore, spin them here
> + * (with interrupts disabled) until CPU0 has finished its work.
> + */
> +static int __cpu_update_alternatives(void *__unused)
> +{
> + if (smp_processor_id()) {
> + atomic_dec(&thread_ack);
> + while (!READ_ONCE(alternatives_patched))
> + cpu_relax();
> +
> + cpu_bugs_update_speculation_msrs();
> + } else {
> + repatch_in_progress = true;
> +
> + /* Wait for all cores to enter this function. */
> + while (atomic_read(&thread_ack))
> + cpu_relax();
> +
> + /* These must be un-done in the opposite order in which they were applied. */
> + reset_alternatives(__alt_instructions, __alt_instructions_end, NULL);
> + reset_builtin_callthunks();
> + reset_returns(__return_sites, __return_sites_end, NULL);
> + reset_retpolines(__retpoline_sites, __retpoline_sites_end, NULL);
> +
> + apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL);
> + apply_returns(__return_sites, __return_sites_end, NULL);
This triggers the following splat:
[ 363.467469] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:575
[ 363.467472] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 18, name: migration/0
[ 363.467472] preempt_count: 110001, expected: 0
[ 363.467473] RCU nest depth: 0, expected: 0
[ 363.467474] no locks held by migration/0/18.
[ 363.467474] irq event stamp: 1280
[ 363.467475] hardirqs last enabled at (1279): [<ffffffff91fd1444>] _raw_spin_unlock_irq+0x24/0x50
[ 363.467479] hardirqs last disabled at (1280): [<ffffffff913c98f9>] multi_cpu_stop+0x119/0x170
[ 363.467482] softirqs last enabled at (0): [<ffffffff9129eaab>] copy_process+0x7fb/0x1990
[ 363.467484] softirqs last disabled at (0): [<0000000000000000>] 0x0
[ 363.467485] Preemption disabled at:
[ 363.467486] [<ffffffff913c8e63>] cpu_stopper_thread+0x93/0x150
[ 363.467488] CPU: 0 UID: 0 PID: 18 Comm: migration/0 Not tainted 6.18.0-rc1-default+ #9 PREEMPT(none)
[ 363.467490] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 363.467491] Stopper: multi_cpu_stop+0x0/0x170 <- __stop_cpus.constprop.0+0x77/0xb0
[ 363.467493] Call Trace:
[ 363.467494] <NMI>
[ 363.467496] dump_stack_lvl+0x62/0x90
[ 363.467498] __might_resched+0x19f/0x2b0
[ 363.467501] ? its_return_thunk+0x10/0x10
[ 363.467503] __mutex_lock+0x67/0x1060
[ 363.467504] ? look_up_lock_class+0x59/0x130
[ 363.467506] ? look_up_lock_class+0x59/0x130
[ 363.467508] ? __static_call_fixup+0x4f/0xa0
[ 363.467510] ? insn_get_prefixes+0x1a4/0x3f0
[ 363.467512] ? __SCT__tp_func_emulate_vsyscall+0x8/0x8
[ 363.467513] ? its_return_thunk+0x10/0x10
[ 363.467514] ? its_return_thunk+0x10/0x10
[ 363.467516] ? __static_call_fixup+0x4f/0xa0
[ 363.467517] __static_call_fixup+0x4f/0xa0
[ 363.467518] ? __SCT__tp_func_emulate_vsyscall+0x8/0x8
[ 363.467519] apply_returns+0x13e/0x370
[ 363.467523] ? __SCT__tp_func_emulate_vsyscall+0x8/0x8
[ 363.467524] ? __SCT__x86_pmu_disable_all+0x7/0x8
[ 363.467525] ? __SCT__x86_pmu_handle_irq+0x5/0x8
[ 363.467527] ? __copy_user_flushcache+0xf3/0x100
[ 363.467528] ? its_return_thunk+0x10/0x10
[ 363.467529] __cpu_update_alternatives+0x1e3/0x240
[ 363.467531] ? x2apic_send_IPI+0x40/0x60
[ 363.467533] stop_machine_nmi_handler+0x29/0x40
[ 363.467534] default_do_nmi+0x137/0x1a0
[ 363.467536] exc_nmi+0xef/0x120
[ 363.467538] end_repeat_nmi+0xf/0x53
[ 363.467578] ================================
[ 363.467578] WARNING: inconsistent lock state
[ 363.467578] 6.18.0-rc1-default+ #9 Tainted: G W
[ 363.467579] --------------------------------
[ 363.467579] inconsistent {INITIAL USE} -> {IN-NMI} usage.
[ 363.467580] migration/0/18 [HC1[1]:SC0[0]:HE0:SE1] takes:
[ 363.467581] ffffffff92668c28 (text_mutex){+.+.}-{4:4}, at: __static_call_fixup+0x4f/0xa0
[ 363.467583] {INITIAL USE} state was registered at:
[ 363.467584] irq event stamp: 1280
[ 363.467584] hardirqs last enabled at (1279): [<ffffffff91fd1444>] _raw_spin_unlock_irq+0x24/0x50
[ 363.467586] hardirqs last disabled at (1280): [<ffffffff913c98f9>] multi_cpu_stop+0x119/0x170
[ 363.467587] softirqs last enabled at (0): [<ffffffff9129eaab>] copy_process+0x7fb/0x1990
[ 363.467588] softirqs last disabled at (0): [<0000000000000000>] 0x0
[ 363.467589]
other info that might help us debug this:
[ 363.467590] Possible unsafe locking scenario:
[ 363.467590] CPU0
[ 363.467590] ----
[ 363.467590] lock(text_mutex);
[ 363.467591] <Interrupt>
[ 363.467591] lock(text_mutex);
[ 363.467592]
*** DEADLOCK ***
[ 363.467592] no locks held by migration/0/18.
[ 363.467592]
stack backtrace:
[ 363.467593] CPU: 0 UID: 0 PID: 18 Comm: migration/0 Tainted: G W 6.18.0-rc1-default+ #9 PREEMPT(none)
[ 363.467594] Tainted: [W]=WARN
[ 363.467595] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 363.467595] Stopper: multi_cpu_stop+0x0/0x170 <- __stop_cpus.constprop.0+0x77/0xb0
[ 363.467597] Call Trace:
[ 363.467598] <NMI>
[ 363.467598] dump_stack_lvl+0x62/0x90
[ 363.467600] print_usage_bug.part.0+0x22c/0x2c0
[ 363.467602] lock_acquire+0x208/0x2d0
[ 363.467604] ? __static_call_fixup+0x4f/0xa0
[ 363.467605] ? its_return_thunk+0x10/0x10
[ 363.467607] __mutex_lock+0xb3/0x1060
[ 363.467607] ? __static_call_fixup+0x4f/0xa0
[ 363.467608] ? look_up_lock_class+0x59/0x130
[ 363.467610] ? look_up_lock_class+0x59/0x130
[ 363.467611] ? __static_call_fixup+0x4f/0xa0
[ 363.467613] ? insn_get_prefixes+0x1a4/0x3f0
[ 363.467614] ? __SCT__tp_func_emulate_vsyscall+0x8/0x8
[ 363.467615] ? its_return_thunk+0x10/0x10
[ 363.467617] ? its_return_thunk+0x10/0x10
[ 363.467618] ? __static_call_fixup+0x4f/0xa0
[ 363.467619] __static_call_fixup+0x4f/0xa0
[ 363.467619] ? __SCT__tp_func_emulate_vsyscall+0x8/0x8
[ 363.467621] apply_returns+0x13e/0x370
[ 363.467624] ? __SCT__tp_func_emulate_vsyscall+0x8/0x8
[ 363.467625] ? __SCT__x86_pmu_disable_all+0x7/0x8
[ 363.467626] ? __SCT__x86_pmu_handle_irq+0x5/0x8
[ 363.467627] ? __copy_user_flushcache+0xf3/0x100
[ 363.467628] ? its_return_thunk+0x10/0x10
[ 363.467630] __cpu_update_alternatives+0x1e3/0x240
[ 363.467631] ? x2apic_send_IPI+0x40/0x60
[ 363.467633] stop_machine_nmi_handler+0x29/0x40
[ 363.467634] default_do_nmi+0x137/0x1a0
[ 363.467635] exc_nmi+0xef/0x120
[ 363.467637] end_repeat_nmi+0xf/0x53
The reason being apply_returns->__static_call_fixup acquires text_mutex from NMI context.
<snip>
Powered by blists - more mailing lists