[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250918222607.186488-2-xiyou.wangcong@gmail.com>
Date: Thu, 18 Sep 2025 15:26:00 -0700
From: Cong Wang <xiyou.wangcong@...il.com>
To: linux-kernel@...r.kernel.org
Cc: pasha.tatashin@...een.com,
Cong Wang <cwang@...tikernel.io>,
Andrew Morton <akpm@...ux-foundation.org>,
Baoquan He <bhe@...hat.com>,
Alexander Graf <graf@...zon.com>,
Mike Rapoport <rppt@...nel.org>,
Changyuan Lyu <changyuanl@...gle.com>,
kexec@...ts.infradead.org,
linux-mm@...ck.org
Subject: [RFC Patch 1/7] kexec: Introduce multikernel support via kexec
From: Cong Wang <cwang@...tikernel.io>
This patch extends the kexec subsystem to support multikernel
functionality, allowing different kernel instances to be loaded and
executed on specific CPUs. The implementation introduces:
- New KEXEC_TYPE_MULTIKERNEL type and KEXEC_MULTIKERNEL flag
- multikernel_kick_ap() function for CPU-specific kernel booting
- LINUX_REBOOT_CMD_MULTIKERNEL reboot command with CPU parameter
- Specialized segment loading for multikernel images using memremap
- Integration with existing kexec infrastructure while bypassing
standard machine_kexec_prepare() for avoiding resets
The multikernel_kexec() function validates CPU availability and uses
the existing kexec image start address to boot the target CPU with
a different kernel instance. This enables heterogeneous computing
scenarios where different CPUs can run specialized kernel variants.
Signed-off-by: Cong Wang <cwang@...tikernel.io>
---
arch/x86/include/asm/smp.h | 1 +
arch/x86/kernel/smpboot.c | 104 +++++++++++++++++++++++++++
include/linux/kexec.h | 6 +-
include/uapi/linux/kexec.h | 1 +
include/uapi/linux/reboot.h | 2 +-
kernel/kexec.c | 41 ++++++++++-
kernel/kexec_core.c | 135 ++++++++++++++++++++++++++++++++++++
kernel/reboot.c | 10 +++
8 files changed, 294 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 22bfebe6776d..1a59fd0de759 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -107,6 +107,7 @@ void native_smp_prepare_cpus(unsigned int max_cpus);
void native_smp_cpus_done(unsigned int max_cpus);
int common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_kick_ap(unsigned int cpu, struct task_struct *tidle);
+int multikernel_kick_ap(unsigned int cpu, unsigned long kernel_start_address);
int native_cpu_disable(void);
void __noreturn hlt_play_dead(void);
void native_play_dead(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 33e166f6ab12..c2844a493ebf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -833,6 +833,72 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
return 0;
}
+// must be locked by cpus_read_lock()
+static int do_multikernel_boot_cpu(u32 apicid, int cpu, unsigned long kernel_start_address)
+{
+ unsigned long start_ip = real_mode_header->trampoline_start;
+ int ret;
+
+ pr_info("do_multikernel_boot_cpu(apicid=%u, cpu=%u, kernel_start_address=%lx)\n", apicid, cpu, kernel_start_address);
+#ifdef CONFIG_X86_64
+ /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
+ if (apic->wakeup_secondary_cpu_64)
+ start_ip = real_mode_header->trampoline_start64;
+#endif
+ //initial_code = (unsigned long)start_secondary;
+ initial_code = (unsigned long)kernel_start_address;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+ //initial_stack = idle->thread.sp;
+ } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) {
+ smpboot_control = cpu;
+ }
+
+ /* Skip init_espfix_ap(cpu); */
+
+ /* Skip announce_cpu(cpu, apicid); */
+
+ /*
+ * This grunge runs the startup process for
+ * the targeted processor.
+ */
+ if (x86_platform.legacy.warm_reset) {
+
+ pr_debug("Setting warm reset code and vector.\n");
+
+ smpboot_setup_warm_reset_vector(start_ip);
+ /*
+ * Be paranoid about clearing APIC errors.
+ */
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+ }
+
+ smp_mb();
+
+ /*
+ * Wake up a CPU in difference cases:
+ * - Use a method from the APIC driver if one defined, with wakeup
+ * straight to 64-bit mode preferred over wakeup to RM.
+ * Otherwise,
+ * - Use an INIT boot APIC message
+ */
+ if (apic->wakeup_secondary_cpu_64)
+ ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu);
+ else if (apic->wakeup_secondary_cpu)
+ ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu);
+ else
+ ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu);
+
+ pr_info("do_multikernel_boot_cpu end\n");
+ /* If the wakeup mechanism failed, cleanup the warm reset vector */
+ if (ret)
+ arch_cpuhp_cleanup_kick_cpu(cpu);
+ return ret;
+}
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -905,6 +971,44 @@ static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
return ret;
}
+// must be locked by cpus_read_lock()
+int multikernel_kick_ap(unsigned int cpu, unsigned long kernel_start_address)
+{
+ u32 apicid = apic->cpu_present_to_apicid(cpu);
+ int err;
+
+ lockdep_assert_irqs_enabled();
+
+ pr_info("++++++++++++++++++++=_---CPU UP %u\n", cpu);
+
+ if (apicid == BAD_APICID || !apic_id_valid(apicid)) {
+ pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid);
+ return -EINVAL;
+ }
+
+ if (!test_bit(apicid, phys_cpu_present_map)) {
+ pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid);
+ return -EINVAL;
+ }
+
+ /*
+ * Save current MTRR state in case it was changed since early boot
+ * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+ */
+ mtrr_save_state();
+
+ /* the FPU context is blank, nobody can own it */
+ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+ /* skip common_cpu_up(cpu, tidle); */
+
+ err = do_multikernel_boot_cpu(apicid, cpu, kernel_start_address);
+ if (err)
+ pr_err("do_multikernel_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
+
+ return err;
+}
+
+
int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
{
u32 apicid = apic->cpu_present_to_apicid(cpu);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 39fe3e6cd282..a3ae3e561109 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -358,9 +358,10 @@ struct kimage {
unsigned long control_page;
/* Flags to indicate special processing */
- unsigned int type : 1;
+ unsigned int type : 2;
#define KEXEC_TYPE_DEFAULT 0
#define KEXEC_TYPE_CRASH 1
+#define KEXEC_TYPE_MULTIKERNEL 2
unsigned int preserve_context : 1;
/* If set, we are using file mode kexec syscall */
unsigned int file_mode:1;
@@ -434,6 +435,7 @@ extern void machine_kexec(struct kimage *image);
extern int machine_kexec_prepare(struct kimage *image);
extern void machine_kexec_cleanup(struct kimage *image);
extern int kernel_kexec(void);
+extern int multikernel_kexec(int cpu);
extern struct page *kimage_alloc_control_pages(struct kimage *image,
unsigned int order);
@@ -455,7 +457,7 @@ bool kexec_load_permitted(int kexec_image_type);
#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | KEXEC_CRASH_HOTPLUG_SUPPORT)
#else
#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR | \
- KEXEC_CRASH_HOTPLUG_SUPPORT)
+ KEXEC_CRASH_HOTPLUG_SUPPORT | KEXEC_MULTIKERNEL)
#endif
/* List of defined/legal kexec file flags */
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 8958ebfcff94..4ed8660ef95e 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -14,6 +14,7 @@
#define KEXEC_PRESERVE_CONTEXT 0x00000002
#define KEXEC_UPDATE_ELFCOREHDR 0x00000004
#define KEXEC_CRASH_HOTPLUG_SUPPORT 0x00000008
+#define KEXEC_MULTIKERNEL 0x00000010
#define KEXEC_ARCH_MASK 0xffff0000
/*
diff --git a/include/uapi/linux/reboot.h b/include/uapi/linux/reboot.h
index 58e64398efc5..aac2f2f94a98 100644
--- a/include/uapi/linux/reboot.h
+++ b/include/uapi/linux/reboot.h
@@ -34,7 +34,7 @@
#define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4
#define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2
#define LINUX_REBOOT_CMD_KEXEC 0x45584543
-
+#define LINUX_REBOOT_CMD_MULTIKERNEL 0x4D4B4C49
#endif /* _UAPI_LINUX_REBOOT_H */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 28008e3d462e..49e62f804674 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/memblock.h>
#include "kexec_internal.h"
@@ -27,6 +28,7 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
int ret;
struct kimage *image;
bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+ bool multikernel_load = flags & KEXEC_MULTIKERNEL;
#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
@@ -37,6 +39,30 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
}
#endif
+#if 0
+ if (multikernel_load) {
+ // Check if entry is in a reserved memory region
+ bool in_reserved_region = false;
+ phys_addr_t start, end;
+ u64 i;
+
+ for_each_reserved_mem_range(i, &start, &end) {
+ if (entry >= start && entry < end) {
+ in_reserved_region = true;
+ break;
+ }
+ }
+
+ if (!in_reserved_region) {
+ pr_err("Entry point 0x%lx is not in a reserved memory region\n", entry);
+ return -EADDRNOTAVAIL; // Return an error if not in a reserved region
+ }
+
+ pr_info("multikernel load: got to multikernel_load syscall, entry 0x%lx, nr_segments %lu, flags 0x%lx\n",
+ entry, nr_segments, flags);
+ }
+#endif
+
/* Allocate and initialize a controlling structure */
image = do_kimage_alloc_init();
if (!image)
@@ -54,10 +80,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
}
#endif
+ if (multikernel_load) {
+ image->type = KEXEC_TYPE_MULTIKERNEL;
+ }
+
ret = sanity_check_segment_list(image);
if (ret)
goto out_free_image;
+ if (multikernel_load)
+ goto done;
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
@@ -79,6 +111,7 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
}
}
+done:
*rimage = image;
return 0;
out_free_control_pages:
@@ -139,9 +172,11 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
image->hotplug_support = 1;
#endif
- ret = machine_kexec_prepare(image);
- if (ret)
- goto out;
+ if (!(flags & KEXEC_MULTIKERNEL)) {
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+ }
/*
* Some architecture(like S390) may touch the crash memory before
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 31203f0bacaf..35a66c8dd78b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -41,6 +41,7 @@
#include <linux/objtool.h>
#include <linux/kmsg_dump.h>
#include <linux/dma-map-ops.h>
+#include <linux/memblock.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -211,6 +212,32 @@ int sanity_check_segment_list(struct kimage *image)
}
#endif
+#if 0
+ if (image->type == KEXEC_TYPE_MULTIKERNEL) {
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+ phys_addr_t start, end;
+ bool in_reserved_region = false;
+ u64 i;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ for_each_reserved_mem_range(i, &start, &end) {
+ if (mstart >= start && mend <= end) {
+ in_reserved_region = true;
+ break;
+ }
+ }
+
+ if (!in_reserved_region) {
+ pr_err("Segment 0x%lx-0x%lx is not in a reserved memory region\n",
+ mstart, mend);
+ return -EADDRNOTAVAIL;
+ }
+ }
+ }
+#endif
+
/*
* The destination addresses are searched from system RAM rather than
* being allocated from the buddy allocator, so they are not guaranteed
@@ -943,6 +970,84 @@ static int kimage_load_crash_segment(struct kimage *image, int idx)
}
#endif
+static int kimage_load_multikernel_segment(struct kimage *image, int idx)
+{
+ /* For multikernel we simply copy the data from
+ * user space to it's destination.
+ * We do things a page at a time for the sake of kmap.
+ */
+ struct kexec_segment *segment = &image->segment[idx];
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+ pr_info("Loading multikernel segment: mem=0x%lx, memsz=0x%zu, buf=0x%px, bufsz=0x%zu\n",
+ maddr, mbytes, buf, ubytes);
+ while (mbytes) {
+ char *ptr;
+ size_t uchunk, mchunk;
+ unsigned long page_addr = maddr & PAGE_MASK;
+ unsigned long page_offset = maddr & ~PAGE_MASK;
+
+ /* Use memremap to map the physical address */
+ ptr = memremap(page_addr, PAGE_SIZE, MEMREMAP_WB);
+ if (!ptr) {
+ pr_err("Failed to memremap memory at 0x%lx\n", page_addr);
+ result = -ENOMEM;
+ goto out;
+ }
+
+ /* Adjust pointer to the offset within the page */
+ ptr += page_offset;
+
+ /* Calculate chunk sizes */
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE - page_offset);
+ uchunk = min(ubytes, mchunk);
+
+ /* Zero the trailing part of the page if needed */
+ if (mchunk > uchunk) {
+ /* Zero the trailing part of the page */
+ memset(ptr + uchunk, 0, mchunk - uchunk);
+ }
+
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
+
+ /* Clean up */
+ memunmap(ptr - page_offset);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ maddr += mchunk;
+ mbytes -= mchunk;
+
+ cond_resched();
+ }
+out:
+ return result;
+}
+
int kimage_load_segment(struct kimage *image, int idx)
{
int result = -ENOMEM;
@@ -956,6 +1061,9 @@ int kimage_load_segment(struct kimage *image, int idx)
result = kimage_load_crash_segment(image, idx);
break;
#endif
+ case KEXEC_TYPE_MULTIKERNEL:
+ result = kimage_load_multikernel_segment(image, idx);
+ break;
}
return result;
@@ -1230,3 +1338,30 @@ int kernel_kexec(void)
kexec_unlock();
return error;
}
+
+int multikernel_kexec(int cpu)
+{
+ int rc;
+
+ pr_info("multikernel kexec: cpu %d\n", cpu);
+
+ if (cpu_online(cpu)) {
+ pr_err("The CPU is currently running with this kernel instance.");
+ return -EBUSY;
+ }
+
+ if (!kexec_trylock())
+ return -EBUSY;
+ if (!kexec_image) {
+ rc = -EINVAL;
+ goto unlock;
+ }
+
+ cpus_read_lock();
+ rc = multikernel_kick_ap(cpu, kexec_image->start);
+ cpus_read_unlock();
+
+unlock:
+ kexec_unlock();
+ return rc;
+}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index ec087827c85c..f3ac703c4695 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -717,6 +717,10 @@ EXPORT_SYMBOL_GPL(kernel_power_off);
DEFINE_MUTEX(system_transition_mutex);
+struct multikernel_boot_args {
+ int cpu;
+};
+
/*
* Reboot system call: for obvious reasons only root may call it,
* and even root needs to set up some magic numbers in the registers
@@ -729,6 +733,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
void __user *, arg)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ struct multikernel_boot_args boot_args;
char buffer[256];
int ret = 0;
@@ -799,6 +804,11 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
case LINUX_REBOOT_CMD_KEXEC:
ret = kernel_kexec();
break;
+ case LINUX_REBOOT_CMD_MULTIKERNEL:
+ if (copy_from_user(&boot_args, arg, sizeof(boot_args)))
+ return -EFAULT;
+ ret = multikernel_kexec(boot_args.cpu);
+ break;
#endif
#ifdef CONFIG_HIBERNATION
--
2.34.1
Powered by blists - more mailing lists