[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <35c58191-f774-40cf-8d66-d1e2aaf11a62@intel.com>
Date: Mon, 28 Apr 2025 15:05:55 -0700
From: Dave Hansen <dave.hansen@...el.com>
To: Changyuan Lyu <changyuanl@...gle.com>, linux-kernel@...r.kernel.org
Cc: akpm@...ux-foundation.org, anthony.yznaga@...cle.com, arnd@...db.de,
ashish.kalra@....com, benh@...nel.crashing.org, bp@...en8.de,
catalin.marinas@....com, corbet@....net, dave.hansen@...ux.intel.com,
devicetree@...r.kernel.org, dwmw2@...radead.org, ebiederm@...ssion.com,
graf@...zon.com, hpa@...or.com, jgowans@...zon.com,
kexec@...ts.infradead.org, krzk@...nel.org,
linux-arm-kernel@...ts.infradead.org, linux-doc@...r.kernel.org,
linux-mm@...ck.org, luto@...nel.org, mark.rutland@....com, mingo@...hat.com,
pasha.tatashin@...een.com, pbonzini@...hat.com, peterz@...radead.org,
ptyadav@...zon.de, robh@...nel.org, rostedt@...dmis.org, rppt@...nel.org,
saravanak@...gle.com, skinsburskii@...ux.microsoft.com, tglx@...utronix.de,
thomas.lendacky@....com, will@...nel.org, x86@...nel.org
Subject: Re: [PATCH v6 11/14] x86: add KHO support
On 4/10/25 22:37, Changyuan Lyu wrote:
> From: Alexander Graf <graf@...zon.com>
>
> We now have all bits in place to support KHO kexecs. This patch adds
Please use imperative voice and also avoid the "this patch" stuff.
...
> +/*
> + * If KHO is active, only process its scratch areas to ensure we are not
> + * stepping onto preserved memory.
> + */
Same thing on the imperative voice here.
I'm also not fully understanding the comment. Do these "scratch" regions
basically represent all the memory that's not being handed over? It's
not obvious.
> +#ifdef CONFIG_KEXEC_HANDOVER
> +static bool process_kho_entries(unsigned long minimum, unsigned long image_size)
> +{
> + struct kho_scratch *kho_scratch;
> + struct setup_data *ptr;
> + int i, nr_areas = 0;
Do these really need actual #ifdefs or will a nice IS_ENABLED() check
work instead?
> + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
What's with the double cast?
> + while (ptr) {
> + if (ptr->type == SETUP_KEXEC_KHO) {
> + struct kho_data *kho = (struct kho_data *)ptr->data;
> +
> + kho_scratch = (void *)kho->scratch_addr;
> + nr_areas = kho->scratch_size / sizeof(*kho_scratch);
> +
> + break;
> + }
> +
> + ptr = (struct setup_data *)(unsigned long)ptr->next;
> + }
Wow, there are a solid number of these loops, each with similarly fun
casting.
> + if (!nr_areas)
> + return false;
> +
> + for (i = 0; i < nr_areas; i++) {
> + struct kho_scratch *area = &kho_scratch[i];
> + struct mem_vector region = {
> + .start = area->addr,
> + .size = area->size,
> + };
> +
> + if (process_mem_region(®ion, minimum, image_size))
> + break;
> + }
> +
> + return true;
> +}
> +#else
> +static inline bool process_kho_entries(unsigned long minimum,
> + unsigned long image_size)
> +{
> + return false;
> +}
> +#endif
> +
> static unsigned long find_random_phys_addr(unsigned long minimum,
> unsigned long image_size)
> {
> @@ -775,7 +824,8 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
> return 0;
> }
>
> - if (!process_efi_entries(minimum, image_size))
> + if (!process_kho_entries(minimum, image_size) &&
> + !process_efi_entries(minimum, image_size))
> process_e820_entries(minimum, image_size);
Oh, so KHO really does replace the other memory maps here. That seems
important to call out.
> phys_addr = slots_fetch_random();
> diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
> index ad9212df0ec0c..906ab5e6d25fe 100644
> --- a/arch/x86/include/asm/setup.h
> +++ b/arch/x86/include/asm/setup.h
> @@ -67,6 +67,10 @@ extern void x86_ce4100_early_setup(void);
> static inline void x86_ce4100_early_setup(void) { }
> #endif
>
> +#ifdef CONFIG_KEXEC_HANDOVER
> +#include <linux/kexec_handover.h>
> +#endif
Please do this #ifdef'ing inside the kexec_handover.h header if at all
possible.
> diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h
> index 50c45ead4e7c9..2671c4e1b3a0b 100644
> --- a/arch/x86/include/uapi/asm/setup_data.h
> +++ b/arch/x86/include/uapi/asm/setup_data.h
> @@ -13,7 +13,8 @@
> #define SETUP_CC_BLOB 7
> #define SETUP_IMA 8
> #define SETUP_RNG_SEED 9
> -#define SETUP_ENUM_MAX SETUP_RNG_SEED
> +#define SETUP_KEXEC_KHO 10
> +#define SETUP_ENUM_MAX SETUP_KEXEC_KHO
>
> #define SETUP_INDIRECT (1<<31)
> #define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
> @@ -78,6 +79,16 @@ struct ima_setup_data {
> __u64 size;
> } __attribute__((packed));
>
> +/*
> + * Locations of kexec handover metadata
> + */
> +struct kho_data {
> + __u64 fdt_addr;
> + __u64 fdt_size;
> + __u64 scratch_addr;
> + __u64 scratch_size;
> +} __attribute__((packed));
> +
> #endif /* __ASSEMBLER__ */
>
> #endif /* _UAPI_ASM_X86_SETUP_DATA_H */
> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
> index 57120f0749cc3..c314212a5ecd5 100644
> --- a/arch/x86/kernel/e820.c
> +++ b/arch/x86/kernel/e820.c
> @@ -1300,6 +1300,24 @@ void __init e820__memblock_setup(void)
> memblock_add(entry->addr, entry->size);
> }
>
> + /*
> + * At this point with KHO we only allocate from scratch memory.
> + * At the same time, we configure memblock to only allow
> + * allocations from memory below ISA_END_ADDRESS which is not
> + * a natural scratch region, because Linux ignores memory below
> + * ISA_END_ADDRESS at runtime. Beside very few (if any) early
> + * allocations, we must allocate real-mode trapoline below
trampoline ^
> + * ISA_END_ADDRESS.
> + *
> + * To make sure that we can actually perform allocations during
> + * this phase, let's mark memory below ISA_END_ADDRESS as scratch
> + * so we can allocate from there in a scratch-only world.
> + *
> + * After real mode trampoline is allocated, we clear scratch
> + * marking from the memory below ISA_END_ADDRESS
> + */
> + memblock_mark_kho_scratch(0, ISA_END_ADDRESS);
This isn't making a whole ton of sense to me.
Is this *only* to facilitate possible users that need <ISA_END_ADDRESS
allocations? If so, please say that.
I _think_ this is trying to say that KHO kernels are special and are
trying to only allocate from scratch areas. But <ISA_END_ADDRESS
allocations are both necessary and not marked by KHO _as_ a scratch area
which causes a problem.
Also, it's a bit goofy that this code does the "mark" with
ISA_END_ADDRESS and the clear with SZ_1M:
memblock_clear_kho_scratch(0, SZ_1M);
That seems unnecessarily oblique.
> /* Throw away partial pages: */
> memblock_trim_memory(PAGE_SIZE);
>
> diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
> index 68530fad05f74..518635cc0876c 100644
> --- a/arch/x86/kernel/kexec-bzimage64.c
> +++ b/arch/x86/kernel/kexec-bzimage64.c
> @@ -233,6 +233,31 @@ setup_ima_state(const struct kimage *image, struct boot_params *params,
> #endif /* CONFIG_IMA_KEXEC */
> }
>
> +static void setup_kho(const struct kimage *image, struct boot_params *params,
> + unsigned long params_load_addr,
> + unsigned int setup_data_offset)
> +{
> +#ifdef CONFIG_KEXEC_HANDOVER
Can this #ifdef be replaced with IS_ENABLED()?
> + struct setup_data *sd = (void *)params + setup_data_offset;
> + struct kho_data *kho = (void *)sd + sizeof(*sd);
> +
> + sd->type = SETUP_KEXEC_KHO;
> + sd->len = sizeof(struct kho_data);
> +
> + /* Only add if we have all KHO images in place */
> + if (!image->kho.fdt || !image->kho.scratch)
> + return;
> +
> + /* Add setup data */
> + kho->fdt_addr = image->kho.fdt;
> + kho->fdt_size = PAGE_SIZE;
> + kho->scratch_addr = image->kho.scratch->mem;
> + kho->scratch_size = image->kho.scratch->bufsz;
> + sd->next = params->hdr.setup_data;
> + params->hdr.setup_data = params_load_addr + setup_data_offset;
> +#endif /* CONFIG_KEXEC_HANDOVER */
> +}
> +
> static int
> setup_boot_parameters(struct kimage *image, struct boot_params *params,
> unsigned long params_load_addr,
> @@ -312,6 +337,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
> sizeof(struct ima_setup_data);
> }
>
> + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
> + /* Setup space to store preservation metadata */
> + setup_kho(image, params, params_load_addr, setup_data_offset);
> + setup_data_offset += sizeof(struct setup_data) +
> + sizeof(struct kho_data);
> + }
This is a bit weird that it needs this IS_ENABLED() check and the
earlier #ifdef. But I guess it's following the IMA_KEXEC code's pattern
at least.
> /* Setup RNG seed */
> setup_rng_seed(params, params_load_addr, setup_data_offset);
>
> @@ -479,6 +511,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
> kbuf.bufsz += sizeof(struct setup_data) +
> sizeof(struct ima_setup_data);
>
> + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER))
> + kbuf.bufsz += sizeof(struct setup_data) +
> + sizeof(struct kho_data);
> +
> params = kzalloc(kbuf.bufsz, GFP_KERNEL);
> if (!params)
> return ERR_PTR(-ENOMEM);
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 766176c4f5ee8..496dae89cf95d 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -451,6 +451,28 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size)
> }
> #endif
>
> +static void __init add_kho(u64 phys_addr, u32 data_len)
> +{
> +#ifdef CONFIG_KEXEC_HANDOVER
> + struct kho_data *kho;
> + u64 addr = phys_addr + sizeof(struct setup_data);
> + u64 size = data_len - sizeof(struct setup_data);
> +
> + kho = early_memremap(addr, size);
> + if (!kho) {
> + pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n",
> + addr, size);
> + return;
> + }
> +
> + kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size);
> +
> + early_memunmap(kho, size);
> +#else
> + pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n");
> +#endif
> +}
Please axe the #ifdef in the .c file if at all possible, just like the
others.
> static void __init parse_setup_data(void)
> {
> struct setup_data *data;
> @@ -479,6 +501,9 @@ static void __init parse_setup_data(void)
> case SETUP_IMA:
> add_early_ima_buffer(pa_data);
> break;
> + case SETUP_KEXEC_KHO:
> + add_kho(pa_data, data_len);
> + break;
> case SETUP_RNG_SEED:
> data = early_memremap(pa_data, data_len);
> add_bootloader_randomness(data->data, data->len);
> diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
> index f9bc444a3064d..9b9f4534086d2 100644
> --- a/arch/x86/realmode/init.c
> +++ b/arch/x86/realmode/init.c
> @@ -65,6 +65,8 @@ void __init reserve_real_mode(void)
> * setup_arch().
> */
> memblock_reserve(0, SZ_1M);
> +
> + memblock_clear_kho_scratch(0, SZ_1M);
Eek. Needs a comment, badly.
Overall this doesn't look bad. It might be useful to break it up into a
couple of patches, but it's not horrific as-is.
Powered by blists - more mailing lists