lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <19b78960-8c5e-fc5f-c102-12db7aedb548@loongson.cn>
Date:   Tue, 30 Aug 2022 11:25:56 +0800
From:   Youling Tang <tangyouling@...ngson.cn>
To:     Jinyang He <hejinyang@...ngson.cn>,
        Huacai Chen <chenhuacai@...nel.org>
Cc:     Baoquan He <bhe@...hat.com>,
        Eric Biederman <ebiederm@...ssion.com>,
        WANG Xuerui <kernel@...0n.name>,
        Vivek Goyal <vgoyal@...hat.com>,
        Dave Young <dyoung@...hat.com>, Guo Ren <guoren@...nel.org>,
        Jiaxun Yang <jiaxun.yang@...goat.com>,
        kexec@...ts.infradead.org, loongarch@...ts.linux.dev,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/3] LoongArch: Add kexec support

Hi, Jinyang

On 08/30/2022 09:53 AM, Jinyang He wrote:
> Hi, Youling,
>
>
> On 08/29/2022 12:37 PM, Youling Tang wrote:
>> Add three new files, kexec.h, machine_kexec.c and relocate_kernel.S to
>> the
>> LoongArch architecture that add support for the kexec re-boot mechanis
>> (CONFIG_KEXEC) on LoongArch platforms.
>>
>> Supports loading vmlinux (vmlinux.elf) in ELF format and vmlinux.efi in
>> PE format.
>>
>> I tested this on  LoongArch 3A5000 machine and works as expected,
>>
>>   $ sudo kexec -l /boot/vmlinux.efi --reuse-cmdline
>>   $ sudo kexec -e
>>
>> Signed-off-by: Youling Tang <tangyouling@...ngson.cn>
>> ---
>>   arch/loongarch/Kconfig                  |  11 ++
>>   arch/loongarch/include/asm/kexec.h      |  58 ++++++++
>>   arch/loongarch/kernel/Makefile          |   2 +
>>   arch/loongarch/kernel/head.S            |   7 +-
>>   arch/loongarch/kernel/machine_kexec.c   | 178 ++++++++++++++++++++++++
>>   arch/loongarch/kernel/relocate_kernel.S | 125 +++++++++++++++++
>>   6 files changed, 380 insertions(+), 1 deletion(-)
>>   create mode 100644 arch/loongarch/include/asm/kexec.h
>>   create mode 100644 arch/loongarch/kernel/machine_kexec.c
>>   create mode 100644 arch/loongarch/kernel/relocate_kernel.S
>>
>> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
>> index 45364cffc793..903c82fa958d 100644
>> --- a/arch/loongarch/Kconfig
>> +++ b/arch/loongarch/Kconfig
>> @@ -409,6 +409,17 @@ config FORCE_MAX_ZONEORDER
>>         The page size is not necessarily 4KB.  Keep this in mind
>>         when choosing a value for this option.
>>   +config KEXEC
>> +    bool "Kexec system call"
>> +    select KEXEC_CORE
>> +    help
>> +      kexec is a system call that implements the ability to shutdown
>> your
>> +      current kernel, and to start another kernel.  It is like a reboot
>> +      but it is independent of the system firmware.   And like a reboot
>> +      you can start any kernel with it, not just Linux.
>> +
>> +      The name comes from the similarity to the exec system call.
>> +
>>   config SECCOMP
>>       bool "Enable seccomp to safely compute untrusted bytecode"
>>       depends on PROC_FS
>> diff --git a/arch/loongarch/include/asm/kexec.h
>> b/arch/loongarch/include/asm/kexec.h
>> new file mode 100644
>> index 000000000000..5c9e7b5eccb8
>> --- /dev/null
>> +++ b/arch/loongarch/include/asm/kexec.h
>> @@ -0,0 +1,58 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * kexec.h for kexec
>> + *
>> + * Copyright (C) 2022 Loongson Technology Corporation Limited
>> + */
>> +
>> +#ifndef _ASM_KEXEC_H
>> +#define _ASM_KEXEC_H
>> +
>> +#include <asm/stacktrace.h>
>> +#include <asm/page.h>
>> +
>> +/* Maximum physical address we can use pages from */
>> +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
>> +/* Maximum address we can reach in physical address mode */
>> +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
>> + /* Maximum address we can use for the control code buffer */
>> +#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL)
>> +
>> +/* Reserve a page for the control code buffer */
>> +#define KEXEC_CONTROL_PAGE_SIZE PAGE_SIZE
>> +
>> +/* The native architecture */
>> +#define KEXEC_ARCH KEXEC_ARCH_LOONGARCH
>> +
>> +static inline void crash_setup_regs(struct pt_regs *newregs,
>> +                    struct pt_regs *oldregs)
>> +{
>> +    if (oldregs)
>> +        memcpy(newregs, oldregs, sizeof(*newregs));
>> +    else
>> +        prepare_frametrace(newregs);
>> +}
>> +
>> +#define ARCH_HAS_KIMAGE_ARCH
>> +
>> +struct kimage_arch {
>> +    unsigned long boot_flag;
>> +    unsigned long fdt_addr;
>> +};
>> +
>> +typedef void (*do_kexec_t)(unsigned long boot_flag,
>> +               unsigned long fdt_addr,
>> +               unsigned long first_ind_entry,
>> +               unsigned long jump_addr);
>> +
>> +struct kimage;
>> +extern const unsigned char relocate_new_kernel[];
>> +extern const size_t relocate_new_kernel_size;
>> +
>> +#ifdef CONFIG_SMP
>> +extern atomic_t kexec_ready_to_reboot;
>> +extern const unsigned char kexec_smp_wait[];
>> +extern void kexec_reboot(void);
>> +#endif
>> +
>> +#endif /* !_ASM_KEXEC_H */
>> diff --git a/arch/loongarch/kernel/Makefile
>> b/arch/loongarch/kernel/Makefile
>> index a213e994db68..20b64ac3f128 100644
>> --- a/arch/loongarch/kernel/Makefile
>> +++ b/arch/loongarch/kernel/Makefile
>> @@ -17,6 +17,8 @@ obj-$(CONFIG_CPU_HAS_FPU)    += fpu.o
>>   obj-$(CONFIG_MODULES)        += module.o module-sections.o
>>   obj-$(CONFIG_STACKTRACE)    += stacktrace.o
>>   +obj-$(CONFIG_KEXEC)             += machine_kexec.o relocate_kernel.o
>> +
>>   obj-$(CONFIG_PROC_FS)        += proc.o
>>     obj-$(CONFIG_SMP)        += smp.o
>> diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S
>> index 01bac62a6442..22bdf4928325 100644
>> --- a/arch/loongarch/kernel/head.S
>> +++ b/arch/loongarch/kernel/head.S
>> @@ -20,7 +20,12 @@
>>     _head:
>>       .word    MZ_MAGIC        /* "MZ", MS-DOS header */
>> -    .org    0x3c            /* 0x04 ~ 0x3b reserved */
>> +    .org    0x8
>> +    .quad    0            /* Image load offset from start of RAM */
>> +    .dword    _end - _text        /* Effective size of kernel image */
>> +    .quad    0
>> +    .dword    kernel_entry        /* Kernel entry point */
>> +    .org    0x3c            /* 0x28 ~ 0x3b reserved */
>>       .long    pe_header - _head    /* Offset to the PE header */
>>     pe_header:
>> diff --git a/arch/loongarch/kernel/machine_kexec.c
>> b/arch/loongarch/kernel/machine_kexec.c
>> new file mode 100644
>> index 000000000000..4ffcd4cd9c8c
>> --- /dev/null
>> +++ b/arch/loongarch/kernel/machine_kexec.c
>> @@ -0,0 +1,178 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * machine_kexec.c for kexec
>> + *
>> + * Copyright (C) 2022 Loongson Technology Corporation Limited
>> + */
>> +#include <linux/compiler.h>
>> +#include <linux/cpu.h>
>> +#include <linux/kexec.h>
>> +#include <linux/mm.h>
>> +#include <linux/delay.h>
>> +#include <linux/libfdt.h>
>> +#include <linux/of_fdt.h>
>> +
>> +#include <asm/bootinfo.h>
>> +#include <asm/cacheflush.h>
>> +#include <asm/page.h>
>> +
>> +/* 0x100000 ~ 0x200000 is safe */
>> +#define KEXEC_CTRL_CODE    TO_CACHE(0x100000UL)
>> +#define KEXEC_BLOB_ADDR    TO_CACHE(0x108000UL)
>> +
>> +static unsigned long reboot_code_buffer;
>> +#ifdef CONFIG_SMP
>> +void (*relocated_kexec_smp_wait)(void *);
>> +atomic_t kexec_ready_to_reboot = ATOMIC_INIT(0);
>> +#endif
>> +
>> +static unsigned long jump_addr;
>> +static unsigned long first_ind_entry;
>> +static unsigned long boot_flag;
>> +static unsigned long fdt_addr;
>> +
>> +static void kexec_image_info(const struct kimage *kimage)
>> +{
>> +    unsigned long i;
>> +
>> +    pr_debug("kexec kimage info:\n");
>> +    pr_debug("\ttype:        %d\n", kimage->type);
>> +    pr_debug("\tstart:       %lx\n", kimage->start);
>> +    pr_debug("\thead:        %lx\n", kimage->head);
>> +    pr_debug("\tnr_segments: %lu\n", kimage->nr_segments);
>> +
>> +    for (i = 0; i < kimage->nr_segments; i++) {
>> +        pr_debug("\t    segment[%lu]: %016lx - %016lx", i,
>> +            kimage->segment[i].mem,
>> +            kimage->segment[i].mem + kimage->segment[i].memsz);
>> +        pr_debug("\t\t0x%lx bytes, %lu pages\n",
>> +            (unsigned long)kimage->segment[i].memsz,
>> +            (unsigned long)kimage->segment[i].memsz /  PAGE_SIZE);
>> +    }
>> +}
>> +
>> +int machine_kexec_prepare(struct kimage *kimage)
>> +{
>> +    int i;
>> +    void *dtb = (void *)KEXEC_BLOB_ADDR;
>> +
>> +    kexec_image_info(kimage);
>> +
>> +    /* Find the Flattened Device Tree */
>> +    for (i = 0; i < kimage->nr_segments; i++) {
>> +        if (!fdt_check_header(kimage->segment[i].buf)) {
>> +            memcpy(dtb, kimage->segment[i].buf, SZ_64K);
>> +            kimage->arch.boot_flag = fw_arg0;
>> +            kimage->arch.fdt_addr = (unsigned long) dtb;
>> +            break;
>> +        }
>> +        continue;
>> +    }
>> +
>> +    /* kexec need a safe page to save reboot_code_buffer */
>> +    kimage->control_code_page = virt_to_page((void *)KEXEC_CTRL_CODE);
>> +
>> +    reboot_code_buffer =
>> +      (unsigned long)page_address(kimage->control_code_page);
>> +    memcpy((void *)reboot_code_buffer, relocate_new_kernel,
>> +           relocate_new_kernel_size);
> It copys same content to KEXEC_CTRL_CODE each time, could we do this at
> boot time?
I think it's possible to have the copy action happen at boot-time or
during the prepare phase. (RISCV in prepare, MIPS in boot-time)

>
> BTW, our system always keep the low-2MB no used, on mips-loongson or
> LoongArch. Is that necessary on LoongArch? We cannot use parameter
> 'mem=YYM' normally but 'mem=YYM@2M' is ok. And the low-2MB is not
> in virtual memory management, although we can get it in kernel.
For existing kernels, the low 2M has been reserved by
memblock_reserve(PHYS_OFFSET, 0x200000), maybe it is acceptable to keep
the low 2M behavior.

Yes, we need to use "mem=YM@2M" if the low 2M is reserved.

>
> In kexec/kdump process, we can follows kimage_alloc_control_pages().
> When the boot cpu copy complete the second-kernels, all cpus can jump
> to a kernel-entry-trampoline which is in kernel image. Then we don't
> worry about the code can be destroyed. The kernel-entry-trampoline
> get its cpuid, keep non-boot cpus do as kexec_smp_wait and let boot
> cpu go kernel-entry. In this way we can drop the low-2MB IMO.

It is also feasible to dynamically allocate control pages, but it is
easier to use a low 2M approach. What do you think, Huacai?

>
>> +
>> +    /* All secondary cpus now may jump to kexec_smp_wait cycle */
>> +    relocated_kexec_smp_wait = reboot_code_buffer +
>> +        (void *)(kexec_smp_wait - relocate_new_kernel);
>> +
>> +    return 0;
>> +}
>> +
>> +void machine_kexec_cleanup(struct kimage *kimage)
>> +{
>> +}
>> +
>> +#ifdef CONFIG_SMP
>> +void kexec_reboot(void)
>> +{
>> +    do_kexec_t do_kexec = NULL;
>> +
>> +    /* All secondary cpus go to kexec_smp_wait */
>> +    if (smp_processor_id() > 0) {
>> +        relocated_kexec_smp_wait(NULL);
>> +        unreachable();
>> +    }
>> +
>> +    do_kexec = (void *)reboot_code_buffer;
>> +    do_kexec(boot_flag, fdt_addr, first_ind_entry, jump_addr);
>> +
>> +    unreachable();
>> +}
>> +
>> +static void kexec_shutdown_secondary(void *)
>> +{
>> +    local_irq_disable();
>> +    while (!atomic_read(&kexec_ready_to_reboot))
>> +        cpu_relax();
>> +
>> +    kexec_reboot();
>> +}
>> +
>> +void machine_crash_shutdown(struct pt_regs *regs)
>> +{
>> +}
>> +#endif
>> +
>> +void machine_shutdown(void)
>> +{
>> +    smp_call_function(kexec_shutdown_secondary, NULL, 0);
>> +}
>> +
>> +void machine_kexec(struct kimage *image)
>> +{
>> +    unsigned long entry;
>> +    unsigned long *ptr;
>> +    struct kimage_arch *internal = &image->arch;
>> +
>> +    boot_flag = internal->boot_flag;
>> +    fdt_addr = internal->fdt_addr;
>> +
>> +    jump_addr = (unsigned long)phys_to_virt(image->start);
>> +
>> +    first_ind_entry = (unsigned long)phys_to_virt(image->head &
>> PAGE_MASK);
>> +
>> +    /*
>> +     * The generic kexec code builds a page list with physical
>> +     * addresses. they are directly accessible through XKPRANGE
>> +     * hence the phys_to_virt() call.
>> +     */
>> +    for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE);
>> +         ptr = (entry & IND_INDIRECTION) ?
>> +           phys_to_virt(entry & PAGE_MASK) : ptr + 1) {
>> +        if (*ptr & IND_SOURCE || *ptr & IND_INDIRECTION ||
>> +            *ptr & IND_DESTINATION)
>> +            *ptr = (unsigned long) phys_to_virt(*ptr);
>> +    }
>> +
>> +    /* Mark offline before disabling local irq. */
>> +    set_cpu_online(smp_processor_id(), false);
>> +
>> +    /* we do not want to be bothered. */
>> +    local_irq_disable();
>> +
>> +    pr_notice("Will call new kernel at %lx\n", jump_addr);
>> +    pr_notice("FDT image at %lx\n", fdt_addr);
>> +    pr_notice("Bye ...\n");
>> +
>> +    /* Make reboot code buffer available to the boot CPU. */
>> +    flush_cache_all();
>> +
>> +    atomic_set(&kexec_ready_to_reboot, 1);
>> +
>> +    /*
>> +     * We know we were online, and there will be no incoming IPIs at
>> +     * this point.
>> +     */
>> +    set_cpu_online(smp_processor_id(), true);
>> +
>> +    /* Ensure remote CPUs observe that we're online before rebooting. */
>> +    smp_mb__after_atomic();
>> +
>> +    kexec_reboot();
>> +}
>> diff --git a/arch/loongarch/kernel/relocate_kernel.S
>> b/arch/loongarch/kernel/relocate_kernel.S
>> new file mode 100644
>> index 000000000000..d1f242f74ea8
>> --- /dev/null
>> +++ b/arch/loongarch/kernel/relocate_kernel.S
>> @@ -0,0 +1,125 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * relocate_kernel.S for kexec
>> + *
>> + * Copyright (C) 2022 Loongson Technology Corporation Limited
>> + */
>> +
>> +#include <linux/kexec.h>
>> +
>> +#include <asm/asm.h>
>> +#include <asm/asmmacro.h>
>> +#include <asm/regdef.h>
>> +#include <asm/loongarch.h>
>> +#include <asm/stackframe.h>
>> +#include <asm/addrspace.h>
>> +
>> +#define IPI_REG_BASE 0x1fe01000
>> +
>> +SYM_CODE_START(relocate_new_kernel)
>> +    /*
>> +     * s0: Boot flag passed to the new kernel
>> +     * s1: Virt address of the FDT image
>> +     * s2: Pointer to the current entry
>> +     * s3: Virt address to jump to after relocation
>> +     */
>> +    move        s0, a0
>> +    move        s1, a1
>> +    move        s2, a2
>> +    move        s3, a3
>> +
>> +process_entry:
>> +    PTR_L        s4, s2, 0
>> +    PTR_ADDI    s2, s2, SZREG
>> +
>> +    /* destination page */
>> +    andi        s5, s4, IND_DESTINATION
>> +    beqz        s5, 1f
>> +    li.w        t0, ~0x1
>> +    and        s6, s4, t0    /* store destination addr in s6 */
>> +    b        process_entry
>> +
>> +1:
>> +    /* indirection page, update s2    */
>> +    andi        s5, s4, IND_INDIRECTION
>> +    beqz        s5, 1f
>> +    li.w        t0, ~0x2
>> +    and        s2, s4, t0
>> +    b        process_entry
>> +
>> +1:
>> +    /* done page */
>> +    andi        s5, s4, IND_DONE
>> +    beqz        s5, 1f
>> +    b        done
>> +1:
>> +    /* source page */
>> +    andi        s5, s4, IND_SOURCE
>> +    beqz        s5, process_entry
>> +    li.w        t0, ~0x8
>> +    and        s4, s4, t0
>> +    li.w        s8, (1 << _PAGE_SHIFT) / SZREG
>> +
>> +copy_word:
>> +    /* copy page word by word */
>> +    REG_L        s7, s4, 0
>> +    REG_S        s7, s6, 0
>> +    PTR_ADDI    s6, s6, SZREG
>> +    PTR_ADDI    s4, s4, SZREG
>> +    LONG_ADDI    s8, s8, -1
>> +    beqz        s8, process_entry
>> +    b        copy_word
>> +    b        process_entry
>> +
>> +done:
>> +    dbar        0
> ibar, too?

Will add ibar 0.

>> +
>> +    move        a0, s0
>> +    move        a1, s1
>> +    /* jump to the new kernel */
>> +    jr        s3
>> +SYM_CODE_END(relocate_new_kernel)
>> +
>> +#ifdef CONFIG_SMP
>> +/*
>> + * Other CPUs should wait until code is relocated and
>> + * then start at entry (?) point.
>> + */
>> +SYM_CODE_START(kexec_smp_wait)
>> +    li.d        t0, IPI_REG_BASE
>> +    li.d        t1, UNCACHE_BASE
>> +    or        t0, t0, t1
>> +
>> +    /*
>> +     * s1:initfn
>> +     * t0:base t1:cpuid t2:node t3:core t4:count
>> +     */
>> +    csrrd        t1, LOONGARCH_CSR_CPUID
>> +    andi        t1, t1, CSR_CPUID_COREID
>> +    andi        t3, t1, 0x3
>> +    slli.w        t3, t3, 8              /* get core id */
>> +    or        t0, t0, t3
>> +    andi        t2, t1, 0x3c
>> +    slli.d        t2, t2, 42             /* get node id */
>> +    or        t0, t0, t2
>> +
>> +1:    li.w        t4, 0x100              /* wait for init loop */
>> +2:    addi.w        t4, t4, -1             /* limit mailbox access */
>> +    bnez        t4, 2b
>> +    ld.w        s1, t0, 0x20           /* check PC as an indicator */
> Can we do this with iocsr*?

OK, I will consider the implementation in the iocsr way.

Thanks,
Youling
>
> Thanks,
> Jinyang
>> +    beqz        s1, 1b
>> +    ld.d        s1, t0, 0x20           /* get PC via mailbox */
>> +    ld.d        sp, t0, 0x28           /* get SP via mailbox */
>> +    ld.d        tp, t0, 0x30           /* get TP via mailbox */
>> +
>> +    li.d        t0, CACHE_BASE
>> +    or        s1, s1, t0
>> +    jr        s1                     /* jump to initial PC */
>> +SYM_CODE_END(kexec_smp_wait)
>> +#endif
>> +
>> +relocate_new_kernel_end:
>> +
>> +SYM_DATA_START(relocate_new_kernel_size)
>> +    PTR        relocate_new_kernel_end - relocate_new_kernel
>> +SYM_DATA_END(relocate_new_kernel_size)
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ