[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1D279568-F296-40F1-9DFB-6AB3F3E589DD@infradead.org>
Date: Fri, 08 Nov 2024 04:29:33 -0800
From: David Woodhouse <dwmw2@...radead.org>
To: "H. Peter Anvin" <hpa@...or.com>, kexec@...ts.infradead.org
CC: Thomas Gleixner <tglx@...utronix.de>, Ingo Molnar <mingo@...hat.com>,
Borislav Petkov <bp@...en8.de>, Dave Hansen <dave.hansen@...ux.intel.com>,
x86@...nel.org, David Woodhouse <dwmw@...zon.co.uk>,
"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
Kai Huang <kai.huang@...el.com>, Nikolay Borisov <nik.borisov@...e.com>,
linux-kernel@...r.kernel.org, Simon Horman <horms@...nel.org>
Subject: Re: [RFC PATCH 2/2] x86/kexec: Add data section to relocate_kernel
On 8 November 2024 03:26:58 GMT-08:00, "H. Peter Anvin" <hpa@...or.com> wrote:
>On November 8, 2024 6:22:41 AM GMT+01:00, David Woodhouse <dwmw2@...radead.org> wrote:
>>From: David Woodhouse <dwmw@...zon.co.uk>
>>
>>Now that it's handled sanely by a linker script we can have actual data,
>>and just use %rip-relative addressing to access it.
>>
>>If we could call the *copy* instead of the original relocate_kernel in
>>the kernel text, then we could use %rip-relative addressing everywhere.
>>
>>Signed-off-by: David Woodhouse <dwmw@...zon.co.uk>
>>---
>> arch/x86/kernel/relocate_kernel_64.S | 58 ++++++++++++++++------------
>> arch/x86/kernel/vmlinux.lds.S | 2 +-
>> 2 files changed, 35 insertions(+), 25 deletions(-)
>>
>>diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
>>index 1efcbd340528..577aa1672349 100644
>>--- a/arch/x86/kernel/relocate_kernel_64.S
>>+++ b/arch/x86/kernel/relocate_kernel_64.S
>>@@ -27,18 +27,28 @@
>> * ~ control_page + PAGE_SIZE are used as data storage and stack for
>> * jumping back
>> */
>>-#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
>>
>>+ .section .data.relocate_kernel,"a";
>> /* Minimal CPU state */
>>-#define RSP DATA(0x0)
>>-#define CR0 DATA(0x8)
>>-#define CR3 DATA(0x10)
>>-#define CR4 DATA(0x18)
>>-
>>+SYM_DATA_LOCAL(saved_rsp, .quad 0)
>>+SYM_DATA_LOCAL(saved_cr0, .quad 0)
>>+SYM_DATA_LOCAL(saved_cr3, .quad 0)
>>+SYM_DATA_LOCAL(saved_cr4, .quad 0)
>> /* other data */
>>-#define CP_PA_TABLE_PAGE DATA(0x20)
>>-#define CP_PA_SWAP_PAGE DATA(0x28)
>>-#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
>>+SYM_DATA_LOCAL(pa_table_page, .quad 0)
>>+SYM_DATA_LOCAL(pa_swap_page, .quad 0)
>>+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
>>+
>>+/*
>>+ * There are two physical copies of relocate_kernel(), one in the original
>>+ * Kernel text and the other copied to the control page. There is a virtual
>>+ * mapping of each, in the original kernel. It is the *original* which is
>>+ * called from machine_kexec(), largely becaose the copy isn't mapped as an
>>+ * executable page. Thus, this code cannot just use %rip-relative addressing
>>+ * until after the %cr3 change and the jump to identity_mapped(). Until
>>+ * then, some pointer arithmetic is required.
>>+ */
>>+#define DATA(x) (x - relocate_kernel)
>>
>> .section .text.relocate_kernel,"ax";
>> .code64
>>@@ -63,13 +73,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
>> pushf
>>
>> movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
>>- movq %rsp, RSP(%r11)
>>+ movq %rsp, DATA(saved_rsp)(%r11)
>> movq %cr0, %rax
>>- movq %rax, CR0(%r11)
>>+ movq %rax, DATA(saved_cr0)(%r11)
>> movq %cr3, %rax
>>- movq %rax, CR3(%r11)
>>+ movq %rax, DATA(saved_cr3)(%r11)
>> movq %cr4, %rax
>>- movq %rax, CR4(%r11)
>>+ movq %rax, DATA(saved_cr4)(%r11)
>>
>> /* Save CR4. Required to enable the right paging mode later. */
>> movq %rax, %r13
>>@@ -94,9 +104,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
>> movq PTR(PA_SWAP_PAGE)(%rsi), %r10
>>
>> /* save some information for jumping back */
>>- movq %r9, CP_PA_TABLE_PAGE(%r11)
>>- movq %r10, CP_PA_SWAP_PAGE(%r11)
>>- movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
>>+ movq %r9, DATA(pa_table_page)(%r11)
>>+ movq %r10, DATA(pa_swap_page)(%r11)
>>+ movq %rdi, DATA(pa_backup_pages_map)(%r11)
>>
>> /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
>> movq %rcx, %r11
>>@@ -128,7 +138,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
>> /* set return address to 0 if not preserving context */
>> pushq $0
>> /* store the start address on the stack */
>>- pushq %rdx
>>+ pushq start_address(%rip)
>>
>> /*
>> * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
>>@@ -227,9 +237,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
>> /* get the re-entry point of the peer system */
>> movq 0(%rsp), %rbp
>> leaq relocate_kernel(%rip), %r8
>>- movq CP_PA_SWAP_PAGE(%r8), %r10
>>- movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
>>- movq CP_PA_TABLE_PAGE(%r8), %rax
>>+ movq pa_swap_page(%rip), %r10
>>+ movq pa_backup_pages_map(%rip), %rdi
>>+ movq pa_table_page(%rip), %rax
>> movq %rax, %cr3
>> lea PAGE_SIZE(%r8), %rsp
>> call swap_pages
>>@@ -243,11 +253,11 @@ SYM_CODE_END(identity_mapped)
>> SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
>> UNWIND_HINT_END_OF_STACK
>> ANNOTATE_NOENDBR // RET target, above
>>- movq RSP(%r8), %rsp
>>- movq CR4(%r8), %rax
>>+ movq saved_rsp(%rip), %rsp
>>+ movq saved_cr4(%rip), %rax
>> movq %rax, %cr4
>>- movq CR3(%r8), %rax
>>- movq CR0(%r8), %r8
>>+ movq saved_cr3(%rip), %rax
>>+ movq saved_cr0(%r8), %r8
>> movq %rax, %cr3
>> movq %r8, %cr0
>> movq %rbp, %rax
>>diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
>>index ad451371e179..65f879b31a82 100644
>>--- a/arch/x86/kernel/vmlinux.lds.S
>>+++ b/arch/x86/kernel/vmlinux.lds.S
>>@@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
>> . = ALIGN(PAGE_SIZE); \
>> __relocate_kernel_start = .; \
>> *(.text.relocate_kernel); \
>>- *(.rodata.relocate_kernel); \
>>+ *(.data.relocate_kernel); \
>> __relocate_kernel_end = .;
>> #else
>> #define KEXEC_RELOCATE_KERNEL_TEXT
>
>Looks good at first glance. I'm currently traveling so I haven't fully reviewed it though.
Ta. That's good enough for me to go ahead and port the rest over.
Is there a selftest for the preserve-context mode somewhere, with a payload that just does a "ret"?
Powered by blists - more mailing lists