[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230207113813.rqwpuaung2hr433q@box.shutemov.name>
Date: Tue, 7 Feb 2023 14:38:13 +0300
From: "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: x86@...nel.org, Borislav Petkov <bp@...en8.de>,
Masami Hiramatsu <mhiramat@...nel.org>,
Andrew Cooper <Andrew.Cooper3@...rix.com>,
Josh Poimboeuf <jpoimboe@...hat.com>,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH] x86/alternative: Support relocations in alternatives
On Mon, Feb 06, 2023 at 04:05:38PM +0100, Peter Zijlstra wrote:
>
> A little while ago someone (Kirill) ran into the whole 'alternatives don't
> do relocations nonsense' again and I got annoyed enough to actually look
> at the code.
>
> Since the whole alternative machinery already fully decodes the
> instructions it is simple enough to adjust immediates and displacement
> when needed. Specifically, the immediates for IP modifying instructions
> (JMP, CALL, Jcc) and the displacement for RIP-relative instructions.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
> arch/x86/kernel/alternative.c | 248 +++++++++++++++++++++++++--------------
> tools/objtool/arch/x86/special.c | 8 -
> 2 files changed, 161 insertions(+), 95 deletions(-)
>
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -123,71 +123,6 @@ extern s32 __smp_locks[], __smp_locks_en
> void text_poke_early(void *addr, const void *opcode, size_t len);
>
> /*
> - * Are we looking at a near JMP with a 1 or 4-byte displacement.
> - */
> -static inline bool is_jmp(const u8 opcode)
> -{
> - return opcode == 0xeb || opcode == 0xe9;
> -}
> -
> -static void __init_or_module
> -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
> -{
> - u8 *next_rip, *tgt_rip;
> - s32 n_dspl, o_dspl;
> - int repl_len;
> -
> - if (a->replacementlen != 5)
> - return;
> -
> - o_dspl = *(s32 *)(insn_buff + 1);
> -
> - /* next_rip of the replacement JMP */
> - next_rip = repl_insn + a->replacementlen;
> - /* target rip of the replacement JMP */
> - tgt_rip = next_rip + o_dspl;
> - n_dspl = tgt_rip - orig_insn;
> -
> - DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
> -
> - if (tgt_rip - orig_insn >= 0) {
> - if (n_dspl - 2 <= 127)
> - goto two_byte_jmp;
> - else
> - goto five_byte_jmp;
> - /* negative offset */
> - } else {
> - if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
> - goto two_byte_jmp;
> - else
> - goto five_byte_jmp;
> - }
> -
> -two_byte_jmp:
> - n_dspl -= 2;
> -
> - insn_buff[0] = 0xeb;
> - insn_buff[1] = (s8)n_dspl;
> - add_nops(insn_buff + 2, 3);
> -
> - repl_len = 2;
> - goto done;
> -
> -five_byte_jmp:
> - n_dspl -= 5;
> -
> - insn_buff[0] = 0xe9;
> - *(s32 *)&insn_buff[1] = n_dspl;
> -
> - repl_len = 5;
> -
> -done:
> -
> - DPRINTK("final displ: 0x%08x, JMP 0x%lx",
> - n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
> -}
> -
> -/*
> * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
> *
> * @instr: instruction byte stream
> @@ -254,6 +189,127 @@ static void __init_or_module noinline op
> }
>
> /*
> + * What we start with is:
> + *
> + * src_imm = target - src_next_ip (1)
> + *
> + * what we want is:
> + *
> + * dst_imm = target - dst_next_ip (2)
> + *
> + * so what we do is rework (1) as an expression for target like:
> + *
> + * target = src_imm + src_next_ip (1a)
> + *
> + * and substitute in (2) to get:
> + *
> + * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
> + *
> + * Now, since the instruction stream is 'identical' at src and dst (we copy
> + * after all) we can state that:
> + *
> + * src_next_ip = src + ip_offset
> + * dst_next_ip = dst + ip_offset (4)
> + *
> + * Substitute (4) in (3) and observe ip_offset being cancelled out to
> + * obtain:
> + *
> + * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
> + * = src_imm + src - dst + ip_offset - ip_offset
> + * = src_imm + src - dst (5)
> + *
> + * IOW, only the relative displacement of the code block matters.
> + */
> +
> +#define apply_reloc_n(n_, p_, d_) \
> + do { \
> + s32 v = *(s##n_ *)(p_); \
> + v += (d_); \
> + BUG_ON((v >> 31) != (v >> (n_-1))); \
> + *(s##n_ *)(p_) = (s##n_)v; \
> + } while (0)
> +
> +
> +static __always_inline
> +void apply_reloc(int n, void *ptr, uintptr_t diff)
> +{
> + switch (n) {
> + case 1: apply_reloc_n(8, ptr, diff); break;
> + case 2: apply_reloc_n(16, ptr, diff); break;
> + case 4: apply_reloc_n(32, ptr, diff); break;
> + default: BUG();
> + }
> +}
> +
> +static __always_inline
> +bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
> +{
> + u8 *target = src + offset;
> + /*
> + * If the target is inside the patched block, it's relative to the
> + * block itself and does not need relocation.
> + */
> + return (target < src || target > src + src_len);
> +}
> +
> +static void __init_or_module noinline
> +apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
> +{
> + for (int next, i = 0; i < len; len = next) {
'len = next'? I guess it suppose to be 'i = next', right? Otherwise it
hangs for me.
And if I fix this, I get this crash later on:
BUG: unable to handle page fault for address: ffffffff53fb2f2e
#PF: supervisor instruction fetch in kernel mode
#PF: error_code(0x0010) - not-present page
PGD 283b067 P4D 283b067 PUD 0
Oops: 0010 [#1] PREEMPT SMP PTI
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.2.0-rc7-00012-ga7fd2d79efc5-dirty #1736
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
RIP: 0010:0xffffffff53fb2f2e
Code: Unable to access opcode bytes at 0xffffffff53fb2f04.
RSP: 0000:ffffffff82803d58 EFLAGS: 00010206
RAX: 0000000000000bc9 RBX: 0000000000000005 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff826be1f9 RDI: ffffffff82662566
RBP: ffffffff82803d8a R08: 0000000000000001 R09: 0000000000000001
R10: 0000000000000005 R11: 00000000ffffffff R12: ffffffff836f1a37
R13: ffffffff8388c65d R14: ffffffff8388bef0 R15: ffffffff82803d8a
FS: 0000000000000000(0000) GS:ffff8881f9c00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffff53fb2f04 CR3: 0000000002836001 CR4: 0000000000370ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? text_poke_early+0x46/0x5d
? text_poke_early+0x3a/0x5d
? apply_alternatives+0x258/0x287
? lock_release+0x1d8/0x2e0
? _raw_spin_unlock_irqrestore+0x2b/0x50
? synchronize_rcu+0x1b0/0x1d0
? synchronize_rcu+0x16c/0x1d0
? synchronize_rcu+0x16c/0x1d0
? lockdep_hardirqs_on+0x79/0x100
? synchronize_rcu+0x16c/0x1d0
? lock_release+0x13a/0x2e0
? _raw_spin_unlock_irqrestore+0x2b/0x50
? _raw_spin_unlock_irqrestore+0x2b/0x50
? lockdep_hardirqs_on+0x79/0x100
? alternative_instructions+0x35/0xd1
? check_bugs+0xd6f/0xdc3
? start_kernel+0x674/0x6a8
? secondary_startup_64_no_verify+0xe0/0xeb
</TASK>
CR2: ffffffff53fb2f2e
---[ end trace 0000000000000000 ]---
It happens when applied on top of current Linus' tree. My config is
attached just in case.
Maybe you forgot to fold in some fixup, I donno.
--
Kiryl Shutsemau / Kirill A. Shutemov
View attachment ".config" of type "text/plain" (143743 bytes)
Powered by blists - more mailing lists