From 68521da88d6d05ed2b9ad5e57a7475e8d3fdc904 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 23 Dec 2017 01:59:04 +0000 Subject: [PATCH 2/8] x86: Replace indirect branches in asm with CALL_THUNK/JMP_THUNK macro This uses the same retpoline thunks that the -mindirect-branch= GCC option uses, to avoid speculative branch prediction problems. Open-code a version of the thunk in the syscall trampoline because we can't just jump to the existing ones. Signed-off-by: David Woodhouse --- arch/x86/crypto/aesni-intel_asm.S | 5 +++-- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 3 ++- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 3 ++- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 3 ++- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 19 +++++++++++++++---- arch/x86/include/asm/spec_ctrl.h | 16 ++++++++++++++++ arch/x86/kernel/ftrace_32.S | 6 ++++-- arch/x86/kernel/ftrace_64.S | 7 ++++--- arch/x86/lib/checksum_32.S | 5 +++-- 10 files changed, 53 insertions(+), 18 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 16627fe..52160d1 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -32,6 +32,7 @@ #include #include #include +#include /* * The following macros are used to move an (un)aligned 16 byte value to/from @@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - call *%r11 + CALL_THUNK r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8) _aesni_gf128mul_x_ble() movups IV, (IVP) - call *%r11 + CALL_THUNK r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index f7c495e..3bf23db 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -17,6 +17,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way: vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - call *%r9; + CALL_THUNK r9; addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index eee5b39..1f06c98 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -12,6 +12,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1343,7 +1344,7 @@ camellia_xts_crypt_32way: vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call *%r9; + CALL_THUNK r9; addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 7a7de27..fd89745 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -45,6 +45,7 @@ #include #include +#include ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction @@ -172,7 +173,7 @@ continue_block: movzxw (bufp, %rax, 2), len lea crc_array(%rip), bufp lea (bufp, len, 1), bufp - jmp *bufp + JMP_THUNK bufp ################################################################ ## 2a) PROCESS FULL BLOCKS: diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ace8f32..608d198 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -290,7 +290,7 @@ ENTRY(ret_from_fork) /* kernel thread */ 1: movl %edi, %eax - call *%ebx + CALL_THUNK ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -919,7 +919,7 @@ common_exception: movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - call *%edi + CALL_THUNK edi jmp ret_from_exception END(common_exception) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c6db10f..0d3ff7f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -189,7 +189,17 @@ ENTRY(entry_SYSCALL_64_trampoline) */ pushq %rdi movq $entry_SYSCALL_64_stage2, %rdi - jmp *%rdi + /* + * Open-code the retpoline from retpoline.S, because we can't + * just jump to it directly. + */ + ALTERNATIVE "call 2f", "jmp *%rdi", X86_FEATURE_IBRS_ATT +1: + lfence + jmp 1b +2: + mov %rdi, (%rsp) + ret END(entry_SYSCALL_64_trampoline) .popsection @@ -276,7 +286,8 @@ entry_SYSCALL_64_fastpath: * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */ - call *sys_call_table(, %rax, 8) + movq sys_call_table(, %rax, 8), %rax + CALL_THUNK rax .Lentry_SYSCALL_64_after_fastpath_call: movq %rax, RAX(%rsp) @@ -449,7 +460,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - jmp *%rax /* Called from C */ + JMP_THUNK rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func @@ -528,7 +539,7 @@ ENTRY(ret_from_fork) 1: /* kernel thread */ movq %r12, %rdi - call *%rbx + CALL_THUNK rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h index e8ed6fd..59b664e 100644 --- a/arch/x86/include/asm/spec_ctrl.h +++ b/arch/x86/include/asm/spec_ctrl.h @@ -180,5 +180,21 @@ ALTERNATIVE "", __stringify(__ASM_DISABLE_IBRS_CLOBBER), X86_FEATURE_SPEC_CTRL ALTERNATIVE __stringify(__ASM_STUFF_RSB), "", X86_FEATURE_SMEP .endm +.macro JMP_THUNK reg:req +#ifdef RETPOLINE + ALTERNATIVE __stringify(jmp __x86.indirect_thunk.\reg), __stringify(jmp *%\reg), X86_FEATURE_IBRS_ATT +#else + jmp *\reg +#endif +.endm + +.macro CALL_THUNK reg:req +#ifdef RETPOLINE + ALTERNATIVE __stringify(call __x86.indirect_thunk.\reg), __stringify(call *%\reg), X86_FEATURE_IBRS_ATT +#else + jmp *\reg +#endif +.endm + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SPEC_CTRL_H */ diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index b6c6468..e89c6be 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CC_USING_FENTRY # define function_hook __fentry__ @@ -197,7 +198,8 @@ ftrace_stub: movl 0x4(%ebp), %edx subl $MCOUNT_INSN_SIZE, %eax - call *ftrace_trace_function + movl ftrace_trace_function, %ecx + CALL_THUNK ecx popl %edx popl %ecx @@ -241,5 +243,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - jmp *%ecx + JMP_THUNK ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index c832291..6c97463 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -7,6 +7,7 @@ #include #include #include +#include .code64 @@ -286,8 +287,8 @@ trace: * ip and parent ip are used and the list function is called when * function tracing is enabled. */ - call *ftrace_trace_function - + movq ftrace_trace_function, %r8 + CALL_THUNK r8 restore_mcount_regs jmp fgraph_trace @@ -329,5 +330,5 @@ GLOBAL(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - jmp *%rdi + JMP_THUNK rdi #endif diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4d34bb5..86df74c 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -29,7 +29,8 @@ #include #include #include - +#include + /* * computes a partial checksum, e.g. for TCP/UDP fragments */ @@ -156,7 +157,7 @@ ENTRY(csum_partial) negl %ebx lea 45f(%ebx,%ebx,2), %ebx testl %esi, %esi - jmp *%ebx + JMP_THUNK ebx # Handle 2-byte-aligned regions 20: addw (%esi), %ax -- 2.7.4