[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220716230954.036332074@linutronix.de>
Date: Sun, 17 Jul 2022 01:17:46 +0200 (CEST)
From: Thomas Gleixner <tglx@...utronix.de>
To: LKML <linux-kernel@...r.kernel.org>
Cc: x86@...nel.org, Linus Torvalds <torvalds@...ux-foundation.org>,
Tim Chen <tim.c.chen@...ux.intel.com>,
Josh Poimboeuf <jpoimboe@...nel.org>,
Andrew Cooper <Andrew.Cooper3@...rix.com>,
Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>,
Johannes Wikner <kwikner@...z.ch>,
Alyssa Milburn <alyssa.milburn@...ux.intel.com>,
Jann Horn <jannh@...gle.com>, "H.J. Lu" <hjl.tools@...il.com>,
Joao Moreira <joao.moreira@...el.com>,
Joseph Nuzman <joseph.nuzman@...el.com>,
Steven Rostedt <rostedt@...dmis.org>
Subject: [patch 23/38] x86/callthunks: Add call patching for call depth tracking
Mitigating the Intel SKL RSB underflow issue in software requires to track
the call depth. This could be done with help of the compiler by adding at
least 7 bytes NOPs before every direct call, which amounts to a 15+ percent
text size increase for a vmlinux built with a Debian kernel config. While
CPUs are quite efficient in ignoring NOPs this still is a massive penalty
in terms of I-cache for all CPUs which do not have this issue.
Inflict the pain only on SKL CPUs by creating call thunks for each function
and patching the calls to invoke the thunks instead.
The thunks are created in module memory to stay within the 32bit
displacement boundary. The thunk then does:
ACCOUNT_DEPTH
JMP function
The functions and call sites lists are generated by objtool. The memory
requirement is 16 bytes per call thunk and btree memory for keeping track
of them. For a Debian distro config this amounts to ~1.6MB thunk memory and
2MB btree storage. This is only required when the call depth tracking is
enabled on the kernel command line. So the burden is solely on SKL[-X].
The thunks are all stored in one 2MB memory region which is mapped with a
large TLB to prevent ITLB pressure.
The thunks are generated from a template and the btree is used to store
them by destination address. The actual call patching retrieves the thunks
from the btree and replaces the original function call by a call to the
thunk.
Module handling and the actual thunk code for SKL will be added in
subsequent steps.
Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
---
arch/x86/Kconfig | 13 +
arch/x86/include/asm/alternative.h | 13 +
arch/x86/kernel/Makefile | 2
arch/x86/kernel/alternative.c | 6
arch/x86/kernel/callthunks.c | 459 +++++++++++++++++++++++++++++++++++++
5 files changed, 493 insertions(+)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -125,6 +125,7 @@ config X86
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_THP_SWAP if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
+ select BTREE if CALL_DEPTH_TRACKING
select BUILDTIME_TABLE_SORT
select CLKEVT_I8253
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
@@ -2511,6 +2512,18 @@ config CALL_DEPTH_TRACKING
of this option is marginal as the call depth tracking is using
run-time generated call thunks and call patching.
+config CALL_THUNKS_DEBUG
+ bool "Enable call thunks and call depth tracking debugging"
+ depends on CALL_DEPTH_TRACKING
+ default n
+ help
+ Enable call/ret counters for imbalance detection and build in
+ a noisy dmesg about callthunks generation and call patching for
+ trouble shooting. The debug prints need to be enabled on the
+ kernel command line with 'debug-callthunks'.
+ Only enable this, when you are debugging call thunks as this
+ creates a noticable runtime overhead. If unsure say N.
+
config CPU_IBPB_ENTRY
bool "Enable IBPB on kernel entry"
depends on CPU_SUP_AMD
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -80,6 +80,19 @@ extern void apply_returns(s32 *start, s3
extern void apply_ibt_endbr(s32 *start, s32 *end);
struct module;
+struct paravirt_patch_site;
+
+struct callthunk_sites {
+ s32 *syms_start, *syms_end;
+ s32 *call_start, *call_end;
+ struct paravirt_patch_site *pv_start, *pv_end;
+};
+
+#ifdef CONFIG_CALL_THUNKS
+extern void callthunks_patch_builtin_calls(void);
+#else
+static __always_inline void callthunks_patch_builtin_calls(void) {}
+#endif
#ifdef CONFIG_SMP
extern void alternatives_smp_module_add(struct module *mod, char *name,
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -141,6 +141,8 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_
obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
+obj-$(CONFIG_CALL_THUNKS) += callthunks.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -936,6 +936,12 @@ void __init alternative_instructions(voi
*/
apply_alternatives(__alt_instructions, __alt_instructions_end);
+ /*
+ * Now all calls are established. Apply the call thunks if
+ * required.
+ */
+ callthunks_patch_builtin_calls();
+
apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
#ifdef CONFIG_SMP
--- /dev/null
+++ b/arch/x86/kernel/callthunks.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "callthunks: " fmt
+
+#include <linux/btree.h>
+#include <linux/memory.h>
+#include <linux/moduleloader.h>
+#include <linux/set_memory.h>
+#include <linux/vmalloc.h>
+
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/nospec-branch.h>
+#include <asm/paravirt.h>
+#include <asm/sections.h>
+#include <asm/switch_to.h>
+#include <asm/sync_core.h>
+#include <asm/text-patching.h>
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+static int __initdata_or_module debug_callthunks;
+
+#define prdbg(fmt, args...) \
+do { \
+ if (debug_callthunks) \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
+} while(0)
+
+static int __init debug_thunks(char *str)
+{
+ debug_callthunks = 1;
+ return 1;
+}
+__setup("debug-callthunks", debug_thunks);
+#else
+#define prdbg(fmt, args...) do { } while(0)
+#endif
+
+extern s32 __call_sites[], __call_sites_end[];
+extern s32 __sym_sites[], __sym_sites_end[];
+
+static struct btree_head64 call_thunks;
+
+static bool thunks_initialized __ro_after_init;
+static struct module_layout builtin_layout __ro_after_init;
+
+struct thunk_desc {
+ void *template;
+ unsigned int template_size;
+ unsigned int thunk_size;
+};
+
+static struct thunk_desc callthunk_desc __ro_after_init;
+
+struct thunk_mem {
+ void *base;
+ unsigned int size;
+ unsigned int nthunks;
+ bool is_rx;
+ struct list_head list;
+ unsigned long map[0];
+};
+
+struct thunk_mem_area {
+ struct thunk_mem *tmem;
+ unsigned long start;
+ unsigned long nthunks;
+};
+
+static LIST_HEAD(thunk_mem_list);
+
+extern void error_entry(void);
+extern void xen_error_entry(void);
+extern void paranoid_entry(void);
+
+static inline bool is_inittext(struct module_layout *layout, void *addr)
+{
+ if (!layout->mtn.mod)
+ return is_kernel_inittext((unsigned long)addr);
+
+ return within_module_init((unsigned long)addr, layout->mtn.mod);
+}
+
+static __init_or_module bool skip_addr(void *dest)
+{
+ if (dest == error_entry)
+ return true;
+ if (dest == paranoid_entry)
+ return true;
+ if (dest == xen_error_entry)
+ return true;
+ /* Does FILL_RSB... */
+ if (dest == __switch_to_asm)
+ return true;
+ /* Accounts directly */
+ if (dest == ret_from_fork)
+ return true;
+#ifdef CONFIG_FUNCTION_TRACER
+ if (dest == __fentry__)
+ return true;
+#endif
+ return false;
+}
+
+static __init_or_module void *call_get_dest(void *addr)
+{
+ struct insn insn;
+ void *dest;
+ int ret;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Patched out call? */
+ if (insn.opcode.bytes[0] != CALL_INSN_OPCODE)
+ return NULL;
+
+ dest = addr + insn.length + insn.immediate.value;
+ if (skip_addr(dest))
+ return NULL;
+ return dest;
+}
+
+static void *jump_get_dest(void *addr)
+{
+ struct insn insn;
+ int ret;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret))
+ return NULL;
+
+ if (insn.opcode.bytes[0] != JMP32_INSN_OPCODE) {
+ WARN_ON_ONCE(insn.opcode.bytes[0] != INT3_INSN_OPCODE);
+ return NULL;
+ }
+
+ return addr + insn.length + insn.immediate.value;
+}
+
+static __init_or_module void callthunk_free(struct thunk_mem_area *area,
+ bool set_int3)
+{
+ struct thunk_mem *tmem = area->tmem;
+ unsigned int i, size;
+ u8 *thunk, *tp;
+
+ lockdep_assert_held(&text_mutex);
+
+ prdbg("Freeing tmem %px %px %lu %lu\n", tmem->base,
+ tmem->base + area->start * callthunk_desc.thunk_size,
+ area->start, area->nthunks);
+
+ /* Jump starts right after the template */
+ thunk = tmem->base + area->start * callthunk_desc.thunk_size;
+ tp = thunk + callthunk_desc.template_size;
+
+ for (i = 0; i < area->nthunks; i++) {
+ void *dest = jump_get_dest(tp);
+
+ if (dest)
+ btree_remove64(&call_thunks, (unsigned long)dest);
+ tp += callthunk_desc.thunk_size;
+ }
+ bitmap_clear(tmem->map, area->start, area->nthunks);
+
+ if (bitmap_empty(tmem->map, tmem->nthunks)) {
+ list_del(&tmem->list);
+ prdbg("Freeing empty tmem: %px %u %u\n", tmem->base,
+ tmem->size, tmem->nthunks);
+ vfree(tmem->base);
+ kfree(tmem);
+ } else if (set_int3) {
+ size = area->nthunks * callthunk_desc.thunk_size;
+ text_poke_set_locked(thunk, 0xcc, size);
+ }
+ kfree(area);
+}
+
+static __init_or_module
+int callthunk_setup_one(void *dest, u8 *thunk, u8 *buffer,
+ struct module_layout *layout)
+{
+ unsigned long key = (unsigned long)dest;
+ u8 *jmp;
+
+ if (is_inittext(layout, dest)) {
+ prdbg("Ignoring init dest: %pS %px\n", dest, dest);
+ return 0;
+ }
+
+ /* Multiple symbols can have the same location. */
+ if (btree_lookup64(&call_thunks, key)) {
+ prdbg("Ignoring duplicate dest: %pS %px\n", dest, dest);
+ return 0;
+ }
+
+ memcpy(buffer, callthunk_desc.template, callthunk_desc.template_size);
+ jmp = thunk + callthunk_desc.template_size;
+ buffer += callthunk_desc.template_size;
+ __text_gen_insn(buffer, JMP32_INSN_OPCODE, jmp, dest, JMP32_INSN_SIZE);
+
+ return btree_insert64(&call_thunks, key, (void *)thunk, GFP_KERNEL) ? : 1;
+}
+
+static __always_inline char *layout_getname(struct module_layout *layout)
+{
+#ifdef CONFIG_MODULES
+ if (layout->mtn.mod)
+ return layout->mtn.mod->name;
+#endif
+ return "builtin";
+}
+
+static __init_or_module void patch_call(void *addr, struct module_layout *layout)
+{
+ void *thunk, *dest;
+ unsigned long key;
+ u8 bytes[8];
+
+ if (is_inittext(layout, addr))
+ return;
+
+ dest = call_get_dest(addr);
+ if (!dest || WARN_ON_ONCE(IS_ERR(dest)))
+ return;
+
+ key = (unsigned long)dest;
+ thunk = btree_lookup64(&call_thunks, key);
+
+ if (!thunk) {
+ WARN_ONCE(!is_inittext(layout, dest),
+ "Lookup %s thunk for %pS -> %pS %016lx failed\n",
+ layout_getname(layout), addr, dest, key);
+ return;
+ }
+
+ __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, thunk, CALL_INSN_SIZE);
+ text_poke_early(addr, bytes, CALL_INSN_SIZE);
+}
+
+static __init_or_module void patch_call_sites(s32 *start, s32 *end,
+ struct module_layout *layout)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++)
+ patch_call((void *)s + *s, layout);
+}
+
+static __init_or_module void
+patch_paravirt_call_sites(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end,
+ struct module_layout *layout)
+{
+ struct paravirt_patch_site *p;
+
+ for (p = start; p < end; p++)
+ patch_call(p->instr, layout);
+}
+
+static struct thunk_mem_area *callthunks_alloc(unsigned int nthunks)
+{
+ struct thunk_mem_area *area;
+ unsigned int size, mapsize;
+ struct thunk_mem *tmem;
+
+ area = kzalloc(sizeof(*area), GFP_KERNEL);
+ if (!area)
+ return NULL;
+
+ list_for_each_entry(tmem, &thunk_mem_list, list) {
+ unsigned long start;
+
+ start = bitmap_find_next_zero_area(tmem->map, tmem->nthunks,
+ 0, nthunks, 0);
+ if (start >= tmem->nthunks)
+ continue;
+ area->tmem = tmem;
+ area->start = start;
+ prdbg("Using tmem %px %px %lu %u\n", tmem->base,
+ tmem->base + start * callthunk_desc.thunk_size,
+ start, nthunks);
+ return area;
+ }
+
+ size = nthunks * callthunk_desc.thunk_size;
+ size = round_up(size, PMD_SIZE);
+ nthunks = size / callthunk_desc.thunk_size;
+ mapsize = nthunks / 8;
+
+ tmem = kzalloc(sizeof(*tmem) + mapsize, GFP_KERNEL);
+ if (!tmem)
+ goto free_area;
+ INIT_LIST_HEAD(&tmem->list);
+
+ tmem->base = __module_alloc(size, VM_HUGE_VMAP);
+ if (!tmem->base)
+ goto free_tmem;
+ memset(tmem->base, INT3_INSN_OPCODE, size);
+ tmem->size = size;
+ tmem->nthunks = nthunks;
+ list_add(&tmem->list, &thunk_mem_list);
+
+ area->tmem = tmem;
+ area->start = 0;
+ prdbg("Allocated tmem %px %x %u\n", tmem->base, size, nthunks);
+ return area;
+
+free_tmem:
+ kfree(tmem);
+free_area:
+ kfree(area);
+ return NULL;
+}
+
+static __init_or_module void callthunk_area_set_rx(struct thunk_mem_area *area)
+{
+ unsigned long base, size;
+
+ base = (unsigned long)area->tmem->base;
+ size = area->tmem->size / PAGE_SIZE;
+
+ prdbg("Set RX: %016lx %lx\n", base, size);
+ set_memory_ro(base, size);
+ set_memory_x(base, size);
+
+ area->tmem->is_rx = true;
+}
+
+static __init_or_module int callthunks_setup(struct callthunk_sites *cs,
+ struct module_layout *layout)
+{
+ u8 *tp, *thunk, *buffer, *vbuf = NULL;
+ unsigned int nthunks, bitpos;
+ struct thunk_mem_area *area;
+ int ret, text_size, size;
+ s32 *s;
+
+ lockdep_assert_held(&text_mutex);
+
+ prdbg("Setup %s\n", layout_getname(layout));
+ /* Calculate the number of thunks required */
+ nthunks = cs->syms_end - cs->syms_start;
+
+ /*
+ * thunk_size can be 0 when there are no intra module calls,
+ * but there might be still sites to patch.
+ */
+ if (!nthunks)
+ goto patch;
+
+ area = callthunks_alloc(nthunks);
+ if (!area)
+ return -ENOMEM;
+
+ bitpos = area->start;
+ thunk = area->tmem->base + bitpos * callthunk_desc.thunk_size;
+ tp = thunk;
+
+ prdbg("Thunk %px\n", tp);
+ /*
+ * If the memory area is already RX, use a temporary
+ * buffer. Otherwise just copy into the unused area
+ */
+ if (!area->tmem->is_rx) {
+ prdbg("Using thunk direct\n");
+ buffer = thunk;
+ } else {
+ size = nthunks * callthunk_desc.thunk_size;
+ vbuf = vmalloc(size);
+ if (!vbuf) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ memset(vbuf, INT3_INSN_OPCODE, size);
+ buffer = vbuf;
+ prdbg("Using thunk vbuf %px\n", vbuf);
+ }
+
+ for (s = cs->syms_start; s < cs->syms_end; s++, bitpos++) {
+ void *dest = (void *)s + *s;
+
+ ret = callthunk_setup_one(dest, tp, buffer, layout);
+ if (ret)
+ goto fail;
+ buffer += callthunk_desc.thunk_size;
+ tp += callthunk_desc.thunk_size;
+ bitmap_set(area->tmem->map, bitpos, 1);
+ area->nthunks++;
+ }
+
+ text_size = tp - thunk;
+ prdbg("Thunk %px .. %px 0x%x\n", thunk, tp, text_size);
+
+ /*
+ * If thunk memory is already RX, poke the buffer into it.
+ * Otherwise make the memory RX.
+ */
+ if (vbuf)
+ text_poke_copy_locked(thunk, vbuf, text_size);
+ else
+ callthunk_area_set_rx(area);
+ sync_core();
+
+ layout->base = thunk;
+ layout->size = text_size;
+ layout->text_size = text_size;
+ layout->arch_data = area;
+
+ vfree(vbuf);
+
+patch:
+ prdbg("Patching call sites %s\n", layout_getname(layout));
+ patch_call_sites(cs->call_start, cs->call_end, layout);
+ patch_paravirt_call_sites(cs->pv_start, cs->pv_end, layout);
+ prdbg("Patching call sites done%s\n", layout_getname(layout));
+ return 0;
+
+fail:
+ WARN_ON_ONCE(ret);
+ callthunk_free(area, false);
+ vfree(vbuf);
+ return ret;
+}
+
+static __init noinline void callthunks_init(struct callthunk_sites *cs)
+{
+ int ret;
+
+ if (!callthunk_desc.template)
+ return;
+
+ if (WARN_ON_ONCE(btree_init64(&call_thunks)))
+ return;
+
+ ret = callthunks_setup(cs, &builtin_layout);
+ if (WARN_ON_ONCE(ret))
+ return;
+
+ thunks_initialized = true;
+}
+
+void __init callthunks_patch_builtin_calls(void)
+{
+ struct callthunk_sites cs = {
+ .syms_start = __sym_sites,
+ .syms_end = __sym_sites_end,
+ .call_start = __call_sites,
+ .call_end = __call_sites_end,
+ .pv_start = __parainstructions,
+ .pv_end = __parainstructions_end
+ };
+
+ mutex_lock(&text_mutex);
+ callthunks_init(&cs);
+ mutex_unlock(&text_mutex);
+}
Powered by blists - more mailing lists