diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 356bf4b4273b..ccaf4be39d17 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1799,8 +1799,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nointroute [IA-64] + noinvpcid [X86] Disable the INVPCID cpu feature. + nojitter [IA-64] Disables jitter checking for ITC timers. + nopti [X86-64] Disable KAISER isolation of kernel from user. + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page @@ -1827,6 +1831,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nopat [X86] Disable PAT (page attribute table extension of pagetables) support. + nopcid [X86-64] Disable the PCID cpu feature. + norandmaps Don't use address space randomization. Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space @@ -2239,6 +2245,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. pt. [PARIDE] See Documentation/blockdev/paride.txt. + pti= [X86_64] + Control KAISER user/kernel address space isolation: + on - enable + off - disable + auto - default setting + pty.legacy_count= [KNL] Number of legacy pty's. Overwrites compiled-in default number. diff --git a/Makefile b/Makefile index 5f8f4c5d7c65..c726e4d97049 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 2 -SUBLEVEL = 97 +SUBLEVEL = 98 EXTRAVERSION = NAME = Saber-toothed Squirrel diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3f19c81a6203..ac9ae2289f23 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -7,6 +7,7 @@ * we just keep it from happening */ #undef CONFIG_PARAVIRT +#undef CONFIG_PAGE_TABLE_ISOLATION #ifdef CONFIG_X86_32 #define _ASM_X86_DESC_H 1 #endif diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 2b5527726ae1..130db0702d9f 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -12,6 +12,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -120,6 +124,7 @@ ENTRY(ia32_sysenter_target) CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movq PER_CPU_VAR(kernel_stack), %rsp addq $(KERNEL_STACK_OFFSET),%rsp /* @@ -183,6 +188,7 @@ ENTRY(ia32_sysenter_target) popq_cfi %rcx /* User %esp */ CFI_REGISTER rsp,rcx TRACE_IRQS_ON + SWITCH_USER_CR3 ENABLE_INTERRUPTS_SYSEXIT32 #ifdef CONFIG_AUDITSYSCALL @@ -281,6 +287,7 @@ ENTRY(ia32_cstar_target) CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 movq PER_CPU_VAR(kernel_stack),%rsp @@ -337,6 +344,7 @@ ENTRY(ia32_cstar_target) xorq %r9,%r9 xorq %r8,%r8 TRACE_IRQS_ON + SWITCH_USER_CR3 movl RSP-ARGOFFSET(%rsp),%esp CFI_RESTORE rsp USERGS_SYSRET32 @@ -409,6 +417,7 @@ ENTRY(ia32_syscall) CFI_REL_OFFSET rip,RIP-RIP PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS + SWITCH_KERNEL_CR3_NO_STACK /* * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 091508b533b4..e8c1d8b8d895 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -15,12 +15,53 @@ .endm #endif -.macro altinstruction_entry orig alt feature orig_len alt_len +.macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . .word \feature .byte \orig_len .byte \alt_len + .byte \pad_len +.endm + +.macro ALTERNATIVE oldinstr, newinstr, feature +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr +144: + .popsection +.endm + +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 + .skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection .endm #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 37ad100a2210..132bf12ddb6a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -47,8 +47,9 @@ struct alt_instr { s32 repl_offset; /* offset to replacement instruction */ u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction, <= instrlen */ -}; + u8 replacementlen; /* length of new instruction */ + u8 padlen; /* length of build-time padding */ +} __packed; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); @@ -75,23 +76,65 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ +#define b_replacement(num) "664"#num +#define e_replacement(num) "665"#num + +#define alt_end_marker "663" +#define alt_slen "662b-661b" +#define alt_pad_len alt_end_marker"b-662b" +#define alt_total_slen alt_end_marker"b-661b" +#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" + +#define __OLDINSTR(oldinstr, num) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ + "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" + +#define OLDINSTR(oldinstr, num) \ + __OLDINSTR(oldinstr, num) \ + alt_end_marker ":\n" + +/* + * Pad the second replacement alternative with additional NOPs if it is + * additionally longer than the first replacement alternative. + */ +#define OLDINSTR_2(oldinstr, num1, num2) \ + __OLDINSTR(oldinstr, num1) \ + ".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \ + "((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n" \ + alt_end_marker ":\n" + +#define ALTINSTR_ENTRY(feature, num) \ + " .long 661b - .\n" /* label */ \ + " .long " b_replacement(num)"f - .\n" /* new instruction */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ + " .byte " alt_total_slen "\n" /* source len */ \ + " .byte " alt_rlen(num) "\n" /* replacement len */ \ + " .byte " alt_pad_len "\n" /* pad len */ + +#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ + b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t" + /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - \ - "661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .long 661b - .\n" /* label */ \ - " .long 663f - .\n" /* new instruction */ \ - " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ - " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" + OLDINSTR(oldinstr, 1) \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature, 1) \ + ".popsection\n" \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ".popsection" + +#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ + OLDINSTR_2(oldinstr, 1, 2) \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature1, 1) \ + ALTINSTR_ENTRY(feature2, 2) \ + ".popsection\n" \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ + ".popsection" /* * This must be included *after* the definition of ALTERNATIVE due to @@ -114,6 +157,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative(oldinstr, newinstr, feature) \ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") +#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ + asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") + /* * Alternative inline assembly with input. * diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h new file mode 100644 index 000000000000..84ae170bc3d0 --- /dev/null +++ b/arch/x86/include/asm/cmdline.h @@ -0,0 +1,8 @@ +#ifndef _ASM_X86_CMDLINE_H +#define _ASM_X86_CMDLINE_H + +int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); +int cmdline_find_option(const char *cmdline_ptr, const char *option, + char *buffer, int bufsize); + +#endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index b8a5fe545f95..5d1b51652d3a 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,6 +176,10 @@ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */ + +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ +#define X86_FEATURE_KAISER ( 7*32+31) /* "" CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ @@ -198,8 +202,11 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) @@ -334,7 +341,7 @@ extern const char * const x86_power_flags[32]; */ static __always_inline __pure bool __static_cpu_has(u16 bit) { -#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 +#ifdef CC_HAVE_ASM_GOTO asm_volatile_goto("1: jmp %l[t_no]\n" "2:\n" ".section .altinstructions,\"a\"\n" @@ -343,6 +350,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (bit) : : t_no); @@ -360,6 +368,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 382ce8a9fd62..7f1ead938ec1 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -40,7 +40,7 @@ struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 55e4de613f0e..33c8d1dea7a6 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -18,8 +18,8 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; - unsigned int irq_tlb_count; #endif + unsigned int irq_tlb_count; #ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6ed2be7..3354a390cc71 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -164,7 +164,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; -DECLARE_PER_CPU(vector_irq_t, vector_irq); +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); extern void setup_vector_irq(int cpu); #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h new file mode 100644 index 000000000000..de6fd8166662 --- /dev/null +++ b/arch/x86/include/asm/kaiser.h @@ -0,0 +1,141 @@ +#ifndef _ASM_X86_KAISER_H +#define _ASM_X86_KAISER_H + +#include /* For PCID constants */ + +/* + * This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on + * the kernel virtual memory. It has a shadow pgd for every process: the + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole + * user memory. Within a kernel context switch, or when an interrupt is handled, + * the pgd is switched to the normal one. When the system switches to user mode, + * the shadow pgd is enabled. By this, the virtual memory caches are freed, + * and the user may not attack the whole kernel memory. + * + * A minimalistic kernel mapping holds the parts needed to be mapped in user + * mode, such as the entry/exit functions of the user space, or the stacks. + */ + +#define KAISER_SHADOW_PGD_OFFSET 0x1000 + +#ifdef __ASSEMBLY__ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +.macro _SWITCH_TO_KERNEL_CR3 reg +movq %cr3, \reg +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ +ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID +movq \reg, %cr3 +.endm + +.macro _SWITCH_TO_USER_CR3 reg regb +/* + * regb must be the low byte portion of reg: because we have arranged + * for the low byte of the user PCID to serve as the high byte of NOFLUSH + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are + * not enabled): so that the one register can update both memory and cr3. + */ +movq %cr3, \reg +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg +js 9f +/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */ +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) +9: +movq \reg, %cr3 +.endm + +.macro SWITCH_KERNEL_CR3 +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER +_SWITCH_TO_KERNEL_CR3 %rax +popq %rax +8: +.endm + +.macro SWITCH_USER_CR3 +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER +_SWITCH_TO_USER_CR3 %rax %al +popq %rax +8: +.endm + +.macro SWITCH_KERNEL_CR3_NO_STACK +ALTERNATIVE "jmp 8f", \ + __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ + X86_FEATURE_KAISER +_SWITCH_TO_KERNEL_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +8: +.endm + +#else /* CONFIG_PAGE_TABLE_ISOLATION */ + +.macro SWITCH_KERNEL_CR3 +.endm +.macro SWITCH_USER_CR3 +.endm +.macro SWITCH_KERNEL_CR3_NO_STACK +.endm + +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Upon kernel/user mode switch, it may happen that the address + * space has to be switched before the registers have been + * stored. To change the address space, another register is + * needed. A register therefore has to be stored/restored. +*/ +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); + +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; + +extern int kaiser_enabled; +extern void __init kaiser_check_boottime_disable(void); +#else +#define kaiser_enabled 0 +static inline void __init kaiser_check_boottime_disable(void) {} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +/* + * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set, + * so as to build with tests on kaiser_enabled instead of #ifdefs. + */ + +/** + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping + * @addr: the start address of the range + * @size: the size of the range + * @flags: The mapping flags of the pages + * + * The mapping is done on a global scope, so no bigger + * synchronization has to be done. the pages have to be + * manually unmapped again when they are not needed any longer. + */ +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + +/** + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + */ +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + +/** + * kaiser_init - Initialize the shadow mapping + * + * Most parts of the shadow mapping can be mapped upon boot + * time. Only per-process things like the thread stacks + * or a new LDT have to be mapped at runtime. These boot- + * time mappings are permanent and never unmapped. + */ +extern void kaiser_init(void); + +#endif /* __ASSEMBLY */ + +#endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 926f67263287..b19dbcf5bdd6 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -20,12 +20,6 @@ typedef struct { void *vdso; } mm_context_t; -#ifdef CONFIG_SMP void leave_mm(int cpu); -#else -static inline void leave_mm(int cpu) -{ -} -#endif #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index babbcd18a7a7..6ab95775921d 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -69,82 +69,16 @@ void destroy_context(struct mm_struct *mm); static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { -#ifdef CONFIG_SMP if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); -#endif } -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned cpu = smp_processor_id(); +extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); - if (likely(prev != next)) { -#ifdef CONFIG_SMP - percpu_write(cpu_tlbstate.state, TLBSTATE_OK); - percpu_write(cpu_tlbstate.active_mm, next); -#endif - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * - */ - load_cr3(next->pgd); - - /* stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - - /* - * load the LDT, if the LDT is different: - */ - if (unlikely(prev->context.ldt != next->context.ldt)) - load_mm_ldt(next); - } -#ifdef CONFIG_SMP - else { - percpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); - - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { - /* We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. - * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. - */ - load_cr3(next->pgd); - load_mm_ldt(next); - } - } -#endif -} +extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); +#define switch_mm_irqs_off switch_mm_irqs_off #define activate_mm(prev, next) \ do { \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6be990922d4b..859dc4ae92ea 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -17,6 +17,11 @@ #ifndef __ASSEMBLY__ #include +#ifdef CONFIG_PAGE_TABLE_ISOLATION +extern int kaiser_enabled; +#else +#define kaiser_enabled 0 +#endif /* * ZERO_PAGE is a global shared page that is always zero: used @@ -570,7 +575,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) static inline int pgd_bad(pgd_t pgd) { - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; + pgdval_t ignore_flags = _PAGE_USER; + /* + * We set NX on KAISER pgds that map userspace memory so + * that userspace can not meaningfully use the kernel + * page table by accident; it will fault on the first + * instruction it tries to run. See native_set_pgd(). + */ + if (kaiser_enabled) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) @@ -770,7 +785,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { - memcpy(dst, src, count * sizeof(pgd_t)); + memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (kaiser_enabled) { + /* Clone the shadow pgd part as well */ + memcpy(native_get_shadow_pgd(dst), + native_get_shadow_pgd(src), + count * sizeof(pgd_t)); + } +#endif } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 975f709e09ae..f810cca32c51 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -105,9 +105,31 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } +#ifdef CONFIG_PAGE_TABLE_ISOLATION +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); + +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) +{ +#ifdef CONFIG_DEBUG_VM + /* linux/mmdebug.h may not have been included at this point */ + BUG_ON(!kaiser_enabled); +#endif + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); +} +#else +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) +{ + return NULL; +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { - *pgdp = pgd; + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 013286a10c2c..34501e647214 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -62,7 +62,7 @@ #endif #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) @@ -74,6 +74,33 @@ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) +/* The ASID is the lower 12 bits of CR3 */ +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) + +/* Mask for all the PCID-related bits in CR3: */ +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) + +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64) +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) + +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) +#else +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) +/* + * PCIDs are unsupported on 32-bit and none of these bits can be + * set in CR3: + */ +#define X86_CR3_PCID_KERN_FLUSH (0) +#define X86_CR3_PCID_USER_FLUSH (0) +#define X86_CR3_PCID_KERN_NOFLUSH (0) +#define X86_CR3_PCID_USER_NOFLUSH (0) +#endif + #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) #define _PAGE_CACHE_WC (_PAGE_PWT) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 2dddb317bb39..360e80d0d217 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -43,6 +43,8 @@ */ #define X86_CR3_PWT 0x00000008 /* Page Write Through */ #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 @@ -60,6 +62,7 @@ #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ +#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f7c89e231c6c..048249e983ca 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -266,7 +266,7 @@ struct tss_struct { } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss); /* * Save the original ist values for checking stack pointers during debugging diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index a7973ddf2d1b..e680388b0ea7 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -6,6 +6,55 @@ #include #include +#include + +static inline void __invpcid(unsigned long pcid, unsigned long addr, + unsigned long type) +{ + struct { u64 d[2]; } desc = { { pcid, addr } }; + + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global + * mappings, we don't want the compiler to reorder any subsequent + * memory accesses before the TLB flush. + * + * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and + * invpcid (%rcx), %rax in long mode. + */ + asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" + : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR 0 +#define INVPCID_TYPE_SINGLE_CTXT 1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +#define INVPCID_TYPE_ALL_NON_GLOBAL 3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, + unsigned long addr) +{ + __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ + __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} #ifdef CONFIG_PARAVIRT #include @@ -15,6 +64,24 @@ #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * Declare a couple of kaiser interfaces here for convenience, + * to avoid the need for asm/kaiser.h in unexpected places. + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +extern int kaiser_enabled; +extern void kaiser_setup_pcid(void); +extern void kaiser_flush_tlb_on_return_to_user(void); +#else +#define kaiser_enabled 0 +static inline void kaiser_setup_pcid(void) +{ +} +static inline void kaiser_flush_tlb_on_return_to_user(void) +{ +} +#endif + static inline void __native_flush_tlb(void) { /* @@ -23,6 +90,8 @@ static inline void __native_flush_tlb(void) * back: */ preempt_disable(); + if (kaiser_enabled) + kaiser_flush_tlb_on_return_to_user(); native_write_cr3(native_read_cr3()); preempt_enable(); } @@ -32,6 +101,17 @@ static inline void __native_flush_tlb_global(void) unsigned long flags; unsigned long cr4; + if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; + } + /* * Read-modify-write to CR4 - protect it from preemption and * from interrupts. (Use the raw variant because this code can @@ -40,25 +120,61 @@ static inline void __native_flush_tlb_global(void) raw_local_irq_save(flags); cr4 = native_read_cr4(); - /* clear PGE */ - native_write_cr4(cr4 & ~X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + if (cr4 & X86_CR4_PGE) { + /* clear PGE and flush TLB of all entries */ + native_write_cr4(cr4 & ~X86_CR4_PGE); + /* restore PGE as it was before */ + native_write_cr4(cr4); + } else { + /* do it with cr3, letting kaiser flush user PCID */ + __native_flush_tlb(); + } raw_local_irq_restore(flags); } static inline void __native_flush_tlb_single(unsigned long addr) { - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + /* + * SIMICS #GP's if you run INVPCID with type 2/3 + * and X86_CR4_PCIDE clear. Shame! + * + * The ASIDs used below are hard-coded. But, we must not + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call + * invlpg in the case we are called early. + */ + + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { + if (kaiser_enabled) + kaiser_flush_tlb_on_return_to_user(); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + return; + } + /* Flush the address out of both PCIDs. */ + /* + * An optimization here might be to determine addresses + * that are only kernel-mapped and only flush the kernel + * ASID. But, userspace flushes are probably much more + * important performance-wise. + * + * Make sure to do only a single invpcid when KAISER is + * disabled and we have only a single ASID. + */ + if (kaiser_enabled) + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); } static inline void __flush_tlb_all(void) { - if (cpu_has_pge) - __flush_tlb_global(); - else - __flush_tlb(); + __flush_tlb_global(); + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but + * we might fail to flush kernel translations for other cached ASIDs. + * + * To avoid this issue, we force PCID off if PGE is off. + */ } static inline void __flush_tlb_one(unsigned long addr) @@ -88,52 +204,8 @@ static inline void __flush_tlb_one(unsigned long addr) * * ..but the i386 has somewhat limited tlb flushing capabilities, * and page-granular flushes are available only on i486 and up. - * - * x86-64 can only flush individual pages or full VMs. For a range flush - * we always do the full VM. Might be worth trying if for a small - * range a few INVLPGs in a row are a win. */ -#ifndef CONFIG_SMP - -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) - __flush_tlb(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); -} - -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb(); -} - -static inline void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long va) -{ -} - -static inline void reset_lazy_tlbstate(void) -{ -} - -#else /* SMP */ - -#include - #define local_flush_tlb() __flush_tlb() extern void flush_tlb_all(void); @@ -167,8 +239,6 @@ static inline void reset_lazy_tlbstate(void) percpu_write(cpu_tlbstate.active_mm, &init_mm); } -#endif /* SMP */ - #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va) #endif diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index eaea1d31f753..143e98b28081 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -22,6 +22,7 @@ enum vsyscall_num { /* kernel space (writeable) */ extern int vgetcpu_mode; extern struct timezone sys_tz; +extern unsigned long vsyscall_pgprot; #include diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bda833c5f6ce..ca2430fbc6b3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -63,8 +63,26 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, args...) if (debug_alternative) \ - printk(KERN_DEBUG fmt, args) +#define DPRINTK(fmt, args...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ +} while (0) + +#define DUMP_BYTES(buf, len, fmt, args...) \ +do { \ + if (unlikely(debug_alternative)) { \ + int j; \ + \ + if (!(len)) \ + break; \ + \ + printk(KERN_DEBUG fmt, ##args); \ + for (j = 0; j < (len) - 1; j++) \ + printk(KERN_CONT "%02hhx ", buf[j]); \ + printk(KERN_CONT "%02hhx\n", buf[j]); \ + } \ +} while (0) /* * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes @@ -251,12 +269,86 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void *text_poke_early(void *addr, const void *opcode, size_t len); -/* Replace instructions with better alternatives for this CPU type. - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that asymmetric systems where - APs have less capabilities than the boot processor are not handled. - Tough. Make sure you disable such features by hand. */ +/* + * Are we looking at a near JMP with a 1 or 4-byte displacement. + */ +static inline bool is_jmp(const u8 opcode) +{ + return opcode == 0xeb || opcode == 0xe9; +} + +static void __init_or_module +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) +{ + u8 *next_rip, *tgt_rip; + s32 n_dspl, o_dspl; + int repl_len; + + if (a->replacementlen != 5) + return; + + o_dspl = *(s32 *)(insnbuf + 1); + + /* next_rip of the replacement JMP */ + next_rip = repl_insn + a->replacementlen; + /* target rip of the replacement JMP */ + tgt_rip = next_rip + o_dspl; + n_dspl = tgt_rip - orig_insn; + + DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); + + if (tgt_rip - orig_insn >= 0) { + if (n_dspl - 2 <= 127) + goto two_byte_jmp; + else + goto five_byte_jmp; + /* negative offset */ + } else { + if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) + goto two_byte_jmp; + else + goto five_byte_jmp; + } + +two_byte_jmp: + n_dspl -= 2; + + insnbuf[0] = 0xeb; + insnbuf[1] = (s8)n_dspl; + add_nops(insnbuf + 2, 3); + + repl_len = 2; + goto done; + +five_byte_jmp: + n_dspl -= 5; + + insnbuf[0] = 0xe9; + *(s32 *)&insnbuf[1] = n_dspl; + repl_len = 5; + +done: + + DPRINTK("final displ: 0x%08x, JMP 0x%lx", + n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); +} + +static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) +{ + add_nops(instr + (a->instrlen - a->padlen), a->padlen); + + DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", + instr, a->instrlen - a->padlen, a->padlen); +} + +/* + * Replace instructions with better alternatives for this CPU type. This runs + * before SMP is initialized to avoid SMP problems with self modifying code. + * This implies that asymmetric systems where APs have less capabilities than + * the boot processor are not handled. Tough. Make sure you disable such + * features by hand. + */ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { @@ -264,10 +356,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; - DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + DPRINTK("alt table %p -> %p", start, end); /* * The scan order should be from start to end. A later scanned - * alternative code can overwrite a previous scanned alternative code. + * alternative code can overwrite previously scanned alternative code. * Some kernel functions (e.g. memcpy, memset, etc) use this order to * patch code. * @@ -275,29 +367,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { + int insnbuf_sz = 0; + instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->cpuid >= NCAPINTS*32); - if (!boot_cpu_has(a->cpuid)) + if (!boot_cpu_has(a->cpuid)) { + if (a->padlen > 1) + optimize_nops(a, instr); + continue; + } + + DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)", + a->cpuid >> 5, + a->cpuid & 0x1f, + instr, a->instrlen, + replacement, a->replacementlen); + + DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); memcpy(insnbuf, replacement, a->replacementlen); + insnbuf_sz = a->replacementlen; /* 0xe8 is a relative jump; fix the offset. */ - if (*insnbuf == 0xe8 && a->replacementlen == 5) - *(s32 *)(insnbuf + 1) += replacement - instr; + if (*insnbuf == 0xe8 && a->replacementlen == 5) { + *(s32 *)(insnbuf + 1) += replacement - instr; + DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", + *(s32 *)(insnbuf + 1), + (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); + } + + if (a->replacementlen && is_jmp(replacement[0])) + recompute_jump(a, instr, replacement, insnbuf); - add_nops(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); + if (a->instrlen > a->replacementlen) { + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + insnbuf_sz += a->instrlen - a->replacementlen; + } + DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); - text_poke_early(instr, insnbuf, a->instrlen); + text_poke_early(instr, insnbuf, insnbuf_sz); } } #ifdef CONFIG_SMP - static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { @@ -383,8 +500,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; - DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", - __func__, smp->locks, smp->locks_end, + DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); mutex_lock(&smp_alt); @@ -408,7 +525,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) continue; list_del(&item->next); mutex_unlock(&smp_alt); - DPRINTK("%s: %s\n", __func__, item->name); + DPRINTK("%s\n", item->name); kfree(item); return; } @@ -483,7 +600,7 @@ int alternatives_text_reserved(void *start, void *end) return 0; } -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_PARAVIRT void __init_or_module apply_paravirt(struct paravirt_patch_site *start, diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 46674fbb62ba..ad95476a08b9 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -159,6 +159,14 @@ static void __init check_config(void) void __init check_bugs(void) { +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif + identify_boot_cpu(); #ifndef CONFIG_SMP printk(KERN_INFO "CPU: "); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 83df01dca3ac..d5ff61ffbe01 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitconst default_cpu = { static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 /* * We need valid kernel segments for data and code in long mode too @@ -155,6 +155,40 @@ static int __init x86_xsaveopt_setup(char *s) } __setup("noxsaveopt", x86_xsaveopt_setup); +#ifdef CONFIG_X86_64 +static int __init x86_pcid_setup(char *s) +{ + /* require an exact match without trailing characters */ + if (strlen(s)) + return 0; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 1; +} +__setup("nopcid", x86_pcid_setup); +#endif + +static int __init x86_noinvpcid_setup(char *s) +{ + /* noinvpcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_INVPCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_INVPCID); + pr_info("noinvpcid: INVPCID feature disabled\n"); + return 0; +} +early_param("noinvpcid", x86_noinvpcid_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -276,6 +310,44 @@ static __cpuinit void setup_smep(struct cpuinfo_x86 *c) } } +static void setup_pcid(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PCID)) { + if (IS_ENABLED(CONFIG_X86_64) && + (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled)) { + /* + * Regardless of whether PCID is enumerated, the + * SDM says that it can't be enabled in 32-bit mode. + */ + set_in_cr4(X86_CR4_PCIDE); + /* + * INVPCID has two "groups" of types: + * 1/2: Invalidate an individual address + * 3/4: Invalidate all contexts + * + * 1/2 take a PCID, but 3/4 do not. So, 3/4 + * ignore the PCID argument in the descriptor. + * But, we have to be careful not to call 1/2 + * with an actual non-zero PCID in them before + * we do the above set_in_cr4(). + */ + if (cpu_has(c, X86_FEATURE_INVPCID)) + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't + * work if PCID is on but PGE is not. Since that + * combination doesn't exist on real hardware, there's + * no reason to try to fully support it, but it's + * polite to avoid corrupting data if we're on + * an improperly configured VM. + */ + clear_cpu_cap(c, X86_FEATURE_PCID); + } + } + kaiser_setup_pcid(); +} + /* * Some CPU features depend on higher CPUID levels, which may not always * be available due to CPUID level capping or broken virtualization @@ -833,6 +905,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); + /* Set up PCID */ + setup_pcid(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." @@ -1055,7 +1130,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ @@ -1155,6 +1230,15 @@ void __cpuinit cpu_init(void) int cpu; int i; + if (!kaiser_enabled) { + /* + * secondary_startup_64() deferred setting PGE in cr4: + * init_memory_mapping() sets it on the boot cpu, + * but it needs to be set on each secondary cpu. + */ + set_in_cr4(X86_CR4_PGE); + } + cpu = stack_smp_processor_id(); t = &per_cpu(init_tss, cpu); oist = &per_cpu(orig_ist, cpu); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 2d4e76ba2b5c..721d236d0c1e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -2,10 +2,14 @@ #include #include +#include #include #include "perf_event.h" +static +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); + /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 @@ -60,6 +64,39 @@ void fini_debug_store_on_cpu(int cpu) wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); } +static void *dsalloc(size_t size, gfp_t flags, int node) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + unsigned int order = get_order(size); + struct page *page; + unsigned long addr; + + page = alloc_pages_node(node, flags | __GFP_ZERO, order); + if (!page) + return NULL; + addr = (unsigned long)page_address(page); + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { + __free_pages(page, order); + addr = 0; + } + return (void *)addr; +#else + return kmalloc_node(size, flags | __GFP_ZERO, node); +#endif +} + +static void dsfree(const void *buffer, size_t size) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (!buffer) + return; + kaiser_remove_mapping((unsigned long)buffer, size); + free_pages((unsigned long)buffer, get_order(size)); +#else + kfree(buffer); +#endif +} + static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -70,7 +107,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -94,7 +131,7 @@ static void release_pebs_buffer(int cpu) if (!ds || !x86_pmu.pebs) return; - kfree((void *)(unsigned long)ds->pebs_buffer_base); + dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE); ds->pebs_buffer_base = 0; } @@ -108,7 +145,7 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -132,19 +169,15 @@ static void release_bts_buffer(int cpu) if (!ds || !x86_pmu.bts) return; - kfree((void *)(unsigned long)ds->bts_buffer_base); + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; } static int alloc_ds_buffer(int cpu) { - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); - if (unlikely(!ds)) - return -ENOMEM; + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); + memset(ds, 0, sizeof(*ds)); per_cpu(cpu_hw_events, cpu).ds = ds; return 0; @@ -158,7 +191,6 @@ static void release_ds_buffer(int cpu) return; per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); } void release_ds_buffers(void) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 0fa4f89125ae..061f09b46929 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -887,7 +887,7 @@ ENTRY(simd_coprocessor_error) 661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" - altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f + altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f, 0 .previous .section .altinstr_replacement,"ax" 663: pushl $do_simd_coprocessor_error diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f6daf3cdb878..b232bfcf44fb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,6 +56,9 @@ #include #include #include +#include +#include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -323,6 +326,7 @@ ENDPROC(native_usergs_sysret64) testl $3, CS(%rdi) je 1f SWAPGS + SWITCH_KERNEL_CR3 /* * irq_count is used to check if a CPU is already on an interrupt stack * or not. While this is essentially redundant with preempt_count it is @@ -362,6 +366,12 @@ END(save_rest) /* save complete stack frame */ .pushsection .kprobes.text, "ax" +/* + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit + * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit + * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit + */ ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld @@ -387,7 +397,26 @@ ENTRY(save_paranoid) js 1f /* negative -> in kernel */ SWAPGS xorl %ebx,%ebx -1: ret +1: +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. + * Do a conditional SWITCH_KERNEL_CR3: this could safely be done + * unconditionally, but we need to find out whether the reverse + * should be done on return (conveyed to paranoid_exit in %ebx). + */ + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER + testl $KAISER_SHADOW_PGD_OFFSET, %eax + jz 2f + orl $2, %ebx + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID + movq %rax, %cr3 +2: +#endif + ret CFI_ENDPROC END(save_paranoid) .popsection @@ -464,6 +493,7 @@ ENTRY(system_call) CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK /* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs @@ -515,6 +545,14 @@ ENTRY(system_call_after_swapgs) CFI_REGISTER rip,rcx RESTORE_ARGS 1,-ARG_SKIP,0 /*CFI_REGISTER rflags,r11*/ + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS + * register useless for telling whether or not we need to + * switch CR3 in NMIs. Normal interrupts are OK because + * they are off here. + */ + SWITCH_USER_CR3 movq PER_CPU_VAR(old_rsp), %rsp USERGS_SYSRET64 @@ -851,6 +889,14 @@ retint_swapgs: /* return to user-space */ */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS + * register useless for telling whether or not we need to + * switch CR3 in NMIs. Normal interrupts are OK because + * they are off here. + */ + SWITCH_USER_CR3 SWAPGS jmp restore_args @@ -891,6 +937,7 @@ ENTRY(native_iret) pushq_cfi %rax pushq_cfi %rdi SWAPGS + SWITCH_KERNEL_CR3 movq PER_CPU_VAR(espfix_waddr),%rdi movq %rax,(0*8)(%rdi) /* RAX */ movq (2*8)(%rsp),%rax /* RIP */ @@ -906,6 +953,7 @@ ENTRY(native_iret) andl $0xffff0000,%eax popq_cfi %rdi orq PER_CPU_VAR(espfix_stack),%rax + SWITCH_USER_CR3 SWAPGS movq %rax,%rsp popq_cfi %rax @@ -1366,30 +1414,41 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) * is fundamentally NMI-unsafe. (we cannot change the soft and * hard flags at once, atomically) */ - - /* ebx: no swapgs flag */ +/* + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 + * ebx=2: needs both swapgs and SWITCH_USER_CR3 + * ebx=3: needs SWITCH_USER_CR3 but not swapgs + */ ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: + movq %rbx, %r12 /* paranoid_userspace uses %ebx */ + testl $3, CS(%rsp) + jnz paranoid_userspace +paranoid_kernel: + movq %r12, %rbx /* restore after paranoid_userspace */ TRACE_IRQS_IRETQ 0 +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */ + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz paranoid_exit_no_switch + SWITCH_USER_CR3 +paranoid_exit_no_switch: +#endif + testl $1, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs SWAPGS_UNSAFE_STACK +paranoid_exit_no_swapgs: RESTORE_ALL 8 - jmp irq_return -paranoid_restore: - TRACE_IRQS_IRETQ 0 - RESTORE_ALL 8 - jmp irq_return + jmp irq_return + paranoid_userspace: GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%ebx andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs + jz paranoid_kernel movq %rsp,%rdi /* &pt_regs */ call sync_regs movq %rax,%rsp /* switch stack for scheduling */ @@ -1438,6 +1497,13 @@ ENTRY(error_entry) movq_cfi r13, R13+8 movq_cfi r14, R14+8 movq_cfi r15, R15+8 + /* + * error_entry() always returns with a kernel gsbase and + * CR3. We must also have a kernel CR3/gsbase before + * calling TRACE_IRQS_*. Just unconditionally switch to + * the kernel CR3 here. + */ + SWITCH_KERNEL_CR3 xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1527,22 +1593,32 @@ ENTRY(nmi) call do_nmi #ifdef CONFIG_TRACE_IRQFLAGS /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ + /* ebx: no-swapgs and kaiser-switch-cr3 flag */ DISABLE_INTERRUPTS(CLBR_NONE) - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace -nmi_swapgs: + movq %rbx, %r12 /* nmi_userspace uses %ebx */ + testl $3, CS(%rsp) + jnz nmi_userspace +nmi_kernel: + movq %r12, %rbx /* restore after nmi_userspace */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */ + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz nmi_exit_no_switch + SWITCH_USER_CR3 +nmi_exit_no_switch: +#endif + testl $1, %ebx /* swapgs needed? */ + jnz nmi_exit_no_swapgs SWAPGS_UNSAFE_STACK -nmi_restore: +nmi_exit_no_swapgs: RESTORE_ALL 8 - jmp irq_return + jmp irq_return + nmi_userspace: GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%ebx andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs + jz nmi_kernel movq %rsp,%rdi /* &pt_regs */ call sync_regs movq %rax,%rsp /* switch stack for scheduling */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 94d857fb1033..a1944faa739c 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -41,6 +41,7 @@ #include #include #include +#include /* * Note: we only need 6*8 = 48 bytes for the espfix stack, but round @@ -129,6 +130,15 @@ void __init init_espfix_bsp(void) /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + /* + * Just copy the top-level PGD that is mapping the espfix + * area to ensure it is mapped into the shadow user page + * tables. + */ + if (kaiser_enabled) { + set_pgd(native_get_shadow_pgd(pgd_p), + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); + } /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 0f8ebf78253a..1de317b20fac 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -166,8 +166,8 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %eax + /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ + movl $(X86_CR4_PAE | X86_CR4_PSE), %eax movq %rax, %cr4 /* Setup early boot stage 4 level pagetables. */ @@ -338,6 +338,27 @@ ENTRY(early_idt_handler) .balign PAGE_SIZE; \ ENTRY(name) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define KAISER_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define KAISER_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -353,13 +374,14 @@ ENTRY(name) * 0xffffffff80000000 to physical address 0x000000. (always using * 2Mbyte large pages provided by PAE mode) */ -NEXT_PAGE(init_level4_pgt) +NEXT_PGD_PAGE(init_level4_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -385,6 +407,7 @@ NEXT_PAGE(level2_ident_pgt) * Don't set NX because code runs from these pages. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level2_kernel_pgt) /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 4970ef070f2f..02fd03bf15dd 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -74,6 +75,8 @@ static inline void hpet_set_mapping(void) hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); #ifdef CONFIG_X86_64 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); + kaiser_add_mapping(__fix_to_virt(VSYSCALL_HPET), PAGE_SIZE, + __PAGE_KERNEL_VVAR_NOCACHE); #endif } diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 43e9ccf44947..f00e6e734fbd 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index e328f691eeef..990f743e21b8 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -85,7 +85,7 @@ static struct irqaction irq2 = { .flags = IRQF_NO_THREAD, }; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = -1, }; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 1dd32307a494..8d4e15f59b5f 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) set_ldt(pc->ldt->entries, pc->ldt->size); } +static void __free_ldt_struct(struct ldt_struct *ldt) +{ + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(ldt->entries); + else + free_page((unsigned long)ldt->entries); + kfree(ldt); +} + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ static struct ldt_struct *alloc_ldt_struct(int size) { struct ldt_struct *new_ldt; int alloc_size; + int ret; if (size > LDT_ENTRIES) return NULL; @@ -59,14 +70,20 @@ static struct ldt_struct *alloc_ldt_struct(int size) if (alloc_size > PAGE_SIZE) new_ldt->entries = vzalloc(alloc_size); else - new_ldt->entries = kzalloc(PAGE_SIZE, GFP_KERNEL); + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); if (!new_ldt->entries) { kfree(new_ldt); return NULL; } + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, + __PAGE_KERNEL); new_ldt->size = size; + if (ret) { + __free_ldt_struct(new_ldt); + return NULL; + } return new_ldt; } @@ -97,12 +114,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (likely(!ldt)) return; + kaiser_remove_mapping((unsigned long)ldt->entries, + ldt->size * LDT_ENTRY_SIZE); paravirt_free_ldt(ldt->entries, ldt->size); - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); - else - kfree(ldt->entries); - kfree(ldt); + __free_ldt_struct(ldt); } /* diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index a1da6737ba5b..a91d9b9b4bde 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); @@ -57,7 +56,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); - PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); patch_site: diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 557eb3757edb..d2ce2a33d15b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -57,7 +57,7 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(unsigned long, old_rsp); +DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); static ATOMIC_NOTIFIER_HEAD(idle_notifier); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a34bdddeca85..04da0bf5c314 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -357,6 +357,12 @@ void machine_real_restart(unsigned int type) lowmem_gdt[1] = GDT_ENTRY(0x009b, restart_pa, 0xffff); +#ifdef CONFIG_X86_64 + /* Exiting long mode will fail if CR4.PCIDE is set. */ + if (static_cpu_has(X86_FEATURE_PCID)) + clear_in_cr4(X86_CR4_PCIDE); +#endif + /* Jump to the identity-mapped low memory code */ restart_lowmem(type); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b506f41b7d37..cb3a58551770 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,6 +114,7 @@ #include #include #include +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -921,6 +922,12 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + /* + * This needs to happen right after XENPV is set on xen and + * kaiser_enabled is checked below in cleanup_highmap(). + */ + kaiser_check_boottime_disable(); + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e4d4a22e8b94..3178f308609a 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -58,6 +58,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = }; static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; +unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL; static int __init vsyscall_setup(char *str) { @@ -274,10 +275,10 @@ void __init map_vsyscall(void) extern char __vvar_page; unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); + if (vsyscall_mode != NATIVE) + vsyscall_pgprot = __PAGE_KERNEL_VVAR; __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); + __pgprot(vsyscall_pgprot)); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != (unsigned long)VSYSCALL_START); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index b00f6785da74..2db5846d86ed 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -16,7 +16,7 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o -lib-y := delay.o +lib-y := delay.o cmdline.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index f2145cfa12a6..38e57faefd71 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -67,7 +67,7 @@ ENDPROC(clear_page) .previous .section .altinstructions,"a" altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ - .Lclear_page_end-clear_page, 2b-1b + .Lclear_page_end-clear_page, 2b-1b, 0 altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ - .Lclear_page_end-clear_page,3b-2b + .Lclear_page_end-clear_page,3b-2b, 0 .previous diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c new file mode 100644 index 000000000000..3261abb21ef4 --- /dev/null +++ b/arch/x86/lib/cmdline.c @@ -0,0 +1,215 @@ +/* + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * Misc librarized functions for cmdline poking. + */ +#include +#include +#include +#include + +static inline int myisspace(u8 c) +{ + return c <= ' '; /* Close enough approximation */ +} + +/** + * Find a boolean option (like quiet,noapic,nosmp....) + * + * @cmdline: the cmdline string + * @option: option string to look for + * + * Returns the position of that @option (starts counting with 1) + * or 0 on not found. @option will only be found if it is found + * as an entire word in @cmdline. For instance, if @option="car" + * then a cmdline which contains "cart" will not match. + */ +static int +__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, + const char *option) +{ + char c; + int pos = 0, wstart = 0; + const char *opptr = NULL; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + /* + * This 'pos' check ensures we do not overrun + * a non-NULL-terminated 'cmdline' + */ + while (pos < max_cmdline_size) { + c = *(char *)cmdline++; + pos++; + + switch (state) { + case st_wordstart: + if (!c) + return 0; + else if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + wstart = pos; + /* fall through */ + + case st_wordcmp: + if (!*opptr) { + /* + * We matched all the way to the end of the + * option we were looking for. If the + * command-line has a space _or_ ends, then + * we matched! + */ + if (!c || myisspace(c)) + return wstart; + /* + * We hit the end of the option, but _not_ + * the end of a word on the cmdline. Not + * a match. + */ + } else if (!c) { + /* + * Hit the NULL terminator on the end of + * cmdline. + */ + return 0; + } else if (c == *opptr++) { + /* + * We are currently matching, so continue + * to the next character on the cmdline. + */ + break; + } + state = st_wordskip; + /* fall through */ + + case st_wordskip: + if (!c) + return 0; + else if (myisspace(c)) + state = st_wordstart; + break; + } + } + + return 0; /* Buffer overrun */ +} + +/* + * Find a non-boolean option (i.e. option=argument). In accordance with + * standard Linux practice, if this option is repeated, this returns the + * last instance on the command line. + * + * @cmdline: the cmdline string + * @max_cmdline_size: the maximum size of cmdline + * @option: option string to look for + * @buffer: memory buffer to return the option argument + * @bufsize: size of the supplied memory buffer + * + * Returns the length of the argument (regardless of if it was + * truncated to fit in the buffer), or -1 on not found. + */ +static int +__cmdline_find_option(const char *cmdline, int max_cmdline_size, + const char *option, char *buffer, int bufsize) +{ + char c; + int pos = 0, len = -1; + const char *opptr = NULL; + char *bufptr = buffer; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + st_bufcpy, /* Copying this to buffer */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + /* + * This 'pos' check ensures we do not overrun + * a non-NULL-terminated 'cmdline' + */ + while (pos++ < max_cmdline_size) { + c = *(char *)cmdline++; + if (!c) + break; + + switch (state) { + case st_wordstart: + if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + /* fall through */ + + case st_wordcmp: + if ((c == '=') && !*opptr) { + /* + * We matched all the way to the end of the + * option we were looking for, prepare to + * copy the argument. + */ + len = 0; + bufptr = buffer; + state = st_bufcpy; + break; + } else if (c == *opptr++) { + /* + * We are currently matching, so continue + * to the next character on the cmdline. + */ + break; + } + state = st_wordskip; + /* fall through */ + + case st_wordskip: + if (myisspace(c)) + state = st_wordstart; + break; + + case st_bufcpy: + if (myisspace(c)) { + state = st_wordstart; + } else { + /* + * Increment len, but don't overrun the + * supplied buffer and leave room for the + * NULL terminator. + */ + if (++len < bufsize) + *bufptr++ = c; + } + break; + } + } + + if (bufsize) + *bufptr = '\0'; + + return len; +} + +int cmdline_find_option_bool(const char *cmdline, const char *option) +{ + return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); +} + +int cmdline_find_option(const char *cmdline, const char *option, char *buffer, + int bufsize) +{ + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, + buffer, bufsize); +} diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 01c805ba5359..8f1917e19a8d 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -112,5 +112,5 @@ ENDPROC(copy_page) .previous .section .altinstructions,"a" altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ - .Lcopy_page_end-copy_page, 2b-1b + .Lcopy_page_end-copy_page, 2b-1b, 0 .previous diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 024840266ba0..18b00595612f 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -26,19 +26,18 @@ */ .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 0: - .byte 0xe9 /* 32bit jump */ - .long \orig-1f /* by default jump to orig */ + jmp \orig 1: .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt1-1b /* offset */ /* or alternatively to alt1 */ -3: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt2-1b /* offset */ /* or alternatively to alt2 */ +2: + jmp \alt1 +3: + jmp \alt2 .previous .section .altinstructions,"a" - altinstruction_entry 0b,2b,\feature1,5,5 - altinstruction_entry 0b,3b,\feature2,5,5 + altinstruction_entry 0b,2b,\feature1,5,5,0 + altinstruction_entry 0b,3b,\feature2,5,5,0 .previous .endm diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index efbf2a0ecdea..6c32ff9b3e37 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -203,8 +203,8 @@ ENDPROC(__memcpy) * only outcome... */ .section .altinstructions, "a" - altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ - .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c - altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ - .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e + altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c,0 + altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e,0 .previous diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index ee164610ec46..f8f64ce99a17 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -218,6 +218,6 @@ ENTRY(memmove) altinstruction_entry .Lmemmove_begin_forward, \ .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ .Lmemmove_end_forward-.Lmemmove_begin_forward, \ - .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs + .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs,0 .previous ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 79bd454b78a3..ea830ff9d528 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -150,8 +150,8 @@ ENDPROC(__memset) * feature to implement the right patch order. */ .section .altinstructions,"a" - altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ - .Lfinal-memset,.Lmemset_e-.Lmemset_c - altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ - .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e + altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ + .Lfinal-__memset,.Lmemset_e-.Lmemset_c,0 + altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ + .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e,0 .previous diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 3d11327c9ab4..08505d201bbe 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,5 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o + pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -7,7 +7,6 @@ CFLAGS_physaddr.o := $(nostackp) CFLAGS_setup_nx.o := $(nostackp) obj-$(CONFIG_X86_PAT) += pat_rbtree.o -obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o @@ -30,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_MEMTEST) += memtest.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index dba71b25a546..f897b97ab897 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -161,7 +161,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, set_in_cr4(X86_CR4_PSE); /* Enable PGE if available */ - if (cpu_has_pge) { + if (cpu_has_pge && !kaiser_enabled) { set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 44b93da18401..1f1b8ed9b06f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -312,6 +312,16 @@ void __init cleanup_highmap(void) continue; if (vaddr < (unsigned long) _text || vaddr > end) set_pmd(pmd, __pmd(0)); + else if (kaiser_enabled) { + /* + * level2_kernel_pgt is initialized with _PAGE_GLOBAL: + * clear that now. This is not important, so long as + * CR4.PGE remains clear, but it removes an anomaly. + * Physical mapping setup below avoids _PAGE_GLOBAL + * by use of massage_pgprot() inside pfn_pte() etc. + */ + set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); + } } } diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c new file mode 100644 index 000000000000..a95acd4c71fa --- /dev/null +++ b/arch/x86/mm/kaiser.c @@ -0,0 +1,452 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + +extern struct mm_struct init_mm; + +#include +#include /* to verify its kaiser declarations */ +#include +#include +#include +#include +#include + +int kaiser_enabled __read_mostly = 1; +EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ + +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +/* + * These can have bit 63 set, so we can not just use a plain "or" + * instruction to get their value or'd into CR3. It would take + * another register. So, we use a memory reference to these instead. + * + * This is also handy because systems that do not support PCIDs + * just end up or'ing a 0 into their CR3, which does no harm. + */ +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); + +/* + * At runtime, the only things we map are some things for CPU + * hotplug, and stacks for new processes. No two CPUs will ever + * be populating the same addresses, so we only need to ensure + * that we protect between two CPUs trying to allocate and + * populate the same page table page. + * + * Only take this lock when doing a set_p[4um]d(), but it is not + * needed for doing a set_pte(). We assume that only the *owner* + * of a given allocation will be doing this for _their_ + * allocation. + * + * This ensures that once a system has been running for a while + * and there have been stacks all over and these page tables + * are fully populated, there will be no further acquisitions of + * this lock. + */ +static DEFINE_SPINLOCK(shadow_table_allocation_lock); + +/* + * Returns -1 on error. + */ +static inline unsigned long get_pa_from_mapping(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(vaddr); + /* + * We made all the kernel PGDs present in kaiser_init(). + * We expect them to stay that way. + */ + BUG_ON(pgd_none(*pgd)); + /* + * PGDs are either 512GB or 128TB on all x86_64 + * configurations. We don't handle these. + */ + BUG_ON(pgd_large(*pgd)); + + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + WARN_ON_ONCE(1); + return -1; + } + + if (pud_large(*pud)) + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); + + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + WARN_ON_ONCE(1); + return -1; + } + + if (pmd_large(*pmd)) + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); + + pte = pte_offset_kernel(pmd, vaddr); + if (pte_none(*pte)) { + WARN_ON_ONCE(1); + return -1; + } + + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); +} + +/* + * This is a relatively normal page table walk, except that it + * also tries to allocate page tables pages along the way. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static pte_t *kaiser_pagetable_walk(unsigned long address) +{ + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + + if (pgd_none(*pgd)) { + WARN_ONCE(1, "All shadow pgds should have been populated"); + return NULL; + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + pud = pud_offset(pgd, address); + /* The shadow page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (!new_pmd_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pud_none(*pud)) { + set_pud(pud, __pud(_PAGE_TABLE | __pa(new_pmd_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pmd_page), NR_KAISERTABLE); + } else + free_page(new_pmd_page); + spin_unlock(&shadow_table_allocation_lock); + } + + pmd = pmd_offset(pud, address); + /* The shadow page tables do not use large mappings: */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(new_pte_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pte_page), NR_KAISERTABLE); + } else + free_page(new_pte_page); + spin_unlock(&shadow_table_allocation_lock); + } + + return pte_offset_kernel(pmd, address); +} + +static int kaiser_add_user_map(const void *__start_addr, unsigned long size, + unsigned long flags) +{ + int ret = 0; + pte_t *pte; + unsigned long start_addr = (unsigned long )__start_addr; + unsigned long address = start_addr & PAGE_MASK; + unsigned long end_addr = PAGE_ALIGN(start_addr + size); + unsigned long target_address; + + /* + * It is convenient for callers to pass in __PAGE_KERNEL etc, + * and there is no actual harm from setting _PAGE_GLOBAL, so + * long as CR4.PGE is not set. But it is nonetheless troubling + * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" + * requires that not to be #defined to 0): so mask it off here. + */ + flags &= ~_PAGE_GLOBAL; + + if (flags & _PAGE_USER) + BUG_ON(address < FIXADDR_START || end_addr >= FIXADDR_TOP); + + for (; address < end_addr; address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + if (target_address == -1) { + ret = -EIO; + break; + } + pte = kaiser_pagetable_walk(address); + if (!pte) { + ret = -ENOMEM; + break; + } + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { + pte_t tmp; + set_pte(&tmp, __pte(flags | target_address)); + WARN_ON_ONCE(!pte_same(*pte, tmp)); + } + } + return ret; +} + +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) +{ + unsigned long size = end - start; + + return kaiser_add_user_map(start, size, flags); +} + +/* + * Ensure that the top level of the (shadow) page tables are + * entirely populated. This ensures that all processes that get + * forked have the same entries. This way, we do not have to + * ever go set up new entries in older processes. + * + * Note: we never free these, so there are no updates to them + * after this. + */ +static void __init kaiser_init_all_pgds(void) +{ + pgd_t *pgd; + int i = 0; + + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { + pgd_t new_pgd; + pud_t *pud = pud_alloc_one(&init_mm, + PAGE_OFFSET + i * PGDIR_SIZE); + if (!pud) { + WARN_ON(1); + break; + } + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); + new_pgd = __pgd(_PAGE_TABLE |__pa(pud)); + /* + * Make sure not to stomp on some other pgd entry. + */ + if (!pgd_none(pgd[i])) { + WARN_ON(1); + continue; + } + set_pgd(pgd + i, new_pgd); + } +} + +#define kaiser_add_user_map_early(start, size, flags) do { \ + int __ret = kaiser_add_user_map(start, size, flags); \ + WARN_ON(__ret); \ +} while (0) + +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ + WARN_ON(__ret); \ +} while (0) + +void __init kaiser_check_boottime_disable(void) +{ + bool enable = true; + char arg[5]; + int ret; + + if (xen_pv_domain()) + goto silent_disable; + + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (!strncmp(arg, "on", 2)) + goto enable; + + if (!strncmp(arg, "off", 3)) + goto disable; + + if (!strncmp(arg, "auto", 4)) + goto skip; + } + + if (cmdline_find_option_bool(boot_command_line, "nopti")) + goto disable; + +skip: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + goto disable; + +enable: + if (enable) + setup_force_cpu_cap(X86_FEATURE_KAISER); + + return; + +disable: + pr_info("disabled\n"); + +silent_disable: + kaiser_enabled = 0; + setup_clear_cpu_cap(X86_FEATURE_KAISER); +} + +/* + * If anything in here fails, we will likely die on one of the + * first kernel->user transitions and init will die. But, we + * will have most of the kernel up by then and should be able to + * get a clean warning out of it. If we BUG_ON() here, we run + * the risk of being before we have good console output. + */ +void __init kaiser_init(void) +{ + int cpu; + + if (!kaiser_enabled) + return; + + kaiser_init_all_pgds(); + + for_each_possible_cpu(cpu) { + void *percpu_vaddr = __per_cpu_user_mapped_start + + per_cpu_offset(cpu); + unsigned long percpu_sz = __per_cpu_user_mapped_end - + __per_cpu_user_mapped_start; + kaiser_add_user_map_early(percpu_vaddr, percpu_sz, + __PAGE_KERNEL); + } + + /* + * Map the entry/exit text section, which is needed at + * switches from user to and from kernel. + */ + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, + __PAGE_KERNEL_RX); + kaiser_add_user_map_ptrs_early(__kprobes_text_start, __kprobes_text_end, + __PAGE_KERNEL_RX); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + kaiser_add_user_map_ptrs_early(__irqentry_text_start, + __irqentry_text_end, + __PAGE_KERNEL_RX); +#endif + kaiser_add_user_map_early((void *)idt_descr.address, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL_RO); + kaiser_add_user_map_early((void *)VVAR_ADDRESS, PAGE_SIZE, + __PAGE_KERNEL_VVAR); + kaiser_add_user_map_early((void *)VSYSCALL_START, PAGE_SIZE, + vsyscall_pgprot); + + pr_info("enabled\n"); +} + +/* Add a mapping to the shadow mapping, and synchronize the mappings */ +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +{ + if (!kaiser_enabled) + return 0; + return kaiser_add_user_map((const void *)addr, size, flags); +} + +void kaiser_remove_mapping(unsigned long start, unsigned long size) +{ + unsigned long end = start + size; + unsigned long addr; + pte_t *pte; + + if (!kaiser_enabled) + return; + for (addr = start; addr < end; addr += PAGE_SIZE) { + pte = kaiser_pagetable_walk(addr); + if (pte) + set_pte(pte, __pte(0)); + } +} + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * This returns true for user pages that need to get copied into + * both the user and kernel copies of the page tables, and false + * for kernel pages that should only be in the kernel copy. + */ +static inline bool is_userspace_pgd(pgd_t *pgdp) +{ + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); +} + +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + if (!kaiser_enabled) + return pgd; + /* + * Do we need to also populate the shadow pgd? Check _PAGE_USER to + * skip cases like kexec and EFI which make temporary low mappings. + */ + if (pgd.pgd & _PAGE_USER) { + if (is_userspace_pgd(pgdp)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + /* + * Even if the entry is *mapping* userspace, ensure + * that userspace can not use it. This way, if we + * get out to userspace running on the kernel CR3, + * userspace will crash instead of running. + */ + pgd.pgd |= _PAGE_NX; + } + } else if (!pgd.pgd) { + /* + * pgd_clear() cannot check _PAGE_USER, and is even used to + * clear corrupted pgd entries: so just rely on cases like + * kexec and EFI never to be using pgd_clear(). + */ + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && + is_userspace_pgd(pgdp)) + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + return pgd; +} + +void kaiser_setup_pcid(void) +{ + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; + + if (this_cpu_has(X86_FEATURE_PCID)) + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; + /* + * These variables are used by the entry/exit + * code to change PCID and pgd and TLB flushing. + */ + this_cpu_write(x86_cr3_pcid_user, user_cr3); +} + +/* + * Make a note that this cpu will need to flush USER tlb on return to user. + * If cpu does not have PCID, then the NOFLUSH bit will never have been set. + */ +void kaiser_flush_tlb_on_return_to_user(void) +{ + if (this_cpu_has(X86_FEATURE_PCID)) + this_cpu_write(x86_cr3_pcid_user, + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); +} +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8573b83a63d0..0b246ef0fa48 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -5,7 +5,7 @@ #include #include -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM @@ -253,12 +253,31 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +/* + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER kaiser_enabled + +static inline pgd_t *_pgd_alloc(void) +{ + /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */ + return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT, + PGD_ALLOCATION_ORDER); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); +} + pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + pgd = _pgd_alloc(); if (pgd == NULL) goto out; @@ -288,7 +307,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) out_free_pmds: free_pmds(pmds); out_free_pgd: - free_page((unsigned long)pgd); + _pgd_free(pgd); out: return NULL; } @@ -298,7 +317,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); - free_page((unsigned long)pgd); + _pgd_free(pgd); } int ptep_set_access_flags(struct vm_area_struct *vma, diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 55034a15f13c..5e1536b47589 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,12 +12,43 @@ #include #include #include +#include DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; +static void load_new_mm_cr3(pgd_t *pgdir) +{ + unsigned long new_mm_cr3 = __pa(pgdir); + + if (kaiser_enabled) { + /* + * We reuse the same PCID for different tasks, so we must + * flush all the entries for the PCID out when we change tasks. + * Flush KERN below, flush USER when returning to userspace in + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. + * + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could + * do it here, but can only be used if X86_FEATURE_INVPCID is + * available - and many machines support pcid without invpcid. + * + * If X86_CR3_PCID_KERN_FLUSH actually added something, then it + * would be needed in the write_cr3() below - if PCIDs enabled. + */ + BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH); + kaiser_flush_tlb_on_return_to_user(); + } + + /* + * Caution: many callers of this function expect + * that load_new_mm_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + /* - * Smarter SMP flushing macros. + * TLB flushing, formerly SMP-only * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about @@ -65,10 +96,85 @@ void leave_mm(int cpu) BUG(); cpumask_clear_cpu(cpu, mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); - load_cr3(swapper_pg_dir); + load_new_mm_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + + if (likely(prev != next)) { + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + percpu_write(cpu_tlbstate.active_mm, next); + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_new_mm_cr3() is serializing + * and gives the ordering guarantee we need. + */ + load_new_mm_cr3(next->pgd); + + /* stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + + /* + * load the LDT, if the LDT is different: + */ + if (unlikely(prev->context.ldt != next->context.ldt)) + load_mm_ldt(next); + } else { + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { + /* We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + * + * As above, load_new_mm_cr3() is serializing and orders + * TLB fills with respect to the mm_cpumask write. + */ + load_new_mm_cr3(next->pgd); + load_mm_ldt(next); + } + } +} + /* * * The flush IPI assumes that a thread switch happens in this order: @@ -172,6 +278,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs) static void flush_tlb_others_ipi(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va) { +#ifdef CONFIG_SMP unsigned int sender; union smp_flush_state *f; @@ -200,6 +307,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_va = 0; if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) raw_spin_unlock(&f->tlbstate_lock); +#endif } void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b255312def9c..4cfb125fe5b0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -270,6 +270,12 @@ static void __init xen_init_cpuid_mask(void) (1 << X86_FEATURE_MTRR) | /* disable MTRR */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + /* + * Xen PV would need some work to support PCID: CR3 handling as well + * as xen_flush_tlb_others() would need updating. + */ + cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_PCID % 32)); /* disable PCID */ + if (!xen_initial_domain()) cpuid_leaf1_edx_mask &= ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b5e2e4c6b017..01c8155dd613 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -692,7 +692,14 @@ */ #define PERCPU_INPUT(cacheline) \ VMLINUX_SYMBOL(__per_cpu_start) = .; \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ *(.data..percpu..first) \ + . = ALIGN(cacheline); \ + *(.data..percpu..user_mapped) \ + *(.data..percpu..user_mapped..shared_aligned) \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..user_mapped..page_aligned) \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h new file mode 100644 index 000000000000..58c55b1589d0 --- /dev/null +++ b/include/linux/kaiser.h @@ -0,0 +1,52 @@ +#ifndef _LINUX_KAISER_H +#define _LINUX_KAISER_H + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#include + +static inline int kaiser_map_thread_stack(void *stack) +{ + /* + * Map that page of kernel stack on which we enter from user context. + */ + return kaiser_add_mapping((unsigned long)stack + + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); +} + +static inline void kaiser_unmap_thread_stack(void *stack) +{ + /* + * Note: may be called even when kaiser_map_thread_stack() failed. + */ + kaiser_remove_mapping((unsigned long)stack + + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); +} +#else + +/* + * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which + * includes architectures that support KAISER, but have it disabled. + */ + +static inline void kaiser_init(void) +{ +} +static inline int kaiser_add_mapping(unsigned long addr, + unsigned long size, unsigned long flags) +{ + return 0; +} +static inline void kaiser_remove_mapping(unsigned long start, + unsigned long size) +{ +} +static inline int kaiser_map_thread_stack(void *stack) +{ + return 0; +} +static inline void kaiser_unmap_thread_stack(void *stack) +{ +} + +#endif /* !CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* _LINUX_KAISER_H */ diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index 70fffeba7495..a4441784503b 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -1,9 +1,16 @@ #ifndef _LINUX_MMU_CONTEXT_H #define _LINUX_MMU_CONTEXT_H +#include + struct mm_struct; void use_mm(struct mm_struct *mm); void unuse_mm(struct mm_struct *mm); +/* Architectures that care about IRQ state in switch_mm can override this. */ +#ifndef switch_mm_irqs_off +# define switch_mm_irqs_off switch_mm +#endif + #endif diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 25842b6e72e1..a0b4422a116a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -95,8 +95,9 @@ enum zone_stat_item { NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ - NR_KERNEL_STACK, /* Second 128 byte cacheline */ + NR_KERNEL_STACK, + NR_KAISERTABLE, NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_BOUNCE, NR_VMSCAN_WRITE, diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 27ef6b190ea6..ea0c9148f624 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -28,6 +28,12 @@ (void)__vpp_verify; \ } while (0) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#define USER_MAPPED_SECTION "..user_mapped" +#else +#define USER_MAPPED_SECTION "" +#endif + /* * s390 and alpha modules require percpu variables to be defined as * weak to force the compiler to generate GOT based external @@ -90,6 +96,12 @@ #define DEFINE_PER_CPU(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "") +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + /* * Declaration/definition used for per-CPU variables that must come first in * the set of variables. @@ -119,6 +131,14 @@ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ ____cacheline_aligned_in_smp +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + #define DECLARE_PER_CPU_ALIGNED(type, name) \ DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ ____cacheline_aligned @@ -137,11 +157,21 @@ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ __aligned(PAGE_SIZE) +/* + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. + */ +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) + +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) /* * Declaration/definition used for per-CPU variables that must be read mostly. */ -#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ DECLARE_PER_CPU_SECTION(type, name, "..readmostly") #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ diff --git a/init/main.c b/init/main.c index e937d9bda0f8..558a9fdd566d 100644 --- a/init/main.c +++ b/init/main.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -463,6 +464,7 @@ static void __init mm_init(void) percpu_init_late(); pgtable_cache_init(); vmalloc_init(); + kaiser_init(); } asmlinkage void __init start_kernel(void) diff --git a/kernel/fork.c b/kernel/fork.c index 29b460431c12..511131a15a75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -133,6 +134,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { + kaiser_unmap_thread_stack(ti); free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } #endif @@ -275,6 +277,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack = ti; + err = kaiser_map_thread_stack(tsk->stack); + if (err) + goto out; + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); diff --git a/kernel/sched.c b/kernel/sched.c index 5cfb99789abf..f0f941f3634a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -3331,7 +3331,7 @@ context_switch(struct rq *rq, struct task_struct *prev, atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else - switch_mm(oldmm, mm, next); + switch_mm_irqs_off(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; diff --git a/mm/mmu_context.c b/mm/mmu_context.c index cf332bc0080a..745416731f77 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -4,9 +4,9 @@ */ #include +#include #include #include -#include #include diff --git a/mm/vmstat.c b/mm/vmstat.c index ff9060919c4b..eaf3db038652 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -699,6 +699,7 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", + "nr_overhead", "nr_unstable", "nr_bounce", "nr_vmscan_write", diff --git a/security/Kconfig b/security/Kconfig index 51bd5a0b69ae..b9bc07d97b15 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -96,6 +96,16 @@ config SECURITY If you are unsure how to answer this question, answer N. +config PAGE_TABLE_ISOLATION + bool "Remove the kernel mapping in user mode" + default y + depends on X86_64 && SMP + help + This enforces a strict kernel and user space isolation, in order + to close hardware side channels on kernel address information. + + If you are unsure how to answer this question, answer Y. + config SECURITYFS bool "Enable the securityfs filesystem" help