lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 20 Dec 2017 01:22:02 +0100 (CET)
From:   Thomas Gleixner <tglx@...utronix.de>
To:     Ingo Molnar <mingo@...nel.org>
cc:     Peter Zijlstra <peterz@...radead.org>,
        Dave Hansen <dave.hansen@...el.com>,
        LKML <linux-kernel@...r.kernel.org>, x86@...nel.org,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Andy Lutomirsky <luto@...nel.org>,
        Borislav Petkov <bpetkov@...e.de>,
        Greg KH <gregkh@...uxfoundation.org>, keescook@...gle.com,
        hughd@...gle.com, Brian Gerst <brgerst@...il.com>,
        Josh Poimboeuf <jpoimboe@...hat.com>,
        Denys Vlasenko <dvlasenk@...hat.com>,
        Rik van Riel <riel@...hat.com>,
        Boris Ostrovsky <boris.ostrovsky@...cle.com>,
        Juergen Gross <jgross@...e.com>,
        David Laight <David.Laight@...lab.com>,
        Eduardo Valentin <eduval@...zon.com>, aliguori@...zon.com,
        Will Deacon <will.deacon@....com>, daniel.gruss@...k.tugraz.at,
        Dave Hansen <dave.hansen@...ux.intel.com>,
        Borislav Petkov <bp@...e.de>, "H. Peter Anvin" <hpa@...or.com>
Subject: Re: [patch V163 27/51] x86/mm/pti: Populate user PGD

On Tue, 19 Dec 2017, Thomas Gleixner wrote:
> On Tue, 19 Dec 2017, Ingo Molnar wrote:
> We don't run out of space, but the 0-day robot triggered a nasty issue.
> 
> The fixmap bottom address, which contains the early_ioremap fixmap area, is:
> 
>     vaddr_bt = FIXADDR_TOP - FIX_BTMAP_BEGIN * PAGE_SIZE
> 
> If that address is lower than:
> 
>     vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
> 
> then cleanup_highmap() will happily 0 out the PMD entry for the PTE page of
> FIX_BTMAP. That entry was set up earlier in early_ioremap_init().
> 
> As a consequence the first call to __early_set_fixmap() which tries to
> install a PTE for early_ioremap() will crash and burn.
> 
> Below is a nasty hack which fixes the problem. Ideally we get all of this
> cpu_entry_stuff out of the fixmap. I'll look into that later, but for now
> the patch 'fixes' the issue.

I had a stab on moving the cpu_entry_area to some other place.

The patch below works, but:

 - it breaks i386 build because I have not yet found a way to place the
   CPU_ENTRY_AREA_BASE without creating include recursion hell

 - it probably does not work on XEN_PV, but I'm too tired now to figure
   that out.

Thanks,

	tglx

8<-------------------

 Documentation/x86/x86_64/mm.txt         |    4 
 arch/x86/events/intel/ds.c              |   53 ++++++------
 arch/x86/include/asm/desc.h             |    1 
 arch/x86/include/asm/fixmap.h           |   89 --------------------
 arch/x86/include/asm/pgtable_32_types.h |    6 -
 arch/x86/include/asm/pgtable_64_types.h |   49 ++++++-----
 arch/x86/kernel/cpu/common.c            |  125 ----------------------------
 arch/x86/kernel/dumpstack.c             |    1 
 arch/x86/kernel/traps.c                 |    5 -
 arch/x86/mm/Makefile                    |    2 
 arch/x86/mm/dump_pagetables.c           |    2 
 arch/x86/mm/kasan_init_64.c             |    6 -
 arch/x86/mm/pti.c                       |   39 +++------
 arch/x86/xen/mmu_pv.c                   |    2 
 b/arch/x86/include/asm/cpu_entry_area.h |   79 ++++++++++++++++++
 b/arch/x86/mm/cpu_entry_area.c          |  138 ++++++++++++++++++++++++++++++++
 16 files changed, 309 insertions(+), 292 deletions(-)

--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,7 +12,8 @@ ffffea0000000000 - ffffeaffffffffff (=40
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
-fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
@@ -36,6 +37,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49
 ... unused hole ...
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -76,36 +76,39 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
 /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM		_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 #ifdef CONFIG_X86_5LEVEL
-#define VMALLOC_SIZE_TB _AC(12800, UL)
-#define __VMALLOC_BASE	_AC(0xffa0000000000000, UL)
-#define __VMEMMAP_BASE	_AC(0xffd4000000000000, UL)
-#define LDT_PGD_ENTRY _AC(-112, UL)
-#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+#define VMALLOC_SIZE_TB		_AC(12800, UL)
+#define __VMALLOC_BASE		_AC(0xffa0000000000000, UL)
+#define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)
+#define LDT_PGD_ENTRY		_AC(-112, UL)
+#define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #else
-#define VMALLOC_SIZE_TB	_AC(32, UL)
-#define __VMALLOC_BASE	_AC(0xffffc90000000000, UL)
-#define __VMEMMAP_BASE	_AC(0xffffea0000000000, UL)
-#define LDT_PGD_ENTRY _AC(-3, UL)
-#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+#define VMALLOC_SIZE_TB		_AC(32, UL)
+#define __VMALLOC_BASE		_AC(0xffffc90000000000, UL)
+#define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)
+#define LDT_PGD_ENTRY		_AC(-4, UL)
+#define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #endif
 #ifdef CONFIG_RANDOMIZE_MEMORY
-#define VMALLOC_START	vmalloc_base
-#define VMEMMAP_START	vmemmap_base
+#define VMALLOC_START		vmalloc_base
+#define VMEMMAP_START		vmemmap_base
 #else
-#define VMALLOC_START	__VMALLOC_BASE
-#define VMEMMAP_START	__VMEMMAP_BASE
+#define VMALLOC_START		__VMALLOC_BASE
+#define VMEMMAP_START		__VMEMMAP_BASE
 #endif /* CONFIG_RANDOMIZE_MEMORY */
-#define VMALLOC_END	(VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
-#define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+
+#define VMALLOC_END		(VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
+#define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
 /* The module sections ends with the start of the fixmap */
-#define MODULES_END   __fix_to_virt(__end_of_fixed_addresses + 1)
-#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
-#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
-#define EFI_VA_START	 ( -4 * (_AC(1, UL) << 30))
-#define EFI_VA_END	 (-68 * (_AC(1, UL) << 30))
+#define MODULES_END		__fix_to_virt(__end_of_fixed_addresses + 1)
+#define MODULES_LEN		(MODULES_END - MODULES_VADDR)
+#define CPU_ENTRY_AREA_PGD	_AC(-3, UL)
+#define CPU_ENTRY_AREA_BASE	(CPU_ENTRY_AREA_PGD << P4D_SHIFT)
+#define ESPFIX_PGD_ENTRY	_AC(-2, UL)
+#define ESPFIX_BASE_ADDR	(ESPFIX_PGD_ENTRY << P4D_SHIFT)
+#define EFI_VA_START		( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END		(-68 * (_AC(1, UL) << 30))
 
 #define EARLY_DYNAMIC_PAGE_TABLES	64
 
--- /dev/null
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -0,0 +1,79 @@
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <asm/processor.h>
+#include <asm/intel_ds.h>
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code.  Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+	char gdt[PAGE_SIZE];
+
+	/*
+	 * The GDT is just below entry_stack and thus serves (on x86_64) as
+	 * a a read-only guard page.
+	 */
+	struct entry_stack_page entry_stack_page;
+
+	/*
+	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
+	 * we need task switches to work, and task switches write to the TSS.
+	 */
+	struct tss_struct tss;
+
+	char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Exception stacks used for IST entries.
+	 *
+	 * In the future, this should have a separate slot for each stack
+	 * with guard pages between them.
+	 */
+	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+#ifdef CONFIG_CPU_SUP_INTEL
+	/*
+	 * Per CPU debug store for Intel performance monitoring. Wastes a
+	 * full page at the moment.
+	 */
+	struct debug_store cpu_debug_store;
+	/*
+	 * The actual PEBS/BTS buffers must be mapped to user space
+	 * Reserve enough fixmap PTEs.
+	 */
+	struct debug_store_buffers cpu_debug_buffers;
+#endif
+};
+
+#define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area))
+#define CPU_ENTRY_AREA_TOT_SIZE	(CPU_ENTRY_AREA_SIZE * NR_CPUS)
+
+extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+#define	CPU_ENTRY_AREA_RO_IDT		CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU		(CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR	((void *)CPU_ENTRY_AREA_RO_IDT)
+
+static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+	unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+	return (struct cpu_entry_area *) va;
+}
+
+static inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+	return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
+
+#endif
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,7 +19,6 @@
 #include <asm/acpi.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
-#include <asm/intel_ds.h>
 #ifdef CONFIG_X86_32
 #include <linux/threads.h>
 #include <asm/kmap_types.h>
@@ -45,57 +44,6 @@ extern unsigned long __FIXADDR_TOP;
 			 PAGE_SIZE)
 #endif
 
-/*
- * cpu_entry_area is a percpu region in the fixmap that contains things
- * needed by the CPU and early entry/exit code.  Real types aren't used
- * for all fields here to avoid circular header dependencies.
- *
- * Every field is a virtual alias of some other allocated backing store.
- * There is no direct allocation of a struct cpu_entry_area.
- */
-struct cpu_entry_area {
-	char gdt[PAGE_SIZE];
-
-	/*
-	 * The GDT is just below entry_stack and thus serves (on x86_64) as
-	 * a a read-only guard page.
-	 */
-	struct entry_stack_page entry_stack_page;
-
-	/*
-	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
-	 * we need task switches to work, and task switches write to the TSS.
-	 */
-	struct tss_struct tss;
-
-	char entry_trampoline[PAGE_SIZE];
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Exception stacks used for IST entries.
-	 *
-	 * In the future, this should have a separate slot for each stack
-	 * with guard pages between them.
-	 */
-	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
-#endif
-#ifdef CONFIG_CPU_SUP_INTEL
-	/*
-	 * Per CPU debug store for Intel performance monitoring. Wastes a
-	 * full page at the moment.
-	 */
-	struct debug_store cpu_debug_store;
-	/*
-	 * The actual PEBS/BTS buffers must be mapped to user space
-	 * Reserve enough fixmap PTEs.
-	 */
-	struct debug_store_buffers cpu_debug_buffers;
-#endif
-};
-
-#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
-
-extern void setup_cpu_entry_areas(void);
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -158,17 +106,7 @@ enum fixed_addresses {
 	FIX_TEXT_POKE1,	/* reserve 2 pages for text_poke() */
 	FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
 
-	/*
-	 * Fixmap entries to remap the IDT, and the per CPU entry areas.
-	 * Aligned to a PMD boundary.
-	 */
-	FIX_USR_SHARED_TOP = round_up(FIX_TEXT_POKE0 + 1, PTRS_PER_PMD),
-	FIX_RO_IDT,
-	FIX_CPU_ENTRY_AREA_TOP,
-	FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
-	FIX_USR_SHARED_BOTTOM  = round_up(FIX_CPU_ENTRY_AREA_BOTTOM + 2, PTRS_PER_PMD) - 1,
-
-	__end_of_permanent_fixed_addresses = FIX_USR_SHARED_BOTTOM,
+	__end_of_permanent_fixed_addresses = FIX_TEXT_POKE0,
 
 	/*
 	 * 512 temporary boot-time mappings, used by early_ioremap(),
@@ -249,30 +187,5 @@ void __init *early_memremap_decrypted_wp
 void __early_set_fixmap(enum fixed_addresses idx,
 			phys_addr_t phys, pgprot_t flags);
 
-static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
-{
-	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
-
-	return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
-}
-
-#define __get_cpu_entry_area_offset_index(cpu, offset) ({		\
-	BUILD_BUG_ON(offset % PAGE_SIZE != 0);				\
-	__get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE);	\
-	})
-
-#define get_cpu_entry_area_index(cpu, field)				\
-	__get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
-
-static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
-{
-	return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
-}
-
-static inline struct entry_stack *cpu_entry_stack(int cpu)
-{
-	return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
-}
-
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -483,133 +483,8 @@ static const unsigned int exception_stac
 	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
 };
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
-	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 #endif
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page,
-				   entry_stack_storage);
-
-/*
- * Force the population of PMDs for not yet allocated per cpu
- * memory like debug store buffers.
- */
-static void __init allocate_percpu_fixmap_ptes(int idx, int pages)
-{
-	for (; pages; pages--, idx--)
-		__set_fixmap(idx, 0, PAGE_NONE);
-}
-
-static void __init
-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
-{
-	for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
-		__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
-}
-
-static void percpu_setup_debug_store(int cpu)
-{
-#ifdef CONFIG_CPU_SUP_INTEL
-	int npages, idx;
-
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-		return;
-
-	idx = get_cpu_entry_area_index(cpu, cpu_debug_store);
-	npages = sizeof(struct debug_store) / PAGE_SIZE;
-	BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
-	set_percpu_fixmap_pages(idx, &per_cpu(cpu_debug_store, cpu), npages,
-				PAGE_KERNEL);
-
-	idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers);
-	npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
-	allocate_percpu_fixmap_ptes(idx, npages);
-#endif
-}
-
-/* Setup the fixmap mappings only once per-processor */
-static void __init setup_cpu_entry_area(int cpu)
-{
-#ifdef CONFIG_X86_64
-	extern char _entry_trampoline[];
-
-	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
-	pgprot_t gdt_prot = PAGE_KERNEL_RO;
-	pgprot_t tss_prot = PAGE_KERNEL_RO;
-#else
-	/*
-	 * On native 32-bit systems, the GDT cannot be read-only because
-	 * our double fault handler uses a task gate, and entering through
-	 * a task gate needs to change an available TSS to busy.  If the
-	 * GDT is read-only, that will triple fault.  The TSS cannot be
-	 * read-only because the CPU writes to it on task switches.
-	 *
-	 * On Xen PV, the GDT must be read-only because the hypervisor
-	 * requires it.
-	 */
-	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
-		PAGE_KERNEL_RO : PAGE_KERNEL;
-	pgprot_t tss_prot = PAGE_KERNEL;
-#endif
-
-	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page),
-				per_cpu_ptr(&entry_stack_storage, cpu), 1,
-				PAGE_KERNEL);
-
-	/*
-	 * The Intel SDM says (Volume 3, 7.2.1):
-	 *
-	 *  Avoid placing a page boundary in the part of the TSS that the
-	 *  processor reads during a task switch (the first 104 bytes). The
-	 *  processor may not correctly perform address translations if a
-	 *  boundary occurs in this area. During a task switch, the processor
-	 *  reads and writes into the first 104 bytes of each TSS (using
-	 *  contiguous physical addresses beginning with the physical address
-	 *  of the first byte of the TSS). So, after TSS access begins, if
-	 *  part of the 104 bytes is not physically contiguous, the processor
-	 *  will access incorrect information without generating a page-fault
-	 *  exception.
-	 *
-	 * There are also a lot of errata involving the TSS spanning a page
-	 * boundary.  Assert that we're not doing that.
-	 */
-	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
-		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
-	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
-				&per_cpu(cpu_tss_rw, cpu),
-				sizeof(struct tss_struct) / PAGE_SIZE,
-				tss_prot);
-
-#ifdef CONFIG_X86_32
-	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
-#endif
-
-#ifdef CONFIG_X86_64
-	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
-	BUILD_BUG_ON(sizeof(exception_stacks) !=
-		     sizeof(((struct cpu_entry_area *)0)->exception_stacks));
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
-				&per_cpu(exception_stacks, cpu),
-				sizeof(exception_stacks) / PAGE_SIZE,
-				PAGE_KERNEL);
-
-	__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
-		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
-#endif
-	percpu_setup_debug_store(cpu);
-}
-
-void __init setup_cpu_entry_areas(void)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu)
-		setup_cpu_entry_area(cpu);
-}
-
 /* Load the original GDT from the per-cpu structure */
 void load_direct_gdt(int cpu)
 {
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -60,6 +60,7 @@
 #include <asm/trace/mpx.h>
 #include <asm/mpx.h>
 #include <asm/vm86.h>
+#include <asm/cpu_entry_area.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
@@ -950,8 +951,8 @@ void __init trap_init(void)
 	 * "sidt" instruction will not leak the location of the kernel, and
 	 * to defend the IDT against arbitrary memory write vulnerabilities.
 	 * It will be reloaded in cpu_init() */
-	__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
-	idt_descr.address = fix_to_virt(FIX_RO_IDT);
+	cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), PAGE_KERNEL_RO);
+	idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
 
 	/*
 	 * Should be a barrier for any external CPU state:
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o	= -pg
 endif
 
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o
+	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
 
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
--- /dev/null
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -0,0 +1,138 @@
+#include <linux/percpu.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+	unsigned long va = (unsigned long) cea_vaddr;
+
+	set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+}
+
+/*
+ * Force the population of PMDs for not yet allocated per cpu
+ * memory like debug store buffers.
+ */
+static void __init cea_allocate_ptes(void *cea_vaddr, int pages)
+{
+	for (; pages; pages--, cea_vaddr += PAGE_SIZE)
+		cea_set_pte(cea_vaddr, 0, PAGE_NONE);
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+	for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+	int npages;
+	void *cea;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return;
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+	npages = sizeof(struct debug_store) / PAGE_SIZE;
+	BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+	cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+			     PAGE_KERNEL);
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+	npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+	cea_allocate_ptes(cea, npages);
+#endif
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+	extern char _entry_trampoline[];
+
+	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+	pgprot_t gdt_prot = PAGE_KERNEL_RO;
+	pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+	/*
+	 * On native 32-bit systems, the GDT cannot be read-only because
+	 * our double fault handler uses a task gate, and entering through
+	 * a task gate needs to change an available TSS to busy.  If the
+	 * GDT is read-only, that will triple fault.  The TSS cannot be
+	 * read-only because the CPU writes to it on task switches.
+	 *
+	 * On Xen PV, the GDT must be read-only because the hypervisor
+	 * requires it.
+	 */
+	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+		PAGE_KERNEL_RO : PAGE_KERNEL;
+	pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+	cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+		    gdt_prot);
+
+	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+			     per_cpu_ptr(&entry_stack_storage, cpu), 1,
+			     PAGE_KERNEL);
+
+	/*
+	 * The Intel SDM says (Volume 3, 7.2.1):
+	 *
+	 *  Avoid placing a page boundary in the part of the TSS that the
+	 *  processor reads during a task switch (the first 104 bytes). The
+	 *  processor may not correctly perform address translations if a
+	 *  boundary occurs in this area. During a task switch, the processor
+	 *  reads and writes into the first 104 bytes of each TSS (using
+	 *  contiguous physical addresses beginning with the physical address
+	 *  of the first byte of the TSS). So, after TSS access begins, if
+	 *  part of the 104 bytes is not physically contiguous, the processor
+	 *  will access incorrect information without generating a page-fault
+	 *  exception.
+	 *
+	 * There are also a lot of errata involving the TSS spanning a page
+	 * boundary.  Assert that we're not doing that.
+	 */
+	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+			     &per_cpu(cpu_tss_rw, cpu),
+			     sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
+
+#ifdef CONFIG_X86_32
+	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+	BUILD_BUG_ON(sizeof(exception_stacks) !=
+		     sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+			     &per_cpu(exception_stacks, cpu),
+			     sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
+
+	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+	percpu_setup_debug_store(cpu);
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		setup_cpu_entry_area(cpu);
+}
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,6 +3,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
@@ -280,16 +281,22 @@ void fini_debug_store_on_cpu(int cpu)
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
-static u64 ds_update_fixmap(int idx, void *addr, size_t size, pgprot_t prot)
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
 {
-	phys_addr_t pa, va;
+	phys_addr_t pa;
 	size_t msz = 0;
 
-	va = __fix_to_virt(idx);
 	pa = virt_to_phys(addr);
-	for (; msz < size; idx--, msz += PAGE_SIZE, pa += PAGE_SIZE)
-		__set_fixmap(idx, pa, prot);
-	return va;
+	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, pa, prot);
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+	size_t msz = 0;
+
+	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, 0, PAGE_NONE);
 }
 
 static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
@@ -313,8 +320,8 @@ static int alloc_pebs_buffer(int cpu)
 	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
 	struct debug_store *ds = hwev->ds;
 	size_t bsiz = x86_pmu.pebs_buffer_size;
-	int idx, max, node = cpu_to_node(cpu);
-	void *buffer, *ibuffer;
+	int max, node = cpu_to_node(cpu);
+	void *buffer, *ibuffer, *cea;
 
 	if (!x86_pmu.pebs)
 		return 0;
@@ -336,10 +343,10 @@ static int alloc_pebs_buffer(int cpu)
 		per_cpu(insn_buffer, cpu) = ibuffer;
 	}
 	hwev->ds_pebs_vaddr = buffer;
-	/* Update the fixmap */
-	idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.pebs_buffer);
-	ds->pebs_buffer_base = ds_update_fixmap(idx, buffer, bsiz,
-						PAGE_KERNEL);
+	/* Update the cpu entry area mapping */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds->pebs_buffer_base = (u64) cea;
+	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
 	ds->pebs_index = ds->pebs_buffer_base;
 	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
 	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
@@ -350,7 +357,7 @@ static void release_pebs_buffer(int cpu)
 {
 	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
 	struct debug_store *ds = hwev->ds;
-	int idx;
+	void *cea;
 
 	if (!ds || !x86_pmu.pebs)
 		return;
@@ -359,8 +366,8 @@ static void release_pebs_buffer(int cpu)
 	per_cpu(insn_buffer, cpu) = NULL;
 
 	/* Clear the fixmap */
-	idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.pebs_buffer);
-	ds_update_fixmap(idx, 0, x86_pmu.pebs_buffer_size, PAGE_NONE);
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
 	ds->pebs_buffer_base = 0;
 	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
 	hwev->ds_pebs_vaddr = NULL;
@@ -370,8 +377,8 @@ static int alloc_bts_buffer(int cpu)
 {
 	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
 	struct debug_store *ds = hwev->ds;
-	int idx, max;
-	void *buffer;
+	void *buffer, *cea;
+	int max;
 
 	if (!x86_pmu.bts)
 		return 0;
@@ -383,9 +390,9 @@ static int alloc_bts_buffer(int cpu)
 	}
 	hwev->ds_bts_vaddr = buffer;
 	/* Update the fixmap */
-	idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.bts_buffer);
-	ds->bts_buffer_base = ds_update_fixmap(idx, buffer, BTS_BUFFER_SIZE,
-					       PAGE_KERNEL);
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds->bts_buffer_base = (u64) cea;
+	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
 	ds->bts_index = ds->bts_buffer_base;
 	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
 	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
@@ -397,14 +404,14 @@ static void release_bts_buffer(int cpu)
 {
 	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
 	struct debug_store *ds = hwev->ds;
-	int idx;
+	void *cea;
 
 	if (!ds || !x86_pmu.bts)
 		return;
 
 	/* Clear the fixmap */
-	idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.bts_buffer);
-	ds_update_fixmap(idx, 0, BTS_BUFFER_SIZE, PAGE_NONE);
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds_clear_cea(cea, BTS_BUFFER_SIZE);
 	ds->bts_buffer_base = 0;
 	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
 	hwev->ds_bts_vaddr = NULL;
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -7,6 +7,7 @@
 #include <asm/mmu.h>
 #include <asm/fixmap.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpu_entry_area.h>
 
 #include <linux/smp.h>
 #include <linux/percpu.h>
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -38,13 +38,13 @@ extern bool __vmalloc_start_set; /* set
 #define LAST_PKMAP 1024
 #endif
 
-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1))	\
-		    & PMD_MASK)
+#define PKMAP_BASE		\
+	((FIXMAP_START - PAGE_SIZE * (LAST_PKMAP + 1)) & PMD_MASK)
 
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END	(PKMAP_BASE - 2 * PAGE_SIZE)
 #else
-# define VMALLOC_END	(FIXADDR_START - 2 * PAGE_SIZE)
+# define VMALLOC_END	(FIXMAP_START - 2 * PAGE_SIZE)
 #endif
 
 #define MODULES_VADDR	VMALLOC_START
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,6 +18,7 @@
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
 
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -62,6 +62,7 @@ enum address_markers_idx {
 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
 	LDT_NR,
 #endif
+	CPU_ENTRY_AREA_START_NR,
 # ifdef CONFIG_X86_ESPFIX64
 	ESPFIX_START_NR,
 # endif
@@ -97,6 +98,7 @@ static struct addr_marker address_marker
 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
 	{ LDT_BASE_ADDR,	"LDT remap" },
 #endif
+	{ CPU_ENTRY_AREA_BASE,	"CPU entry Area" },
 # ifdef CONFIG_X86_ESPFIX64
 	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
 # endif
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -15,6 +15,7 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>
 
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
@@ -330,12 +331,13 @@ void __init kasan_init(void)
 			      (unsigned long)kasan_mem_to_shadow(_end),
 			      early_pfn_to_nid(__pa(_stext)));
 
-	shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
+	shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
 	shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
 	shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
 						PAGE_SIZE);
 
-	shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
+	shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+					CPU_ENTRY_AREA_TOT_SIZE);
 	shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
 	shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
 					PAGE_SIZE);
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -275,21 +275,25 @@ pti_clone_pmds(unsigned long start, unsi
 	}
 }
 
-static void __init pti_setup_espfix64(void)
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
 {
-#ifdef CONFIG_X86_ESPFIX64
-	/*
-	 * ESPFIX64 uses a single p4d (i.e. a top-level entry on 4-level
-	 * systems and a next-level entry on 5-level systems.  Share that
-	 * entry between the user and kernel pagetables.
-	 */
-	pgd_t *kernel_pgd;
 	p4d_t *kernel_p4d, *user_p4d;
+	pgd_t *kernel_pgd;
 
-	user_p4d = pti_user_pagetable_walk_p4d(ESPFIX_BASE_ADDR);
-	kernel_pgd = pgd_offset_k(ESPFIX_BASE_ADDR);
-	kernel_p4d = p4d_offset(kernel_pgd, ESPFIX_BASE_ADDR);
+	user_p4d = pti_user_pagetable_walk_p4d(addr);
+	kernel_pgd = pgd_offset_k(addr);
+	kernel_p4d = p4d_offset(kernel_pgd, addr);
 	*user_p4d = *kernel_p4d;
+}
+
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+	pti_clone_p4d(ESPFIX_BASE_ADDR);
 #endif
 }
 
@@ -313,20 +317,11 @@ static void __init pti_setup_vsyscall(vo
 }
 
 /*
- * Clone the populated PMDs of the user shared fixmaps into the user space
- * visible page table.
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
  */
 static void __init pti_clone_user_shared(void)
 {
-	unsigned long bot, top;
-
-	bot = __fix_to_virt(FIX_USR_SHARED_BOTTOM);
-	top = __fix_to_virt(FIX_USR_SHARED_TOP) + PAGE_SIZE;
-
-	/* Top of the user shared block must be PMD-aligned. */
-	WARN_ON(top & ~PMD_MASK);
-
-	pti_clone_pmds(bot, top, 0);
+	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
 }
 
 /*
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2261,7 +2261,6 @@ static void xen_set_fixmap(unsigned idx,
 
 	switch (idx) {
 	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-	case FIX_RO_IDT:
 #ifdef CONFIG_X86_32
 	case FIX_WP_TEST:
 # ifdef CONFIG_HIGHMEM
@@ -2272,7 +2271,6 @@ static void xen_set_fixmap(unsigned idx,
 #endif
 	case FIX_TEXT_POKE0:
 	case FIX_TEXT_POKE1:
-	case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
 		/* All local page mappings */
 		pte = pfn_pte(phys, prot);
 		break;

Powered by blists - more mailing lists