diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 766ea16..2d88344 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PGTABLE_64_DEFS_H #define _ASM_X86_PGTABLE_64_DEFS_H +#include + #ifndef __ASSEMBLY__ #include @@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define EARLY_DYNAMIC_PAGE_TABLES 64 + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 037df57..9443c77 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -26,11 +26,73 @@ #include #include -static void __init zap_identity_mappings(void) +/* + * Manage page tables very early on. + */ +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt = 2, early_pgt_resets = 0; + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - __flush_tlb_all(); + unsigned long i; + + for (i = 0; i < PTRS_PER_PGD-1; i++) + early_level4_pgt[i].pgd = 0; + + next_early_pgt = 0; + early_pgt_resets++; + + __native_flush_tlb(); +} + +/* Create a new PMD entry */ +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + unsigned long i; + pgdval_t pgd, *pgd_p; + pudval_t *pud_p; + pmdval_t pmd, *pmd_p; + + if (physaddr >= MAXMEM) + return -1; /* Invalid address - puke */ + + i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1); + pgd_p = &early_level4_pgt[i].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) { + pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map); + } else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1) + reset_early_page_tables(); + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PUD; i++) + pud_p[i] = 0; + + *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + _KERNPG_TABLE; + } + i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); + pud_p += i; + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_p[i] = pmd; + pmd += PMD_SIZE; + } + + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + _KERNPG_TABLE; + + return 0; } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -70,12 +132,13 @@ void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* Make NULL pointers segfault */ - zap_identity_mappings(); - + /* XXX - this is wrong... we need to build page tables from scratch */ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 94bf9cc..e13ff91 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: - /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded an identity mapped page table * for us. These identity mapped page tables map all of the * kernel pages and possibly all of memory. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from * arch/x86_64/boot/compressed/head.S. @@ -66,7 +65,8 @@ startup_64: * tables and then reload them. */ - /* Compute the delta between the address I am compiled to run at and the + /* + * Compute the delta between the address I am compiled to run at and the * address I am actually running at. */ leaq _text(%rip), %rbp @@ -78,53 +78,66 @@ startup_64: testl %eax, %eax jnz bad_address - /* Is the address too large? */ - leaq _text(%rip), %rdx - movq $PGDIR_SIZE, %rax - cmpq %rax, %rdx - jae bad_address - - /* Fixup the physical addresses in the page table + /* + * Is the address too large? */ - addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) - addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) + leaq _text(%rip), %rax + shrq $MAX_PHYSMEM_BITS, %rax + jnz bad_address - addq %rbp, level3_ident_pgt + 0(%rip) + /* + * Fixup the physical addresses in the page table + */ + addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) addq %rbp, level3_kernel_pgt + (510*8)(%rip) addq %rbp, level3_kernel_pgt + (511*8)(%rip) addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - /* Add an Identity mapping if I am above 1G */ + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ leaq _text(%rip), %rdi - andq $PMD_PAGE_MASK, %rdi + leaq early_level4_pgt(%rip), %rbx movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andq $(PTRS_PER_PUD - 1), %rax - jz ident_complete + shrq $PGDIR_SHIFT, %rax + + leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + movq %rdx, 0(%rbx,%rax,8) + movq %rdx, 8(%rbx,%rax,8) - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx - leaq level3_ident_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) + addq $4096, %rdx + movq %rdi, %rax + shrq $PUD_SHIFT, %rax + andl $(PTRS_PER_PUD-1), %eax + movq %rdx, (4096+0)(%rbx,%rax,8) + movq %rdx, (4096+8)(%rbx,%rax,8) + addq $8192, %rbx movq %rdi, %rax - shrq $PMD_SHIFT, %rax - andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx - leaq level2_spare_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) -ident_complete: + shrq $PMD_SHIFT, %rdi + addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax + movl $PTRS_PER_PMD, %ecx +1: + andq $(PTRS_PER_PMD - 1), %rdi + movq %rax, (%rbx,%rdi,8) + incq %rdi + addq $PMD_SIZE, %rax + decl %ecx + jnz 1b + /* * Fixup the kernel text+data virtual addresses. Note that * we might write invalid pmds, when the kernel is relocated * cleanup_highmap() fixes this up along with the mappings * beyond _end. */ - leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ @@ -149,7 +162,7 @@ ENTRY(secondary_startup_64) * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded a mapped page table. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either from startup_64 (using physical addresses) * or from trampoline.S (using virtual addresses). @@ -196,7 +209,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip),%rsp + movq stack_start(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -236,31 +249,31 @@ ENTRY(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr - /* esi is pointer to real mode structure with interesting info. + /* rsi is pointer to real mode structure with interesting info. pass it to C */ - movl %esi, %edi + movq %rsi, %rdi /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this - * a far return. + * a far jump. */ - movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq + /* gas 2.22 is buggy and mis-assembles ljmpq */ + rex64 ljmp *initial_code(%rip) /* SMP bootup changes these two */ __REFDATA - .align 8 - ENTRY(initial_code) + .balign 8 + GLOBAL(initial_code) .quad x86_64_start_kernel - ENTRY(initial_gs) + .word __KERNEL_CS + .balign 8 + GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - ENTRY(stack_start) + GLOBAL(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 __FINITDATA @@ -268,7 +281,7 @@ ENTRY(secondary_startup_64) bad_address: jmp bad_address - .section ".init.text","ax" + __INIT .globl early_idt_handlers early_idt_handlers: # 104(%rsp) %rflags @@ -305,14 +318,22 @@ ENTRY(early_idt_handler) pushq %r11 # 0(%rsp) cmpl $__KERNEL_CS,96(%rsp) - jne 10f + jne 11f + cmpl $14,72(%rsp) # Page fault? + jnz 10f + GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + call early_make_pgtable + andl %eax,%eax + jz 20f # All good + +10: leaq 88(%rsp),%rdi # Pointer to %rip call early_fixup_exception andl %eax,%eax jnz 20f # Found an exception entry -10: +11: #ifdef CONFIG_EARLY_PRINTK GET_CR2_INTO(%r9) # can clobber any volatile register if pv movl 80(%rsp),%r8d # error code @@ -334,7 +355,7 @@ ENTRY(early_idt_handler) 1: hlt jmp 1b -20: # Exception table entry found +20: # Exception table entry found or page table generated popq %r11 popq %r10 popq %r9 @@ -348,6 +369,8 @@ ENTRY(early_idt_handler) decl early_recursion_flag(%rip) INTERRUPT_RETURN + __INITDATA + .balign 4 early_recursion_flag: .long 0 @@ -358,11 +381,10 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" #endif /* CONFIG_EARLY_PRINTK */ - .previous #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ -ENTRY(name) +GLOBAL(name) /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ @@ -372,46 +394,21 @@ ENTRY(name) i = i + 1 ; \ .endr - .data - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ -NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + __INITDATA +NEXT_PAGE(early_level4_pgt) + .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE -NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 511,8,0 +NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 + .data NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE -NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) - .fill 512,8,0 - -NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. - * Don't set NX because code runs from these pages. - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) - NEXT_PAGE(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable @@ -426,11 +423,13 @@ NEXT_PAGE(level2_kernel_pgt) PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) -NEXT_PAGE(level2_spare_pgt) - .fill 512, 8, 0 +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 #undef PMDS -#undef NEXT_PAGE .data .align 16 @@ -456,6 +455,7 @@ ENTRY(nmi_idt_table) .skip IDT_ENTRIES * 16 __PAGE_ALIGNED_BSS - .align PAGE_SIZE -ENTRY(empty_zero_page) +NEXT_PAGE(empty_zero_page) + .skip PAGE_SIZE +NEXT_PAGE(init_level4_pgt) .skip PAGE_SIZE