Add paravirt hooks into the initial pagetable setup. In the native case, the kernel builds itself a new initial pagetable from scratch. In the Xen case, the kernel starts with a pagetable provided by the hypervisor, which is used as the prototype for the kernel-generated pagetable. The hooks added in this patch allow either mode of operation without having special cases (the main change to the pagetable construction logic is a testing to make sure a pagetable slot is actually empty before populating it). Signed-off-by: Jeremy Fitzhardinge Cc: Chris Wright Cc: Zachary Amsden Cc: Andi Kleen Cc: Andrew Morton Cc: Rusty Russell =================================================================== --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -379,6 +379,43 @@ static fastcall void native_io_delay(voi { asm volatile("outb %al,$0x80"); } + +void native_pagetable_setup_start(pgd_t *base) +{ +#ifdef CONFIG_X86_PAE + int i; + + /* + * Init entries of the first-level page table to the + * zero page, if they haven't already been set up. + * + * In a normal native boot, we'll be running on a + * pagetable rooted in swapper_pg_dir, but not in PAE + * mode, so this will end up clobbering the mappings + * for the lower 24Mbytes of the address space, + * without affecting the kernel address space. + */ + for (i = 0; i < USER_PTRS_PER_PGD; i++) + set_pgd(&base[i], + __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); + memset(&base[USER_PTRS_PER_PGD], 0, sizeof(pgd_t)); +#endif +} + +void native_pagetable_setup_done(pgd_t *base) +{ +#ifdef CONFIG_X86_PAE + /* + * Add low memory identity-mappings - SMP needs it when + * starting up on an AP from real-mode. In the non-PAE + * case we already have these mappings through head.S. + * All user-space mappings are explicitly cleared after + * SMP startup. + */ + set_pgd(&base[0], base[USER_PTRS_PER_PGD]); +#endif +} + static fastcall void native_flush_tlb(void) { @@ -627,6 +664,9 @@ struct paravirt_ops paravirt_ops = { #endif .set_lazy_mode = (void *)native_nop, + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, + .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, =================================================================== --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -42,6 +42,7 @@ #include #include #include +#include unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -62,6 +63,8 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + memset(pmd_table, 0, PAGE_SIZE); + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -83,12 +86,11 @@ static pte_t * __init one_page_table_ini { if (pmd_none(*pmd)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + memset(page_table, 0, PAGE_SIZE); + paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } return pte_offset_kernel(pmd, 0); @@ -119,7 +121,7 @@ static void __init page_table_range_init pgd = pgd_base + pgd_idx; for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) one_md_table_init(pgd); pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); @@ -158,7 +160,11 @@ static void __init kernel_physical_mappi pfn = 0; for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { - pmd = one_md_table_init(pgd); + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + pmd = one_md_table_init(pgd); + else + pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET); + if (pfn >= max_low_pfn) continue; for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { @@ -167,20 +173,26 @@ static void __init kernel_physical_mappi /* Map with big pages if possible, otherwise create normal page tables. */ if (cpu_has_pse) { unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - - if (is_kernel_text(address) || is_kernel_text(address2)) - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); - else - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + if (!pmd_present(*pmd)) { + if (is_kernel_text(address) || is_kernel_text(address2)) + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); + else + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + } pfn += PTRS_PER_PTE; } else { pte = one_page_table_init(pmd); - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + for (pte_ofs = 0; + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { + if (pte_present(*pte)) + continue; + + if (is_kernel_text(address)) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + else + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); } } } @@ -337,19 +349,32 @@ extern void __init remap_numa_kva(void); #define remap_numa_kva() do {} while (0) #endif +/* + * Build a proper pagetable for the kernel mappings. Up until this + * point, we've been running on some set of pagetables constructed by + * the boot process. + * + * If we're booting on native hardware, this will be a pagetable + * constructed in arch/i386/kernel/head.S, and not running in PAE mode + * (even if we'll end up running in PAE). The root of the pagetable + * will be swapper_pg_dir. + * + * If we're booting paravirtualized under a hypervisor, then there are + * more options: we may already be running PAE, and the pagetable may + * or may not be based in swapper_pg_dir. In any case, + * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * appropriately for the rest of the initialization to work. + * + * In general, pagetable_init() assumes that the pagetable may already + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ static void __init pagetable_init (void) { - unsigned long vaddr; + unsigned long vaddr, end; pgd_t *pgd_base = swapper_pg_dir; -#ifdef CONFIG_X86_PAE - int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); -#else - paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); -#endif + paravirt_pagetable_setup_start(pgd_base); /* Enable PSE if available */ if (cpu_has_pse) { @@ -371,20 +396,12 @@ static void __init pagetable_init (void) * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; + page_table_range_init(vaddr, end, pgd_base); permanent_kmaps_init(pgd_base); -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); -#endif + paravirt_pagetable_setup_done(pgd_base); } #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) =================================================================== --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -49,6 +49,9 @@ struct paravirt_ops void (*arch_setup)(void); char *(*memory_setup)(void); void (*init_IRQ)(void); + + void (*pagetable_setup_start)(pgd_t *pgd_base); + void (*pagetable_setup_done)(pgd_t *pgd_base); void (*banner)(void); @@ -185,6 +188,8 @@ struct paravirt_ops extern struct paravirt_ops paravirt_ops; +void native_pagetable_setup_start(pgd_t *pgd); + #ifdef CONFIG_X86_PAE fastcall unsigned long long native_pte_val(pte_t); fastcall unsigned long long native_pmd_val(pmd_t); @@ -389,6 +394,17 @@ static inline void setup_secondary_clock } #endif +static inline void paravirt_pagetable_setup_start(pgd_t *base) +{ + if (paravirt_ops.pagetable_setup_start) + (*paravirt_ops.pagetable_setup_start)(base); +} + +static inline void paravirt_pagetable_setup_done(pgd_t *base) +{ + if (paravirt_ops.pagetable_setup_done) + (*paravirt_ops.pagetable_setup_done)(base); +} fastcall void native_set_pte(pte_t *ptep, pte_t pteval); fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval); @@ -615,5 +631,43 @@ 772:; \ call *paravirt_ops+PARAVIRT_read_cr0 #endif /* __ASSEMBLY__ */ +#else /* !CONFIG_PARAVIRT */ +#include + +static inline void paravirt_pagetable_setup_start(pgd_t *base) +{ +#ifdef CONFIG_X86_PAE + int i; + + /* + * Init entries of the first-level page table to the + * zero page, if they haven't already been set up. + * + * In a normal native boot, we'll be running on a + * pagetable rooted in swapper_pg_dir, but not in PAE + * mode, so this will end up clobbering the mappings + * for the lower 24Mbytes of the address space, + * without affecting the kernel address space. + */ + for (i = 0; i < USER_PTRS_PER_PGD; i++) + set_pgd(&base[i], + __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); + memset(&base[USER_PTRS_PER_PGD], 0, sizeof(pgd_t)); +#endif +} + +static inline void paravirt_pagetable_setup_done(pgd_t *base) +{ +#ifdef CONFIG_X86_PAE + /* + * Add low memory identity-mappings - SMP needs it when + * starting up on an AP from real-mode. In the non-PAE + * case we already have these mappings through head.S. + * All user-space mappings are explicitly cleared after + * SMP startup. + */ + set_pgd(&base[0], base[USER_PTRS_PER_PGD]); +#endif +} #endif /* CONFIG_PARAVIRT */ #endif /* __ASM_PARAVIRT_H */ =================================================================== --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -15,7 +15,10 @@ #include #include #include + +#ifdef CONFIG_PARAVIRT /* guarded to prevent cyclic dependency */ #include +#endif #ifndef _I386_BITOPS_H #include -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/