x86_64: Map only usable memory in identity map. All reserved memory maps to a zero page. This is done later during the boot process, by pruning the page table setup earlier to remove mappings for the reserved region. Prune done after mem_init, so we can allocate pages as needed and before APs start. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Index: linux-2.6.git/arch/x86/kernel/e820_64.c =================================================================== --- linux-2.6.git.orig/arch/x86/kernel/e820_64.c 2008-01-08 03:41:30.000000000 -0800 +++ linux-2.6.git/arch/x86/kernel/e820_64.c 2008-01-08 04:00:59.000000000 -0800 @@ -121,6 +121,35 @@ } EXPORT_SYMBOL_GPL(e820_any_mapped); +int e820_any_non_reserved(unsigned long start, unsigned long end) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (ei->type == E820_RESERVED) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_non_reserved); + +int is_memory_any_valid(unsigned long start, unsigned long end) +{ + /* + * Keep low PCI/ISA area always mapped. + * Note: end address is exclusive and start is inclusive here + */ + if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) + return 1; + + /* Switch to efi or e820 in future here */ + return e820_any_non_reserved(start, end); +} +EXPORT_SYMBOL_GPL(is_memory_any_valid); + /* * This function checks if the entire range is mapped with type. * @@ -156,6 +185,47 @@ return 0; } +int e820_all_non_reserved(unsigned long start, unsigned long end) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (ei->type == E820_RESERVED) + continue; + + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* + * if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + + /* if start is at or beyond end, we're done, full coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_all_non_reserved); + +int is_memory_all_valid(unsigned long start, unsigned long end) +{ + /* + * Keep low PCI/ISA area always mapped. + * Note: end address is exclusive and start is inclusive here + */ + if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) + return 1; + + /* Switch to efi or e820 in future here */ + return e820_all_non_reserved(start, end); +} +EXPORT_SYMBOL_GPL(is_memory_all_valid); + /* * Find a free area in a specific range. */ Index: linux-2.6.git/arch/x86/mm/init_64.c =================================================================== --- linux-2.6.git.orig/arch/x86/mm/init_64.c 2008-01-08 03:43:46.000000000 -0800 +++ linux-2.6.git/arch/x86/mm/init_64.c 2008-01-08 03:59:28.000000000 -0800 @@ -215,8 +215,9 @@ int i, pmds; pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - vaddr = __START_KERNEL_map; - pmd = level2_kernel_pgt; + /* Skip PMDs meant for kernel text */ + vaddr = __START_KERNEL_map + KERNEL_TEXT_SIZE; + pmd = level2_kernel_pgt + (KERNEL_TEXT_SIZE / PMD_SIZE); last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { for (i = 0; i < pmds; i++) { @@ -299,11 +300,6 @@ if (addr >= end) break; - if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { - set_pud(pud, __pud(0)); - continue; - } - if (pud_val(*pud)) { phys_pmd_update(pud, addr, end); continue; @@ -344,6 +340,8 @@ (table_start << PAGE_SHIFT) + tables); } +static unsigned long max_addr; + /* Setup the direct mapping of the physical memory at PAGE_OFFSET. This runs before bootmem is initialized and gets pages directly from the physical memory. To access them they are temporarily mapped. */ @@ -370,10 +368,13 @@ pgd_t *pgd = pgd_offset_k(start); pud_t *pud; - if (after_bootmem) + if (after_bootmem) { pud = pud_offset(pgd, start & PGDIR_MASK); - else + } else { pud = alloc_low_page(&pud_phys); + if (end > max_addr) + max_addr = end; + } next = start + PGDIR_SIZE; if (next > end) @@ -489,6 +490,187 @@ static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; + +static unsigned long __init get_res_page(void) +{ + static unsigned long res_phys_page; + if (!res_phys_page) { + pte_t *pte; + pte = alloc_low_page(&res_phys_page); + unmap_low_page(pte); + } + return res_phys_page; +} + +static unsigned long __init get_res_ptepage(void) +{ + static unsigned long res_phys_ptepage; + if (!res_phys_ptepage) { + pte_t *pte_page; + unsigned long page_phys; + unsigned long entry; + int i; + + pte_page = alloc_low_page(&res_phys_ptepage); + + page_phys = get_res_page(); + entry = _PAGE_NX | _KERNPG_TABLE | _PAGE_GLOBAL | page_phys; + entry &= __supported_pte_mask; + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t *pte = pte_page + i; + set_pte(pte, __pte(entry)); + } + + unmap_low_page(pte_page); + } + return res_phys_ptepage; +} + +static void __init phys_pte_prune(pte_t *pte_page, unsigned long address, + unsigned long end, unsigned long vaddr, unsigned int exec) +{ + int i = pte_index(vaddr); + + for (; i < PTRS_PER_PTE; i++, address = (address & PAGE_MASK) + PAGE_SIZE, vaddr = (vaddr + PAGE_MASK) + PAGE_SIZE) { + unsigned long entry; + pte_t *pte = pte_page + i; + + if (address >= end) + break; + + if (pte_val(*pte)) + continue; + + /* Nothing to map. Map the null page */ + if (!(address & (~PAGE_MASK)) && + (address + PAGE_SIZE <= end) && + !is_memory_any_valid(address, address + PAGE_SIZE)) { + unsigned long phys_page; + + phys_page = get_res_page(); + entry = _PAGE_NX | _KERNPG_TABLE | _PAGE_GLOBAL | + phys_page; + + entry &= __supported_pte_mask; + set_pte(pte, __pte(entry)); + + continue; + } + + if (exec) + entry = _PAGE_NX|_KERNPG_TABLE|_PAGE_GLOBAL|address; + else + entry = _KERNPG_TABLE|_PAGE_GLOBAL|address; + entry &= __supported_pte_mask; + set_pte(pte, __pte(entry)); + } +} + +static void __init phys_pmd_prune(pmd_t *pmd_page, unsigned long address, + unsigned long end, unsigned long vaddr, unsigned int exec) +{ + int i = pmd_index(vaddr); + + for (; i < PTRS_PER_PMD; i++, address = (address & PMD_MASK) + PMD_SIZE, + vaddr = (vaddr & PMD_MASK) + PMD_SIZE) { + pmd_t *pmd = pmd_page + i; + pte_t *pte; + unsigned long pte_phys; + + if (address >= end) + break; + + if (!pmd_val(*pmd)) + continue; + + /* Nothing to map. Map the null page */ + if (!(address & (~PMD_MASK)) && + (address + PMD_SIZE <= end) && + !is_memory_any_valid(address, address + PMD_SIZE)) { + + pte_phys = get_res_ptepage(); + set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE)); + + continue; + } + + /* Map with 2M pages */ + if (is_memory_all_valid(address, address + PUD_SIZE)) { + /* Init already done */ + continue; + } + + /* Map with 4k pages */ + pte = alloc_low_page(&pte_phys); + phys_pte_prune(pte, address, address + PMD_SIZE, vaddr, exec); + set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE)); + unmap_low_page(pte); + + } +} + +static void __init phys_pud_prune(pud_t *pud_page, unsigned long addr, + unsigned long end, unsigned long vaddr, unsigned int exec) +{ + int i = pud_index(vaddr); + + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE, + vaddr = (vaddr & PUD_MASK) + PUD_SIZE) { + pud_t *pud = pud_page + i; + + if (addr >= end) + break; + + if (pud_val(*pud)) { + pmd_t *pmd = pmd_offset(pud,0); + phys_pmd_prune(pmd, addr, end, vaddr, exec); + } + } +} + +void __init prune_reserved_region_maps(void) +{ + unsigned long start, end, next; + + /* Prune physical memory identity map */ + start = (unsigned long)__va(0); + end = max_addr; + for (; start < end; start = next) { + pgd_t *pgd = pgd_offset_k(start); + pud_t *pud; + + pud = pud_offset(pgd, start & PGDIR_MASK); + + next = start + PGDIR_SIZE; + if (next > end) + next = end; + + phys_pud_prune(pud, __pa(start), __pa(next), start, 0); + } + + /* Prune kernel text region */ + start = (unsigned long)KERNEL_TEXT_START; + end = start + (unsigned long)KERNEL_TEXT_SIZE; + for (; start < end; start = next) { + pgd_t *pgd = pgd_offset_k(start); + pud_t *pud; + + pud = pud_offset(pgd, start & PGDIR_MASK); + + next = (start & PGDIR_MASK) + (unsigned long)PGDIR_SIZE; + if (!next || next > end) + next = end; + + phys_pud_prune(pud, + start - (unsigned long)KERNEL_TEXT_START, + next - (unsigned long)KERNEL_TEXT_START, + start, + 1); + } + + __flush_tlb(); +} + void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; @@ -538,6 +720,8 @@ reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); + + prune_reserved_region_maps(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) Index: linux-2.6.git/arch/x86/mm/ioremap_64.c =================================================================== --- linux-2.6.git.orig/arch/x86/mm/ioremap_64.c 2008-01-08 03:41:30.000000000 -0800 +++ linux-2.6.git/arch/x86/mm/ioremap_64.c 2008-01-08 03:59:28.000000000 -0800 @@ -19,6 +19,7 @@ #include #include #include +#include unsigned long __phys_addr(unsigned long x) { @@ -28,9 +29,6 @@ } EXPORT_SYMBOL(__phys_addr); -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. Index: linux-2.6.git/arch/x86/mm/pageattr_64.c =================================================================== --- linux-2.6.git.orig/arch/x86/mm/pageattr_64.c 2008-01-08 03:41:30.000000000 -0800 +++ linux-2.6.git/arch/x86/mm/pageattr_64.c 2008-01-08 04:03:33.000000000 -0800 @@ -53,9 +53,11 @@ /* * page_private is used to track the number of entries in * the page table page have non standard attributes. + * Count of 1 indicates page split by split_large_page(), + * additional count indicates the number of pages with non-std attr. */ SetPagePrivate(base); - page_private(base) = 0; + page_private(base) = 1; address = __pa(address); addr = address & LARGE_PAGE_MASK; @@ -176,11 +178,8 @@ BUG(); } - /* on x86-64 the direct mapping set at boot is not using 4k pages */ - BUG_ON(PageReserved(kpte_page)); - save_page(kpte_page); - if (page_private(kpte_page) == 0) + if (page_private(kpte_page) == 1) revert_page(address, ref_prot); return 0; } Index: linux-2.6.git/include/asm-x86/e820_64.h =================================================================== --- linux-2.6.git.orig/include/asm-x86/e820_64.h 2008-01-08 03:41:30.000000000 -0800 +++ linux-2.6.git/include/asm-x86/e820_64.h 2008-01-08 03:59:28.000000000 -0800 @@ -26,6 +26,10 @@ extern void e820_mark_nosave_regions(void); extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); +extern int e820_any_non_reserved(unsigned long start, unsigned long end); +extern int is_memory_any_valid(unsigned long start, unsigned long end); +extern int e820_all_non_reserved(unsigned long start, unsigned long end); +extern int is_memory_all_valid(unsigned long start, unsigned long end); extern unsigned long e820_hole_size(unsigned long start, unsigned long end); extern void e820_setup_gap(void); @@ -38,6 +42,10 @@ extern unsigned ebda_addr, ebda_size; extern unsigned long nodemap_addr, nodemap_size; + +#define ISA_START_ADDRESS 0xa0000 +#define ISA_END_ADDRESS 0x100000 + #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/