[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5420EFD2.4070801@suse.com>
Date: Tue, 23 Sep 2014 05:58:10 +0200
From: Juergen Gross <jgross@...e.com>
To: linux-kernel@...r.kernel.org, xen-devel@...ts.xensource.com,
konrad.wilk@...cle.com, boris.ostrovsky@...cle.com,
david.vrabel@...rix.com, jbeulich@...e.com
Subject: Re: [PATCH V3] xen: eliminate scalability issues from initial mapping
setup
On 09/17/2014 04:59 PM, Juergen Gross wrote:
> Direct Xen to place the initial P->M table outside of the initial
> mapping, as otherwise the 1G (implementation) / 2G (theoretical)
> restriction on the size of the initial mapping limits the amount
> of memory a domain can be handed initially.
>
> As the initial P->M table is copied rather early during boot to
> domain private memory and it's initial virtual mapping is dropped,
> the easiest way to avoid virtual address conflicts with other
> addresses in the kernel is to use a user address area for the
> virtual address of the initial P->M table. This allows us to just
> throw away the page tables of the initial mapping after the copy
> without having to care about address invalidation.
>
> It should be noted that this patch won't enable a pv-domain to USE
> more than 512 GB of RAM. It just enables it to be started with a
> P->M table covering more memory. This is especially important for
> being able to boot a Dom0 on a system with more than 512 GB memory.
>
> Signed-off-by: Juergen Gross <jgross@...e.com>
> Signed-off-by: Jan Beulich <jbeulich@...e.com>
Any Acks/Naks?
Juergen
> ---
> arch/x86/xen/mmu.c | 119 +++++++++++++++++++++++++++++++++++++++++++++---
> arch/x86/xen/setup.c | 65 ++++++++++++++------------
> arch/x86/xen/xen-head.S | 2 +
> 3 files changed, 151 insertions(+), 35 deletions(-)
>
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index 16fb009..3bd403b 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -1198,6 +1198,78 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
> * instead of somewhere later and be confusing. */
> xen_mc_flush();
> }
> +
> +/*
> + * Make a page range writeable and free it.
> + */
> +static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
> +{
> + void *vaddr = __va(paddr);
> + void *vaddr_end = vaddr + size;
> +
> + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
> + make_lowmem_page_readwrite(vaddr);
> +
> + memblock_free(paddr, size);
> +}
> +
> +static void xen_cleanmfnmap_free_pgtbl(void *pgtbl)
> +{
> + unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
> +
> + ClearPagePinned(virt_to_page(__va(pa)));
> + xen_free_ro_pages(pa, PAGE_SIZE);
> +}
> +
> +/*
> + * Since it is well isolated we can (and since it is perhaps large we should)
> + * also free the page tables mapping the initial P->M table.
> + */
> +static void __init xen_cleanmfnmap(unsigned long vaddr)
> +{
> + unsigned long va = vaddr & PMD_MASK;
> + unsigned long pa;
> + pgd_t *pgd = pgd_offset_k(va);
> + pud_t *pud_page = pud_offset(pgd, 0);
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *pte;
> + unsigned int i;
> +
> + set_pgd(pgd, __pgd(0));
> + do {
> + pud = pud_page + pud_index(va);
> + if (pud_none(*pud)) {
> + va += PUD_SIZE;
> + } else if (pud_large(*pud)) {
> + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> + xen_free_ro_pages(pa, PUD_SIZE);
> + va += PUD_SIZE;
> + } else {
> + pmd = pmd_offset(pud, va);
> + if (pmd_large(*pmd)) {
> + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> + xen_free_ro_pages(pa, PMD_SIZE);
> + } else if (!pmd_none(*pmd)) {
> + pte = pte_offset_kernel(pmd, va);
> + for (i = 0; i < PTRS_PER_PTE; ++i) {
> + if (pte_none(pte[i]))
> + break;
> + pa = pte_pfn(pte[i]) << PAGE_SHIFT;
> + xen_free_ro_pages(pa, PAGE_SIZE);
> + }
> + xen_cleanmfnmap_free_pgtbl(pte);
> + }
> + va += PMD_SIZE;
> + if (pmd_index(va))
> + continue;
> + xen_cleanmfnmap_free_pgtbl(pmd);
> + }
> +
> + } while (pud_index(va) || pmd_index(va));
> + xen_cleanmfnmap_free_pgtbl(pud_page);
> +}
> +
> static void __init xen_pagetable_p2m_copy(void)
> {
> unsigned long size;
> @@ -1217,18 +1289,23 @@ static void __init xen_pagetable_p2m_copy(void)
> /* using __ka address and sticking INVALID_P2M_ENTRY! */
> memset((void *)xen_start_info->mfn_list, 0xff, size);
>
> - /* We should be in __ka space. */
> - BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
> addr = xen_start_info->mfn_list;
> - /* We roundup to the PMD, which means that if anybody at this stage is
> + /* We could be in __ka space.
> + * We roundup to the PMD, which means that if anybody at this stage is
> * using the __ka address of xen_start_info or xen_start_info->shared_info
> * they are in going to crash. Fortunatly we have already revectored
> * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
> size = roundup(size, PMD_SIZE);
> - xen_cleanhighmap(addr, addr + size);
>
> - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
> - memblock_free(__pa(xen_start_info->mfn_list), size);
> + if (addr >= __START_KERNEL_map) {
> + xen_cleanhighmap(addr, addr + size);
> + size = PAGE_ALIGN(xen_start_info->nr_pages *
> + sizeof(unsigned long));
> + memblock_free(__pa(addr), size);
> + } else {
> + xen_cleanmfnmap(addr);
> + }
> +
> /* And revector! Bye bye old array */
> xen_start_info->mfn_list = new_mfn_list;
>
> @@ -1529,6 +1606,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
> #else /* CONFIG_X86_64 */
> static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
> {
> + unsigned long pfn;
> +
> + if (xen_feature(XENFEAT_writable_page_tables) ||
> + xen_feature(XENFEAT_auto_translated_physmap) ||
> + xen_start_info->mfn_list >= __START_KERNEL_map)
> + return pte;
> +
> + /*
> + * Pages belonging to the initial p2m list mapped outside the default
> + * address range must be mapped read-only. This region contains the
> + * page tables for mapping the p2m list, too, and page tables MUST be
> + * mapped read-only.
> + */
> + pfn = pte_pfn(pte);
> + if (pfn >= xen_start_info->first_p2m_pfn &&
> + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
> + pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
> +
> return pte;
> }
> #endif /* CONFIG_X86_64 */
> @@ -1884,7 +1979,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
> * mappings. Considering that on Xen after the kernel mappings we
> * have the mappings of some pages that don't exist in pfn space, we
> * set max_pfn_mapped to the last real pfn mapped. */
> - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
> + if (xen_start_info->mfn_list < __START_KERNEL_map)
> + max_pfn_mapped = xen_start_info->first_p2m_pfn;
> + else
> + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
>
> pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
> pt_end = pt_base + xen_start_info->nr_pt_frames;
> @@ -1924,6 +2022,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
> /* Graft it onto L4[511][510] */
> copy_page(level2_kernel_pgt, l2);
>
> + /* Copy the initial P->M table mappings if necessary. */
> + i = pgd_index(xen_start_info->mfn_list);
> + if (i && i < pgd_index(__START_KERNEL_map))
> + init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
> +
> if (!xen_feature(XENFEAT_auto_translated_physmap)) {
> /* Make pagetable pieces RO */
> set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
> @@ -1964,6 +2067,8 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
>
> /* Our (by three pages) smaller Xen pagetable that we are using */
> memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
> + /* protect xen_start_info */
> + memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
> /* Revector the xen_start_info */
> xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
> }
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index 2e555163..6412367 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -333,6 +333,41 @@ void xen_ignore_unusable(struct e820entry *list, size_t map_size)
> }
> }
>
> +/*
> + * Reserve Xen mfn_list.
> + * See comment above "struct start_info" in <xen/interface/xen.h>
> + * We tried to make the the memblock_reserve more selective so
> + * that it would be clear what region is reserved. Sadly we ran
> + * in the problem wherein on a 64-bit hypervisor with a 32-bit
> + * initial domain, the pt_base has the cr3 value which is not
> + * neccessarily where the pagetable starts! As Jan put it: "
> + * Actually, the adjustment turns out to be correct: The page
> + * tables for a 32-on-64 dom0 get allocated in the order "first L1",
> + * "first L2", "first L3", so the offset to the page table base is
> + * indeed 2. When reading xen/include/public/xen.h's comment
> + * very strictly, this is not a violation (since there nothing is said
> + * that the first thing in the page table space is pointed to by
> + * pt_base; I admit that this seems to be implied though, namely
> + * do I think that it is implied that the page table space is the
> + * range [pt_base, pt_base + nt_pt_frames), whereas that
> + * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
> + * which - without a priori knowledge - the kernel would have
> + * difficulty to figure out)." - so lets just fall back to the
> + * easy way and reserve the whole region.
> + */
> +static void __init xen_reserve_xen_mfnlist(void)
> +{
> + if (xen_start_info->mfn_list >= __START_KERNEL_map) {
> + memblock_reserve(__pa(xen_start_info->mfn_list),
> + xen_start_info->pt_base -
> + xen_start_info->mfn_list);
> + return;
> + }
> +
> + memblock_reserve(PFN_PHYS(xen_start_info->first_p2m_pfn),
> + PFN_PHYS(xen_start_info->nr_p2m_frames));
> +}
> +
> /**
> * machine_specific_memory_setup - Hook for machine specific memory setup.
> **/
> @@ -467,32 +502,7 @@ char * __init xen_memory_setup(void)
> e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
> E820_RESERVED);
>
> - /*
> - * Reserve Xen bits:
> - * - mfn_list
> - * - xen_start_info
> - * See comment above "struct start_info" in <xen/interface/xen.h>
> - * We tried to make the the memblock_reserve more selective so
> - * that it would be clear what region is reserved. Sadly we ran
> - * in the problem wherein on a 64-bit hypervisor with a 32-bit
> - * initial domain, the pt_base has the cr3 value which is not
> - * neccessarily where the pagetable starts! As Jan put it: "
> - * Actually, the adjustment turns out to be correct: The page
> - * tables for a 32-on-64 dom0 get allocated in the order "first L1",
> - * "first L2", "first L3", so the offset to the page table base is
> - * indeed 2. When reading xen/include/public/xen.h's comment
> - * very strictly, this is not a violation (since there nothing is said
> - * that the first thing in the page table space is pointed to by
> - * pt_base; I admit that this seems to be implied though, namely
> - * do I think that it is implied that the page table space is the
> - * range [pt_base, pt_base + nt_pt_frames), whereas that
> - * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
> - * which - without a priori knowledge - the kernel would have
> - * difficulty to figure out)." - so lets just fall back to the
> - * easy way and reserve the whole region.
> - */
> - memblock_reserve(__pa(xen_start_info->mfn_list),
> - xen_start_info->pt_base - xen_start_info->mfn_list);
> + xen_reserve_xen_mfnlist();
>
> sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
>
> @@ -522,8 +532,7 @@ char * __init xen_auto_xlated_memory_setup(void)
> for (i = 0; i < memmap.nr_entries; i++)
> e820_add_region(map[i].addr, map[i].size, map[i].type);
>
> - memblock_reserve(__pa(xen_start_info->mfn_list),
> - xen_start_info->pt_base - xen_start_info->mfn_list);
> + xen_reserve_xen_mfnlist();
>
> return "Xen";
> }
> diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
> index 46408e5..e7bd668 100644
> --- a/arch/x86/xen/xen-head.S
> +++ b/arch/x86/xen/xen-head.S
> @@ -112,6 +112,8 @@ NEXT_HYPERCALL(arch_6)
> ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
> #else
> ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
> + /* Map the p2m table to a 512GB-aligned user address. */
> + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE)
> #endif
> ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
> ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists