linux-kernel - [PATCH 13/13] xen: allow more than 512 GB of RAM for 64 bit pv-domains

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1424242326-26611-14-git-send-email-jgross@suse.com>
Date:	Wed, 18 Feb 2015 07:52:06 +0100
From:	Juergen Gross <jgross@...e.com>
To:	linux-kernel@...r.kernel.org, xen-devel@...ts.xensource.com,
	konrad.wilk@...cle.com, david.vrabel@...rix.com,
	boris.ostrovsky@...cle.com
Cc:	Juergen Gross <jgross@...e.com>
Subject: [PATCH 13/13] xen: allow more than 512 GB of RAM for 64 bit pv-domains

64 bit pv-domains under Xen are limited to 512 GB of RAM today. The
main reason has been the 3 level p2m tree, which was replaced by the
virtual mapped linear p2m list. Parallel to the p2m list which is
being used by the kernel itself there is a 3 level mfn tree for usage
by the Xen tools and eventually for crash dump analysis. For this tree
the linear p2m list can serve as a replacement, too. As the kernel
can't know whether the tools are capable of dealing with the p2m list
instead of the mfn tree, the limit of 512 GB can't be dropped in all
cases.

This patch replaces the hard limit by a kernel parameter which tells
the kernel to obey the 512 GB limit or not. The default is selected by
a configuration parameter which specifies whether the 512 GB limit
should be active per default for dom0 (only crash dump analysis is
affected) and/or for domUs (additionally domain save/restore/migration
are affected).

Memory above the domain limit is returned to the hypervisor instead of
being identity mapped, which was wrong anyways.

The kernel configuration parameter to specify the maximum size of a
domain can be deleted, as it is not relevant any more.

Signed-off-by: Juergen Gross <jgross@...e.com>
---
 Documentation/kernel-parameters.txt |  7 ++++
 arch/x86/include/asm/xen/page.h     |  4 ---
 arch/x86/xen/Kconfig                | 31 +++++++++++-----
 arch/x86/xen/p2m.c                  | 10 +++---
 arch/x86/xen/setup.c                | 72 ++++++++++++++++++++++++++++++-------
 5 files changed, 93 insertions(+), 31 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a89e326..7bf6342 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3959,6 +3959,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			plus one apbt timer for broadcast timer.
 			x86_intel_mid_timer=apbt_only | lapic_and_apbt
 
+	xen_512gb_limit		[KNL,X86-64,XEN]
+			Restricts the kernel running paravirtualized under Xen
+			to use only up to 512 GB of RAM. The reason to do so is
+			crash analysis tools and Xen tools for doing domain
+			save/restore/migration must be enabled to handle larger
+			domains.
+
 	xen_emul_unplug=		[HW,X86,XEN]
 			Unplug Xen emulated devices
 			Format: [unplug0,][unplug1]
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 358dcd3..18a11f2 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -35,10 +35,6 @@ typedef struct xpaddr {
 #define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
 #define IDENTITY_FRAME(m)	((m) | IDENTITY_FRAME_BIT)
 
-/* Maximum amount of memory we can handle in a domain in pages */
-#define MAX_DOMAIN_PAGES						\
-    ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
-
 extern unsigned long *machine_to_phys_mapping;
 extern unsigned long  machine_to_phys_nr;
 extern unsigned long *xen_p2m_addr;
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index e88fda8..b61a15e 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -23,14 +23,29 @@ config XEN_PVHVM
 	def_bool y
 	depends on XEN && PCI && X86_LOCAL_APIC
 
-config XEN_MAX_DOMAIN_MEMORY
-       int
-       default 500 if X86_64
-       default 64 if X86_32
-       depends on XEN
-       help
-         This only affects the sizing of some bss arrays, the unused
-         portions of which are freed.
+if X86_64
+choice
+	prompt "Support pv-domains larger than 512GB"
+	default XEN_512GB_NONE
+	help
+	  Support paravirtualized domains with more than 512GB of RAM.
+
+	  The Xen tools and crash dump analysis tools might not support
+	  pv-domains with more than 512 GB of RAM. This option controls the
+	  default setting of the kernel to use only up to 512 GB or more.
+	  It is always possible to change the default via specifying the
+	  boot parameter "xen_512gb_limit".
+
+	config XEN_512GB_NONE
+		bool "neither dom0 nor domUs can be larger than 512GB"
+	config XEN_512GB_DOM0
+		bool "dom0 can be larger than 512GB, domUs not"
+	config XEN_512GB_DOMU
+		bool "domUs can be larger than 512GB, dom0 not"
+	config XEN_512GB_ALL
+		bool "dom0 and domUs can be larger than 512GB"
+endchoice
+endif
 
 config XEN_SAVE_RESTORE
        bool
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index df73cc5..12a1e98 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -502,7 +502,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
  */
 static bool alloc_p2m(unsigned long pfn)
 {
-	unsigned topidx, mididx;
+	unsigned topidx;
 	unsigned long *top_mfn_p, *mid_mfn;
 	pte_t *ptep, *pte_pg;
 	unsigned int level;
@@ -510,9 +510,6 @@ static bool alloc_p2m(unsigned long pfn)
 	unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
 	unsigned long p2m_pfn;
 
-	topidx = p2m_top_index(pfn);
-	mididx = p2m_mid_index(pfn);
-
 	ptep = lookup_address(addr, &level);
 	BUG_ON(!ptep || level != PG_LEVEL_4K);
 	pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -524,7 +521,8 @@ static bool alloc_p2m(unsigned long pfn)
 			return false;
 	}
 
-	if (p2m_top_mfn) {
+	if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
+		topidx = p2m_top_index(pfn);
 		top_mfn_p = &p2m_top_mfn[topidx];
 		mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
 
@@ -579,7 +577,7 @@ static bool alloc_p2m(unsigned long pfn)
 				pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
 			HYPERVISOR_shared_info->arch.p2m_generation++;
 			if (mid_mfn)
-				mid_mfn[mididx] = virt_to_mfn(p2m);
+				mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
 			p2m = NULL;
 		}
 
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 84a6473..16d94de 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -32,6 +32,8 @@
 #include "p2m.h"
 #include "mmu.h"
 
+#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
+
 /* Amount of extra memory space we add to the e820 ranges */
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 
@@ -85,6 +87,27 @@ static struct {
  */
 #define EXTRA_MEM_RATIO		(10)
 
+static bool xen_dom0_512gb_limit __initdata =
+	IS_ENABLED(CONFIG_XEN_512GB_NONE) || IS_ENABLED(CONFIG_XEN_512GB_DOMU);
+static bool xen_domu_512gb_limit __initdata =
+	IS_ENABLED(CONFIG_XEN_512GB_NONE) || IS_ENABLED(CONFIG_XEN_512GB_DOM0);
+
+static int __init xen_parse_512gb(char *arg)
+{
+	bool val = false;
+
+	if (!arg)
+		val = true;
+	else if (strtobool(arg, &val))
+		return 1;
+
+	xen_dom0_512gb_limit = val;
+	xen_domu_512gb_limit = val;
+
+	return 0;
+}
+early_param("xen_512gb_limit", xen_parse_512gb);
+
 static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
 {
 	int i;
@@ -242,14 +265,13 @@ static int __init xen_free_mfn(unsigned long mfn)
 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
 				unsigned long end_pfn, unsigned long *released)
 {
-	unsigned long pfn, end;
+	unsigned long pfn;
 	int ret;
 
 	WARN_ON(start_pfn > end_pfn);
 
 	/* Release pages first. */
-	end = min(end_pfn, xen_max_pfn);
-	for (pfn = start_pfn; pfn < end; pfn++) {
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		unsigned long mfn = pfn_to_mfn(pfn);
 
 		/* Make sure pfn exists to start with */
@@ -390,8 +412,9 @@ static void __init xen_set_identity_and_remap_chunk(unsigned long start_pfn,
 
 		/* Do not remap pages beyond the current allocation */
 		if (cur_pfn >= xen_max_pfn) {
-			/* Identity map remaining pages */
-			set_phys_range_identity(cur_pfn, cur_pfn + size);
+			/* Release remaining pages */
+			xen_set_identity_and_release_chunk(cur_pfn,
+				cur_pfn + size, released);
 			break;
 		}
 		if (cur_pfn + size > xen_max_pfn)
@@ -612,12 +635,34 @@ void __init xen_remap_memory(void)
 	pr_info("Remapped %ld page(s)\n", remapped);
 }
 
+static unsigned long __init xen_get_pages_limit(void)
+{
+	unsigned long limit;
+
+#ifdef CONFIG_X86_32
+	limit = GB(64) / PAGE_SIZE;
+#else
+	limit = ~0ul;
+	if (xen_initial_domain()) {
+		if (xen_dom0_512gb_limit)
+			limit = GB(512) / PAGE_SIZE;
+	} else {
+		if (xen_domu_512gb_limit)
+			limit = GB(512) / PAGE_SIZE;
+	}
+#endif
+	return limit;
+}
+
 static unsigned long __init xen_get_max_pages(void)
 {
-	unsigned long max_pages = MAX_DOMAIN_PAGES;
+	unsigned long max_pages, limit;
 	domid_t domid = DOMID_SELF;
 	int ret;
 
+	limit = xen_get_pages_limit();
+	max_pages = limit;
+
 	/*
 	 * For the initial domain we use the maximum reservation as
 	 * the maximum page.
@@ -633,7 +678,7 @@ static unsigned long __init xen_get_max_pages(void)
 			max_pages = ret;
 	}
 
-	return min(max_pages, MAX_DOMAIN_PAGES);
+	return min(max_pages, limit);
 }
 
 static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -871,7 +916,8 @@ char * __init xen_memory_setup(void)
 
 	xen_reserve_xen_mfnlist();
 
-	xen_max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+	xen_max_pfn = xen_get_pages_limit();
+	xen_max_pfn = min(xen_max_pfn, xen_start_info->nr_pages);
 	mem_end = PFN_PHYS(xen_max_pfn);
 
 	memmap.nr_entries = E820MAX;
@@ -933,12 +979,15 @@ char * __init xen_memory_setup(void)
 	 * is limited to the max size of lowmem, so that it doesn't
 	 * get completely filled.
 	 *
+	 * Make sure we have no memory above max_pages, as this area
+	 * isn't handled by the p2m management.
+	 *
 	 * In principle there could be a problem in lowmem systems if
 	 * the initial memory is also very large with respect to
 	 * lowmem, but we won't try to deal with that here.
 	 */
-	extra_pages = min(EXTRA_MEM_RATIO * min(xen_max_pfn, PFN_DOWN(MAXMEM)),
-			  extra_pages);
+	extra_pages = min3(EXTRA_MEM_RATIO * min(xen_max_pfn, PFN_DOWN(MAXMEM)),
+			   extra_pages, max_pages - xen_max_pfn);
 	i = 0;
 	while (i < xen_e820_map_entries) {
 		phys_addr_t addr = xen_e820_map[i].addr;
@@ -968,9 +1017,6 @@ char * __init xen_memory_setup(void)
 	/*
 	 * Set the rest as identity mapped, in case PCI BARs are
 	 * located here.
-	 *
-	 * PFNs above MAX_P2M_PFN are considered identity mapped as
-	 * well.
 	 */
 	set_phys_range_identity(xen_e820_map[i - 1].addr / PAGE_SIZE, ~0ul);
 
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/