lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20191230093828.24613-1-kirill.shutemov@linux.intel.com>
Date:   Mon, 30 Dec 2019 12:38:28 +0300
From:   "Kirill A. Shutemov" <kirill@...temov.name>
To:     Andrew Morton <akpm@...ux-foundation.org>
Cc:     Dan Williams <dan.j.williams@...el.com>,
        Michal Hocko <mhocko@...e.com>,
        Vlastimil Babka <vbabka@...e.cz>, Mel Gorman <mgorman@...e.de>,
        "Jin, Zhi" <zhi.jin@...el.com>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org,
        "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
Subject: [PATCH] mm/page_alloc: Skip non present sections on zone initialization

memmap_init_zone() can be called on the ranges with holes during the
boot. It will skip any non-valid PFNs one-by-one. It works fine as long
as holes are not too big.

But huge holes in the memory map causes a problem. It takes over 20
seconds to walk 32TiB hole. x86-64 with 5-level paging allows for much
larger holes in the memory map which would practically hang the system.

Deferred struct page init doesn't help here. It only works on the
present ranges.

Skipping non-present sections would fix the issue.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
---

The situation can be emulated using the following QEMU patch:

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index ac08e6360437..f5f2258092e1 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1159,13 +1159,14 @@ void pc_memory_init(PCMachineState *pcms,
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
     e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
     if (x86ms->above_4g_mem_size > 0) {
+        int shift = 45;
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
         memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
                                  x86ms->below_4g_mem_size,
                                  x86ms->above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+        memory_region_add_subregion(system_memory, 1ULL << shift,
                                     ram_above_4g);
-        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
+        e820_add_entry(1ULL << shift, x86ms->above_4g_mem_size, E820_RAM);
     }
 
     if (!pcmc->has_reserved_memory &&
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index cde2a16b941a..694c26947bf6 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1928,7 +1928,7 @@ uint64_t cpu_get_tsc(CPUX86State *env);
 /* XXX: This value should match the one returned by CPUID
  * and in exec.c */
 # if defined(TARGET_X86_64)
-# define TCG_PHYS_ADDR_BITS 40
+# define TCG_PHYS_ADDR_BITS 52
 # else
 # define TCG_PHYS_ADDR_BITS 36
 # endif

---
 mm/page_alloc.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df62a49cd09e..442dc0244bb4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5873,6 +5873,30 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
 	return false;
 }
 
+#ifdef CONFIG_SPARSEMEM
+/* Skip PFNs that belong to non-present sections */
+static inline __meminit unsigned long next_pfn(unsigned long pfn)
+{
+	unsigned long section_nr;
+
+	section_nr = pfn_to_section_nr(++pfn);
+	if (present_section_nr(section_nr))
+		return pfn;
+
+	while (++section_nr <= __highest_present_section_nr) {
+		if (present_section_nr(section_nr))
+			return section_nr_to_pfn(section_nr);
+	}
+
+	return -1;
+}
+#else
+static inline __meminit unsigned long next_pfn(unsigned long pfn)
+{
+	return pfn++;
+}
+#endif
+
 /*
  * Initially all pages are reserved - free ones are freed
  * up by memblock_free_all() once the early boot process is
@@ -5912,8 +5936,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		 * function.  They do not exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
-			if (!early_pfn_valid(pfn))
+			if (!early_pfn_valid(pfn)) {
+				pfn = next_pfn(pfn) - 1;
 				continue;
+			}
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 			if (overlap_memmap_init(zone, &pfn))
-- 
2.24.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ