lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20130403024344.GA4384@sgi.com>
Date:	Tue, 2 Apr 2013 21:43:44 -0500
From:	Robin Holt <holt@....com>
To:	Michal Hocko <mhocko@...e.cz>, Cliff Wickman <cpw@....com>
Cc:	linux-kernel@...r.kernel.org, linux-mm@...ck.org, x86@...nel.org,
	wli@...omorphy.com
Subject: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2

Reserving a large number of 1GB hugetlbfs pages at boot takes a very
long time due to the pages being memset to 0 during the reservation.
This is unneeded as the pages will be zeroed by clear_huge_page() when
being allocated by the user.

Large system sites would at times like to allocate a very large amount
of memory as 1GB pages.  They would put this on the kernel boot line:
   default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
 up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
      start_kernel
        kernel_init
          do_pre_smp_initcalls
            hugetlb_init
              hugetlb_init_hstates
                hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).  This estimate is approximate (it depends on
core frequency & number of hops to remote memory) but should be within
a factor of 2 on most systems.  A benchmark attempting to reserve a TB
for 1GB pages would thus require ~1000 seconds of boot time just for
this allocating.  32TB would take 8 hours.

Signed-off-by: Robin Holt <holt@....com>
To: Cliff Whickman <cpw@....com>
To: Michal Hocko <mhocko@...e.cz>
Cc: lkml <linux-kernel@...r.kernel.org>
Cc: Linux mm <linux-mm@...ck.org>
Cc: x86 Maintainers <x86@...nel.org>
---

Changes since -v1
 - Reworked to remove the special NO_ZERO flag and push that down further
   in the call chain.

Note: I compiled this only with a .config which specified
CONFIG_NO_BOOTMEM (x86_64).  I have not tried a config which uses a
bootmem allocator.

 include/linux/bootmem.h |  8 +++++++-
 mm/bootmem.c            | 21 +++++++++++++++++----
 mm/hugetlb.c            |  2 +-
 mm/nobootmem.c          | 37 +++++++++++++++++++++++++++----------
 mm/sparse.c             |  2 +-
 5 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab..04563fc 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -92,11 +92,17 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
+extern void *__alloc_bootmem_node_nopanic_notzeroed(
+				  pg_data_t *pgdat,
+				  unsigned long size,
+				  unsigned long align,
+				  unsigned long goal);
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
-				  unsigned long limit);
+				  unsigned long limit,
+				  int zeroed);
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
 				 unsigned long goal);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb0..b2e4027 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 
 void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				unsigned long size, unsigned long align,
-				unsigned long goal, unsigned long limit)
+				unsigned long goal, unsigned long limit,
+				int zeroed)
 {
 	void *ptr;
 
 	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc(size, GFP_NOWAIT);
+		if (zeroed)
+			return kzalloc(size, GFP_NOWAIT);
+		else
+			return kmalloc(size, GFP_NOWAIT);
 again:
 
 	/* do not panic in alloc_bootmem_bdata() */
@@ -733,13 +737,22 @@ again:
 	return NULL;
 }
 
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +761,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 	if (ptr)
 		return ptr;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ca9a7c6..7683f6a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
 	while (nr_nodes) {
 		void *addr;
 
-		addr = __alloc_bootmem_node_nopanic(
+		addr = __alloc_bootmem_node_nopanic_notzeroed(
 				NODE_DATA(hstate_next_node_to_alloc(h,
 						&node_states[N_MEMORY])),
 				huge_page_size(h), huge_page_size(h), 0);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..342511b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -32,8 +32,8 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-					u64 goal, u64 limit)
+static void * __init ___alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit, int zeroed)
 {
 	void *ptr;
 	u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 		return NULL;
 
 	ptr = phys_to_virt(addr);
-	memset(ptr, 0, size);
+	if (zeroed)
+		memset(ptr, 0, size);
 	memblock_reserve(addr, size);
 	/*
 	 * The min_count is set to 0 so that bootmem allocated blocks
@@ -56,6 +57,12 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 	return ptr;
 }
 
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit)
+{
+	return ___alloc_memory_core_early(nid, size, align, goal, limit, 1);
+}
+
 /*
  * free_bootmem_late - free bootmem pages directly to page allocator
  * @addr: starting address of the range
@@ -291,18 +298,19 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 						   unsigned long size,
 						   unsigned long align,
 						   unsigned long goal,
-						   unsigned long limit)
+						   unsigned long limit,
+						   int zeroed)
 {
 	void *ptr;
 
 again:
-	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-					goal, limit);
+	ptr = ___alloc_memory_core_early(pgdat->node_id, size, align,
+					goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					goal, limit);
+	ptr = ___alloc_memory_core_early(MAX_NUMNODES, size, align,
+					goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
@@ -314,13 +322,22 @@ again:
 	return NULL;
 }
 
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +346,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 1);
 	if (ptr)
 		return ptr;
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc8..8a1c5ad 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
 	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
-					  SMP_CACHE_BYTES, goal, limit);
+					  SMP_CACHE_BYTES, goal, limit, 1);
 	if (!p && limit) {
 		limit = 0;
 		goto again;
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ