lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sun, 16 Jun 2019 13:11:23 +0000
From:   Wei Yang <richard.weiyang@...il.com>
To:     Dan Williams <dan.j.williams@...el.com>
Cc:     akpm@...ux-foundation.org, Michal Hocko <mhocko@...e.com>,
        Vlastimil Babka <vbabka@...e.cz>,
        Logan Gunthorpe <logang@...tatee.com>,
        Oscar Salvador <osalvador@...e.de>,
        Pavel Tatashin <pasha.tatashin@...een.com>,
        Benjamin Herrenschmidt <benh@...nel.crashing.org>,
        Paul Mackerras <paulus@...ba.org>,
        Michael Ellerman <mpe@...erman.id.au>, linux-mm@...ck.org,
        linux-nvdimm@...ts.01.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v9 01/12] mm/sparsemem: Introduce struct mem_section_usage

On Wed, Jun 05, 2019 at 02:57:54PM -0700, Dan Williams wrote:
>Towards enabling memory hotplug to track partial population of a
>section, introduce 'struct mem_section_usage'.
>
>A pointer to a 'struct mem_section_usage' instance replaces the existing
>pointer to a 'pageblock_flags' bitmap. Effectively it adds one more
>'unsigned long' beyond the 'pageblock_flags' (usemap) allocation to
>house a new 'subsection_map' bitmap.  The new bitmap enables the memory
>hot{plug,remove} implementation to act on incremental sub-divisions of a
>section.
>
>The default SUBSECTION_SHIFT is chosen to keep the 'subsection_map' no
>larger than a single 'unsigned long' on the major architectures.
>Alternatively an architecture can define ARCH_SUBSECTION_SHIFT to
>override the default PMD_SHIFT. Note that PowerPC needs to use
>ARCH_SUBSECTION_SHIFT to workaround PMD_SHIFT being a non-constant
>expression on PowerPC.
>
>The primary motivation for this functionality is to support platforms
>that mix "System RAM" and "Persistent Memory" within a single section,
>or multiple PMEM ranges with different mapping lifetimes within a single
>section. The section restriction for hotplug has caused an ongoing saga
>of hacks and bugs for devm_memremap_pages() users.
>
>Beyond the fixups to teach existing paths how to retrieve the 'usemap'
>from a section, and updates to usemap allocation path, there are no
>expected behavior changes.
>
>Cc: Michal Hocko <mhocko@...e.com>
>Cc: Vlastimil Babka <vbabka@...e.cz>
>Cc: Logan Gunthorpe <logang@...tatee.com>
>Cc: Oscar Salvador <osalvador@...e.de>
>Cc: Pavel Tatashin <pasha.tatashin@...een.com>
>Cc: Benjamin Herrenschmidt <benh@...nel.crashing.org>
>Cc: Paul Mackerras <paulus@...ba.org>
>Cc: Michael Ellerman <mpe@...erman.id.au>
>Signed-off-by: Dan Williams <dan.j.williams@...el.com>
>---
> arch/powerpc/include/asm/sparsemem.h |    3 +
> include/linux/mmzone.h               |   48 +++++++++++++++++++-
> mm/memory_hotplug.c                  |   18 ++++----
> mm/page_alloc.c                      |    2 -
> mm/sparse.c                          |   81 +++++++++++++++++-----------------
> 5 files changed, 99 insertions(+), 53 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
>index 3192d454a733..1aa3c9303bf8 100644
>--- a/arch/powerpc/include/asm/sparsemem.h
>+++ b/arch/powerpc/include/asm/sparsemem.h
>@@ -10,6 +10,9 @@
>  */
> #define SECTION_SIZE_BITS       24
> 
>+/* Reflect the largest possible PMD-size as the subsection-size constant */
>+#define ARCH_SUBSECTION_SHIFT 24
>+
> #endif /* CONFIG_SPARSEMEM */
> 
> #ifdef CONFIG_MEMORY_HOTPLUG
>diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>index 427b79c39b3c..ac163f2f274f 100644
>--- a/include/linux/mmzone.h
>+++ b/include/linux/mmzone.h
>@@ -1161,6 +1161,44 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
> #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
> #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
> 
>+/*
>+ * SUBSECTION_SHIFT must be constant since it is used to declare
>+ * subsection_map and related bitmaps without triggering the generation
>+ * of variable-length arrays. The most natural size for a subsection is
>+ * a PMD-page. For architectures that do not have a constant PMD-size
>+ * ARCH_SUBSECTION_SHIFT can be set to a constant max size, or otherwise
>+ * fallback to 2MB.
>+ */
>+#if defined(ARCH_SUBSECTION_SHIFT)
>+#define SUBSECTION_SHIFT (ARCH_SUBSECTION_SHIFT)
>+#elif defined(PMD_SHIFT)
>+#define SUBSECTION_SHIFT (PMD_SHIFT)
>+#else
>+/*
>+ * Memory hotplug enabled platforms avoid this default because they
>+ * either define ARCH_SUBSECTION_SHIFT, or PMD_SHIFT is a constant, but
>+ * this is kept as a backstop to allow compilation on
>+ * !ARCH_ENABLE_MEMORY_HOTPLUG archs.
>+ */
>+#define SUBSECTION_SHIFT 21
>+#endif
>+
>+#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
>+#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
>+#define PAGE_SUBSECTION_MASK ((~(PAGES_PER_SUBSECTION-1)))

One pair of brackets could be removed, IMHO.

>+
>+#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
>+#error Subsection size exceeds section size
>+#else
>+#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
>+#endif
>+
>+struct mem_section_usage {
>+	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
>+	/* See declaration of similar field in struct zone */
>+	unsigned long pageblock_flags[0];
>+};
>+
> struct page;
> struct page_ext;
> struct mem_section {
>@@ -1178,8 +1216,7 @@ struct mem_section {
> 	 */
> 	unsigned long section_mem_map;
> 
>-	/* See declaration of similar field in struct zone */
>-	unsigned long *pageblock_flags;
>+	struct mem_section_usage *usage;
> #ifdef CONFIG_PAGE_EXTENSION
> 	/*
> 	 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
>@@ -1210,6 +1247,11 @@ extern struct mem_section **mem_section;
> extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
> #endif
> 
>+static inline unsigned long *section_to_usemap(struct mem_section *ms)
>+{
>+	return ms->usage->pageblock_flags;

Do we need to consider the case when ms->usage is NULL?

>+}
>+
> static inline struct mem_section *__nr_to_section(unsigned long nr)
> {
> #ifdef CONFIG_SPARSEMEM_EXTREME
>@@ -1221,7 +1263,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
> 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
> }
> extern int __section_nr(struct mem_section* ms);
>-extern unsigned long usemap_size(void);
>+extern size_t mem_section_usage_size(void);
> 
> /*
>  * We use the lower bits of the mem_map pointer to store
>diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>index a88c5f334e5a..7b963c2d3a0d 100644
>--- a/mm/memory_hotplug.c
>+++ b/mm/memory_hotplug.c
>@@ -166,9 +166,10 @@ void put_page_bootmem(struct page *page)
> #ifndef CONFIG_SPARSEMEM_VMEMMAP
> static void register_page_bootmem_info_section(unsigned long start_pfn)
> {
>-	unsigned long *usemap, mapsize, section_nr, i;
>+	unsigned long mapsize, section_nr, i;
> 	struct mem_section *ms;
> 	struct page *page, *memmap;
>+	struct mem_section_usage *usage;
> 
> 	section_nr = pfn_to_section_nr(start_pfn);
> 	ms = __nr_to_section(section_nr);
>@@ -188,10 +189,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
> 	for (i = 0; i < mapsize; i++, page++)
> 		get_page_bootmem(section_nr, page, SECTION_INFO);
> 
>-	usemap = ms->pageblock_flags;
>-	page = virt_to_page(usemap);
>+	usage = ms->usage;
>+	page = virt_to_page(usage);
> 
>-	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
>+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
> 
> 	for (i = 0; i < mapsize; i++, page++)
> 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
>@@ -200,9 +201,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
> #else /* CONFIG_SPARSEMEM_VMEMMAP */
> static void register_page_bootmem_info_section(unsigned long start_pfn)
> {
>-	unsigned long *usemap, mapsize, section_nr, i;
>+	unsigned long mapsize, section_nr, i;
> 	struct mem_section *ms;
> 	struct page *page, *memmap;
>+	struct mem_section_usage *usage;
> 
> 	section_nr = pfn_to_section_nr(start_pfn);
> 	ms = __nr_to_section(section_nr);
>@@ -211,10 +213,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
> 
> 	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
> 
>-	usemap = ms->pageblock_flags;
>-	page = virt_to_page(usemap);
>+	usage = ms->usage;
>+	page = virt_to_page(usage);
> 
>-	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
>+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
> 
> 	for (i = 0; i < mapsize; i++, page++)
> 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
>diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>index c061f66c2d0c..c6d8224d792e 100644
>--- a/mm/page_alloc.c
>+++ b/mm/page_alloc.c
>@@ -404,7 +404,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
> 							unsigned long pfn)
> {
> #ifdef CONFIG_SPARSEMEM
>-	return __pfn_to_section(pfn)->pageblock_flags;
>+	return section_to_usemap(__pfn_to_section(pfn));
> #else
> 	return page_zone(page)->pageblock_flags;
> #endif /* CONFIG_SPARSEMEM */
>diff --git a/mm/sparse.c b/mm/sparse.c
>index 1552c855d62a..71da15cc7432 100644
>--- a/mm/sparse.c
>+++ b/mm/sparse.c
>@@ -288,33 +288,31 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
> 
> static void __meminit sparse_init_one_section(struct mem_section *ms,
> 		unsigned long pnum, struct page *mem_map,
>-		unsigned long *pageblock_bitmap)
>+		struct mem_section_usage *usage)
> {
> 	ms->section_mem_map &= ~SECTION_MAP_MASK;
> 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
> 							SECTION_HAS_MEM_MAP;
>- 	ms->pageblock_flags = pageblock_bitmap;
>+	ms->usage = usage;
> }
> 
>-unsigned long usemap_size(void)
>+static unsigned long usemap_size(void)
> {
> 	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
> }
> 
>-#ifdef CONFIG_MEMORY_HOTPLUG
>-static unsigned long *__kmalloc_section_usemap(void)
>+size_t mem_section_usage_size(void)
> {
>-	return kmalloc(usemap_size(), GFP_KERNEL);
>+	return sizeof(struct mem_section_usage) + usemap_size();
> }
>-#endif /* CONFIG_MEMORY_HOTPLUG */
> 
> #ifdef CONFIG_MEMORY_HOTREMOVE
>-static unsigned long * __init
>+static struct mem_section_usage * __init
> sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> 					 unsigned long size)
> {
>+	struct mem_section_usage *usage;
> 	unsigned long goal, limit;
>-	unsigned long *p;
> 	int nid;
> 	/*
> 	 * A page may contain usemaps for other sections preventing the
>@@ -330,15 +328,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> 	limit = goal + (1UL << PA_SECTION_SHIFT);
> 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> again:
>-	p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
>-	if (!p && limit) {
>+	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
>+	if (!usage && limit) {
> 		limit = 0;
> 		goto again;
> 	}
>-	return p;
>+	return usage;
> }
> 
>-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
>+static void __init check_usemap_section_nr(int nid,
>+		struct mem_section_usage *usage)
> {
> 	unsigned long usemap_snr, pgdat_snr;
> 	static unsigned long old_usemap_snr;
>@@ -352,7 +351,7 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
> 		old_pgdat_snr = NR_MEM_SECTIONS;
> 	}
> 
>-	usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
>+	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
> 	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
> 	if (usemap_snr == pgdat_snr)
> 		return;
>@@ -380,14 +379,15 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
> 		usemap_snr, pgdat_snr, nid);
> }
> #else
>-static unsigned long * __init
>+static struct mem_section_usage * __init
> sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
> 					 unsigned long size)
> {
> 	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
> }
> 
>-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
>+static void __init check_usemap_section_nr(int nid,
>+		struct mem_section_usage *usage)
> {
> }
> #endif /* CONFIG_MEMORY_HOTREMOVE */
>@@ -474,14 +474,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
> 				   unsigned long pnum_end,
> 				   unsigned long map_count)
> {
>-	unsigned long pnum, usemap_longs, *usemap;
>+	struct mem_section_usage *usage;
>+	unsigned long pnum;
> 	struct page *map;
> 
>-	usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
>-	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
>-							  usemap_size() *
>-							  map_count);
>-	if (!usemap) {
>+	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
>+			mem_section_usage_size() * map_count);
>+	if (!usage) {
> 		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
> 		goto failed;
> 	}
>@@ -497,9 +496,9 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
> 			pnum_begin = pnum;
> 			goto failed;
> 		}
>-		check_usemap_section_nr(nid, usemap);
>-		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
>-		usemap += usemap_longs;
>+		check_usemap_section_nr(nid, usage);
>+		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage);
>+		usage = (void *) usage + mem_section_usage_size();
> 	}
> 	sparse_buffer_fini();
> 	return;
>@@ -697,9 +696,9 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
> 				     struct vmem_altmap *altmap)
> {
> 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
>+	struct mem_section_usage *usage;
> 	struct mem_section *ms;
> 	struct page *memmap;
>-	unsigned long *usemap;
> 	int ret;
> 
> 	/*
>@@ -713,8 +712,8 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
> 	memmap = kmalloc_section_memmap(section_nr, nid, altmap);
> 	if (!memmap)
> 		return -ENOMEM;
>-	usemap = __kmalloc_section_usemap();
>-	if (!usemap) {
>+	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
>+	if (!usage) {
> 		__kfree_section_memmap(memmap, altmap);
> 		return -ENOMEM;
> 	}
>@@ -732,11 +731,11 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
> 	page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
> 
> 	section_mark_present(ms);
>-	sparse_init_one_section(ms, section_nr, memmap, usemap);
>+	sparse_init_one_section(ms, section_nr, memmap, usage);
> 
> out:
> 	if (ret < 0) {
>-		kfree(usemap);
>+		kfree(usage);
> 		__kfree_section_memmap(memmap, altmap);
> 	}
> 	return ret;
>@@ -772,20 +771,20 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
> }
> #endif
> 
>-static void free_section_usemap(struct page *memmap, unsigned long *usemap,
>-		struct vmem_altmap *altmap)
>+static void free_section_usage(struct page *memmap,
>+		struct mem_section_usage *usage, struct vmem_altmap *altmap)
> {
>-	struct page *usemap_page;
>+	struct page *usage_page;
> 
>-	if (!usemap)
>+	if (!usage)
> 		return;
> 
>-	usemap_page = virt_to_page(usemap);
>+	usage_page = virt_to_page(usage);
> 	/*
> 	 * Check to see if allocation came from hot-plug-add
> 	 */
>-	if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
>-		kfree(usemap);
>+	if (PageSlab(usage_page) || PageCompound(usage_page)) {
>+		kfree(usage);
> 		if (memmap)
> 			__kfree_section_memmap(memmap, altmap);
> 		return;
>@@ -804,18 +803,18 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
> 			       struct vmem_altmap *altmap)
> {
> 	struct page *memmap = NULL;
>-	unsigned long *usemap = NULL;
>+	struct mem_section_usage *usage = NULL;
> 
> 	if (ms->section_mem_map) {
>-		usemap = ms->pageblock_flags;
>+		usage = ms->usage;
> 		memmap = sparse_decode_mem_map(ms->section_mem_map,
> 						__section_nr(ms));
> 		ms->section_mem_map = 0;
>-		ms->pageblock_flags = NULL;
>+		ms->usage = NULL;
> 	}
> 
> 	clear_hwpoisoned_pages(memmap + map_offset,
> 			PAGES_PER_SECTION - map_offset);
>-	free_section_usemap(memmap, usemap, altmap);
>+	free_section_usage(memmap, usage, altmap);
> }
> #endif /* CONFIG_MEMORY_HOTPLUG */

-- 
Wei Yang
Help you, Help me

Powered by blists - more mailing lists