lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YFtPxH0CT5QZsnR1@dhcp22.suse.cz>
Date:   Wed, 24 Mar 2021 15:42:12 +0100
From:   Michal Hocko <mhocko@...e.com>
To:     Oscar Salvador <osalvador@...e.de>
Cc:     Andrew Morton <akpm@...ux-foundation.org>,
        David Hildenbrand <david@...hat.com>,
        Anshuman Khandual <anshuman.khandual@....com>,
        Vlastimil Babka <vbabka@...e.cz>,
        Pavel Tatashin <pasha.tatashin@...een.com>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH v5 1/5] mm,memory_hotplug: Allocate memmap from the added
 memory range

On Wed 24-03-21 13:03:29, Michal Hocko wrote:
> On Wed 24-03-21 11:12:59, Oscar Salvador wrote:
[...]
> > I kind of understand to be reluctant to use vmemmap_pages terminology here, but
> > unfortunately we need to know about it.
> > We could rename nr_vmemmap_pages to offset_buddy_pages or something like that.
> 
> I am not convinced. It seems you are justr trying to graft the new
> functionality in. But I still believe that {on,off}lining shouldn't care
> about where their vmemmaps come from at all. It should be a
> responsibility of the code which reserves that space to compansate for
> accounting. Otherwise we will end up with a hard to maintain code
> because expectations would be spread at way too many places. Not to
> mention different pfns that the code should care about.

The below is a quick hack on top of this patch to illustrate my
thinking. I have dug out all the vmemmap pieces out of the
{on,off}lining and hooked all the accounting when the space is reserved.
This just compiles without any deeper look so there are likely some
minor problems but I haven't really encountered any major problems or
hacks to introduce into the code. The separation seems to be possible.
The diffstat also looks promising. Am I missing something fundamental in
this?

--- 
 drivers/base/memory.c          |   8 +--
 include/linux/memory_hotplug.h |   6 +-
 mm/memory_hotplug.c            | 151 ++++++++++++++++++++---------------------
 3 files changed, 80 insertions(+), 85 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 5ea2b3fbce02..9697acfe96eb 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -181,15 +181,15 @@ memory_block_action(unsigned long start_section_nr, unsigned long action,
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	int ret;
 
-	start_pfn = section_nr_to_pfn(start_section_nr);
+	start_pfn = section_nr_to_pfn(start_section_nr) + nr_vmemmap_pages;
+	nr_pages -= nr_vmemmap_pages;
 
 	switch (action) {
 	case MEM_ONLINE:
-		ret = online_pages(start_pfn, nr_pages, nr_vmemmap_pages,
-				   online_type, nid);
+		ret = online_pages(start_pfn, nr_pages, online_type, nid);
 		break;
 	case MEM_OFFLINE:
-		ret = offline_pages(start_pfn, nr_pages, nr_vmemmap_pages);
+		ret = offline_pages(start_pfn, nr_pages);
 		break;
 	default:
 		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index a85d4b7d15c2..673d2d4a8443 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -109,8 +109,7 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
 /* VM interface that may be used by firmware interface */
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
-			unsigned long nr_vmemmap_pages, int online_type,
-			int nid);
+			int online_type, int nid);
 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -317,8 +316,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
 #ifdef CONFIG_MEMORY_HOTREMOVE
 
 extern void try_offline_node(int nid);
-extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
-			 unsigned long nr_vmemmap_pages);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern int remove_memory(int nid, u64 start, u64 size);
 extern void __remove_memory(int nid, u64 start, u64 size);
 extern int offline_and_remove_memory(int nid, u64 start, u64 size);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0c3a98cb8cde..754026a9164d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -844,30 +844,19 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 }
 
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
-		       unsigned long nr_vmemmap_pages, int online_type, int nid)
+		       int online_type, int nid)
 {
-	unsigned long flags, buddy_start_pfn, buddy_nr_pages;
+	unsigned long flags;
 	struct zone *zone;
 	int need_zonelists_rebuild = 0;
 	int ret;
 	struct memory_notify arg;
 
-	/* We can only online full sections (e.g., SECTION_IS_ONLINE) */
-	if (WARN_ON_ONCE(!nr_pages ||
-			 !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
-		return -EINVAL;
-
-	buddy_start_pfn = pfn + nr_vmemmap_pages;
-	buddy_nr_pages = nr_pages - nr_vmemmap_pages;
-
 	mem_hotplug_begin();
 
 	/* associate pfn range with the zone */
 	zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
-	if (nr_vmemmap_pages)
-		move_pfn_range_to_zone(zone, pfn, nr_vmemmap_pages, NULL,
-				       MIGRATE_UNMOVABLE);
-	move_pfn_range_to_zone(zone, buddy_start_pfn, buddy_nr_pages, NULL,
+	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL,
 			       MIGRATE_ISOLATE);
 
 	arg.start_pfn = pfn;
@@ -884,7 +873,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	 * onlining, such that undo_isolate_page_range() works correctly.
 	 */
 	spin_lock_irqsave(&zone->lock, flags);
-	zone->nr_isolate_pageblock += buddy_nr_pages / pageblock_nr_pages;
+	zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
 	spin_unlock_irqrestore(&zone->lock, flags);
 
 	/*
@@ -897,7 +886,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 		setup_zone_pageset(zone);
 	}
 
-	online_pages_range(pfn, nr_pages, buddy_start_pfn);
+	online_pages_range(pfn, nr_pages, pfn);
 	zone->present_pages += nr_pages;
 
 	pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -910,9 +899,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	zone_pcp_update(zone);
 
 	/* Basic onlining is complete, allow allocation of onlined pages. */
-	undo_isolate_page_range(buddy_start_pfn,
-				buddy_start_pfn + buddy_nr_pages,
-				MIGRATE_MOVABLE);
+	undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
 
 	/*
 	 * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
@@ -1126,6 +1113,59 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
 	       IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
 }
 
+static void reserve_vmmemmap_space(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+	struct zone *zone = &NODE_DATA(nid)->node_zones[ZONE_NORMAL];
+
+	altmap->free = nr_pages;
+	altmap->base_pfn = pfn;
+
+	/* initialize struct pages and account for this space */
+	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+}
+
+static void unaccount_vmemmap_space(int nid, unsigned long start_pfn, unsigned long nr_pages)
+{
+	struct zone *zone = &NODE_DATA(nid)->node_zones[ZONE_NORMAL];
+	unsigned long flags;
+
+	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+	zone->present_pages -= nr_pages;
+
+	pgdat_resize_lock(zone->zone_pgdat, &flags);
+	zone->zone_pgdat->node_present_pages -= nr_pages;
+	pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
+}
+
+static int remove_memory_block_cb(struct memory_block *mem, void *arg)
+{
+	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+	struct vmem_altmap mhp_altmap = {};
+	struct vmem_altmap *altmap = NULL;
+	u64 start = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
+	u64 size = memory_block_size_bytes();
+
+	if (!mem->nr_vmemmap_pages) {
+		arch_remove_memory(mem->nid, start, size, NULL);
+		return 0;
+	}
+
+	/*
+	 * Let remove_pmd_table->free_hugepage_table
+	 * do the right thing if we used vmem_altmap
+	 * when hot-adding the range.
+	 */
+	mhp_altmap.alloc = nr_vmemmap_pages;
+	altmap = &mhp_altmap;
+
+	unaccount_vmemmap_space(mem->nid, PHYS_PFN(start), nr_vmemmap_pages);
+	arch_remove_memory(mem->nid, start, size, altmap);
+
+	return 0;
+}
+
 /*
  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
  * and online/offline operations (triggered e.g. by sysfs).
@@ -1170,8 +1210,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 			ret = -EINVAL;
 			goto error;
 		}
-		mhp_altmap.free = PHYS_PFN(size);
-		mhp_altmap.base_pfn = PHYS_PFN(start);
+		reserve_vmmemmap_space(nid, PHYS_PFN(start), PHYS_PFN(size), &mhp_altmap);
 		params.altmap = &mhp_altmap;
 	}
 
@@ -1639,25 +1678,16 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
 	return 0;
 }
 
-int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
-			unsigned long nr_vmemmap_pages)
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	const unsigned long end_pfn = start_pfn + nr_pages;
-	unsigned long pfn, buddy_start_pfn, buddy_nr_pages, system_ram_pages = 0;
+	unsigned long pfn, system_ram_pages = 0;
 	unsigned long flags;
 	struct zone *zone;
 	struct memory_notify arg;
 	int ret, node;
 	char *reason;
 
-	/* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
-	if (WARN_ON_ONCE(!nr_pages ||
-			 !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
-		return -EINVAL;
-
-	buddy_start_pfn = start_pfn + nr_vmemmap_pages;
-	buddy_nr_pages = nr_pages - nr_vmemmap_pages;
-
 	mem_hotplug_begin();
 
 	/*
@@ -1693,7 +1723,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	zone_pcp_disable(zone);
 
 	/* set above range as isolated */
-	ret = start_isolate_page_range(buddy_start_pfn, end_pfn,
+	ret = start_isolate_page_range(start_pfn, end_pfn,
 				       MIGRATE_MOVABLE,
 				       MEMORY_OFFLINE | REPORT_FAILURE);
 	if (ret) {
@@ -1713,7 +1743,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	}
 
 	do {
-		pfn = buddy_start_pfn;
+		pfn = start_pfn;
 		do {
 			if (signal_pending(current)) {
 				ret = -EINTR;
@@ -1744,18 +1774,18 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 		 * offlining actually in order to make hugetlbfs's object
 		 * counting consistent.
 		 */
-		ret = dissolve_free_huge_pages(buddy_start_pfn, end_pfn);
+		ret = dissolve_free_huge_pages(start_pfn, end_pfn);
 		if (ret) {
 			reason = "failure to dissolve huge pages";
 			goto failed_removal_isolated;
 		}
 
-		ret = test_pages_isolated(buddy_start_pfn, end_pfn, MEMORY_OFFLINE);
+		ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
 
 	} while (ret);
 
 	/* Mark all sections offline and remove free pages from the buddy. */
-	__offline_isolated_pages(start_pfn, end_pfn, buddy_start_pfn);
+	__offline_isolated_pages(start_pfn, end_pfn, start_pfn);
 	pr_debug("Offlined Pages %ld\n", nr_pages);
 
 	/*
@@ -1764,13 +1794,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	 * of isolated pageblocks, memory onlining will properly revert this.
 	 */
 	spin_lock_irqsave(&zone->lock, flags);
-	zone->nr_isolate_pageblock -= buddy_nr_pages / pageblock_nr_pages;
+	zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
 	spin_unlock_irqrestore(&zone->lock, flags);
 
 	zone_pcp_enable(zone);
 
 	/* removal success */
-	adjust_managed_page_count(pfn_to_page(start_pfn), -buddy_nr_pages);
+	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
 	zone->present_pages -= nr_pages;
 
 	pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -1799,7 +1829,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	return 0;
 
 failed_removal_isolated:
-	undo_isolate_page_range(buddy_start_pfn, end_pfn, MIGRATE_MOVABLE);
+	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 	memory_notify(MEM_CANCEL_OFFLINE, &arg);
 failed_removal_pcplists_disabled:
 	zone_pcp_enable(zone);
@@ -1830,14 +1860,6 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 	return 0;
 }
 
-static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
-{
-	/*
-	 * If not set, continue with the next block.
-	 */
-	return mem->nr_vmemmap_pages;
-}
-
 static int check_cpu_on_node(pg_data_t *pgdat)
 {
 	int cpu;
@@ -1912,9 +1934,6 @@ EXPORT_SYMBOL(try_offline_node);
 static int __ref try_remove_memory(int nid, u64 start, u64 size)
 {
 	int rc = 0;
-	struct vmem_altmap mhp_altmap = {};
-	struct vmem_altmap *altmap = NULL;
-	unsigned long nr_vmemmap_pages = 0;
 
 	BUG_ON(check_hotplug_memory_range(start, size));
 
@@ -1927,31 +1946,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 	if (rc)
 		return rc;
 
-	/*
-	 * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
-	 * the same granularity it was added - a single memory block.
-	 */
-	if (memmap_on_memory) {
-		nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
-						      get_nr_vmemmap_pages_cb);
-		if (nr_vmemmap_pages) {
-			if (size != memory_block_size_bytes()) {
-				pr_warn("Refuse to remove %#llx - %#llx,"
-					"wrong granularity\n",
-					 start, start + size);
-				return -EINVAL;
-			}
-
-			/*
-			 * Let remove_pmd_table->free_hugepage_table
-			 * do the right thing if we used vmem_altmap
-			 * when hot-adding the range.
-			 */
-			mhp_altmap.alloc = nr_vmemmap_pages;
-			altmap = &mhp_altmap;
-		}
-	}
-
 	/* remove memmap entry */
 	firmware_map_remove(start, start + size, "System RAM");
 
@@ -1963,7 +1957,10 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 
 	mem_hotplug_begin();
 
-	arch_remove_memory(nid, start, size, altmap);
+	if (!memmap_on_memory)
+		arch_remove_memory(nid, start, size, NULL);
+	else
+		walk_memory_blocks(start, size, NULL, remove_memory_block_cb);
 
 	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
 		memblock_free(start, size);
-- 
Michal Hocko
SUSE Labs

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ