lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240521-mm-hotplug-sync-v1-2-6d53706c1ba8@google.com>
Date: Tue, 21 May 2024 12:57:19 +0000
From: Brendan Jackman <jackmanb@...gle.com>
To: David Hildenbrand <david@...hat.com>, Oscar Salvador <osalvador@...e.de>, 
	Andrew Morton <akpm@...ux-foundation.org>, Mike Rapoport <rppt@...nel.org>
Cc: Michal Hocko <mhocko@...e.com>, Anshuman Khandual <anshuman.khandual@....com>, 
	Vlastimil Babka <vbabka@...e.cz>, Pavel Tatashin <pasha.tatashin@...een.com>, linux-mm@...ck.org, 
	linux-kernel@...r.kernel.org, Brendan Jackman <jackmanb@...gle.com>
Subject: [PATCH 2/2] mm,memory_hotplug: {READ,WRITE}_ONCE unsynchronized zone data

These fields are written by memory hotplug under mem_hotplug_lock but
read without any lock. It seems like reader code is robust against the
value being stale or "from the future", but we also need to account
for:

1. Load/store tearing (according to Linus[1], this really happens,
   even when everything is aligned as you would hope).

2. Invented loads[2] - the compiler can spill and re-read these fields
   ([2] calls this "invented loads") and assume that they have not
   changed.

Note we don't need READ_ONCE in paths that have the mem_hotplug_lock
for write, but we still need WRITE_ONCE to prevent store-tearing.

[1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u
    As discovered via the original big-bad article[2]
[2] https://lwn.net/Articles/793253/

Signed-off-by: Brendan Jackman <jackmanb@...gle.com>
---
 include/linux/mmzone.h | 14 ++++++++++----
 mm/compaction.c        |  2 +-
 mm/memory_hotplug.c    | 20 ++++++++++++--------
 mm/mm_init.c           |  2 +-
 mm/page_alloc.c        |  2 +-
 mm/show_mem.c          |  8 ++++----
 mm/vmstat.c            |  4 ++--
 7 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 194ef7fed9d6..bdb3be76d10c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone)
 #endif
 }
 
+/* This is unstable unless you hold mem_hotplug_lock. */
 static inline unsigned long zone_end_pfn(const struct zone *zone)
 {
-	return zone->zone_start_pfn + zone->spanned_pages;
+	return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages);
 }
 
+/* This is unstable unless you hold mem_hotplug_lock. */
 static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
 {
 	return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
@@ -1033,9 +1035,10 @@ static inline bool zone_is_initialized(struct zone *zone)
 	return zone->initialized;
 }
 
+/* This is unstable unless you hold mem_hotplug_lock. */
 static inline bool zone_is_empty(struct zone *zone)
 {
-	return zone->spanned_pages == 0;
+	return READ_ONCE(zone->spanned_pages) == 0;
 }
 
 #ifndef BUILD_VDSO32_64
@@ -1485,10 +1488,13 @@ static inline bool managed_zone(struct zone *zone)
 	return zone_managed_pages(zone);
 }
 
-/* Returns true if a zone has memory */
+/*
+ * Returns true if a zone has memory.
+ * This is unstable unless you old mem_hotplug_lock.
+ */
 static inline bool populated_zone(struct zone *zone)
 {
-	return zone->present_pages;
+	return READ_ONCE(zone->present_pages);
 }
 
 #ifdef CONFIG_NUMA
diff --git a/mm/compaction.c b/mm/compaction.c
index e731d45befc7..b8066d1fdcf5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2239,7 +2239,7 @@ static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
 {
 	unsigned long score;
 
-	score = zone->present_pages * fragmentation_score_zone(zone);
+	score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone);
 	return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
 }
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 431b1f6753c0..71b5e3d314a2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -463,6 +463,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 	int nid = zone_to_nid(zone);
 
 	if (zone->zone_start_pfn == start_pfn) {
+		unsigned long old_end_pfn = zone_end_pfn(zone);
+
 		/*
 		 * If the section is smallest section in the zone, it need
 		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
@@ -470,13 +472,13 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 		 * for shrinking zone.
 		 */
 		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
-						zone_end_pfn(zone));
+						old_end_pfn);
 		if (pfn) {
-			zone->spanned_pages = zone_end_pfn(zone) - pfn;
+			WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn);
 			zone->zone_start_pfn = pfn;
 		} else {
 			zone->zone_start_pfn = 0;
-			zone->spanned_pages = 0;
+			WRITE_ONCE(zone->spanned_pages, 0);
 		}
 	} else if (zone_end_pfn(zone) == end_pfn) {
 		/*
@@ -488,10 +490,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 		pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
 					       start_pfn);
 		if (pfn)
-			zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+			WRITE_ONCE(zone->spanned_pages,
+				   pfn - zone->zone_start_pfn + 1);
 		else {
 			zone->zone_start_pfn = 0;
-			zone->spanned_pages = 0;
+			WRITE_ONCE(zone->spanned_pages, 0);
 		}
 	}
 }
@@ -710,7 +713,8 @@ static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p
 	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
 		zone->zone_start_pfn = start_pfn;
 
-	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
+	WRITE_ONCE(zone->spanned_pages,
+		   max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn);
 }
 
 static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
@@ -795,7 +799,7 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
 					    struct zone *zone)
 {
 	if (zone_idx(zone) == ZONE_MOVABLE) {
-		stats->movable_pages += zone->present_pages;
+		stats->movable_pages += READ_ONCE(zone->present_pages);
 	} else {
 		stats->kernel_early_pages += zone->present_early_pages;
 #ifdef CONFIG_CMA
@@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
 	 */
 	if (early_section(__pfn_to_section(page_to_pfn(page))))
 		zone->present_early_pages += nr_pages;
-	zone->present_pages += nr_pages;
+	WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);
 	zone->zone_pgdat->node_present_pages += nr_pages;
 
 	if (group && movable)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c725618aeb58..ec66f2eadb95 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1540,7 +1540,7 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
 	for (z = 0; z < MAX_NR_ZONES; z++) {
 		struct zone *zone = pgdat->node_zones + z;
 
-		zone->present_pages = 0;
+		WRITE_ONCE(zone->present_pages, 0);
 		zone_init_internals(zone, z, nid, 0);
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5116a2b9ea6e..1eb9000ec7d7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5728,7 +5728,7 @@ __meminit void zone_pcp_init(struct zone *zone)
 
 	if (populated_zone(zone))
 		pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
-			 zone->present_pages, zone_batchsize(zone));
+			 READ_ONCE(zone->present_pages), zone_batchsize(zone));
 }
 
 void adjust_managed_page_count(struct page *page, long count)
diff --git a/mm/show_mem.c b/mm/show_mem.c
index bdb439551eef..667680a6107b 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 			K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
-			K(zone->present_pages),
+			K(READ_ONCE(zone->present_pages)),
 			K(zone_managed_pages(zone)),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_BOUNCE)),
@@ -407,11 +407,11 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 
 	for_each_populated_zone(zone) {
 
-		total += zone->present_pages;
-		reserved += zone->present_pages - zone_managed_pages(zone);
+		total += READ_ONCE(zone->present_pages);
+		reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone);
 
 		if (is_highmem(zone))
-			highmem += zone->present_pages;
+			highmem += READ_ONCE(zone->present_pages);
 	}
 
 	printk("%lu pages RAM\n", total);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8507c497218b..5a9c4b5768e5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1708,8 +1708,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
-		   zone->spanned_pages,
-		   zone->present_pages,
+		   READ_ONCE(zone->spanned_pages),
+		   READ_ONCE(zone->present_pages),
 		   zone_managed_pages(zone),
 		   zone_cma_pages(zone));
 

-- 
2.45.0.rc1.225.g2a3ae87e7f-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ