[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240521-mm-hotplug-sync-v1-2-6d53706c1ba8@google.com>
Date: Tue, 21 May 2024 12:57:19 +0000
From: Brendan Jackman <jackmanb@...gle.com>
To: David Hildenbrand <david@...hat.com>, Oscar Salvador <osalvador@...e.de>,
Andrew Morton <akpm@...ux-foundation.org>, Mike Rapoport <rppt@...nel.org>
Cc: Michal Hocko <mhocko@...e.com>, Anshuman Khandual <anshuman.khandual@....com>,
Vlastimil Babka <vbabka@...e.cz>, Pavel Tatashin <pasha.tatashin@...een.com>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org, Brendan Jackman <jackmanb@...gle.com>
Subject: [PATCH 2/2] mm,memory_hotplug: {READ,WRITE}_ONCE unsynchronized zone data
These fields are written by memory hotplug under mem_hotplug_lock but
read without any lock. It seems like reader code is robust against the
value being stale or "from the future", but we also need to account
for:
1. Load/store tearing (according to Linus[1], this really happens,
even when everything is aligned as you would hope).
2. Invented loads[2] - the compiler can spill and re-read these fields
([2] calls this "invented loads") and assume that they have not
changed.
Note we don't need READ_ONCE in paths that have the mem_hotplug_lock
for write, but we still need WRITE_ONCE to prevent store-tearing.
[1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u
As discovered via the original big-bad article[2]
[2] https://lwn.net/Articles/793253/
Signed-off-by: Brendan Jackman <jackmanb@...gle.com>
---
include/linux/mmzone.h | 14 ++++++++++----
mm/compaction.c | 2 +-
mm/memory_hotplug.c | 20 ++++++++++++--------
mm/mm_init.c | 2 +-
mm/page_alloc.c | 2 +-
mm/show_mem.c | 8 ++++----
mm/vmstat.c | 4 ++--
7 files changed, 31 insertions(+), 21 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 194ef7fed9d6..bdb3be76d10c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone)
#endif
}
+/* This is unstable unless you hold mem_hotplug_lock. */
static inline unsigned long zone_end_pfn(const struct zone *zone)
{
- return zone->zone_start_pfn + zone->spanned_pages;
+ return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages);
}
+/* This is unstable unless you hold mem_hotplug_lock. */
static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
@@ -1033,9 +1035,10 @@ static inline bool zone_is_initialized(struct zone *zone)
return zone->initialized;
}
+/* This is unstable unless you hold mem_hotplug_lock. */
static inline bool zone_is_empty(struct zone *zone)
{
- return zone->spanned_pages == 0;
+ return READ_ONCE(zone->spanned_pages) == 0;
}
#ifndef BUILD_VDSO32_64
@@ -1485,10 +1488,13 @@ static inline bool managed_zone(struct zone *zone)
return zone_managed_pages(zone);
}
-/* Returns true if a zone has memory */
+/*
+ * Returns true if a zone has memory.
+ * This is unstable unless you old mem_hotplug_lock.
+ */
static inline bool populated_zone(struct zone *zone)
{
- return zone->present_pages;
+ return READ_ONCE(zone->present_pages);
}
#ifdef CONFIG_NUMA
diff --git a/mm/compaction.c b/mm/compaction.c
index e731d45befc7..b8066d1fdcf5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2239,7 +2239,7 @@ static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
{
unsigned long score;
- score = zone->present_pages * fragmentation_score_zone(zone);
+ score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone);
return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 431b1f6753c0..71b5e3d314a2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -463,6 +463,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
int nid = zone_to_nid(zone);
if (zone->zone_start_pfn == start_pfn) {
+ unsigned long old_end_pfn = zone_end_pfn(zone);
+
/*
* If the section is smallest section in the zone, it need
* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
@@ -470,13 +472,13 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
* for shrinking zone.
*/
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
- zone_end_pfn(zone));
+ old_end_pfn);
if (pfn) {
- zone->spanned_pages = zone_end_pfn(zone) - pfn;
+ WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn);
zone->zone_start_pfn = pfn;
} else {
zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
+ WRITE_ONCE(zone->spanned_pages, 0);
}
} else if (zone_end_pfn(zone) == end_pfn) {
/*
@@ -488,10 +490,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
start_pfn);
if (pfn)
- zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+ WRITE_ONCE(zone->spanned_pages,
+ pfn - zone->zone_start_pfn + 1);
else {
zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
+ WRITE_ONCE(zone->spanned_pages, 0);
}
}
}
@@ -710,7 +713,8 @@ static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p
if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
zone->zone_start_pfn = start_pfn;
- zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
+ WRITE_ONCE(zone->spanned_pages,
+ max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn);
}
static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
@@ -795,7 +799,7 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
struct zone *zone)
{
if (zone_idx(zone) == ZONE_MOVABLE) {
- stats->movable_pages += zone->present_pages;
+ stats->movable_pages += READ_ONCE(zone->present_pages);
} else {
stats->kernel_early_pages += zone->present_early_pages;
#ifdef CONFIG_CMA
@@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
*/
if (early_section(__pfn_to_section(page_to_pfn(page))))
zone->present_early_pages += nr_pages;
- zone->present_pages += nr_pages;
+ WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);
zone->zone_pgdat->node_present_pages += nr_pages;
if (group && movable)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c725618aeb58..ec66f2eadb95 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1540,7 +1540,7 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
for (z = 0; z < MAX_NR_ZONES; z++) {
struct zone *zone = pgdat->node_zones + z;
- zone->present_pages = 0;
+ WRITE_ONCE(zone->present_pages, 0);
zone_init_internals(zone, z, nid, 0);
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5116a2b9ea6e..1eb9000ec7d7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5728,7 +5728,7 @@ __meminit void zone_pcp_init(struct zone *zone)
if (populated_zone(zone))
pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
- zone->present_pages, zone_batchsize(zone));
+ READ_ONCE(zone->present_pages), zone_batchsize(zone));
}
void adjust_managed_page_count(struct page *page, long count)
diff --git a/mm/show_mem.c b/mm/show_mem.c
index bdb439551eef..667680a6107b 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
- K(zone->present_pages),
+ K(READ_ONCE(zone->present_pages)),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_BOUNCE)),
@@ -407,11 +407,11 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
for_each_populated_zone(zone) {
- total += zone->present_pages;
- reserved += zone->present_pages - zone_managed_pages(zone);
+ total += READ_ONCE(zone->present_pages);
+ reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone);
if (is_highmem(zone))
- highmem += zone->present_pages;
+ highmem += READ_ONCE(zone->present_pages);
}
printk("%lu pages RAM\n", total);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8507c497218b..5a9c4b5768e5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1708,8 +1708,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
- zone->spanned_pages,
- zone->present_pages,
+ READ_ONCE(zone->spanned_pages),
+ READ_ONCE(zone->present_pages),
zone_managed_pages(zone),
zone_cma_pages(zone));
--
2.45.0.rc1.225.g2a3ae87e7f-goog
Powered by blists - more mailing lists