linux-kernel - Re: [PATCH 2/2] mm,memory_hotplug: {READ,WRITE}

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CABzRoyZXq3u4DYxO39Fcezo56HAbkNh6xLuK9jnuiNK5gVmV1w@mail.gmail.com>
Date: Wed, 22 May 2024 12:25:30 +0800
From: Lance Yang <ioworker0@...il.com>
To: Brendan Jackman <jackmanb@...gle.com>
Cc: David Hildenbrand <david@...hat.com>, Oscar Salvador <osalvador@...e.de>, 
	Andrew Morton <akpm@...ux-foundation.org>, Mike Rapoport <rppt@...nel.org>, 
	Michal Hocko <mhocko@...e.com>, Anshuman Khandual <anshuman.khandual@....com>, 
	Vlastimil Babka <vbabka@...e.cz>, Pavel Tatashin <pasha.tatashin@...een.com>, linux-mm@...ck.org, 
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/2] mm,memory_hotplug: {READ,WRITE}_ONCE unsynchronized
 zone data

Hi Brendan,

On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@...gle.com> wrote:
>
> These fields are written by memory hotplug under mem_hotplug_lock but
> read without any lock. It seems like reader code is robust against the
> value being stale or "from the future", but we also need to account
> for:
>
> 1. Load/store tearing (according to Linus[1], this really happens,
>    even when everything is aligned as you would hope).
>
> 2. Invented loads[2] - the compiler can spill and re-read these fields
>    ([2] calls this "invented loads") and assume that they have not
>    changed.
>
> Note we don't need READ_ONCE in paths that have the mem_hotplug_lock
> for write, but we still need WRITE_ONCE to prevent store-tearing.
>
> [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u
>     As discovered via the original big-bad article[2]
> [2] https://lwn.net/Articles/793253/
>
> Signed-off-by: Brendan Jackman <jackmanb@...gle.com>
> ---
>  include/linux/mmzone.h | 14 ++++++++++----
>  mm/compaction.c        |  2 +-
>  mm/memory_hotplug.c    | 20 ++++++++++++--------
>  mm/mm_init.c           |  2 +-
>  mm/page_alloc.c        |  2 +-
>  mm/show_mem.c          |  8 ++++----
>  mm/vmstat.c            |  4 ++--
>  7 files changed, 31 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 194ef7fed9d6..bdb3be76d10c 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone)
>  #endif
>  }
>
> +/* This is unstable unless you hold mem_hotplug_lock. */
>  static inline unsigned long zone_end_pfn(const struct zone *zone)
>  {
> -       return zone->zone_start_pfn + zone->spanned_pages;
> +       return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages);
>  }
>
> +/* This is unstable unless you hold mem_hotplug_lock. */
>  static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
>  {
>         return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
> @@ -1033,9 +1035,10 @@ static inline bool zone_is_initialized(struct zone *zone)
>         return zone->initialized;
>  }
>
> +/* This is unstable unless you hold mem_hotplug_lock. */
>  static inline bool zone_is_empty(struct zone *zone)
>  {
> -       return zone->spanned_pages == 0;
> +       return READ_ONCE(zone->spanned_pages) == 0;
>  }
>
>  #ifndef BUILD_VDSO32_64
> @@ -1485,10 +1488,13 @@ static inline bool managed_zone(struct zone *zone)
>         return zone_managed_pages(zone);
>  }
>
> -/* Returns true if a zone has memory */
> +/*
> + * Returns true if a zone has memory.
> + * This is unstable unless you old mem_hotplug_lock.
> + */
>  static inline bool populated_zone(struct zone *zone)
>  {
> -       return zone->present_pages;
> +       return READ_ONCE(zone->present_pages);
>  }
>
>  #ifdef CONFIG_NUMA
> diff --git a/mm/compaction.c b/mm/compaction.c
> index e731d45befc7..b8066d1fdcf5 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -2239,7 +2239,7 @@ static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
>  {
>         unsigned long score;
>
> -       score = zone->present_pages * fragmentation_score_zone(zone);
> +       score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone);
>         return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
>  }
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 431b1f6753c0..71b5e3d314a2 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -463,6 +463,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>         int nid = zone_to_nid(zone);
>
>         if (zone->zone_start_pfn == start_pfn) {
> +               unsigned long old_end_pfn = zone_end_pfn(zone);
> +
>                 /*
>                  * If the section is smallest section in the zone, it need
>                  * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
> @@ -470,13 +472,13 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>                  * for shrinking zone.
>                  */
>                 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
> -                                               zone_end_pfn(zone));
> +                                               old_end_pfn);
>                 if (pfn) {
> -                       zone->spanned_pages = zone_end_pfn(zone) - pfn;
> +                       WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn);
>                         zone->zone_start_pfn = pfn;
>                 } else {
>                         zone->zone_start_pfn = 0;
> -                       zone->spanned_pages = 0;
> +                       WRITE_ONCE(zone->spanned_pages, 0);
>                 }
>         } else if (zone_end_pfn(zone) == end_pfn) {
>                 /*
> @@ -488,10 +490,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>                 pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
>                                                start_pfn);
>                 if (pfn)
> -                       zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
> +                       WRITE_ONCE(zone->spanned_pages,
> +                                  pfn - zone->zone_start_pfn + 1);
>                 else {
>                         zone->zone_start_pfn = 0;
> -                       zone->spanned_pages = 0;
> +                       WRITE_ONCE(zone->spanned_pages, 0);
>                 }
>         }
>  }
> @@ -710,7 +713,8 @@ static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p
>         if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
>                 zone->zone_start_pfn = start_pfn;
>
> -       zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
> +       WRITE_ONCE(zone->spanned_pages,
> +                  max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn);
>  }
>
>  static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
> @@ -795,7 +799,7 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
>                                             struct zone *zone)
>  {
>         if (zone_idx(zone) == ZONE_MOVABLE) {
> -               stats->movable_pages += zone->present_pages;
> +               stats->movable_pages += READ_ONCE(zone->present_pages);
>         } else {
>                 stats->kernel_early_pages += zone->present_early_pages;
>  #ifdef CONFIG_CMA
> @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
>          */
>         if (early_section(__pfn_to_section(page_to_pfn(page))))
>                 zone->present_early_pages += nr_pages;
> -       zone->present_pages += nr_pages;
> +       WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);

I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing
on 'zone->present_pages', but it's probably just me overthinking it :)

Thanks,
Lance

>         zone->zone_pgdat->node_present_pages += nr_pages;
>
>         if (group && movable)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index c725618aeb58..ec66f2eadb95 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1540,7 +1540,7 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
>         for (z = 0; z < MAX_NR_ZONES; z++) {
>                 struct zone *zone = pgdat->node_zones + z;
>
> -               zone->present_pages = 0;
> +               WRITE_ONCE(zone->present_pages, 0);
>                 zone_init_internals(zone, z, nid, 0);
>         }
>  }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5116a2b9ea6e..1eb9000ec7d7 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5728,7 +5728,7 @@ __meminit void zone_pcp_init(struct zone *zone)
>
>         if (populated_zone(zone))
>                 pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
> -                        zone->present_pages, zone_batchsize(zone));
> +                        READ_ONCE(zone->present_pages), zone_batchsize(zone));
>  }
>
>  void adjust_managed_page_count(struct page *page, long count)
> diff --git a/mm/show_mem.c b/mm/show_mem.c
> index bdb439551eef..667680a6107b 100644
> --- a/mm/show_mem.c
> +++ b/mm/show_mem.c
> @@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
>                         K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
>                         K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
>                         K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
> -                       K(zone->present_pages),
> +                       K(READ_ONCE(zone->present_pages)),
>                         K(zone_managed_pages(zone)),
>                         K(zone_page_state(zone, NR_MLOCK)),
>                         K(zone_page_state(zone, NR_BOUNCE)),
> @@ -407,11 +407,11 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>
>         for_each_populated_zone(zone) {
>
> -               total += zone->present_pages;
> -               reserved += zone->present_pages - zone_managed_pages(zone);
> +               total += READ_ONCE(zone->present_pages);
> +               reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone);
>
>                 if (is_highmem(zone))
> -                       highmem += zone->present_pages;
> +                       highmem += READ_ONCE(zone->present_pages);
>         }
>
>         printk("%lu pages RAM\n", total);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 8507c497218b..5a9c4b5768e5 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1708,8 +1708,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
>                    min_wmark_pages(zone),
>                    low_wmark_pages(zone),
>                    high_wmark_pages(zone),
> -                  zone->spanned_pages,
> -                  zone->present_pages,
> +                  READ_ONCE(zone->spanned_pages),
> +                  READ_ONCE(zone->present_pages),
>                    zone_managed_pages(zone),
>                    zone_cma_pages(zone));
>
>
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog
>
>