linux-kernel - [patch 2/2] mm: vmstat: use node_page_state_snapshot in too_many

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20231113233502.587879658@redhat.com>
Date:   Mon, 13 Nov 2023 20:34:22 -0300
From:   Marcelo Tosatti <mtosatti@...hat.com>
To:     linux-kernel@...r.kernel.org, linux-mm@...ck.org
Cc:     Michal Hocko <mhocko@...e.com>, Vlastimil Babka <vbabka@...e.cz>,
        Andrew Morton <akpm@...ux-foundation.org>,
        David Hildenbrand <david@...hat.com>,
        Peter Xu <peterx@...hat.com>,
        Marcelo Tosatti <mtosatti@...hat.com>
Subject: [patch 2/2] mm: vmstat: use node_page_state_snapshot in too_many_isolated

A customer reported seeing processes hung at too_many_isolated,
while analysis indicated that the problem occurred due to out
of sync per-CPU stats (see below).

Fix is to use node_page_state_snapshot to avoid the out of stale values.

2136 static unsigned long
    2137 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
    2138                      struct scan_control *sc, enum lru_list lru)
    2139 {
    :
    2145         bool file = is_file_lru(lru);
    :
    2147         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
    :
    2150         while (unlikely(too_many_isolated(pgdat, file, sc))) {
    2151                 if (stalled)
    2152                         return 0;
    2153 
    2154                 /* wait a bit for the reclaimer. */
    2155                 msleep(100);   <--- some processes were sleeping here, with pending SIGKILL.
    2156                 stalled = true;
    2157 
    2158                 /* We are about to die and free our memory. Return now. */
    2159                 if (fatal_signal_pending(current))
    2160                         return SWAP_CLUSTER_MAX;
    2161         }

msleep() must be called only when there are too many isolated pages: 

    2019 static int too_many_isolated(struct pglist_data *pgdat, int file,
    2020                 struct scan_control *sc)
    2021 {
    :
    2030         if (file) {
    2031                 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
    2032                 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
    2033         } else {
    :
    2046         return isolated > inactive;

The return value was true since:

    crash> p ((struct pglist_data *) 0xffff00817fffe580)->vm_stat[NR_INACTIVE_FILE]
    $8 = {
      counter = 1
    }
    crash> p ((struct pglist_data *) 0xffff00817fffe580)->vm_stat[NR_ISOLATED_FILE]
    $9 = {
      counter = 2

while per_cpu stats had:

    crash> p ((struct pglist_data *) 0xffff00817fffe580)->per_cpu_nodestats
    $85 = (struct per_cpu_nodestat *) 0xffff8000118832e0
    crash> p/x 0xffff8000118832e0 + __per_cpu_offset[42]
    $86 = 0xffff00917fcc32e0
    crash> p ((struct per_cpu_nodestat *) 0xffff00917fcc32e0)->vm_node_stat_diff[NR_ISOLATED_FILE]
    $87 = -1 '\377'
    
    crash> p/x 0xffff8000118832e0 + __per_cpu_offset[44]
    $89 = 0xffff00917fe032e0
    crash> p ((struct per_cpu_nodestat *) 0xffff00917fe032e0)->vm_node_stat_diff[NR_ISOLATED_FILE]
    $91 = -1 '\377' 

It seems that processes were trapped in direct reclaim/compaction loop
because these nodes had few free pages lower than watermark min.

  crash> kmem -z | grep -A 3 Normal
  :
  NODE: 4  ZONE: 1  ADDR: ffff00817fffec40  NAME: "Normal"
    SIZE: 8454144  PRESENT: 98304  MIN/LOW/HIGH: 68/166/264
    VM_STAT:
          NR_FREE_PAGES: 68
  --
  NODE: 5  ZONE: 1  ADDR: ffff00897fffec40  NAME: "Normal"
    SIZE: 118784  MIN/LOW/HIGH: 82/200/318
    VM_STAT:
          NR_FREE_PAGES: 45
  --
  NODE: 6  ZONE: 1  ADDR: ffff00917fffec40  NAME: "Normal"
    SIZE: 118784  MIN/LOW/HIGH: 82/200/318
    VM_STAT:
          NR_FREE_PAGES: 53
  --
  NODE: 7  ZONE: 1  ADDR: ffff00997fbbec40  NAME: "Normal"
    SIZE: 118784  MIN/LOW/HIGH: 82/200/318
    VM_STAT:
          NR_FREE_PAGES: 52

Signed-off-by: Marcelo Tosatti <mtosatti@...hat.com>

---
 mm/compaction.c |    6 +++---
 mm/vmscan.c     |    8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

Index: linux/mm/compaction.c
===================================================================
--- linux.orig/mm/compaction.c
+++ linux/mm/compaction.c
@@ -791,11 +791,11 @@ static bool too_many_isolated(struct com
 
 	unsigned long active, inactive, isolated;
 
-	inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
+	inactive = node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
 			node_page_state(pgdat, NR_INACTIVE_ANON);
-	active = node_page_state(pgdat, NR_ACTIVE_FILE) +
+	active = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
 			node_page_state(pgdat, NR_ACTIVE_ANON);
-	isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
+	isolated = node_page_state_snapshot(pgdat, NR_ISOLATED_FILE) +
 			node_page_state(pgdat, NR_ISOLATED_ANON);
 
 	/*
Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c
+++ linux/mm/vmscan.c
@@ -1756,11 +1756,11 @@ static int too_many_isolated(struct pgli
 		return 0;
 
 	if (file) {
-		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
-		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
+		inactive = node_page_state_snapshot(pgdat, NR_INACTIVE_FILE);
+		isolated = node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
 	} else {
-		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
-		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
+		inactive = node_page_state_snapshot(pgdat, NR_INACTIVE_ANON);
+		isolated = node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
 	}
 
 	/*