[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20251022135735.246203-1-akinobu.mita@gmail.com>
Date: Wed, 22 Oct 2025 22:57:35 +0900
From: Akinobu Mita <akinobu.mita@...il.com>
To: linux-kernel@...r.kernel.org
Cc: linux-cxl@...r.kernel.org,
linux-mm@...ck.org,
akinobu.mita@...il.com
Subject: oom-killer not invoked on systems with multiple memory-tiers
On systems with multiple memory-tiers consisting of DRAM and CXL memory,
the OOM killer is not invoked properly.
Here's the command to reproduce:
$ stress-ng --oomable -v --memrate 20 --memrate-bytes 10G \
--memrate-rd-mbs 1 --memrate-wr-mbs 1
The memory usage is the number of workers specified with the --memrate
option multiplied by the buffer size specified with the --memrate-bytes
option, so please adjust it so that it exceeds the total size of the
installed DRAM and CXL memory.
If swap is disabled, you can usually expect the OOM killer to terminate
the stress-ng process when memory usage approaches the installed memory size.
However, if multiple memory-tiers exist (multiple
/sys/devices/virtual/memory_tiering/memory_tier<N> directories exist),
and /sys/kernel/mm/numa/demotion_enabled is true and
/sys/kernel/mm/lru_gen/min_ttl_ms is 0, the OOM killer will not be invoked
and the system will become inoperable.
If /sys/kernel/mm/numa/demotion_enabled is false, or if demotion_enabled
is true but /sys/kernel/mm/lru_gen/min_ttl_ms is set to a non-zero value
such as 1000, the OOM killer will be invoked properly.
This issue can be reproduced using NUMA emulation even on systems with
only DRAM. However, to configure multiple memory-tiers using fake nodes,
you must apply the attached patch.
You can create two-fake memory-tiers by booting a single-node system with
the following boot options:
numa=fake=2
numa_emulation.default_dram=1,0
numa_emulation.read_latency=100,1000
numa_emulation.write_latency=100,1000
numa_emulation.read_bandwidth=100000,10000
numa_emulation.write_bandwidth=100000,10000
---
mm/numa_emulation.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 45 insertions(+)
diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
index 703c8fa05048..b1c283b99038 100644
--- a/mm/numa_emulation.c
+++ b/mm/numa_emulation.c
@@ -6,6 +6,9 @@
#include <linux/errno.h>
#include <linux/topology.h>
#include <linux/memblock.h>
+#include <linux/memory-tiers.h>
+#include <linux/module.h>
+#include <linux/node.h>
#include <linux/numa_memblks.h>
#include <asm/numa.h>
#include <acpi/acpi_numa.h>
@@ -344,6 +347,46 @@ static int __init setup_emu2phys_nid(int *dfl_phys_nid)
return max_emu_nid;
}
+static bool default_dram[MAX_NUMNODES];
+module_param_array(default_dram, bool, NULL, 0400);
+
+static unsigned int read_latency[MAX_NUMNODES];
+module_param_array(read_latency, uint, NULL, 0400);
+
+static unsigned int write_latency[MAX_NUMNODES];
+module_param_array(write_latency, uint, NULL, 0400);
+
+static unsigned int read_bandwidth[MAX_NUMNODES];
+module_param_array(read_bandwidth, uint, NULL, 0400);
+
+static unsigned int write_bandwidth[MAX_NUMNODES];
+module_param_array(write_bandwidth, uint, NULL, 0400);
+
+static int emu_calculate_adistance(struct notifier_block *self,
+ unsigned long nid, void *data)
+{
+ struct access_coordinate perf = {
+ .read_bandwidth = read_bandwidth[nid],
+ .write_bandwidth = write_bandwidth[nid],
+ .read_latency = read_latency[nid],
+ .write_latency = write_latency[nid],
+ };
+ int *adist = data;
+
+ if (default_dram[nid])
+ mt_set_default_dram_perf(nid, &perf, "numa_emu");
+
+ if (mt_perf_to_adistance(&perf, adist))
+ return NOTIFY_OK;
+
+ return NOTIFY_STOP;
+}
+
+static struct notifier_block emu_adist_nb = {
+ .notifier_call = emu_calculate_adistance,
+ .priority = INT_MIN,
+};
+
/**
* numa_emulation - Emulate NUMA nodes
* @numa_meminfo: NUMA configuration to massage
@@ -532,6 +575,8 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
}
}
+ register_mt_adistance_algorithm(&emu_adist_nb);
+
/* free the copied physical distance table */
memblock_free(phys_dist, phys_size);
return;
--
2.43.0
Powered by blists - more mailing lists