[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260129144043.231636-5-bharata@amd.com>
Date: Thu, 29 Jan 2026 20:10:37 +0530
From: Bharata B Rao <bharata@....com>
To: <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>
CC: <Jonathan.Cameron@...wei.com>, <dave.hansen@...el.com>,
<gourry@...rry.net>, <mgorman@...hsingularity.net>, <mingo@...hat.com>,
<peterz@...radead.org>, <raghavendra.kt@....com>, <riel@...riel.com>,
<rientjes@...gle.com>, <sj@...nel.org>, <weixugc@...gle.com>,
<willy@...radead.org>, <ying.huang@...ux.alibaba.com>, <ziy@...dia.com>,
<dave@...olabs.net>, <nifan.cxl@...il.com>, <xuezhengchu@...wei.com>,
<yiannis@...corp.com>, <akpm@...ux-foundation.org>, <david@...hat.com>,
<byungchul@...com>, <kinseyho@...gle.com>, <joshua.hahnjy@...il.com>,
<yuanchu@...gle.com>, <balbirs@...dia.com>, <alok.rathore@...sung.com>,
<shivankg@....com>, Bharata B Rao <bharata@....com>
Subject: [RFC PATCH v5 04/10] mm: pghot: Precision mode for pghot
By default, one byte per PFN is used to store hotness information.
Limited number of bits are used to store the access time leading
to coarse-grained time tracking. Also there aren't enough bits
to track the toptier NID explicitly and hence the default target_nid
is used for promotion.
This precise mode relaxes the above situation by storing the
hotness information in 4 bytes per PFN. More fine-grained
access time tracking and toptier NID tracking becomes possible
in this mode.
Typically useful when toptier consists of more than one node.
Signed-off-by: Bharata B Rao <bharata@....com>
---
Documentation/admin-guide/mm/pghot.txt | 4 +-
include/linux/mmzone.h | 2 +-
include/linux/pghot.h | 31 ++++++++++++
mm/Kconfig | 11 ++++
mm/Makefile | 7 ++-
mm/pghot-precise.c | 70 ++++++++++++++++++++++++++
mm/pghot.c | 13 +++--
7 files changed, 130 insertions(+), 8 deletions(-)
create mode 100644 mm/pghot-precise.c
diff --git a/Documentation/admin-guide/mm/pghot.txt b/Documentation/admin-guide/mm/pghot.txt
index 01291b72e7ab..b329e692ef89 100644
--- a/Documentation/admin-guide/mm/pghot.txt
+++ b/Documentation/admin-guide/mm/pghot.txt
@@ -38,7 +38,7 @@ Path: /sys/kernel/debug/pghot/
3. **freq_threshold**
- Minimum access frequency before a page is marked ready for promotion.
- - Range: 1 to 3
+ - Range: 1 to 3 in default mode, 1 to 7 in precision mode.
- Default: 2
- Example:
# echo 3 > /sys/kernel/debug/pghot/freq_threshold
@@ -60,7 +60,7 @@ Path: /proc/sys/vm/pghot_promote_freq_window_ms
- Controls the time window (in ms) for counting access frequency. A page is
considered hot only when **freq_threshold** number of accesses occur with
this time period.
-- Default: 4000 (4 seconds)
+- Default: 4000 (4 seconds) in default mode and 5000 (5s) in precision mode.
- Example:
# sysctl vm.pghot_promote_freq_window_ms=3000
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 22e08befb096..49c374064fc2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1924,7 +1924,7 @@ struct mem_section {
#ifdef CONFIG_PGHOT
/*
* Per-PFN hotness data for this section.
- * Array of phi_t (u8 in default mode).
+ * Array of phi_t (u8 in default mode, u32 in precision mode).
* LSB is used as PGHOT_SECTION_HOT_BIT flag.
*/
void *hot_map;
diff --git a/include/linux/pghot.h b/include/linux/pghot.h
index 88e57aab697b..d3d59b0c0cf6 100644
--- a/include/linux/pghot.h
+++ b/include/linux/pghot.h
@@ -48,6 +48,36 @@ enum pghot_src_enabled {
#define PGHOT_DEFAULT_NODE 0
+#if defined(CONFIG_PGHOT_PRECISE)
+#define PGHOT_DEFAULT_FREQ_WINDOW (5 * MSEC_PER_SEC)
+
+/*
+ * Bits 0-26 are used to store nid, frequency and time.
+ * Bits 27-30 are unused now.
+ * Bit 31 is used to indicate the page is ready for migration.
+ */
+#define PGHOT_MIGRATE_READY 31
+
+#define PGHOT_NID_WIDTH 10
+#define PGHOT_FREQ_WIDTH 3
+/* time is stored in 14 bits which can represent up to 16s with HZ=1000 */
+#define PGHOT_TIME_WIDTH 14
+
+#define PGHOT_NID_SHIFT 0
+#define PGHOT_FREQ_SHIFT (PGHOT_NID_SHIFT + PGHOT_NID_WIDTH)
+#define PGHOT_TIME_SHIFT (PGHOT_FREQ_SHIFT + PGHOT_FREQ_WIDTH)
+
+#define PGHOT_NID_MASK GENMASK(PGHOT_NID_WIDTH - 1, 0)
+#define PGHOT_FREQ_MASK GENMASK(PGHOT_FREQ_WIDTH - 1, 0)
+#define PGHOT_TIME_MASK GENMASK(PGHOT_TIME_WIDTH - 1, 0)
+
+#define PGHOT_NID_MAX ((1 << PGHOT_NID_WIDTH) - 1)
+#define PGHOT_FREQ_MAX ((1 << PGHOT_FREQ_WIDTH) - 1)
+#define PGHOT_TIME_MAX ((1 << PGHOT_TIME_WIDTH) - 1)
+
+typedef u32 phi_t;
+
+#else /* !CONFIG_PGHOT_PRECISE */
#define PGHOT_DEFAULT_FREQ_WINDOW (4 * MSEC_PER_SEC)
/*
@@ -74,6 +104,7 @@ enum pghot_src_enabled {
#define PGHOT_TIME_MAX ((1 << PGHOT_TIME_WIDTH) - 1)
typedef u8 phi_t;
+#endif /* CONFIG_PGHOT_PRECISE */
#define PGHOT_RECORD_SIZE sizeof(phi_t)
diff --git a/mm/Kconfig b/mm/Kconfig
index f4f0147faac5..fde5aee3e16f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1478,6 +1478,17 @@ config PGHOT
This adds 1 byte of metadata overhead per page in lower-tier
memory nodes.
+config PGHOT_PRECISE
+ bool "Hot page tracking precision mode"
+ def_bool n
+ depends on PGHOT
+ help
+ Enables precision mode for tracking hot pages with pghot sub-system.
+ Adds fine-grained access time tracking and explicit toptier target
+ NID tracking. Precise hot page tracking comes at the cost of using
+ 4 bytes per page against the default one byte per page. Preferable
+ to enable this on systems with multiple nodes in toptier.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 655a27f3a215..89f999647752 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -147,4 +147,9 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
-obj-$(CONFIG_PGHOT) += pghot.o pghot-tunables.o pghot-default.o
+obj-$(CONFIG_PGHOT) += pghot.o pghot-tunables.o
+ifdef CONFIG_PGHOT_PRECISE
+obj-$(CONFIG_PGHOT) += pghot-precise.o
+else
+obj-$(CONFIG_PGHOT) += pghot-default.o
+endif
diff --git a/mm/pghot-precise.c b/mm/pghot-precise.c
new file mode 100644
index 000000000000..d8d4f15b3f9f
--- /dev/null
+++ b/mm/pghot-precise.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * pghot: Precision mode
+ *
+ * 4 byte hotness record per PFN (u32)
+ * NID, time and frequency tracked as part of the record.
+ */
+
+#include <linux/pghot.h>
+#include <linux/jiffies.h>
+
+unsigned long pghot_access_latency(unsigned long old_time, unsigned long time)
+{
+ return jiffies_to_msecs((time - old_time) & PGHOT_TIME_MASK);
+}
+
+bool pghot_update_record(phi_t *phi, int nid, unsigned long now)
+{
+ phi_t freq, old_freq, hotness, old_hotness, old_time, old_nid;
+ phi_t time = now & PGHOT_TIME_MASK;
+
+ old_hotness = READ_ONCE(*phi);
+ do {
+ bool new_window = false;
+
+ hotness = old_hotness;
+ old_nid = (hotness >> PGHOT_NID_SHIFT) & PGHOT_NID_MASK;
+ old_freq = (hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
+ old_time = (hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
+
+ if (pghot_access_latency(old_time, time) > sysctl_pghot_freq_window)
+ new_window = true;
+
+ if (new_window)
+ freq = 1;
+ else if (old_freq < PGHOT_FREQ_MAX)
+ freq = old_freq + 1;
+ else
+ freq = old_freq;
+ nid = (nid == NUMA_NO_NODE) ? pghot_target_nid : nid;
+
+ hotness &= ~(PGHOT_NID_MASK << PGHOT_NID_SHIFT);
+ hotness &= ~(PGHOT_FREQ_MASK << PGHOT_FREQ_SHIFT);
+ hotness &= ~(PGHOT_TIME_MASK << PGHOT_TIME_SHIFT);
+
+ hotness |= (nid & PGHOT_NID_MASK) << PGHOT_NID_SHIFT;
+ hotness |= (freq & PGHOT_FREQ_MASK) << PGHOT_FREQ_SHIFT;
+ hotness |= (time & PGHOT_TIME_MASK) << PGHOT_TIME_SHIFT;
+
+ if (freq >= pghot_freq_threshold)
+ hotness |= BIT(PGHOT_MIGRATE_READY);
+ } while (unlikely(!try_cmpxchg(phi, &old_hotness, hotness)));
+ return !!(hotness & BIT(PGHOT_MIGRATE_READY));
+}
+
+int pghot_get_record(phi_t *phi, int *nid, int *freq, unsigned long *time)
+{
+ phi_t old_hotness, hotness = 0;
+
+ old_hotness = READ_ONCE(*phi);
+ do {
+ if (!(old_hotness & BIT(PGHOT_MIGRATE_READY)))
+ return -EINVAL;
+ } while (unlikely(!try_cmpxchg(phi, &old_hotness, hotness)));
+
+ *nid = (old_hotness >> PGHOT_NID_SHIFT) & PGHOT_NID_MASK;
+ *freq = (old_hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
+ *time = (old_hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
+ return 0;
+}
diff --git a/mm/pghot.c b/mm/pghot.c
index 95b5012d5b99..bf1d9029cbaa 100644
--- a/mm/pghot.c
+++ b/mm/pghot.c
@@ -10,6 +10,9 @@
* the frequency of access and last access time. Promotions are done
* to a default toptier NID.
*
+ * In the precision mode, 4 bytes are used to store the frequency
+ * of access, last access time and the accessing NID.
+ *
* A kernel thread named kmigrated is provided to migrate or promote
* the hot pages. kmigrated runs for each lower tier node. It iterates
* over the node's PFNs and migrates pages marked for migration into
@@ -52,13 +55,15 @@ static bool kmigrated_started __ro_after_init;
* for the purpose of tracking page hotness and subsequent promotion.
*
* @pfn: PFN of the page
- * @nid: Unused
+ * @nid: Target NID to where the page needs to be migrated in precision
+ * mode but unused in default mode
* @src: The identifier of the sub-system that reports the access
* @now: Access time in jiffies
*
- * Updates the frequency and time of access and marks the page as
- * ready for migration if the frequency crosses a threshold. The pages
- * marked for migration are migrated by kmigrated kernel thread.
+ * Updates the NID (in precision mode only), frequency and time of access
+ * and marks the page as ready for migration if the frequency crosses a
+ * threshold. The pages marked for migration are migrated by kmigrated
+ * kernel thread.
*
* Return: 0 on success and -EINVAL on failure to record the access.
*/
--
2.34.1
Powered by blists - more mailing lists