lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260129144043.231636-5-bharata@amd.com>
Date: Thu, 29 Jan 2026 20:10:37 +0530
From: Bharata B Rao <bharata@....com>
To: <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>
CC: <Jonathan.Cameron@...wei.com>, <dave.hansen@...el.com>,
	<gourry@...rry.net>, <mgorman@...hsingularity.net>, <mingo@...hat.com>,
	<peterz@...radead.org>, <raghavendra.kt@....com>, <riel@...riel.com>,
	<rientjes@...gle.com>, <sj@...nel.org>, <weixugc@...gle.com>,
	<willy@...radead.org>, <ying.huang@...ux.alibaba.com>, <ziy@...dia.com>,
	<dave@...olabs.net>, <nifan.cxl@...il.com>, <xuezhengchu@...wei.com>,
	<yiannis@...corp.com>, <akpm@...ux-foundation.org>, <david@...hat.com>,
	<byungchul@...com>, <kinseyho@...gle.com>, <joshua.hahnjy@...il.com>,
	<yuanchu@...gle.com>, <balbirs@...dia.com>, <alok.rathore@...sung.com>,
	<shivankg@....com>, Bharata B Rao <bharata@....com>
Subject: [RFC PATCH v5 04/10] mm: pghot: Precision mode for pghot

By default, one byte per PFN is used to store hotness information.
Limited number of bits are used to store the access time leading
to coarse-grained time tracking. Also there aren't enough bits
to track the toptier NID explicitly and hence the default target_nid
is used for promotion.

This precise mode relaxes the above situation by storing the
hotness information in 4 bytes per PFN. More fine-grained
access time tracking and toptier NID tracking becomes possible
in this mode.

Typically useful when toptier consists of more than one node.

Signed-off-by: Bharata B Rao <bharata@....com>
---
 Documentation/admin-guide/mm/pghot.txt |  4 +-
 include/linux/mmzone.h                 |  2 +-
 include/linux/pghot.h                  | 31 ++++++++++++
 mm/Kconfig                             | 11 ++++
 mm/Makefile                            |  7 ++-
 mm/pghot-precise.c                     | 70 ++++++++++++++++++++++++++
 mm/pghot.c                             | 13 +++--
 7 files changed, 130 insertions(+), 8 deletions(-)
 create mode 100644 mm/pghot-precise.c

diff --git a/Documentation/admin-guide/mm/pghot.txt b/Documentation/admin-guide/mm/pghot.txt
index 01291b72e7ab..b329e692ef89 100644
--- a/Documentation/admin-guide/mm/pghot.txt
+++ b/Documentation/admin-guide/mm/pghot.txt
@@ -38,7 +38,7 @@ Path: /sys/kernel/debug/pghot/
 
 3. **freq_threshold**
    - Minimum access frequency before a page is marked ready for promotion.
-   - Range: 1 to 3
+   - Range: 1 to 3 in default mode, 1 to 7 in precision mode.
    - Default: 2
    - Example:
      # echo 3 > /sys/kernel/debug/pghot/freq_threshold
@@ -60,7 +60,7 @@ Path: /proc/sys/vm/pghot_promote_freq_window_ms
 - Controls the time window (in ms) for counting access frequency. A page is
   considered hot only when **freq_threshold** number of accesses occur with
   this time period.
-- Default: 4000 (4 seconds)
+- Default: 4000 (4 seconds) in default mode and 5000 (5s) in precision mode.
 - Example:
   # sysctl vm.pghot_promote_freq_window_ms=3000
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 22e08befb096..49c374064fc2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1924,7 +1924,7 @@ struct mem_section {
 #ifdef CONFIG_PGHOT
 	/*
 	 * Per-PFN hotness data for this section.
-	 * Array of phi_t (u8 in default mode).
+	 * Array of phi_t (u8 in default mode, u32 in precision mode).
 	 * LSB is used as PGHOT_SECTION_HOT_BIT flag.
 	 */
 	void *hot_map;
diff --git a/include/linux/pghot.h b/include/linux/pghot.h
index 88e57aab697b..d3d59b0c0cf6 100644
--- a/include/linux/pghot.h
+++ b/include/linux/pghot.h
@@ -48,6 +48,36 @@ enum pghot_src_enabled {
 
 #define PGHOT_DEFAULT_NODE		0
 
+#if defined(CONFIG_PGHOT_PRECISE)
+#define PGHOT_DEFAULT_FREQ_WINDOW	(5 * MSEC_PER_SEC)
+
+/*
+ * Bits 0-26 are used to store nid, frequency and time.
+ * Bits 27-30 are unused now.
+ * Bit 31 is used to indicate the page is ready for migration.
+ */
+#define PGHOT_MIGRATE_READY		31
+
+#define PGHOT_NID_WIDTH			10
+#define PGHOT_FREQ_WIDTH		3
+/* time is stored in 14 bits which can represent up to 16s with HZ=1000 */
+#define PGHOT_TIME_WIDTH		14
+
+#define PGHOT_NID_SHIFT			0
+#define PGHOT_FREQ_SHIFT		(PGHOT_NID_SHIFT + PGHOT_NID_WIDTH)
+#define PGHOT_TIME_SHIFT		(PGHOT_FREQ_SHIFT + PGHOT_FREQ_WIDTH)
+
+#define PGHOT_NID_MASK			GENMASK(PGHOT_NID_WIDTH - 1, 0)
+#define PGHOT_FREQ_MASK			GENMASK(PGHOT_FREQ_WIDTH - 1, 0)
+#define PGHOT_TIME_MASK			GENMASK(PGHOT_TIME_WIDTH - 1, 0)
+
+#define PGHOT_NID_MAX			((1 << PGHOT_NID_WIDTH) - 1)
+#define PGHOT_FREQ_MAX			((1 << PGHOT_FREQ_WIDTH) - 1)
+#define PGHOT_TIME_MAX			((1 << PGHOT_TIME_WIDTH) - 1)
+
+typedef u32 phi_t;
+
+#else	/* !CONFIG_PGHOT_PRECISE */
 #define PGHOT_DEFAULT_FREQ_WINDOW	(4 * MSEC_PER_SEC)
 
 /*
@@ -74,6 +104,7 @@ enum pghot_src_enabled {
 #define PGHOT_TIME_MAX			((1 << PGHOT_TIME_WIDTH) - 1)
 
 typedef u8 phi_t;
+#endif /* CONFIG_PGHOT_PRECISE */
 
 #define PGHOT_RECORD_SIZE		sizeof(phi_t)
 
diff --git a/mm/Kconfig b/mm/Kconfig
index f4f0147faac5..fde5aee3e16f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1478,6 +1478,17 @@ config PGHOT
 	  This adds 1 byte of metadata overhead per page in lower-tier
 	  memory nodes.
 
+config PGHOT_PRECISE
+	bool "Hot page tracking precision mode"
+	def_bool n
+	depends on PGHOT
+	help
+	  Enables precision mode for tracking hot pages with pghot sub-system.
+	  Adds fine-grained access time tracking and explicit toptier target
+	  NID tracking. Precise hot page tracking comes at the cost of using
+	  4 bytes per page against the default one byte per page. Preferable
+	  to enable this on systems with multiple nodes in toptier.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 655a27f3a215..89f999647752 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -147,4 +147,9 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
 obj-$(CONFIG_EXECMEM) += execmem.o
 obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
 obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
-obj-$(CONFIG_PGHOT) += pghot.o pghot-tunables.o pghot-default.o
+obj-$(CONFIG_PGHOT) += pghot.o pghot-tunables.o
+ifdef CONFIG_PGHOT_PRECISE
+obj-$(CONFIG_PGHOT) += pghot-precise.o
+else
+obj-$(CONFIG_PGHOT) += pghot-default.o
+endif
diff --git a/mm/pghot-precise.c b/mm/pghot-precise.c
new file mode 100644
index 000000000000..d8d4f15b3f9f
--- /dev/null
+++ b/mm/pghot-precise.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * pghot: Precision mode
+ *
+ * 4 byte hotness record per PFN (u32)
+ * NID, time and frequency tracked as part of the record.
+ */
+
+#include <linux/pghot.h>
+#include <linux/jiffies.h>
+
+unsigned long pghot_access_latency(unsigned long old_time, unsigned long time)
+{
+	return jiffies_to_msecs((time - old_time) & PGHOT_TIME_MASK);
+}
+
+bool pghot_update_record(phi_t *phi, int nid, unsigned long now)
+{
+	phi_t freq, old_freq, hotness, old_hotness, old_time, old_nid;
+	phi_t time = now & PGHOT_TIME_MASK;
+
+	old_hotness = READ_ONCE(*phi);
+	do {
+		bool new_window = false;
+
+		hotness = old_hotness;
+		old_nid = (hotness >> PGHOT_NID_SHIFT) & PGHOT_NID_MASK;
+		old_freq = (hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
+		old_time = (hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
+
+		if (pghot_access_latency(old_time, time) > sysctl_pghot_freq_window)
+			new_window = true;
+
+		if (new_window)
+			freq = 1;
+		else if (old_freq < PGHOT_FREQ_MAX)
+			freq = old_freq + 1;
+		else
+			freq = old_freq;
+		nid = (nid == NUMA_NO_NODE) ? pghot_target_nid : nid;
+
+		hotness &= ~(PGHOT_NID_MASK << PGHOT_NID_SHIFT);
+		hotness &= ~(PGHOT_FREQ_MASK << PGHOT_FREQ_SHIFT);
+		hotness &= ~(PGHOT_TIME_MASK << PGHOT_TIME_SHIFT);
+
+		hotness |= (nid & PGHOT_NID_MASK) << PGHOT_NID_SHIFT;
+		hotness |= (freq & PGHOT_FREQ_MASK) << PGHOT_FREQ_SHIFT;
+		hotness |= (time & PGHOT_TIME_MASK) << PGHOT_TIME_SHIFT;
+
+		if (freq >= pghot_freq_threshold)
+			hotness |= BIT(PGHOT_MIGRATE_READY);
+	} while (unlikely(!try_cmpxchg(phi, &old_hotness, hotness)));
+	return !!(hotness & BIT(PGHOT_MIGRATE_READY));
+}
+
+int pghot_get_record(phi_t *phi, int *nid, int *freq, unsigned long *time)
+{
+	phi_t old_hotness, hotness = 0;
+
+	old_hotness = READ_ONCE(*phi);
+	do {
+		if (!(old_hotness & BIT(PGHOT_MIGRATE_READY)))
+			return -EINVAL;
+	} while (unlikely(!try_cmpxchg(phi, &old_hotness, hotness)));
+
+	*nid = (old_hotness >> PGHOT_NID_SHIFT) & PGHOT_NID_MASK;
+	*freq = (old_hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
+	*time = (old_hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
+	return 0;
+}
diff --git a/mm/pghot.c b/mm/pghot.c
index 95b5012d5b99..bf1d9029cbaa 100644
--- a/mm/pghot.c
+++ b/mm/pghot.c
@@ -10,6 +10,9 @@
  * the frequency of access and last access time. Promotions are done
  * to a default toptier NID.
  *
+ * In the precision mode, 4 bytes are used to store the frequency
+ * of access, last access time and the accessing NID.
+ *
  * A kernel thread named kmigrated is provided to migrate or promote
  * the hot pages. kmigrated runs for each lower tier node. It iterates
  * over the node's PFNs and  migrates pages marked for migration into
@@ -52,13 +55,15 @@ static bool kmigrated_started __ro_after_init;
  * for the purpose of tracking page hotness and subsequent promotion.
  *
  * @pfn: PFN of the page
- * @nid: Unused
+ * @nid: Target NID to where the page needs to be migrated in precision
+ *       mode but unused in default mode
  * @src: The identifier of the sub-system that reports the access
  * @now: Access time in jiffies
  *
- * Updates the frequency and time of access and marks the page as
- * ready for migration if the frequency crosses a threshold. The pages
- * marked for migration are migrated by kmigrated kernel thread.
+ * Updates the NID (in precision mode only), frequency and time of access
+ * and marks the page as ready for migration if the frequency crosses a
+ * threshold. The pages marked for migration are migrated by kmigrated
+ * kernel thread.
  *
  * Return: 0 on success and -EINVAL on failure to record the access.
  */
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ