[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250306054532.221138-3-bharata@amd.com>
Date: Thu, 6 Mar 2025 11:15:30 +0530
From: Bharata B Rao <bharata@....com>
To: <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>
CC: <AneeshKumar.KizhakeVeetil@....com>, <Hasan.Maruf@....com>,
<Jonathan.Cameron@...wei.com>, <Michael.Day@....com>,
<akpm@...ux-foundation.org>, <dave.hansen@...el.com>, <david@...hat.com>,
<feng.tang@...el.com>, <gourry@...rry.net>, <hannes@...xchg.org>,
<honggyu.kim@...com>, <hughd@...gle.com>, <jhubbard@...dia.com>,
<k.shutemov@...il.com>, <kbusch@...a.com>, <kmanaouil.dev@...il.com>,
<leesuyeon0506@...il.com>, <leillc@...gle.com>, <liam.howlett@...cle.com>,
<mgorman@...hsingularity.net>, <mingo@...hat.com>, <nadav.amit@...il.com>,
<nphamcs@...il.com>, <peterz@...radead.org>, <raghavendra.kt@....com>,
<riel@...riel.com>, <rientjes@...gle.com>, <rppt@...nel.org>,
<shivankg@....com>, <shy828301@...il.com>, <sj@...nel.org>, <vbabka@...e.cz>,
<weixugc@...gle.com>, <willy@...radead.org>, <ying.huang@...ux.alibaba.com>,
<ziy@...dia.com>, <dave@...olabs.net>, <yuanchu@...gle.com>,
<hyeonggon.yoo@...com>, Bharata B Rao <bharata@....com>
Subject: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
kpromoted is a kernel daemon that accumulates hot page info
from different sources and tries to promote pages from slow
tiers to top tiers. One instance of this thread runs on each
node that has CPUs.
Subsystems that generate hot page access info can report that
to kpromoted via this API:
int kpromoted_record_access(u64 pfn, int nid, int src,
unsigned long time)
@pfn: The PFN of the memory accessed
@nid: The accessing NUMA node ID
@src: The temperature source (subsystem) that generated the
access info
@time: The access time in jiffies
Some temperature sources may not provide the nid from which
the page was accessed. This is true for sources that use
page table scanning for PTE Accessed bit. Currently the toptier
node to which such pages should be promoted to is hard coded.
Also, the access time provided some sources may at best be
considered approximate. This is especially true for hot pages
detected by PTE A bit scanning.
kpromoted currently maintains the hot PFN records in hash lists
hashed by PFN value. Each record stores the following info:
struct page_hotness_info {
unsigned long pfn;
/* Time when this record was updated last */
unsigned long last_update;
/*
* Number of times this page was accessed in the
* current window
*/
int frequency;
/* Most recent access time */
unsigned long recency;
/* Most recent access from this node */
int hot_node;
struct hlist_node hnode;
};
The way in which a page is categorized as hot enough to be
promoted is pretty primitive now.
Signed-off-by: Bharata B Rao <bharata@....com>
---
include/linux/kpromoted.h | 54 ++++++
include/linux/mmzone.h | 4 +
include/linux/vm_event_item.h | 13 ++
mm/Kconfig | 7 +
mm/Makefile | 1 +
mm/kpromoted.c | 305 ++++++++++++++++++++++++++++++++++
mm/mm_init.c | 10 ++
mm/vmstat.c | 13 ++
8 files changed, 407 insertions(+)
create mode 100644 include/linux/kpromoted.h
create mode 100644 mm/kpromoted.c
diff --git a/include/linux/kpromoted.h b/include/linux/kpromoted.h
new file mode 100644
index 000000000000..2bef3d74f03a
--- /dev/null
+++ b/include/linux/kpromoted.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KPROMOTED_H
+#define _LINUX_KPROMOTED_H
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/workqueue_types.h>
+
+/* Page hotness temperature sources */
+enum kpromoted_src {
+ KPROMOTED_HW_HINTS,
+ KPROMOTED_PGTABLE_SCAN,
+};
+
+#ifdef CONFIG_KPROMOTED
+
+#define KPROMOTED_FREQ_WINDOW (5 * MSEC_PER_SEC)
+
+/* 2 accesses within a window will make the page a promotion candidate */
+#define KPRMOTED_FREQ_THRESHOLD 2
+
+#define KPROMOTED_HASH_ORDER 16
+
+struct page_hotness_info {
+ unsigned long pfn;
+
+ /* Time when this record was updated last */
+ unsigned long last_update;
+
+ /*
+ * Number of times this page was accessed in the
+ * current window
+ */
+ int frequency;
+
+ /* Most recent access time */
+ unsigned long recency;
+
+ /* Most recent access from this node */
+ int hot_node;
+ struct hlist_node hnode;
+};
+
+#define KPROMOTE_DELAY MSEC_PER_SEC
+
+int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now);
+#else
+static inline int kpromoted_record_access(u64 pfn, int nid, int src,
+ unsigned long now)
+{
+ return 0;
+}
+#endif /* CONFIG_KPROMOTED */
+#endif /* _LINUX_KPROMOTED_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9540b41894da..a5c4e789aa55 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1459,6 +1459,10 @@ typedef struct pglist_data {
#ifdef CONFIG_MEMORY_FAILURE
struct memory_failure_stats mf_stats;
#endif
+#ifdef CONFIG_KPROMOTED
+ struct task_struct *kpromoted;
+ wait_queue_head_t kpromoted_wait;
+#endif
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index f70d0958095c..b5823b037883 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -182,6 +182,19 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
KSTACK_REST,
#endif
#endif /* CONFIG_DEBUG_STACK_USAGE */
+ KPROMOTED_RECORDED_ACCESSES,
+ KPROMOTED_RECORD_HWHINTS,
+ KPROMOTED_RECORD_PGTSCANS,
+ KPROMOTED_RECORD_TOPTIER,
+ KPROMOTED_RECORD_ADDED,
+ KPROMOTED_RECORD_EXISTS,
+ KPROMOTED_MIG_RIGHT_NODE,
+ KPROMOTED_MIG_NON_LRU,
+ KPROMOTED_MIG_COLD_OLD,
+ KPROMOTED_MIG_COLD_NOT_ACCESSED,
+ KPROMOTED_MIG_CANDIDATE,
+ KPROMOTED_MIG_PROMOTED,
+ KPROMOTED_MIG_DROPPED,
NR_VM_EVENT_ITEMS
};
diff --git a/mm/Kconfig b/mm/Kconfig
index 1b501db06417..ceaa462a0ce6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1358,6 +1358,13 @@ config PT_RECLAIM
Note: now only empty user PTE page table pages will be reclaimed.
+config KPROMOTED
+ bool "Kernel hot page promotion daemon"
+ def_bool y
+ depends on NUMA && MIGRATION && MMU
+ help
+ Promote hot pages from lower tier to top tier by using the
+ memory access information provided by various sources.
source "mm/damon/Kconfig"
diff --git a/mm/Makefile b/mm/Makefile
index 850386a67b3e..bf4f5f18f1f9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
+obj-$(CONFIG_KPROMOTED) += kpromoted.o
diff --git a/mm/kpromoted.c b/mm/kpromoted.c
new file mode 100644
index 000000000000..2a8b8495b6b3
--- /dev/null
+++ b/mm/kpromoted.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kpromoted is a kernel thread that runs on each node that has CPU i,e.,
+ * on regular nodes.
+ *
+ * Maintains list of hot pages from lower tiers and promotes them.
+ */
+#include <linux/kpromoted.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/mmzone.h>
+#include <linux/migrate.h>
+#include <linux/memory-tiers.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/cpuhotplug.h>
+#include <linux/hashtable.h>
+
+static DEFINE_HASHTABLE(page_hotness_hash, KPROMOTED_HASH_ORDER);
+static struct mutex page_hotness_lock[1UL << KPROMOTED_HASH_ORDER];
+
+static int kpromote_page(struct page_hotness_info *phi)
+{
+ struct page *page = pfn_to_page(phi->pfn);
+ struct folio *folio;
+ int ret;
+
+ if (!page)
+ return 1;
+
+ folio = page_folio(page);
+ ret = migrate_misplaced_folio_prepare(folio, NULL, phi->hot_node);
+ if (ret)
+ return 1;
+
+ return migrate_misplaced_folio(folio, phi->hot_node);
+}
+
+static int page_should_be_promoted(struct page_hotness_info *phi)
+{
+ struct page *page = pfn_to_online_page(phi->pfn);
+ unsigned long now = jiffies;
+ struct folio *folio;
+
+ if (!page || is_zone_device_page(page))
+ return false;
+
+ folio = page_folio(page);
+ if (!folio_test_lru(folio)) {
+ count_vm_event(KPROMOTED_MIG_NON_LRU);
+ return false;
+ }
+ if (folio_nid(folio) == phi->hot_node) {
+ count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
+ return false;
+ }
+
+ /* If the page was hot a while ago, don't promote */
+ if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
+ count_vm_event(KPROMOTED_MIG_COLD_OLD);
+ return false;
+ }
+
+ /* If the page hasn't been accessed enough number of times, don't promote */
+ if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
+ count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Go thro' page hotness information and migrate pages if required.
+ *
+ * Promoted pages are not longer tracked in the hot list.
+ * Cold pages are pruned from the list as well.
+ *
+ * TODO: Batching could be done
+ */
+static void kpromoted_migrate(pg_data_t *pgdat)
+{
+ int nid = pgdat->node_id;
+ struct page_hotness_info *phi;
+ struct hlist_node *tmp;
+ int nr_bkts = HASH_SIZE(page_hotness_hash);
+ int bkt;
+
+ for (bkt = 0; bkt < nr_bkts; bkt++) {
+ mutex_lock(&page_hotness_lock[bkt]);
+ hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
+ if (phi->hot_node != nid)
+ continue;
+
+ if (page_should_be_promoted(phi)) {
+ count_vm_event(KPROMOTED_MIG_CANDIDATE);
+ if (!kpromote_page(phi)) {
+ count_vm_event(KPROMOTED_MIG_PROMOTED);
+ hlist_del_init(&phi->hnode);
+ kfree(phi);
+ }
+ } else {
+ /*
+ * Not a suitable page or cold page, stop tracking it.
+ * TODO: Identify cold pages and drive demotion?
+ */
+ count_vm_event(KPROMOTED_MIG_DROPPED);
+ hlist_del_init(&phi->hnode);
+ kfree(phi);
+ }
+ }
+ mutex_unlock(&page_hotness_lock[bkt]);
+ }
+}
+
+static struct page_hotness_info *__kpromoted_lookup(unsigned long pfn, int bkt)
+{
+ struct page_hotness_info *phi;
+
+ hlist_for_each_entry(phi, &page_hotness_hash[bkt], hnode) {
+ if (phi->pfn == pfn)
+ return phi;
+ }
+ return NULL;
+}
+
+static struct page_hotness_info *kpromoted_lookup(unsigned long pfn, int bkt, unsigned long now)
+{
+ struct page_hotness_info *phi;
+
+ phi = __kpromoted_lookup(pfn, bkt);
+ if (!phi) {
+ phi = kzalloc(sizeof(struct page_hotness_info), GFP_KERNEL);
+ if (!phi)
+ return ERR_PTR(-ENOMEM);
+
+ phi->pfn = pfn;
+ phi->frequency = 1;
+ phi->last_update = now;
+ phi->recency = now;
+ hlist_add_head(&phi->hnode, &page_hotness_hash[bkt]);
+ count_vm_event(KPROMOTED_RECORD_ADDED);
+ } else {
+ count_vm_event(KPROMOTED_RECORD_EXISTS);
+ }
+ return phi;
+}
+
+/*
+ * Called by subsystems that generate page hotness/access information.
+ *
+ * Records the memory access info for futher action by kpromoted.
+ */
+int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now)
+{
+ struct page_hotness_info *phi;
+ struct page *page;
+ struct folio *folio;
+ int ret, bkt;
+
+ count_vm_event(KPROMOTED_RECORDED_ACCESSES);
+
+ switch (src) {
+ case KPROMOTED_HW_HINTS:
+ count_vm_event(KPROMOTED_RECORD_HWHINTS);
+ break;
+ case KPROMOTED_PGTABLE_SCAN:
+ count_vm_event(KPROMOTED_RECORD_PGTSCANS);
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Record only accesses from lower tiers.
+ * Assuming node having CPUs as toptier for now.
+ */
+ if (node_is_toptier(pfn_to_nid(pfn))) {
+ count_vm_event(KPROMOTED_RECORD_TOPTIER);
+ return 0;
+ }
+
+ page = pfn_to_online_page(pfn);
+ if (!page || is_zone_device_page(page))
+ return 0;
+
+ folio = page_folio(page);
+ if (!folio_test_lru(folio))
+ return 0;
+
+ bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
+ mutex_lock(&page_hotness_lock[bkt]);
+ phi = kpromoted_lookup(pfn, bkt, now);
+ if (!phi) {
+ ret = PTR_ERR(phi);
+ goto out;
+ }
+
+ if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
+ /* New window */
+ phi->frequency = 1; /* TODO: Factor in the history */
+ phi->last_update = now;
+ } else {
+ phi->frequency++;
+ }
+ phi->recency = now;
+
+ /*
+ * TODOs:
+ * 1. Source nid is hard-coded for some temperature sources
+ * 2. Take action if hot_node changes - may be a shared page?
+ * 3. Maintain node info for every access within the window?
+ */
+ phi->hot_node = (nid == NUMA_NO_NODE) ? 1 : nid;
+ mutex_unlock(&page_hotness_lock[bkt]);
+out:
+ return 0;
+}
+
+/*
+ * Go through the accumulated mem_access_info and migrate
+ * pages if required.
+ */
+static void kpromoted_do_work(pg_data_t *pgdat)
+{
+ kpromoted_migrate(pgdat);
+}
+
+static inline bool kpromoted_work_requested(pg_data_t *pgdat)
+{
+ return false;
+}
+
+static int kpromoted(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t *)p;
+ struct task_struct *tsk = current;
+ long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ while (!kthread_should_stop()) {
+ wait_event_timeout(pgdat->kpromoted_wait,
+ kpromoted_work_requested(pgdat), timeout);
+ kpromoted_do_work(pgdat);
+ }
+ return 0;
+}
+
+static void kpromoted_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (pgdat->kpromoted)
+ return;
+
+ pgdat->kpromoted = kthread_run(kpromoted, pgdat, "kpromoted%d", nid);
+ if (IS_ERR(pgdat->kpromoted)) {
+ pr_err("Failed to start kpromoted on node %d\n", nid);
+ pgdat->kpromoted = NULL;
+ }
+}
+
+static int kpromoted_cpu_online(unsigned int cpu)
+{
+ int nid;
+
+ for_each_node_state(nid, N_CPU) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ if (pgdat->kpromoted)
+ set_cpus_allowed_ptr(pgdat->kpromoted, mask);
+ }
+ return 0;
+}
+
+static int __init kpromoted_init(void)
+{
+ int nid, ret, i;
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "mm/promotion:online",
+ kpromoted_cpu_online, NULL);
+ if (ret < 0) {
+ pr_err("kpromoted: failed to register hotplug callbacks.\n");
+ return ret;
+ }
+
+ for (i = 0; i < (1UL << KPROMOTED_HASH_ORDER); i++)
+ mutex_init(&page_hotness_lock[i]);
+
+ for_each_node_state(nid, N_CPU)
+ kpromoted_run(nid);
+
+ return 0;
+}
+
+subsys_initcall(kpromoted_init)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2630cc30147e..d212df24f89b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1362,6 +1362,15 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
#endif
+#ifdef CONFIG_KPROMOTED
+static void pgdat_init_kpromoted(struct pglist_data *pgdat)
+{
+ init_waitqueue_head(&pgdat->kpromoted_wait);
+}
+#else
+static void pgdat_init_kpromoted(struct pglist_data *pgdat) {}
+#endif
+
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
int i;
@@ -1371,6 +1380,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
+ pgdat_init_kpromoted(pgdat);
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 16bfe1c694dd..618f44bae5c8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1466,6 +1466,19 @@ const char * const vmstat_text[] = {
"kstack_rest",
#endif
#endif
+ "kpromoted_recorded_accesses",
+ "kpromoted_recorded_hwhints",
+ "kpromoted_recorded_pgtscans",
+ "kpromoted_record_toptier",
+ "kpromoted_record_added",
+ "kpromoted_record_exists",
+ "kpromoted_mig_right_node",
+ "kpromoted_mig_non_lru",
+ "kpromoted_mig_cold_old",
+ "kpromoted_mig_cold_not_accessed",
+ "kpromoted_mig_candidate",
+ "kpromoted_mig_promoted",
+ "kpromoted_mig_dropped",
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
--
2.34.1
Powered by blists - more mailing lists