[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c9c7e0da-115e-4f81-be81-193431fe838e@nvidia.com>
Date: Mon, 24 Mar 2025 14:35:44 +1100
From: Balbir Singh <balbirs@...dia.com>
To: Bharata B Rao <bharata@....com>, linux-kernel@...r.kernel.org,
linux-mm@...ck.org
Cc: AneeshKumar.KizhakeVeetil@....com, Hasan.Maruf@....com,
Jonathan.Cameron@...wei.com, Michael.Day@....com, akpm@...ux-foundation.org,
dave.hansen@...el.com, david@...hat.com, feng.tang@...el.com,
gourry@...rry.net, hannes@...xchg.org, honggyu.kim@...com, hughd@...gle.com,
jhubbard@...dia.com, k.shutemov@...il.com, kbusch@...a.com,
kmanaouil.dev@...il.com, leesuyeon0506@...il.com, leillc@...gle.com,
liam.howlett@...cle.com, mgorman@...hsingularity.net, mingo@...hat.com,
nadav.amit@...il.com, nphamcs@...il.com, peterz@...radead.org,
raghavendra.kt@....com, riel@...riel.com, rientjes@...gle.com,
rppt@...nel.org, shivankg@....com, shy828301@...il.com, sj@...nel.org,
vbabka@...e.cz, weixugc@...gle.com, willy@...radead.org,
ying.huang@...ux.alibaba.com, ziy@...dia.com, dave@...olabs.net,
yuanchu@...gle.com, hyeonggon.yoo@...com
Subject: Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and
promotion daemon
On 3/6/25 16:45, Bharata B Rao wrote:
> kpromoted is a kernel daemon that accumulates hot page info
> from different sources and tries to promote pages from slow
> tiers to top tiers. One instance of this thread runs on each
> node that has CPUs.
>
Could you please elaborate on what is slow vs top tier? A top tier uses
adist (which is a combination of bandwidth and latency), so I am
not sure the terminology here holds.
> Subsystems that generate hot page access info can report that
> to kpromoted via this API:
>
> int kpromoted_record_access(u64 pfn, int nid, int src,
> unsigned long time)
>
> @pfn: The PFN of the memory accessed
> @nid: The accessing NUMA node ID
> @src: The temperature source (subsystem) that generated the
> access info
> @time: The access time in jiffies
>
> Some temperature sources may not provide the nid from which
What is a temperature source?
> the page was accessed. This is true for sources that use
> page table scanning for PTE Accessed bit. Currently the toptier
> node to which such pages should be promoted to is hard coded.
>
What would it take to make this flexible?
> Also, the access time provided some sources may at best be
> considered approximate. This is especially true for hot pages
> detected by PTE A bit scanning.
>
> kpromoted currently maintains the hot PFN records in hash lists
> hashed by PFN value. Each record stores the following info:
>
> struct page_hotness_info {
> unsigned long pfn;
>
> /* Time when this record was updated last */
> unsigned long last_update;
>
> /*
> * Number of times this page was accessed in the
> * current window
> */
> int frequency;
>
> /* Most recent access time */
> unsigned long recency;
>
> /* Most recent access from this node */
> int hot_node;
>
> struct hlist_node hnode;
> };
>
> The way in which a page is categorized as hot enough to be
> promoted is pretty primitive now.
>
> Signed-off-by: Bharata B Rao <bharata@....com>
> ---
> include/linux/kpromoted.h | 54 ++++++
> include/linux/mmzone.h | 4 +
> include/linux/vm_event_item.h | 13 ++
> mm/Kconfig | 7 +
> mm/Makefile | 1 +
> mm/kpromoted.c | 305 ++++++++++++++++++++++++++++++++++
> mm/mm_init.c | 10 ++
> mm/vmstat.c | 13 ++
> 8 files changed, 407 insertions(+)
> create mode 100644 include/linux/kpromoted.h
> create mode 100644 mm/kpromoted.c
>
> diff --git a/include/linux/kpromoted.h b/include/linux/kpromoted.h
> new file mode 100644
> index 000000000000..2bef3d74f03a
> --- /dev/null
> +++ b/include/linux/kpromoted.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_KPROMOTED_H
> +#define _LINUX_KPROMOTED_H
> +
> +#include <linux/types.h>
> +#include <linux/init.h>
> +#include <linux/workqueue_types.h>
> +
> +/* Page hotness temperature sources */
> +enum kpromoted_src {
> + KPROMOTED_HW_HINTS,
> + KPROMOTED_PGTABLE_SCAN,
> +};
> +
> +#ifdef CONFIG_KPROMOTED
> +
> +#define KPROMOTED_FREQ_WINDOW (5 * MSEC_PER_SEC)
> +
> +/* 2 accesses within a window will make the page a promotion candidate */
> +#define KPRMOTED_FREQ_THRESHOLD 2
> +
Were these value derived empirically?
> +#define KPROMOTED_HASH_ORDER 16
> +
> +struct page_hotness_info {
> + unsigned long pfn;
> +
> + /* Time when this record was updated last */
> + unsigned long last_update;
> +
> + /*
> + * Number of times this page was accessed in the
> + * current window
> + */
> + int frequency;
> +
> + /* Most recent access time */
> + unsigned long recency;
> +
> + /* Most recent access from this node */
> + int hot_node;
> + struct hlist_node hnode;
> +};
> +
> +#define KPROMOTE_DELAY MSEC_PER_SEC
> +
> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now);
> +#else
> +static inline int kpromoted_record_access(u64 pfn, int nid, int src,
> + unsigned long now)
> +{
> + return 0;
> +}
> +#endif /* CONFIG_KPROMOTED */
> +#endif /* _LINUX_KPROMOTED_H */
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 9540b41894da..a5c4e789aa55 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1459,6 +1459,10 @@ typedef struct pglist_data {
> #ifdef CONFIG_MEMORY_FAILURE
> struct memory_failure_stats mf_stats;
> #endif
> +#ifdef CONFIG_KPROMOTED
> + struct task_struct *kpromoted;
> + wait_queue_head_t kpromoted_wait;
> +#endif
> } pg_data_t;
>
> #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index f70d0958095c..b5823b037883 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -182,6 +182,19 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
> KSTACK_REST,
> #endif
> #endif /* CONFIG_DEBUG_STACK_USAGE */
> + KPROMOTED_RECORDED_ACCESSES,
> + KPROMOTED_RECORD_HWHINTS,
> + KPROMOTED_RECORD_PGTSCANS,
> + KPROMOTED_RECORD_TOPTIER,
> + KPROMOTED_RECORD_ADDED,
> + KPROMOTED_RECORD_EXISTS,
> + KPROMOTED_MIG_RIGHT_NODE,
> + KPROMOTED_MIG_NON_LRU,
> + KPROMOTED_MIG_COLD_OLD,
> + KPROMOTED_MIG_COLD_NOT_ACCESSED,
> + KPROMOTED_MIG_CANDIDATE,
> + KPROMOTED_MIG_PROMOTED,
> + KPROMOTED_MIG_DROPPED,
> NR_VM_EVENT_ITEMS
> };
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 1b501db06417..ceaa462a0ce6 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1358,6 +1358,13 @@ config PT_RECLAIM
>
> Note: now only empty user PTE page table pages will be reclaimed.
>
> +config KPROMOTED
> + bool "Kernel hot page promotion daemon"
> + def_bool y
> + depends on NUMA && MIGRATION && MMU
> + help
> + Promote hot pages from lower tier to top tier by using the
> + memory access information provided by various sources.
>
> source "mm/damon/Kconfig"
>
> diff --git a/mm/Makefile b/mm/Makefile
> index 850386a67b3e..bf4f5f18f1f9 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> obj-$(CONFIG_EXECMEM) += execmem.o
> obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
> obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
> +obj-$(CONFIG_KPROMOTED) += kpromoted.o
> diff --git a/mm/kpromoted.c b/mm/kpromoted.c
> new file mode 100644
> index 000000000000..2a8b8495b6b3
> --- /dev/null
> +++ b/mm/kpromoted.c
> @@ -0,0 +1,305 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * kpromoted is a kernel thread that runs on each node that has CPU i,e.,
> + * on regular nodes.
> + *
> + * Maintains list of hot pages from lower tiers and promotes them.
> + */
> +#include <linux/kpromoted.h>
> +#include <linux/kthread.h>
> +#include <linux/mutex.h>
> +#include <linux/mmzone.h>
> +#include <linux/migrate.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/slab.h>
> +#include <linux/sched.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/hashtable.h>
> +
> +static DEFINE_HASHTABLE(page_hotness_hash, KPROMOTED_HASH_ORDER);
> +static struct mutex page_hotness_lock[1UL << KPROMOTED_HASH_ORDER];
> +
> +static int kpromote_page(struct page_hotness_info *phi)
> +{
Why not just call it kpromote_folio?
> + struct page *page = pfn_to_page(phi->pfn);
> + struct folio *folio;
> + int ret;
> +
> + if (!page)
> + return 1;
Do we need to check for is_zone_device_page() here?
> +
> + folio = page_folio(page);
> + ret = migrate_misplaced_folio_prepare(folio, NULL, phi->hot_node);
> + if (ret)
> + return 1;
> +
> + return migrate_misplaced_folio(folio, phi->hot_node);
> +}
Could you please document the assumptions for kpromote_page(), what locks
should be held? Does the ref count need to be incremented?
> +
> +static int page_should_be_promoted(struct page_hotness_info *phi)
> +{
> + struct page *page = pfn_to_online_page(phi->pfn);
> + unsigned long now = jiffies;
> + struct folio *folio;
> +
> + if (!page || is_zone_device_page(page))
> + return false;
> +
> + folio = page_folio(page);
> + if (!folio_test_lru(folio)) {
> + count_vm_event(KPROMOTED_MIG_NON_LRU);
> + return false;
> + }
> + if (folio_nid(folio) == phi->hot_node) {
> + count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
> + return false;
> + }
> +
> + /* If the page was hot a while ago, don't promote */
> + if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> + count_vm_event(KPROMOTED_MIG_COLD_OLD);
Shouldn't we update phi->last_update here?
> + return false;
> + }
> +
> + /* If the page hasn't been accessed enough number of times, don't promote */
> + if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
> + count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
> + return false;
> + }
> + return true;
> +}
> +
> +/*
> + * Go thro' page hotness information and migrate pages if required.
> + *
> + * Promoted pages are not longer tracked in the hot list.
> + * Cold pages are pruned from the list as well.
> + *
> + * TODO: Batching could be done
> + */
> +static void kpromoted_migrate(pg_data_t *pgdat)
> +{
> + int nid = pgdat->node_id;
> + struct page_hotness_info *phi;
> + struct hlist_node *tmp;
> + int nr_bkts = HASH_SIZE(page_hotness_hash);
> + int bkt;
> +
> + for (bkt = 0; bkt < nr_bkts; bkt++) {
> + mutex_lock(&page_hotness_lock[bkt]);
> + hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
> + if (phi->hot_node != nid)
> + continue;
> +
> + if (page_should_be_promoted(phi)) {
> + count_vm_event(KPROMOTED_MIG_CANDIDATE);
> + if (!kpromote_page(phi)) {
> + count_vm_event(KPROMOTED_MIG_PROMOTED);
> + hlist_del_init(&phi->hnode);
> + kfree(phi);
> + }
> + } else {
> + /*
> + * Not a suitable page or cold page, stop tracking it.
> + * TODO: Identify cold pages and drive demotion?
> + */
> + count_vm_event(KPROMOTED_MIG_DROPPED);
> + hlist_del_init(&phi->hnode);
> + kfree(phi);
Won't existing demotion already handle this?
> + }
> + }
> + mutex_unlock(&page_hotness_lock[bkt]);
> + }
> +}
> +
It sounds like NUMA balancing, promotion and demotion can all act on parallel on
these folios, if not could you clarify their relationship and dependency?
> +static struct page_hotness_info *__kpromoted_lookup(unsigned long pfn, int bkt)
> +{
> + struct page_hotness_info *phi;
> +
> + hlist_for_each_entry(phi, &page_hotness_hash[bkt], hnode) {
> + if (phi->pfn == pfn)
> + return phi;
> + }
> + return NULL;
> +}
> +
> +static struct page_hotness_info *kpromoted_lookup(unsigned long pfn, int bkt, unsigned long now)
> +{
> + struct page_hotness_info *phi;
> +
> + phi = __kpromoted_lookup(pfn, bkt);
> + if (!phi) {
> + phi = kzalloc(sizeof(struct page_hotness_info), GFP_KERNEL);
> + if (!phi)
> + return ERR_PTR(-ENOMEM);
> +
> + phi->pfn = pfn;
> + phi->frequency = 1;
> + phi->last_update = now;
> + phi->recency = now;
> + hlist_add_head(&phi->hnode, &page_hotness_hash[bkt]);
> + count_vm_event(KPROMOTED_RECORD_ADDED);
> + } else {
> + count_vm_event(KPROMOTED_RECORD_EXISTS);
> + }
> + return phi;
> +}
> +
> +/*
> + * Called by subsystems that generate page hotness/access information.
> + *
> + * Records the memory access info for futher action by kpromoted.
> + */
> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now)
> +{
> + struct page_hotness_info *phi;
> + struct page *page;
> + struct folio *folio;
> + int ret, bkt;
> +
> + count_vm_event(KPROMOTED_RECORDED_ACCESSES);
> +
> + switch (src) {
> + case KPROMOTED_HW_HINTS:
> + count_vm_event(KPROMOTED_RECORD_HWHINTS);
> + break;
> + case KPROMOTED_PGTABLE_SCAN:
> + count_vm_event(KPROMOTED_RECORD_PGTSCANS);
> + break;
> + default:
> + break;
> + }
> +
> + /*
> + * Record only accesses from lower tiers.
> + * Assuming node having CPUs as toptier for now.
> + */
> + if (node_is_toptier(pfn_to_nid(pfn))) {
> + count_vm_event(KPROMOTED_RECORD_TOPTIER);
> + return 0;
> + }
> +
> + page = pfn_to_online_page(pfn);
> + if (!page || is_zone_device_page(page))
> + return 0;
> +
> + folio = page_folio(page);
> + if (!folio_test_lru(folio))
> + return 0;
> +
> + bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
> + mutex_lock(&page_hotness_lock[bkt]);
> + phi = kpromoted_lookup(pfn, bkt, now);
> + if (!phi) {
> + ret = PTR_ERR(phi);
> + goto out;
> + }
> +
> + if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> + /* New window */
> + phi->frequency = 1; /* TODO: Factor in the history */
> + phi->last_update = now;
> + } else {
> + phi->frequency++;
> + }
> + phi->recency = now;
> +
> + /*
> + * TODOs:
> + * 1. Source nid is hard-coded for some temperature sources
> + * 2. Take action if hot_node changes - may be a shared page?
> + * 3. Maintain node info for every access within the window?
> + */
> + phi->hot_node = (nid == NUMA_NO_NODE) ? 1 : nid;
I don't understand why nid needs to be 1 if nid is NUMA_NODE_ID? Does
it mean that it's being promoted to the top tier, the mix of hot_node,
tier and nid is not very clear here.
> + mutex_unlock(&page_hotness_lock[bkt]);
> +out:
> + return 0;
> +}
> +
> +/*
> + * Go through the accumulated mem_access_info and migrate
> + * pages if required.
> + */
> +static void kpromoted_do_work(pg_data_t *pgdat)
> +{
> + kpromoted_migrate(pgdat);
> +}
> +
> +static inline bool kpromoted_work_requested(pg_data_t *pgdat)
> +{
> + return false;
> +}
> +
> +static int kpromoted(void *p)
> +{
> + pg_data_t *pgdat = (pg_data_t *)p;
> + struct task_struct *tsk = current;
> + long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
> +
> + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> +
> + if (!cpumask_empty(cpumask))
> + set_cpus_allowed_ptr(tsk, cpumask);
> +
> + while (!kthread_should_stop()) {
> + wait_event_timeout(pgdat->kpromoted_wait,
> + kpromoted_work_requested(pgdat), timeout);
> + kpromoted_do_work(pgdat);
> + }
> + return 0;
> +}
> +
> +static void kpromoted_run(int nid)
> +{
> + pg_data_t *pgdat = NODE_DATA(nid);
> +
> + if (pgdat->kpromoted)
> + return;
> +
> + pgdat->kpromoted = kthread_run(kpromoted, pgdat, "kpromoted%d", nid);
> + if (IS_ERR(pgdat->kpromoted)) {
> + pr_err("Failed to start kpromoted on node %d\n", nid);
> + pgdat->kpromoted = NULL;
> + }
> +}
> +
> +static int kpromoted_cpu_online(unsigned int cpu)
> +{
> + int nid;
> +
> + for_each_node_state(nid, N_CPU) {
> + pg_data_t *pgdat = NODE_DATA(nid);
> + const struct cpumask *mask;
> +
> + mask = cpumask_of_node(pgdat->node_id);
> +
> + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
> + /* One of our CPUs online: restore mask */
> + if (pgdat->kpromoted)
> + set_cpus_allowed_ptr(pgdat->kpromoted, mask);
> + }
> + return 0;
> +}
> +
> +static int __init kpromoted_init(void)
> +{
> + int nid, ret, i;
> +
> + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
> + "mm/promotion:online",
> + kpromoted_cpu_online, NULL);
> + if (ret < 0) {
> + pr_err("kpromoted: failed to register hotplug callbacks.\n");
> + return ret;
> + }
> +
> + for (i = 0; i < (1UL << KPROMOTED_HASH_ORDER); i++)
> + mutex_init(&page_hotness_lock[i]);
> +
> + for_each_node_state(nid, N_CPU)
> + kpromoted_run(nid);
> +
I think we need a dynamic way to disabling promotion at run time
as well, right?
> + return 0;
> +}
> +
> +subsys_initcall(kpromoted_init)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 2630cc30147e..d212df24f89b 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1362,6 +1362,15 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
> static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
> #endif
>
> +#ifdef CONFIG_KPROMOTED
> +static void pgdat_init_kpromoted(struct pglist_data *pgdat)
> +{
> + init_waitqueue_head(&pgdat->kpromoted_wait);
> +}
> +#else
> +static void pgdat_init_kpromoted(struct pglist_data *pgdat) {}
> +#endif
> +
> static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
> {
> int i;
> @@ -1371,6 +1380,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>
> pgdat_init_split_queue(pgdat);
> pgdat_init_kcompactd(pgdat);
> + pgdat_init_kpromoted(pgdat);
>
> init_waitqueue_head(&pgdat->kswapd_wait);
> init_waitqueue_head(&pgdat->pfmemalloc_wait);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 16bfe1c694dd..618f44bae5c8 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1466,6 +1466,19 @@ const char * const vmstat_text[] = {
> "kstack_rest",
> #endif
> #endif
> + "kpromoted_recorded_accesses",
> + "kpromoted_recorded_hwhints",
> + "kpromoted_recorded_pgtscans",
> + "kpromoted_record_toptier",
> + "kpromoted_record_added",
> + "kpromoted_record_exists",
> + "kpromoted_mig_right_node",
> + "kpromoted_mig_non_lru",
> + "kpromoted_mig_cold_old",
> + "kpromoted_mig_cold_not_accessed",
> + "kpromoted_mig_candidate",
> + "kpromoted_mig_promoted",
> + "kpromoted_mig_dropped",
> #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
> };
> #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
Powered by blists - more mailing lists