lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c9c7e0da-115e-4f81-be81-193431fe838e@nvidia.com>
Date: Mon, 24 Mar 2025 14:35:44 +1100
From: Balbir Singh <balbirs@...dia.com>
To: Bharata B Rao <bharata@....com>, linux-kernel@...r.kernel.org,
 linux-mm@...ck.org
Cc: AneeshKumar.KizhakeVeetil@....com, Hasan.Maruf@....com,
 Jonathan.Cameron@...wei.com, Michael.Day@....com, akpm@...ux-foundation.org,
 dave.hansen@...el.com, david@...hat.com, feng.tang@...el.com,
 gourry@...rry.net, hannes@...xchg.org, honggyu.kim@...com, hughd@...gle.com,
 jhubbard@...dia.com, k.shutemov@...il.com, kbusch@...a.com,
 kmanaouil.dev@...il.com, leesuyeon0506@...il.com, leillc@...gle.com,
 liam.howlett@...cle.com, mgorman@...hsingularity.net, mingo@...hat.com,
 nadav.amit@...il.com, nphamcs@...il.com, peterz@...radead.org,
 raghavendra.kt@....com, riel@...riel.com, rientjes@...gle.com,
 rppt@...nel.org, shivankg@....com, shy828301@...il.com, sj@...nel.org,
 vbabka@...e.cz, weixugc@...gle.com, willy@...radead.org,
 ying.huang@...ux.alibaba.com, ziy@...dia.com, dave@...olabs.net,
 yuanchu@...gle.com, hyeonggon.yoo@...com
Subject: Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and
 promotion daemon

On 3/6/25 16:45, Bharata B Rao wrote:
> kpromoted is a kernel daemon that accumulates hot page info
> from different sources and tries to promote pages from slow
> tiers to top tiers. One instance of this thread runs on each
> node that has CPUs.
> 

Could you please elaborate on what is slow vs top tier? A top tier uses
adist (which is a combination of bandwidth and latency), so I am
not sure the terminology here holds.

> Subsystems that generate hot page access info can report that
> to kpromoted via this API:
> 
> int kpromoted_record_access(u64 pfn, int nid, int src,
> 			    unsigned long time)
> 
> @pfn: The PFN of the memory accessed
> @nid: The accessing NUMA node ID
> @src: The temperature source (subsystem) that generated the
>       access info
> @time: The access time in jiffies
> 
> Some temperature sources may not provide the nid from which

What is a temperature source?

> the page was accessed. This is true for sources that use
> page table scanning for PTE Accessed bit. Currently the toptier
> node to which such pages should be promoted to is hard coded.
> 

What would it take to make this flexible?

> Also, the access time provided some sources may at best be
> considered approximate. This is especially true for hot pages
> detected by PTE A bit scanning.
> 
> kpromoted currently maintains the hot PFN records in hash lists
> hashed by PFN value. Each record stores the following info:
> 
> struct page_hotness_info {
> 	unsigned long pfn;
> 
> 	/* Time when this record was updated last */
> 	unsigned long last_update;
> 
> 	/*
> 	 * Number of times this page was accessed in the
> 	 * current window
> 	 */
> 	int frequency;
> 
> 	/* Most recent access time */
> 	unsigned long recency;
> 
> 	/* Most recent access from this node */
> 	int hot_node;
> 
> 	struct hlist_node hnode;
> };
> 
> The way in which a page is categorized as hot enough to be
> promoted is pretty primitive now.
> 
> Signed-off-by: Bharata B Rao <bharata@....com>
> ---
>  include/linux/kpromoted.h     |  54 ++++++
>  include/linux/mmzone.h        |   4 +
>  include/linux/vm_event_item.h |  13 ++
>  mm/Kconfig                    |   7 +
>  mm/Makefile                   |   1 +
>  mm/kpromoted.c                | 305 ++++++++++++++++++++++++++++++++++
>  mm/mm_init.c                  |  10 ++
>  mm/vmstat.c                   |  13 ++
>  8 files changed, 407 insertions(+)
>  create mode 100644 include/linux/kpromoted.h
>  create mode 100644 mm/kpromoted.c
> 
> diff --git a/include/linux/kpromoted.h b/include/linux/kpromoted.h
> new file mode 100644
> index 000000000000..2bef3d74f03a
> --- /dev/null
> +++ b/include/linux/kpromoted.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_KPROMOTED_H
> +#define _LINUX_KPROMOTED_H
> +
> +#include <linux/types.h>
> +#include <linux/init.h>
> +#include <linux/workqueue_types.h>
> +
> +/* Page hotness temperature sources */
> +enum kpromoted_src {
> +	KPROMOTED_HW_HINTS,
> +	KPROMOTED_PGTABLE_SCAN,
> +};
> +
> +#ifdef CONFIG_KPROMOTED
> +
> +#define KPROMOTED_FREQ_WINDOW	(5 * MSEC_PER_SEC)
> +
> +/* 2 accesses within a window will make the page a promotion candidate */
> +#define KPRMOTED_FREQ_THRESHOLD	2
> +

Were these value derived empirically?


> +#define KPROMOTED_HASH_ORDER	16
> +
> +struct page_hotness_info {
> +	unsigned long pfn;
> +
> +	/* Time when this record was updated last */
> +	unsigned long last_update;
> +
> +	/*
> +	 * Number of times this page was accessed in the
> +	 * current window
> +	 */
> +	int frequency;
> +
> +	/* Most recent access time */
> +	unsigned long recency;
> +
> +	/* Most recent access from this node */
> +	int hot_node;
> +	struct hlist_node hnode;
> +};
> +
> +#define KPROMOTE_DELAY	MSEC_PER_SEC
> +
> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now);
> +#else
> +static inline int kpromoted_record_access(u64 pfn, int nid, int src,
> +					  unsigned long now)
> +{
> +	return 0;
> +}
> +#endif /* CONFIG_KPROMOTED */
> +#endif /* _LINUX_KPROMOTED_H */
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 9540b41894da..a5c4e789aa55 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1459,6 +1459,10 @@ typedef struct pglist_data {
>  #ifdef CONFIG_MEMORY_FAILURE
>  	struct memory_failure_stats mf_stats;
>  #endif
> +#ifdef CONFIG_KPROMOTED
> +	struct task_struct *kpromoted;
> +	wait_queue_head_t kpromoted_wait;
> +#endif
>  } pg_data_t;
>  
>  #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index f70d0958095c..b5823b037883 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -182,6 +182,19 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>  		KSTACK_REST,
>  #endif
>  #endif /* CONFIG_DEBUG_STACK_USAGE */
> +		KPROMOTED_RECORDED_ACCESSES,
> +		KPROMOTED_RECORD_HWHINTS,
> +		KPROMOTED_RECORD_PGTSCANS,
> +		KPROMOTED_RECORD_TOPTIER,
> +		KPROMOTED_RECORD_ADDED,
> +		KPROMOTED_RECORD_EXISTS,
> +		KPROMOTED_MIG_RIGHT_NODE,
> +		KPROMOTED_MIG_NON_LRU,
> +		KPROMOTED_MIG_COLD_OLD,
> +		KPROMOTED_MIG_COLD_NOT_ACCESSED,
> +		KPROMOTED_MIG_CANDIDATE,
> +		KPROMOTED_MIG_PROMOTED,
> +		KPROMOTED_MIG_DROPPED,
>  		NR_VM_EVENT_ITEMS
>  };
>  
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 1b501db06417..ceaa462a0ce6 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1358,6 +1358,13 @@ config PT_RECLAIM
>  
>  	  Note: now only empty user PTE page table pages will be reclaimed.
>  
> +config KPROMOTED
> +	bool "Kernel hot page promotion daemon"
> +	def_bool y
> +	depends on NUMA && MIGRATION && MMU
> +	help
> +	  Promote hot pages from lower tier to top tier by using the
> +	  memory access information provided by various sources.
>  
>  source "mm/damon/Kconfig"
>  
> diff --git a/mm/Makefile b/mm/Makefile
> index 850386a67b3e..bf4f5f18f1f9 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
>  obj-$(CONFIG_EXECMEM) += execmem.o
>  obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
>  obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
> +obj-$(CONFIG_KPROMOTED) += kpromoted.o
> diff --git a/mm/kpromoted.c b/mm/kpromoted.c
> new file mode 100644
> index 000000000000..2a8b8495b6b3
> --- /dev/null
> +++ b/mm/kpromoted.c
> @@ -0,0 +1,305 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * kpromoted is a kernel thread that runs on each node that has CPU i,e.,
> + * on regular nodes.
> + *
> + * Maintains list of hot pages from lower tiers and promotes them.
> + */
> +#include <linux/kpromoted.h>
> +#include <linux/kthread.h>
> +#include <linux/mutex.h>
> +#include <linux/mmzone.h>
> +#include <linux/migrate.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/slab.h>
> +#include <linux/sched.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/hashtable.h>
> +
> +static DEFINE_HASHTABLE(page_hotness_hash, KPROMOTED_HASH_ORDER);
> +static struct mutex page_hotness_lock[1UL << KPROMOTED_HASH_ORDER];
> +
> +static int kpromote_page(struct page_hotness_info *phi)
> +{

Why not just call it kpromote_folio?

> +	struct page *page = pfn_to_page(phi->pfn);
> +	struct folio *folio;
> +	int ret;
> +
> +	if (!page)
> +		return 1;

Do we need to check for is_zone_device_page() here?

> +
> +	folio = page_folio(page);
> +	ret = migrate_misplaced_folio_prepare(folio, NULL, phi->hot_node);
> +	if (ret)
> +		return 1;
> +
> +	return migrate_misplaced_folio(folio, phi->hot_node);
> +}


Could you please document the assumptions for kpromote_page(), what locks
should be held? Does the ref count need to be incremented?

> +
> +static int page_should_be_promoted(struct page_hotness_info *phi)
> +{
> +	struct page *page = pfn_to_online_page(phi->pfn);
> +	unsigned long now = jiffies;
> +	struct folio *folio;
> +
> +	if (!page || is_zone_device_page(page))
> +		return false;
> +
> +	folio = page_folio(page);
> +	if (!folio_test_lru(folio)) {
> +		count_vm_event(KPROMOTED_MIG_NON_LRU);
> +		return false;
> +	}
> +	if (folio_nid(folio) == phi->hot_node) {
> +		count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
> +		return false;
> +	}
> +
> +	/* If the page was hot a while ago, don't promote */
> +	if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> +		count_vm_event(KPROMOTED_MIG_COLD_OLD);

Shouldn't we update phi->last_update here?

> +		return false;
> +	}
> +
> +	/* If the page hasn't been accessed enough number of times, don't promote */
> +	if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
> +		count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
> +		return false;
> +	}
> +	return true;
> +}
> +
> +/*
> + * Go thro' page hotness information and migrate pages if required.
> + *
> + * Promoted pages are not longer tracked in the hot list.
> + * Cold pages are pruned from the list as well.
> + *
> + * TODO: Batching could be done
> + */
> +static void kpromoted_migrate(pg_data_t *pgdat)
> +{
> +	int nid = pgdat->node_id;
> +	struct page_hotness_info *phi;
> +	struct hlist_node *tmp;
> +	int nr_bkts = HASH_SIZE(page_hotness_hash);
> +	int bkt;
> +
> +	for (bkt = 0; bkt < nr_bkts; bkt++) {
> +		mutex_lock(&page_hotness_lock[bkt]);
> +		hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
> +			if (phi->hot_node != nid)
> +				continue;
> +
> +			if (page_should_be_promoted(phi)) {
> +				count_vm_event(KPROMOTED_MIG_CANDIDATE);
> +				if (!kpromote_page(phi)) {
> +					count_vm_event(KPROMOTED_MIG_PROMOTED);
> +					hlist_del_init(&phi->hnode);
> +					kfree(phi);
> +				}
> +			} else {
> +				/*
> +				 * Not a suitable page or cold page, stop tracking it.
> +				 * TODO: Identify cold pages and drive demotion?
> +				 */
> +				count_vm_event(KPROMOTED_MIG_DROPPED);
> +				hlist_del_init(&phi->hnode);
> +				kfree(phi);

Won't existing demotion already handle this?

> +			}
> +		}
> +		mutex_unlock(&page_hotness_lock[bkt]);
> +	}
> +}
> +

It sounds like NUMA balancing, promotion and demotion can all act on parallel on
these folios, if not could you clarify their relationship and dependency?


> +static struct page_hotness_info *__kpromoted_lookup(unsigned long pfn, int bkt)
> +{
> +	struct page_hotness_info *phi;
> +
> +	hlist_for_each_entry(phi, &page_hotness_hash[bkt], hnode) {
> +		if (phi->pfn == pfn)
> +			return phi;
> +	}
> +	return NULL;
> +}
> +
> +static struct page_hotness_info *kpromoted_lookup(unsigned long pfn, int bkt, unsigned long now)
> +{
> +	struct page_hotness_info *phi;
> +
> +	phi = __kpromoted_lookup(pfn, bkt);
> +	if (!phi) {
> +		phi = kzalloc(sizeof(struct page_hotness_info), GFP_KERNEL);
> +		if (!phi)
> +			return ERR_PTR(-ENOMEM);
> +
> +		phi->pfn = pfn;
> +		phi->frequency = 1;
> +		phi->last_update = now;
> +		phi->recency = now;
> +		hlist_add_head(&phi->hnode, &page_hotness_hash[bkt]);
> +		count_vm_event(KPROMOTED_RECORD_ADDED);
> +	} else {
> +		count_vm_event(KPROMOTED_RECORD_EXISTS);
> +	}
> +	return phi;
> +}
> +
> +/*
> + * Called by subsystems that generate page hotness/access information.
> + *
> + * Records the memory access info for futher action by kpromoted.
> + */
> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now)
> +{
> +	struct page_hotness_info *phi;
> +	struct page *page;
> +	struct folio *folio;
> +	int ret, bkt;
> +
> +	count_vm_event(KPROMOTED_RECORDED_ACCESSES);
> +
> +	switch (src) {
> +	case KPROMOTED_HW_HINTS:
> +		count_vm_event(KPROMOTED_RECORD_HWHINTS);
> +		break;
> +	case KPROMOTED_PGTABLE_SCAN:
> +		count_vm_event(KPROMOTED_RECORD_PGTSCANS);
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	/*
> +	 * Record only accesses from lower tiers.
> +	 * Assuming node having CPUs as toptier for now.
> +	 */
> +	if (node_is_toptier(pfn_to_nid(pfn))) {
> +		count_vm_event(KPROMOTED_RECORD_TOPTIER);
> +		return 0;
> +	}
> +
> +	page = pfn_to_online_page(pfn);
> +	if (!page || is_zone_device_page(page))
> +		return 0;
> +
> +	folio = page_folio(page);
> +	if (!folio_test_lru(folio))
> +		return 0;
> +
> +	bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
> +	mutex_lock(&page_hotness_lock[bkt]);
> +	phi = kpromoted_lookup(pfn, bkt, now);
> +	if (!phi) {
> +		ret = PTR_ERR(phi);
> +		goto out;
> +	}
> +
> +	if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> +		/* New window */
> +		phi->frequency = 1; /* TODO: Factor in the history */
> +		phi->last_update = now;
> +	} else {
> +		phi->frequency++;
> +	}
> +	phi->recency = now;
> +
> +	/*
> +	 * TODOs:
> +	 * 1. Source nid is hard-coded for some temperature sources
> +	 * 2. Take action if hot_node changes - may be a shared page?
> +	 * 3. Maintain node info for every access within the window?
> +	 */
> +	phi->hot_node = (nid == NUMA_NO_NODE) ? 1 : nid;

I don't understand why nid needs to be 1 if nid is NUMA_NODE_ID? Does
it mean that it's being promoted to the top tier, the mix of hot_node,
tier and nid is not very clear here.

> +	mutex_unlock(&page_hotness_lock[bkt]);
> +out:
> +	return 0;
> +}
> +
> +/*
> + * Go through the accumulated mem_access_info and migrate
> + * pages if required.
> + */
> +static void kpromoted_do_work(pg_data_t *pgdat)
> +{
> +	kpromoted_migrate(pgdat);
> +}
> +
> +static inline bool kpromoted_work_requested(pg_data_t *pgdat)
> +{
> +	return false;
> +}
> +
> +static int kpromoted(void *p)
> +{
> +	pg_data_t *pgdat = (pg_data_t *)p;
> +	struct task_struct *tsk = current;
> +	long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
> +
> +	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> +
> +	if (!cpumask_empty(cpumask))
> +		set_cpus_allowed_ptr(tsk, cpumask);
> +
> +	while (!kthread_should_stop()) {
> +		wait_event_timeout(pgdat->kpromoted_wait,
> +				   kpromoted_work_requested(pgdat), timeout);
> +		kpromoted_do_work(pgdat);
> +	}
> +	return 0;
> +}
> +
> +static void kpromoted_run(int nid)
> +{
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +
> +	if (pgdat->kpromoted)
> +		return;
> +
> +	pgdat->kpromoted = kthread_run(kpromoted, pgdat, "kpromoted%d", nid);
> +	if (IS_ERR(pgdat->kpromoted)) {
> +		pr_err("Failed to start kpromoted on node %d\n", nid);
> +		pgdat->kpromoted = NULL;
> +	}
> +}
> +
> +static int kpromoted_cpu_online(unsigned int cpu)
> +{
> +	int nid;
> +
> +	for_each_node_state(nid, N_CPU) {
> +		pg_data_t *pgdat = NODE_DATA(nid);
> +		const struct cpumask *mask;
> +
> +		mask = cpumask_of_node(pgdat->node_id);
> +
> +		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
> +			/* One of our CPUs online: restore mask */
> +			if (pgdat->kpromoted)
> +				set_cpus_allowed_ptr(pgdat->kpromoted, mask);
> +	}
> +	return 0;
> +}
> +
> +static int __init kpromoted_init(void)
> +{
> +	int nid, ret, i;
> +
> +	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
> +					"mm/promotion:online",
> +					kpromoted_cpu_online, NULL);
> +	if (ret < 0) {
> +		pr_err("kpromoted: failed to register hotplug callbacks.\n");
> +		return ret;
> +	}
> +
> +	for (i = 0; i < (1UL << KPROMOTED_HASH_ORDER); i++)
> +		mutex_init(&page_hotness_lock[i]);
> +
> +	for_each_node_state(nid, N_CPU)
> +		kpromoted_run(nid);
> +

I think we need a dynamic way to disabling promotion at run time
as well, right?


> +	return 0;
> +}
> +
> +subsys_initcall(kpromoted_init)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 2630cc30147e..d212df24f89b 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1362,6 +1362,15 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
>  static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
>  #endif
>  
> +#ifdef CONFIG_KPROMOTED
> +static void pgdat_init_kpromoted(struct pglist_data *pgdat)
> +{
> +	init_waitqueue_head(&pgdat->kpromoted_wait);
> +}
> +#else
> +static void pgdat_init_kpromoted(struct pglist_data *pgdat) {}
> +#endif
> +
>  static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>  {
>  	int i;
> @@ -1371,6 +1380,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>  
>  	pgdat_init_split_queue(pgdat);
>  	pgdat_init_kcompactd(pgdat);
> +	pgdat_init_kpromoted(pgdat);
>  
>  	init_waitqueue_head(&pgdat->kswapd_wait);
>  	init_waitqueue_head(&pgdat->pfmemalloc_wait);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 16bfe1c694dd..618f44bae5c8 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1466,6 +1466,19 @@ const char * const vmstat_text[] = {
>  	"kstack_rest",
>  #endif
>  #endif
> +	"kpromoted_recorded_accesses",
> +	"kpromoted_recorded_hwhints",
> +	"kpromoted_recorded_pgtscans",
> +	"kpromoted_record_toptier",
> +	"kpromoted_record_added",
> +	"kpromoted_record_exists",
> +	"kpromoted_mig_right_node",
> +	"kpromoted_mig_non_lru",
> +	"kpromoted_mig_cold_old",
> +	"kpromoted_mig_cold_not_accessed",
> +	"kpromoted_mig_candidate",
> +	"kpromoted_mig_promoted",
> +	"kpromoted_mig_dropped",
>  #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
>  };
>  #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ