lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1983025922.01755775202288.JavaMail.epsvc@epcpadp1new>
Date: Thu, 21 Aug 2025 16:47:19 +0530
From: Alok Rathore <alok.rathore@...sung.com>
To: Bharata B Rao <bharata@....com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org,
	Jonathan.Cameron@...wei.com, dave.hansen@...el.com, gourry@...rry.net,
	hannes@...xchg.org, mgorman@...hsingularity.net, mingo@...hat.com,
	peterz@...radead.org, raghavendra.kt@....com, riel@...riel.com,
	rientjes@...gle.com, sj@...nel.org, weixugc@...gle.com, willy@...radead.org,
	ying.huang@...ux.alibaba.com, ziy@...dia.com, dave@...olabs.net,
	nifan.cxl@...il.com, xuezhengchu@...wei.com, yiannis@...corp.com,
	akpm@...ux-foundation.org, david@...hat.com, byungchul@...com,
	kinseyho@...gle.com, joshua.hahnjy@...il.com, yuanchu@...gle.com,
	balbirs@...dia.com, alokrathore20@...il.com, gost.dev@...sung.com,
	cpgs@...sung.com
Subject: Re: [RFC PATCH v1 3/7] mm: Hot page tracking and promotion

On 14/08/25 07:18PM, Bharata B Rao wrote:
>This introduces a sub-system for collecting memory access
>information from different sources. It maintains the hotness
>information based on the access history and time of access.
>
>Additionally, it provides per-lowertier-node kernel threads
>(named kpromoted) that periodically promote the pages that
>are eligible for promotion.
>
>Sub-systems that generate hot page access info can report that
>using this API:
>
>int pghot_record_access(u64 pfn, int nid, int src,
>			unsigned long time)
>
>@pfn: The PFN of the memory accessed
>@nid: The accessing NUMA node ID
>@src: The temperature source (sub-system) that generated the
>      access info
>@time: The access time in jiffies
>
>Some temperature sources may not provide the nid from which
>the page was accessed. This is true for sources that use
>page table scanning for PTE Accessed bit. For such sources,
>the default toptier node to which such pages should be promoted
>is hard coded.
>
>Also, the access time provided some sources may at best be
>considered approximate. This is especially true for hot pages
>detected by PTE A bit scanning.
>
>The hot PFN records are stored in hash lists hashed by PFN value.
>The PFN records that are categorized as hot enough to be promoted
>are maintained in a per-lowertier-node max heap from which
>kpromoted extracts and promotes them.
>
>Each record stores the following info:
>
>struct pghot_info {
>	unsigned long pfn;
>
>	unsigned long last_update; /* Most recent access time */
>	int frequency; /* Number of accesses within current window */
>	int nid; /* Most recent access from this node */
>
>	struct hlist_node hnode;
>	size_t heap_idx; /* Position in max heap for quick retreival */
>};
>
>The way in which a page is categorized as hot enough to be
>promoted is pretty primitive now.
>
>Signed-off-by: Bharata B Rao <bharata@....com>
>---
> include/linux/mmzone.h        |  11 +
> include/linux/pghot.h         |  87 ++++++
> include/linux/vm_event_item.h |   9 +
> mm/Kconfig                    |  11 +
> mm/Makefile                   |   1 +
> mm/mm_init.c                  |  10 +
> mm/pghot.c                    | 501 ++++++++++++++++++++++++++++++++++
> mm/vmstat.c                   |   9 +
> 8 files changed, 639 insertions(+)
> create mode 100644 include/linux/pghot.h
> create mode 100644 mm/pghot.c
>
>diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>index 0c5da9141983..f7094babed10 100644
>--- a/include/linux/mmzone.h
>+++ b/include/linux/mmzone.h
>@@ -1349,6 +1349,10 @@ struct memory_failure_stats {
> };
> #endif
>
>+#ifdef CONFIG_PGHOT
>+#include <linux/pghot.h>
>+#endif
>+
> /*
>  * On NUMA machines, each NUMA node would have a pg_data_t to describe
>  * it's memory layout. On UMA machines there is a single pglist_data which
>@@ -1497,6 +1501,13 @@ typedef struct pglist_data {
> #ifdef CONFIG_MEMORY_FAILURE
> 	struct memory_failure_stats mf_stats;
> #endif
>+#ifdef CONFIG_PGHOT
>+	struct task_struct *kpromoted;
>+	wait_queue_head_t kpromoted_wait;
>+	struct pghot_info **phi_buf;
>+	struct max_heap heap;
>+	spinlock_t heap_lock;
>+#endif
> } pg_data_t;
>
> #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
>diff --git a/include/linux/pghot.h b/include/linux/pghot.h
>new file mode 100644
>index 000000000000..6b8496944e7f
>--- /dev/null
>+++ b/include/linux/pghot.h
>@@ -0,0 +1,87 @@
>+/* SPDX-License-Identifier: GPL-2.0 */
>+#ifndef _LINUX_KPROMOTED_H
>+#define _LINUX_KPROMOTED_H
>+
>+#include <linux/types.h>
>+#include <linux/init.h>
>+#include <linux/workqueue_types.h>
>+
>+/* Page hotness temperature sources */
>+enum pghot_src {
>+	PGHOT_HW_HINTS,
>+	PGHOT_PGTABLE_SCAN,
>+};
>+
>+#ifdef CONFIG_PGHOT
>+
>+#define KPROMOTED_FREQ_WINDOW	(5 * MSEC_PER_SEC)
>+
>+/* 2 accesses within a window will make the page a promotion candidate */
>+#define KPRMOTED_FREQ_THRESHOLD	2
>+
>+/*
>+ * The following two defines control the number of hash lists
>+ * that are maintained for tracking PFN accesses.
>+ */
>+#define PGHOT_HASH_PCT		50	/* % of lower tier memory pages to track */
>+#define PGHOT_HASH_ENTRIES	1024	/* Number of entries per list, ideal case */
>+
>+/*
>+ * Percentage of hash entries that can reside in heap as migrate-ready
>+ * candidates
>+ */
>+#define PGHOT_HEAP_PCT		25
>+
>+#define KPRMOTED_MIGRATE_BATCH	1024
>+
>+/*
>+ * If target NID isn't available, kpromoted promotes to node 0
>+ * by default.
>+ *
>+ * TODO: Need checks to validate that default node is indeed
>+ * present and is a toptier node.
>+ */
>+#define KPROMOTED_DEFAULT_NODE	0
>+
>+struct pghot_info {
>+	unsigned long pfn;
>+
>+	/*
>+	 * The following are the three fundamental parameters
>+	 * required to track the hotness of page/PFN.
>+	 *
>+	 * TODO:
>+	 * Check if these three can fit into a u32.
>+	 * With 3 bits for frequency (8 most recent accesses),
>+	 * 10 bits for nid (1024 nodes), the remaining 19 bits
>+	 * are available for timestamp.
>+	 */
>+	unsigned long last_update; /* Most recent access time */
>+	int frequency; /* Number of accesses within current window */
>+	int nid; /* Most recent access from this node */
>+
>+	struct hlist_node hnode;
>+	size_t heap_idx; /* Position in max heap for quick retreival */
>+};
>+
>+struct max_heap {
>+	size_t nr;
>+	size_t size;
>+	struct pghot_info **data;
>+	DECLARE_FLEX_ARRAY(struct pghot_info *, preallocated);
>+};
>+
>+/*
>+ * The wakeup interval of kpromoted threads
>+ */
>+#define KPROMOTE_DELAY	20	/* 20ms */
>+
>+int pghot_record_access(u64 pfn, int nid, int src, unsigned long now);
>+#else
>+static inline int pghot_record_access(u64 pfn, int nid, int src,
>+				      unsigned long now)
>+{
>+	return 0;
>+}
>+#endif /* CONFIG_PGHOT */
>+#endif /* _LINUX_KPROMOTED_H */
>diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
>index 9e15a088ba38..9085e5c2d4aa 100644
>--- a/include/linux/vm_event_item.h
>+++ b/include/linux/vm_event_item.h
>@@ -186,6 +186,15 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
> 		KSTACK_REST,
> #endif
> #endif /* CONFIG_DEBUG_STACK_USAGE */
>+		PGHOT_RECORDED_ACCESSES,
>+		PGHOT_RECORD_HWHINTS,
>+		PGHOT_RECORD_PGTSCANS,
>+		PGHOT_RECORDS_HASH,
>+		PGHOT_RECORDS_HEAP,
>+		KPROMOTED_RIGHT_NODE,
>+		KPROMOTED_NON_LRU,
>+		KPROMOTED_COLD_OLD,
>+		KPROMOTED_DROPPED,
> 		NR_VM_EVENT_ITEMS
> };
>
>diff --git a/mm/Kconfig b/mm/Kconfig
>index e443fe8cd6cf..8b236eb874cf 100644
>--- a/mm/Kconfig
>+++ b/mm/Kconfig
>@@ -1381,6 +1381,17 @@ config PT_RECLAIM
>
> 	  Note: now only empty user PTE page table pages will be reclaimed.
>
>+config PGHOT
>+	bool "Hot page tracking and promotion"
>+	def_bool y
>+	depends on NUMA && MIGRATION && MMU
>+	select MIN_HEAP
>+	help
>+	  A sub-system to track page accesses in lower tier memory and
>+	  maintain hot page information. Promotes hot pages from lower
>+	  tiers to top tier by using the memory access information provided
>+	  by various sources. Asynchronous promotion is done by per-node
>+	  kernel threads.
>
> source "mm/damon/Kconfig"
>
>diff --git a/mm/Makefile b/mm/Makefile
>index ef54aa615d9d..8799bd0c68ed 100644
>--- a/mm/Makefile
>+++ b/mm/Makefile
>@@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> obj-$(CONFIG_EXECMEM) += execmem.o
> obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
> obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
>+obj-$(CONFIG_PGHOT) += kpromoted.o

Looks like by mistake used older file name. It should be pghot.o

Can you please provide base commit. Unable to apply patch cleanly using b4 utility.

Regards,
Alok Rathore


>diff --git a/mm/mm_init.c b/mm/mm_init.c
>index 5c21b3af216b..f7992be3ff7f 100644
>--- a/mm/mm_init.c
>+++ b/mm/mm_init.c
>@@ -1402,6 +1402,15 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
> static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
> #endif
>
>+#ifdef CONFIG_PGHOT
>+static void pgdat_init_kpromoted(struct pglist_data *pgdat)
>+{
>+	init_waitqueue_head(&pgdat->kpromoted_wait);
>+}
>+#else
>+static void pgdat_init_kpromoted(struct pglist_data *pgdat) {}
>+#endif
>+
> static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
> {
> 	int i;
>@@ -1411,6 +1420,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>
> 	pgdat_init_split_queue(pgdat);
> 	pgdat_init_kcompactd(pgdat);
>+	pgdat_init_kpromoted(pgdat);
>
> 	init_waitqueue_head(&pgdat->kswapd_wait);
> 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
>diff --git a/mm/pghot.c b/mm/pghot.c
>new file mode 100644
>index 000000000000..eadcf970c3ef
>--- /dev/null
>+++ b/mm/pghot.c
>@@ -0,0 +1,501 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/*
>+ * Maintains information about hot pages from slower tier nodes and
>+ * promotes them.
>+ *
>+ * Info about accessed pages are stored in hash lists indexed by PFN.
>+ * Info about pages that are hot enough to be promoted are stored in
>+ * a per-toptier-node max_heap.
>+ *
>+ * kpromoted is a kernel thread that runs on each toptier node and
>+ * promotes pages from max_heap.
>+ *
>+ * TODO:
>+ * - Compact pghot_info so that nid, time and frequency can fit
>+ * - Scalar hotness value as a function frequency and recency
>+ * - Possibility of moving migration rate limiting to kpromoted
>+ */
>+#include <linux/pghot.h>
>+#include <linux/kthread.h>
>+#include <linux/mmzone.h>
>+#include <linux/migrate.h>
>+#include <linux/memory-tiers.h>
>+#include <linux/slab.h>
>+#include <linux/sched.h>
>+#include <linux/vmalloc.h>
>+#include <linux/hashtable.h>
>+#include <linux/min_heap.h>
>+
>+struct pghot_hash {
>+	struct hlist_head hash;
>+	spinlock_t lock;
>+};
>+
>+static struct pghot_hash *phi_hash;
>+static int phi_hash_order;
>+static int phi_heap_entries;
>+static struct kmem_cache *phi_cache __ro_after_init;
>+static bool kpromoted_started __ro_after_init;
>+
>+static bool phi_heap_less(const void *lhs, const void *rhs, void *args)
>+{
>+	return (*(struct pghot_info **)lhs)->frequency >
>+		(*(struct pghot_info **)rhs)->frequency;
>+}
>+
>+static void phi_heap_swp(void *lhs, void *rhs, void *args)
>+{
>+	struct pghot_info **l = (struct pghot_info **)lhs;
>+	struct pghot_info **r = (struct pghot_info **)rhs;
>+	int lindex = l - (struct pghot_info **)args;
>+	int rindex = r - (struct pghot_info **)args;
>+	struct pghot_info *tmp = *l;
>+
>+	*l = *r;
>+	*r = tmp;
>+
>+	(*l)->heap_idx = lindex;
>+	(*r)->heap_idx = rindex;
>+}
>+
>+static const struct min_heap_callbacks phi_heap_cb = {
>+	.less = phi_heap_less,
>+	.swp = phi_heap_swp,
>+};
>+
>+static void phi_heap_update_entry(struct max_heap *phi_heap, struct pghot_info *phi)
>+{
>+	int orig_idx = phi->heap_idx;
>+
>+	min_heap_sift_up(phi_heap, phi->heap_idx, &phi_heap_cb,
>+			 phi_heap->data);
>+	if (phi_heap->data[phi->heap_idx]->heap_idx == orig_idx)
>+		min_heap_sift_down(phi_heap, phi->heap_idx,
>+				   &phi_heap_cb, phi_heap->data);
>+}
>+
>+static bool phi_heap_insert(struct max_heap *phi_heap, struct pghot_info *phi)
>+{
>+	if (phi_heap->nr >= phi_heap_entries)
>+		return false;
>+
>+	phi->heap_idx = phi_heap->nr;
>+	min_heap_push(phi_heap, &phi, &phi_heap_cb, phi_heap->data);
>+
>+	return true;
>+}
>+
>+static bool phi_is_pfn_hot(struct pghot_info *phi)
>+{
>+	struct page *page = pfn_to_online_page(phi->pfn);
>+	unsigned long now = jiffies;
>+	struct folio *folio;
>+
>+	if (!page || is_zone_device_page(page))
>+		return false;
>+
>+	folio = page_folio(page);
>+	if (!folio_test_lru(folio)) {
>+		count_vm_event(KPROMOTED_NON_LRU);
>+		return false;
>+	}
>+	if (folio_nid(folio) == phi->nid) {
>+		count_vm_event(KPROMOTED_RIGHT_NODE);
>+		return false;
>+	}
>+
>+	/* If the page was hot a while ago, don't promote */
>+	if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
>+		count_vm_event(KPROMOTED_COLD_OLD);
>+		return false;
>+	}
>+	return true;
>+}
>+
>+static struct folio *kpromoted_isolate_folio(struct pghot_info *phi)
>+{
>+	struct page *page = pfn_to_page(phi->pfn);
>+	struct folio *folio;
>+
>+	if (!page)
>+		return NULL;
>+
>+	folio = page_folio(page);
>+	if (migrate_misplaced_folio_prepare(folio, NULL, phi->nid))
>+		return NULL;
>+	else
>+		return folio;
>+}
>+
>+static struct pghot_info *phi_alloc(unsigned long pfn)
>+{
>+	struct pghot_info *phi;
>+
>+	phi = kmem_cache_zalloc(phi_cache, GFP_NOWAIT);
>+	if (!phi)
>+		return NULL;
>+
>+	phi->pfn = pfn;
>+	phi->heap_idx = -1;
>+	return phi;
>+}
>+
>+static inline void phi_free(struct pghot_info *phi)
>+{
>+	kmem_cache_free(phi_cache, phi);
>+}
>+
>+static int phi_heap_extract(pg_data_t *pgdat, int batch_count, int freq_th,
>+			    struct list_head *migrate_list, int *count)
>+{
>+	spinlock_t *phi_heap_lock = &pgdat->heap_lock;
>+	struct max_heap *phi_heap = &pgdat->heap;
>+	int max_retries = 10;
>+	int bkt, i = 0;
>+
>+	if (batch_count < 0 || !migrate_list || !count || freq_th < 1 ||
>+	    freq_th > KPRMOTED_FREQ_THRESHOLD)
>+		return -EINVAL;
>+
>+	*count = 0;
>+	for (i = 0; i < batch_count; i++) {
>+		struct pghot_info *top = NULL;
>+		bool should_continue = false;
>+		struct folio *folio;
>+		int retries = 0;
>+
>+		while (retries < max_retries) {
>+			spin_lock(phi_heap_lock);
>+			if (phi_heap->nr > 0 && phi_heap->data[0]->frequency >= freq_th) {
>+				should_continue = true;
>+				bkt = hash_min(phi_heap->data[0]->pfn, phi_hash_order);
>+				top = phi_heap->data[0];
>+			}
>+			spin_unlock(phi_heap_lock);
>+
>+			if (!should_continue)
>+				goto done;
>+
>+			spin_lock(&phi_hash[bkt].lock);
>+			spin_lock(phi_heap_lock);
>+			if (phi_heap->nr == 0 || phi_heap->data[0] != top ||
>+			    phi_heap->data[0]->frequency < freq_th) {
>+				spin_unlock(phi_heap_lock);
>+				spin_unlock(&phi_hash[bkt].lock);
>+				retries++;
>+				continue;
>+			}
>+
>+			top = phi_heap->data[0];
>+			hlist_del_init(&top->hnode);
>+
>+			phi_heap->nr--;
>+			if (phi_heap->nr > 0) {
>+				phi_heap->data[0] = phi_heap->data[phi_heap->nr];
>+				phi_heap->data[0]->heap_idx = 0;
>+				min_heap_sift_down(phi_heap, 0, &phi_heap_cb,
>+						   phi_heap->data);
>+			}
>+
>+			spin_unlock(phi_heap_lock);
>+			spin_unlock(&phi_hash[bkt].lock);
>+
>+			if (!phi_is_pfn_hot(top)) {
>+				count_vm_event(KPROMOTED_DROPPED);
>+				goto skip;
>+			}
>+
>+			folio = kpromoted_isolate_folio(top);
>+			if (folio) {
>+				list_add(&folio->lru, migrate_list);
>+				(*count)++;
>+			}
>+skip:
>+			phi_free(top);
>+			break;
>+		}
>+		if (retries >= max_retries) {
>+			pr_warn("%s: Too many retries\n", __func__);
>+			break;
>+		}
>+
>+	}
>+done:
>+	return 0;
>+}
>+
>+static void phi_heap_add_or_adjust(struct pghot_info *phi)
>+{
>+	pg_data_t *pgdat = NODE_DATA(phi->nid);
>+	struct max_heap *phi_heap = &pgdat->heap;
>+
>+	spin_lock(&pgdat->heap_lock);
>+	if (phi->heap_idx >= 0 && phi->heap_idx < phi_heap->nr &&
>+	    phi_heap->data[phi->heap_idx] == phi) {
>+		/* Entry exists in heap */
>+		if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
>+			/* Below threshold, remove from the heap */
>+			phi_heap->nr--;
>+			if (phi->heap_idx < phi_heap->nr) {
>+				phi_heap->data[phi->heap_idx] =
>+					phi_heap->data[phi_heap->nr];
>+				phi_heap->data[phi->heap_idx]->heap_idx =
>+					phi->heap_idx;
>+				min_heap_sift_down(phi_heap, phi->heap_idx,
>+						   &phi_heap_cb, phi_heap->data);
>+			}
>+			phi->heap_idx = -1;
>+
>+		} else {
>+			/* Update position in heap */
>+			phi_heap_update_entry(phi_heap, phi);
>+		}
>+	} else if (phi->frequency >= KPRMOTED_FREQ_THRESHOLD) {
>+		/* Add to the heap */
>+		if (phi_heap_insert(phi_heap, phi))
>+			count_vm_event(PGHOT_RECORDS_HEAP);
>+	}
>+	spin_unlock(&pgdat->heap_lock);
>+}
>+
>+static struct pghot_info *phi_lookup(unsigned long pfn, int bkt)
>+{
>+	struct pghot_info *phi;
>+
>+	hlist_for_each_entry(phi, &phi_hash[bkt].hash, hnode) {
>+		if (phi->pfn == pfn)
>+			return phi;
>+	}
>+	return NULL;
>+}
>+
>+/*
>+ * Called by subsystems that generate page hotness/access information.
>+ *
>+ *  @pfn: The PFN of the memory accessed
>+ *  @nid: The accessing NUMA node ID
>+ *  @src: The temperature source (sub-system) that generated the
>+ *        access info
>+ *  @time: The access time in jiffies
>+ *
>+ * Maintains the access records per PFN, classifies them as
>+ * hot based on subsequent accesses and finally hands over
>+ * them to kpromoted for migration.
>+ */
>+int pghot_record_access(u64 pfn, int nid, int src, unsigned long now)
>+{
>+	struct pghot_info *phi;
>+	struct page *page;
>+	struct folio *folio;
>+	int bkt;
>+	bool new_entry = false, new_window = false;
>+
>+	if (!kpromoted_started)
>+		return -EINVAL;
>+
>+	count_vm_event(PGHOT_RECORDED_ACCESSES);
>+
>+	switch (src) {
>+	case PGHOT_HW_HINTS:
>+		count_vm_event(PGHOT_RECORD_HWHINTS);
>+		break;
>+	case PGHOT_PGTABLE_SCAN:
>+		count_vm_event(PGHOT_RECORD_PGTSCANS);
>+		break;
>+	default:
>+		return -EINVAL;
>+	}
>+
>+	/*
>+	 * Record only accesses from lower tiers.
>+	 */
>+	if (node_is_toptier(pfn_to_nid(pfn)))
>+		return 0;
>+
>+	/*
>+	 * Reject the non-migratable pages right away.
>+	 */
>+	page = pfn_to_online_page(pfn);
>+	if (!page || is_zone_device_page(page))
>+		return 0;
>+
>+	folio = page_folio(page);
>+	if (!folio_test_lru(folio))
>+		return 0;
>+
>+	bkt = hash_min(pfn, phi_hash_order);
>+	spin_lock(&phi_hash[bkt].lock);
>+	phi = phi_lookup(pfn, bkt);
>+	if (!phi) {
>+		phi = phi_alloc(pfn);
>+		if (!phi)
>+			goto out;
>+		new_entry = true;
>+	}
>+
>+	if (((now - phi->last_update) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) ||
>+	    (nid != NUMA_NO_NODE && phi->nid != nid))
>+		new_window = true;
>+
>+	if (new_entry || new_window) {
>+		/* New window */
>+		phi->frequency = 1; /* TODO: Factor in the history */
>+	} else
>+		phi->frequency++;
>+	phi->last_update = now;
>+	phi->nid = (nid == NUMA_NO_NODE) ? KPROMOTED_DEFAULT_NODE : nid;
>+
>+	if (new_entry) {
>+		/* Insert the new entry into hash table */
>+		hlist_add_head(&phi->hnode, &phi_hash[bkt].hash);
>+		count_vm_event(PGHOT_RECORDS_HASH);
>+	} else {
>+		/* Add/update the position in heap */
>+		phi_heap_add_or_adjust(phi);
>+	}
>+out:
>+	spin_unlock(&phi_hash[bkt].lock);
>+	return 0;
>+}
>+
>+/*
>+ * Extract the hot page records and batch-migrate the
>+ * hot pages.
>+ */
>+static void kpromoted_migrate(pg_data_t *pgdat)
>+{
>+	int count, ret;
>+	LIST_HEAD(migrate_list);
>+
>+	/*
>+	 * Extract the top N elements from the heap that match
>+	 * the requested hotness threshold.
>+	 *
>+	 * PFNs ineligible from migration standpoint are removed
>+	 * from the heap and hash.
>+	 *
>+	 * Folios eligible for migration are isolated and returned
>+	 * in @migrate_list.
>+	 */
>+	ret = phi_heap_extract(pgdat, KPRMOTED_MIGRATE_BATCH,
>+			       KPRMOTED_FREQ_THRESHOLD, &migrate_list, &count);
>+	if (ret)
>+		return;
>+
>+	if (!list_empty(&migrate_list))
>+		migrate_misplaced_folios_batch(&migrate_list, pgdat->node_id);
>+}
>+
>+static int kpromoted(void *p)
>+{
>+	pg_data_t *pgdat = (pg_data_t *)p;
>+
>+	while (!kthread_should_stop()) {
>+		wait_event_timeout(pgdat->kpromoted_wait, false,
>+				   msecs_to_jiffies(KPROMOTE_DELAY));
>+		kpromoted_migrate(pgdat);
>+	}
>+	return 0;
>+}
>+
>+static int kpromoted_run(int nid)
>+{
>+	pg_data_t *pgdat = NODE_DATA(nid);
>+	int ret = 0;
>+
>+	if (!node_is_toptier(nid))
>+		return 0;
>+
>+	if (!pgdat->phi_buf) {
>+		pgdat->phi_buf = vzalloc_node(phi_heap_entries * sizeof(struct pghot_info *),
>+					      nid);
>+		if (!pgdat->phi_buf)
>+			return -ENOMEM;
>+
>+		min_heap_init(&pgdat->heap, pgdat->phi_buf, phi_heap_entries);
>+		spin_lock_init(&pgdat->heap_lock);
>+	}
>+
>+	if (!pgdat->kpromoted)
>+		pgdat->kpromoted = kthread_create_on_node(kpromoted, pgdat, nid,
>+							  "kpromoted%d", nid);
>+	if (IS_ERR(pgdat->kpromoted)) {
>+		ret = PTR_ERR(pgdat->kpromoted);
>+		pgdat->kpromoted = NULL;
>+		pr_info("Failed to start kpromoted%d, ret %d\n", nid, ret);
>+	} else {
>+		wake_up_process(pgdat->kpromoted);
>+	}
>+	return ret;
>+}
>+
>+static int __init pghot_init(void)
>+{
>+	unsigned int hash_size;
>+	size_t hash_entries;
>+	size_t nr_pages = 0;
>+	pg_data_t *pgdat;
>+	int i, nid, ret;
>+
>+	/*
>+	 * Arrive at the hash and heap sizes based on the
>+	 * number of pages present in the lower tier nodes.
>+	 */
>+	for_each_node_state(nid, N_MEMORY) {
>+		if (!node_is_toptier(nid))
>+			nr_pages += NODE_DATA(nid)->node_present_pages;
>+	}
>+
>+	if (!nr_pages)
>+		return 0;
>+
>+	hash_entries = nr_pages * PGHOT_HASH_PCT / 100;
>+	hash_size = hash_entries / PGHOT_HASH_ENTRIES;
>+	phi_hash_order = ilog2(hash_size);
>+
>+	phi_hash = vmalloc(sizeof(struct pghot_hash) * hash_size);
>+	if (!phi_hash) {
>+		ret = -ENOMEM;
>+		goto out;
>+	}
>+
>+	for (i = 0; i < hash_size; i++) {
>+		INIT_HLIST_HEAD(&phi_hash[i].hash);
>+		spin_lock_init(&phi_hash[i].lock);
>+	}
>+
>+	phi_cache = KMEM_CACHE(pghot_info, 0);
>+	if (unlikely(!phi_cache)) {
>+		ret = -ENOMEM;
>+		goto out;
>+	}
>+
>+	phi_heap_entries = hash_entries * PGHOT_HEAP_PCT / 100;
>+	for_each_node_state(nid, N_CPU) {
>+		ret = kpromoted_run(nid);
>+		if (ret)
>+			goto out_stop_kthread;
>+	}
>+
>+	kpromoted_started = true;
>+	pr_info("pghot: Started page hotness monitoring and promotion thread\n");
>+	pr_info("pghot: nr_pages %ld hash_size %d hash_entries %ld hash_order %d heap_entries %d\n",
>+	       nr_pages, hash_size, hash_entries, phi_hash_order, phi_heap_entries);
>+	return 0;
>+
>+out_stop_kthread:
>+	for_each_node_state(nid, N_CPU) {
>+		pgdat = NODE_DATA(nid);
>+		if (pgdat->kpromoted) {
>+			kthread_stop(pgdat->kpromoted);
>+			pgdat->kpromoted = NULL;
>+			vfree(pgdat->phi_buf);
>+		}
>+	}
>+out:
>+	kmem_cache_destroy(phi_cache);
>+	vfree(phi_hash);
>+	return ret;
>+}
>+
>+late_initcall(pghot_init)
>diff --git a/mm/vmstat.c b/mm/vmstat.c
>index 71cd1ceba191..9edbdd71c6f7 100644
>--- a/mm/vmstat.c
>+++ b/mm/vmstat.c
>@@ -1496,6 +1496,15 @@ const char * const vmstat_text[] = {
> #endif
> #undef I
> #endif /* CONFIG_VM_EVENT_COUNTERS */
>+	"pghot_recorded_accesses",
>+	"pghot_recorded_hwhints",
>+	"pghot_recorded_pgtscans",
>+	"pghot_records_hash",
>+	"pghot_records_heap",
>+	"kpromoted_right_node",
>+	"kpromoted_non_lru",
>+	"kpromoted_cold_old",
>+	"kpromoted_dropped",
> };
> #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
>
>-- 
>2.34.1
>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ