[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1983025922.01766400002783.JavaMail.epsvc@epcpadp1new>
Date: Mon, 22 Dec 2025 15:56:55 +0530
From: Alok Rathore <alok.rathore@...sung.com>
To: Bharata B Rao <bharata@....com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org,
Jonathan.Cameron@...wei.com, dave.hansen@...el.com, gourry@...rry.net,
mgorman@...hsingularity.net, mingo@...hat.com, peterz@...radead.org,
raghavendra.kt@....com, riel@...riel.com, rientjes@...gle.com,
sj@...nel.org, weixugc@...gle.com, willy@...radead.org,
ying.huang@...ux.alibaba.com, ziy@...dia.com, dave@...olabs.net,
nifan.cxl@...il.com, xuezhengchu@...wei.com, yiannis@...corp.com,
akpm@...ux-foundation.org, david@...hat.com, byungchul@...com,
kinseyho@...gle.com, joshua.hahnjy@...il.com, yuanchu@...gle.com,
balbirs@...dia.com, shivankg@....com, alokrathore20@...il.com,
gost.dev@...sung.com, cpgs@...sung.com
Subject: Re: [RFC PATCH v4 8/9] mm: sched: Move hot page promotion from
NUMAB=2 to pghot tracking
On 06/12/25 03:44PM, Bharata B Rao wrote:
>Currently hot page promotion (NUMA_BALANCING_MEMORY_TIERING
>mode of NUMA Balancing) does hot page detection (via hint faults),
>hot page classification and eventual promotion, all by itself and
>sits within the scheduler.
>
>With the new hot page tracking and promotion mechanism being
>available, NUMA Balancing can limit itself to detection of
>hot pages (via hint faults) and off-load rest of the
>functionality to the common hot page tracking system.
>
>pghot_record_access(PGHOT_HINT_FAULT) API is used to feed the
>hot page info. In addition, the migration rate limiting and
>dynamic threshold logic are moved to kmigrated so that the same
>can be used for hot pages reported by other sources too.
>
>Signed-off-by: Bharata B Rao <bharata@....com>
<snip>
>--- a/mm/pghot.c
>+++ b/mm/pghot.c
>@@ -12,6 +12,9 @@
> * the hot pages. kmigrated runs for each lower tier node. It iterates
> * over the node's PFNs and migrates pages marked for migration into
> * their targeted nodes.
>+ *
>+ * Migration rate-limiting and dynamic threshold logic implementations
>+ * were moved from NUMA Balancing mode 2.
> */
> #include <linux/mm.h>
> #include <linux/migrate.h>
>@@ -25,6 +28,8 @@ static unsigned int pghot_freq_threshold = PGHOT_DEFAULT_FREQ_THRESHOLD;
> static unsigned int kmigrated_sleep_ms = KMIGRATED_DEFAULT_SLEEP_MS;
> static unsigned int kmigrated_batch_nr = KMIGRATED_DEFAULT_BATCH_NR;
>
>+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
>+static unsigned int sysctl_pghot_promote_rate_limit = 65536;
> static unsigned int sysctl_pghot_freq_window = PGHOT_DEFAULT_FREQ_WINDOW;
>
> static DEFINE_STATIC_KEY_FALSE(pghot_src_hwhints);
>@@ -43,6 +48,14 @@ static const struct ctl_table pghot_sysctls[] = {
> .proc_handler = proc_dointvec_minmax,
> .extra1 = SYSCTL_ZERO,
> },
>+ {
>+ .procname = "pghot_promote_rate_limit_MBps",
>+ .data = &sysctl_pghot_promote_rate_limit,
>+ .maxlen = sizeof(unsigned int),
>+ .mode = 0644,
>+ .proc_handler = proc_dointvec_minmax,
>+ .extra1 = SYSCTL_ZERO,
>+ },
> };
> #endif
>
>@@ -137,8 +150,13 @@ int pghot_record_access(unsigned long pfn, int nid, int src, unsigned long now)
> old_freq = (hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
> old_time = (hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
>
>- if (((time - old_time) > msecs_to_jiffies(sysctl_pghot_freq_window))
>- || (nid != NUMA_NO_NODE && old_nid != nid))
>+ /*
>+ * Bypass the new window logic for NUMA hint fault source
>+ * as it is too slow in reporting accesses.
>+ * TODO: Fix this.
>+ */
>+ if ((((time - old_time) > msecs_to_jiffies(sysctl_pghot_freq_window))
>+ && (src != PGHOT_HINT_FAULT)) || (nid != NUMA_NO_NODE && old_nid != nid))
> new_window = true;
>
> if (new_window)
>@@ -166,6 +184,110 @@ int pghot_record_access(unsigned long pfn, int nid, int src, unsigned long now)
> return 0;
> }
>
>+/*
>+ * For memory tiering mode, if there are enough free pages (more than
>+ * enough watermark defined here) in fast memory node, to take full
>+ * advantage of fast memory capacity, all recently accessed slow
>+ * memory pages will be migrated to fast memory node without
>+ * considering hot threshold.
>+ */
>+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
>+{
>+ int z;
>+ unsigned long enough_wmark;
>+
>+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
>+ pgdat->node_present_pages >> 4);
>+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
>+ struct zone *zone = pgdat->node_zones + z;
>+
>+ if (!populated_zone(zone))
>+ continue;
>+
>+ if (zone_watermark_ok(zone, 0,
>+ promo_wmark_pages(zone) + enough_wmark,
>+ ZONE_MOVABLE, 0))
>+ return true;
>+ }
>+ return false;
>+}
>+
>+/*
>+ * For memory tiering mode, too high promotion/demotion throughput may
>+ * hurt application latency. So we provide a mechanism to rate limit
>+ * the number of pages that are tried to be promoted.
>+ */
>+static bool kmigrated_promotion_rate_limit(struct pglist_data *pgdat, unsigned long rate_limit,
>+ int nr, unsigned long now_ms)
>+{
>+ unsigned long nr_cand;
>+ unsigned int start;
>+
>+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
>+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
>+ start = pgdat->nbp_rl_start;
>+ if (now_ms - start > MSEC_PER_SEC &&
>+ cmpxchg(&pgdat->nbp_rl_start, start, now_ms) == start)
>+ pgdat->nbp_rl_nr_cand = nr_cand;
>+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
>+ return true;
>+ return false;
>+}
>+
>+static void kmigrated_promotion_adjust_threshold(struct pglist_data *pgdat,
>+ unsigned long rate_limit, unsigned int ref_th,
>+ unsigned long now_ms)
>+{
>+ unsigned int start, th_period, unit_th, th;
>+ unsigned long nr_cand, ref_cand, diff_cand;
>+
>+ th_period = KMIGRATED_PROMOTION_THRESHOLD_WINDOW;
>+ start = pgdat->nbp_th_start;
>+ if (now_ms - start > th_period &&
>+ cmpxchg(&pgdat->nbp_th_start, start, now_ms) == start) {
>+ ref_cand = rate_limit *
>+ KMIGRATED_PROMOTION_THRESHOLD_WINDOW / MSEC_PER_SEC;
>+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
>+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
>+ unit_th = ref_th * 2 / KMIGRATED_MIGRATION_ADJUST_STEPS;
>+ th = pgdat->nbp_threshold ? : ref_th;
>+ if (diff_cand > ref_cand * 11 / 10)
>+ th = max(th - unit_th, unit_th);
>+ else if (diff_cand < ref_cand * 9 / 10)
>+ th = min(th + unit_th, ref_th * 2);
>+ pgdat->nbp_th_nr_cand = nr_cand;
>+ pgdat->nbp_threshold = th;
>+ }
>+}
>+
>+static bool kmigrated_should_migrate_memory(unsigned long nr_pages, unsigned long nid,
>+ unsigned long time)
>+{
>+ struct pglist_data *pgdat;
>+ unsigned long rate_limit;
>+ unsigned int th, def_th;
>+ unsigned long now = jiffies;
now = jiffies & PGHOT_TIME_MASK;
>+ unsigned long now_ms = jiffies_to_msecs(now);
>+
>+ pgdat = NODE_DATA(nid);
>+ if (pgdat_free_space_enough(pgdat)) {
>+ /* workload changed, reset hot threshold */
>+ pgdat->nbp_threshold = 0;
>+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr_pages);
>+ return true;
>+ }
>+
>+ def_th = sysctl_pghot_freq_window;
>+ rate_limit = MB_TO_PAGES(sysctl_pghot_promote_rate_limit);
>+ kmigrated_promotion_adjust_threshold(pgdat, rate_limit, def_th, now_ms);
>+
>+ th = pgdat->nbp_threshold ? : def_th;
>+ if (jiffies_to_msecs(now - time) >= th)
Setting time in pfn hotness using PGHOT_TIME_MASK in pghot_record_access(). Therefore
here also it should be calculated using PGHOT_TIME_MASK. Then it'll be right comparision.
Regards,
Alok Rathore
Powered by blists - more mailing lists