[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251003133025.00006f4b@huawei.com>
Date: Fri, 3 Oct 2025 13:30:25 +0100
From: Jonathan Cameron <jonathan.cameron@...wei.com>
To: Bharata B Rao <bharata@....com>, <akpm@...ux-foundation.org>,
<david@...hat.com>
CC: <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>,
<dave.hansen@...el.com>, <gourry@...rry.net>, <hannes@...xchg.org>,
<mgorman@...hsingularity.net>, <mingo@...hat.com>, <peterz@...radead.org>,
<raghavendra.kt@....com>, <riel@...riel.com>, <rientjes@...gle.com>,
<sj@...nel.org>, <weixugc@...gle.com>, <willy@...radead.org>,
<ying.huang@...ux.alibaba.com>, <ziy@...dia.com>, <dave@...olabs.net>,
<nifan.cxl@...il.com>, <xuezhengchu@...wei.com>, <yiannis@...corp.com>,
<byungchul@...com>, <kinseyho@...gle.com>, <joshua.hahnjy@...il.com>,
<yuanchu@...gle.com>, <balbirs@...dia.com>, <alok.rathore@...sung.com>
Subject: Re: [RFC PATCH v2 7/8] mm: klruscand: use mglru scanning for page
promotion
On Wed, 10 Sep 2025 20:16:52 +0530
Bharata B Rao <bharata@....com> wrote:
> From: Kinsey Ho <kinseyho@...gle.com>
>
> Introduce a new kernel daemon, klruscand, that periodically invokes the
> MGLRU page table walk. It leverages the new callbacks to gather access
> information and forwards it to the pghot hot page tracking sub-system
> for promotion decisions.
>
> This benefits from reusing the existing MGLRU page table walk
> infrastructure, which is optimized with features such as hierarchical
> scanning and bloom filters to reduce CPU overhead.
>
> As an additional optimization to be added in the future, we can tune
> the scan intervals for each memcg.
>
> Signed-off-by: Kinsey Ho <kinseyho@...gle.com>
> Signed-off-by: Yuanchu Xie <yuanchu@...gle.com>
> Signed-off-by: Bharata B Rao <bharata@....com>
> [Reduced the scan interval to 100ms, pfn_t to unsigned long]
Some very minor comments inline. I know even less about the stuff this
is using than IBS (and I don't know much about that ;)
J
> ---
> mm/Kconfig | 8 ++++
> mm/Makefile | 1 +
> mm/klruscand.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 127 insertions(+)
> create mode 100644 mm/klruscand.c
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 8b236eb874cf..6d53c1208729 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1393,6 +1393,14 @@ config PGHOT
> by various sources. Asynchronous promotion is done by per-node
> kernel threads.
>
> +config KLRUSCAND
> + bool "Kernel lower tier access scan daemon"
> + default y
Why default to y? That's very rarely done for new features.
> + depends on PGHOT && LRU_GEN_WALKS_MMU
> + help
> + Scan for accesses from lower tiers by invoking MGLRU to perform
> + page table walks.
> diff --git a/mm/klruscand.c b/mm/klruscand.c
> new file mode 100644
> index 000000000000..1a51aab29bd9
> --- /dev/null
> +++ b/mm/klruscand.c
> @@ -0,0 +1,118 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/memcontrol.h>
Probably pick some ordering scheme for includes.
I'm not spotting what is currently used here.
> +#include <linux/kthread.h>
> +#include <linux/module.h>
> +#include <linux/vmalloc.h>
> +#include <linux/random.h>
> +#include <linux/migrate.h>
> +#include <linux/mm_inline.h>
> +#include <linux/slab.h>
> +#include <linux/sched/clock.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/sched/mm.h>
> +#include <linux/sched.h>
> +#include <linux/pghot.h>
> +
> +#include "internal.h"
> +
> +#define KLRUSCAND_INTERVAL_MS 100
> +#define BATCH_SIZE (2 << 16)
> +
> +static struct task_struct *scan_thread;
> +static unsigned long pfn_batch[BATCH_SIZE];
> +static int batch_index;
> +
> +static void flush_cb(void)
> +{
> + int i = 0;
> +
> + for (; i < batch_index; i++) {
> + u64 pfn = pfn_batch[i];
Why dance through types? pfn_batch is unsigned long and it is
cast back to that below.
> +
> + pghot_record_access((unsigned long)pfn, NUMA_NO_NODE,
> + PGHOT_PGTABLE_SCAN, jiffies);
> +
> + if (i % 16 == 0)
No problem with this, but maybe a comment on why 16?
> + cond_resched();
> + }
> + batch_index = 0;
> +}
> +static int klruscand_run(void *unused)
> +{
> + struct lru_gen_mm_walk *walk;
> +
> + walk = kzalloc(sizeof(*walk),
> + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
Maybe use __free() magic so we can forget about having to clear this up on exit.
Entirely up to you though as doesn't simplify code much in this case.
> + if (!walk)
> + return -ENOMEM;
> +
> + while (!kthread_should_stop()) {
> + unsigned long next_wake_time;
> + long sleep_time;
> + struct mem_cgroup *memcg;
> + int flags;
> + int nid;
> +
> + next_wake_time = jiffies + msecs_to_jiffies(KLRUSCAND_INTERVAL_MS);
> +
> + for_each_node_state(nid, N_MEMORY) {
> + pg_data_t *pgdat = NODE_DATA(nid);
> + struct reclaim_state rs = { 0 };
> +
> + if (node_is_toptier(nid))
> + continue;
> +
> + rs.mm_walk = walk;
> + set_task_reclaim_state(current, &rs);
> + flags = memalloc_noreclaim_save();
> +
> + memcg = mem_cgroup_iter(NULL, NULL, NULL);
> + do {
> + struct lruvec *lruvec =
> + mem_cgroup_lruvec(memcg, pgdat);
> + unsigned long max_seq =
> + READ_ONCE((lruvec)->lrugen.max_seq);
> +
> + lru_gen_scan_lruvec(lruvec, max_seq,
> + accessed_cb, flush_cb);
> + cond_resched();
> + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
> +
> + memalloc_noreclaim_restore(flags);
> + set_task_reclaim_state(current, NULL);
> + memset(walk, 0, sizeof(*walk));
> + }
> +
> + sleep_time = next_wake_time - jiffies;
> + if (sleep_time > 0 && sleep_time != MAX_SCHEDULE_TIMEOUT)
> + schedule_timeout_idle(sleep_time);
> + }
> + kfree(walk);
> + return 0;
> +}
Powered by blists - more mailing lists