[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <0d1cc457c6a97178fc68880957757f3c27088f53.camel@linux.intel.com>
Date: Fri, 07 Mar 2025 14:54:10 -0800
From: Tim Chen <tim.c.chen@...ux.intel.com>
To: Chen Yu <yu.c.chen@...el.com>, Ingo Molnar <mingo@...hat.com>, Peter
Zijlstra <peterz@...radead.org>, Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>, Andrew Morton
<akpm@...ux-foundation.org>
Cc: Rik van Riel <riel@...hat.com>, Mel Gorman <mgorman@...e.de>, Johannes
Weiner <hannes@...xchg.org>, Michal Hocko <mhocko@...nel.org>, Roman
Gushchin <roman.gushchin@...ux.dev>, Shakeel Butt <shakeel.butt@...ux.dev>,
Muchun Song <muchun.song@...ux.dev>, "Liam R. Howlett"
<Liam.Howlett@...cle.com>, Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
"Huang, Ying" <ying.huang@...ux.alibaba.com>, Tim Chen
<tim.c.chen@...el.com>, Aubrey Li <aubrey.li@...el.com>, Michael Wang
<yun.wang@...ux.alibaba.com>, Kaiyang Zhao <kaiyang2@...cmu.edu>, David
Rientjes <rientjes@...gle.com>, Raghavendra K T <raghavendra.kt@....com>,
cgroups@...r.kernel.org, linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH 2/3] sched/numa: Introduce per cgroup numa balance
control
On Tue, 2025-02-25 at 22:00 +0800, Chen Yu wrote:
> [Problem Statement]
> Currently, NUMA balancing is configured system-wide. However,
>
>
> A simple example to show how to use per-cgroup Numa balancing:
>
> Step1
> //switch to global per cgroup Numa balancing,
> //All cgroup's Numa balance is disabled by default.
> echo 4 > /proc/sys/kernel/numa_balancing
>
Can you add documentation of this additional feature
for numa_balancing in
admin-guide/sysctl/kernel.rst
Should you make NUMA_BALANCING_NORMAL and NUMA_BALANCING_CGROUP
mutually exclusive in? In other words
echo 5 > /proc/sys/kernel/numa_balancing should result in numa_balancing to be 1?
Otherwise tg_numa_balance_enabled() can return 0 with NUMA_BALANCING_CGROUP
bit turned on even though you have NUMA_BALANCING_NORMAL bit on.
Tim
>
> Suggested-by: Tim Chen <tim.c.chen@...el.com>
> Signed-off-by: Chen Yu <yu.c.chen@...el.com>
> ---
> include/linux/sched/sysctl.h | 1 +
> kernel/sched/core.c | 32 ++++++++++++++++++++++++++++++++
> kernel/sched/fair.c | 18 ++++++++++++++++++
> kernel/sched/sched.h | 3 +++
> mm/mprotect.c | 5 +++--
> 5 files changed, 57 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index 5a64582b086b..1e4d5a9ddb26 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -22,6 +22,7 @@ enum sched_tunable_scaling {
> #define NUMA_BALANCING_DISABLED 0x0
> #define NUMA_BALANCING_NORMAL 0x1
> #define NUMA_BALANCING_MEMORY_TIERING 0x2
> +#define NUMA_BALANCING_CGROUP 0x4
>
> #ifdef CONFIG_NUMA_BALANCING
> extern int sysctl_numa_balancing_mode;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 44efc725054a..f4f048b3da68 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10023,6 +10023,31 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
> }
> #endif
>
> +#ifdef CONFIG_NUMA_BALANCING
> +static DEFINE_MUTEX(numa_balance_mutex);
> +static int numa_balance_write_u64(struct cgroup_subsys_state *css,
> + struct cftype *cftype, u64 enable)
> +{
> + struct task_group *tg;
> + int ret;
> +
> + guard(mutex)(&numa_balance_mutex);
> + tg = css_tg(css);
> + if (tg->nlb_enabled == enable)
> + return 0;
> +
> + tg->nlb_enabled = enable;
> +
> + return ret;
> +}
> +
> +static u64 numa_balance_read_u64(struct cgroup_subsys_state *css,
> + struct cftype *cft)
> +{
> + return css_tg(css)->nlb_enabled;
> +}
> +#endif /* CONFIG_NUMA_BALANCING */
> +
> static struct cftype cpu_files[] = {
> #ifdef CONFIG_GROUP_SCHED_WEIGHT
> {
> @@ -10071,6 +10096,13 @@ static struct cftype cpu_files[] = {
> .seq_show = cpu_uclamp_max_show,
> .write = cpu_uclamp_max_write,
> },
> +#endif
> +#ifdef CONFIG_NUMA_BALANCING
> + {
> + .name = "numa_load_balance",
> + .read_u64 = numa_balance_read_u64,
> + .write_u64 = numa_balance_write_u64,
> + },
> #endif
> { } /* terminate */
> };
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1c0ef435a7aa..526cb33b007c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3146,6 +3146,18 @@ void task_numa_free(struct task_struct *p, bool final)
> }
> }
>
> +/* return true if the task group has enabled the numa balance */
> +static bool tg_numa_balance_enabled(struct task_struct *p)
> +{
> + struct task_group *tg = task_group(p);
> +
> + if (tg && (sysctl_numa_balancing_mode & NUMA_BALANCING_CGROUP) &&
> + !tg->nlb_enabled)
> + return false;
> +
> + return true;
> +}
> +
> /*
> * Got a PROT_NONE fault for a page on @node.
> */
> @@ -3174,6 +3186,9 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> !cpupid_valid(last_cpupid)))
> return;
>
> + if (!tg_numa_balance_enabled(p))
> + return;
> +
> /* Allocate buffer to track faults on a per-node basis */
> if (unlikely(!p->numa_faults)) {
> int size = sizeof(*p->numa_faults) *
> @@ -3596,6 +3611,9 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
> return;
>
> + if (!tg_numa_balance_enabled(curr))
> + return;
> +
> /*
> * Using runtime rather than walltime has the dual advantage that
> * we (mostly) drive the selection from busy threads and that the
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 38e0e323dda2..9f478fb2c03a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -491,6 +491,9 @@ struct task_group {
> /* Effective clamp values used for a task group */
> struct uclamp_se uclamp[UCLAMP_CNT];
> #endif
> +#ifdef CONFIG_NUMA_BALANCING
> + u64 nlb_enabled;
> +#endif
>
> };
>
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 516b1d847e2c..ddaaf20ef94c 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -155,10 +155,11 @@ static long change_pte_range(struct mmu_gather *tlb,
> toptier = node_is_toptier(nid);
>
> /*
> - * Skip scanning top tier node if normal numa
> + * Skip scanning top tier node if normal/cgroup numa
> * balancing is disabled
> */
> - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
> + if (!(sysctl_numa_balancing_mode &
> + (NUMA_BALANCING_CGROUP | NUMA_BALANCING_NORMAL)) &&
> toptier)
> continue;
> if (folio_use_access_time(folio))
Powered by blists - more mailing lists