[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e78c0d2d-c5bf-41f1-9786-981c60b7b50c@redhat.com>
Date: Tue, 4 Mar 2025 10:17:45 -0500
From: Waiman Long <llong@...hat.com>
To: Juri Lelli <juri.lelli@...hat.com>, linux-kernel@...r.kernel.org,
cgroups@...r.kernel.org
Cc: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
Tejun Heo <tj@...nel.org>, Johannes Weiner <hannes@...xchg.org>,
Michal Koutný <mkoutny@...e.com>,
Qais Yousef <qyousef@...alina.io>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
Swapnil Sapkal <swapnil.sapkal@....com>,
Shrikanth Hegde <sshegde@...ux.ibm.com>, Phil Auld <pauld@...hat.com>,
luca.abeni@...tannapisa.it, tommaso.cucinotta@...tannapisa.it,
Jon Hunter <jonathanh@...dia.com>
Subject: Re: [PATCH 4/5] sched/deadline: Rebuild root domain accounting after
every update
On 3/4/25 3:40 AM, Juri Lelli wrote:
> Rebuilding of root domains accounting information (total_bw) is
> currently broken on some cases, e.g. suspend/resume on aarch64. Problem
> is that the way we keep track of domain changes and try to add bandwidth
> back is convoluted and fragile.
>
> Fix it by simplify things by making sure bandwidth accounting is cleared
> and completely restored after root domains changes (after root domains
> are again stable).
>
> Reported-by: Jon Hunter <jonathanh@...dia.com>
> Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug")
> Signed-off-by: Juri Lelli <juri.lelli@...hat.com>
> ---
> include/linux/sched/deadline.h | 4 ++++
> include/linux/sched/topology.h | 2 ++
> kernel/cgroup/cpuset.c | 16 +++++++++-------
> kernel/sched/deadline.c | 16 ++++++++++------
> kernel/sched/topology.c | 1 +
> 5 files changed, 26 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
> index 6ec578600b24..a780068aa1a5 100644
> --- a/include/linux/sched/deadline.h
> +++ b/include/linux/sched/deadline.h
> @@ -34,6 +34,10 @@ static inline bool dl_time_before(u64 a, u64 b)
> struct root_domain;
> extern void dl_add_task_root_domain(struct task_struct *p);
> extern void dl_clear_root_domain(struct root_domain *rd);
> +extern void dl_clear_root_domain_cpu(int cpu);
> +
> +extern u64 dl_cookie;
> +extern bool dl_bw_visited(int cpu, u64 gen);
>
> #endif /* CONFIG_SMP */
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 7f3dbafe1817..1622232bd08b 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -166,6 +166,8 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
> return to_cpumask(sd->span);
> }
>
> +extern void dl_rebuild_rd_accounting(void);
> +
> extern void partition_sched_domains_locked(int ndoms_new,
> cpumask_var_t doms_new[],
> struct sched_domain_attr *dattr_new);
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index f87526edb2a4..f66b2aefdc04 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -954,10 +954,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs)
> css_task_iter_end(&it);
> }
>
> -static void dl_rebuild_rd_accounting(void)
> +void dl_rebuild_rd_accounting(void)
> {
> struct cpuset *cs = NULL;
> struct cgroup_subsys_state *pos_css;
> + int cpu;
> + u64 cookie = ++dl_cookie;
>
> lockdep_assert_held(&cpuset_mutex);
> lockdep_assert_cpus_held();
> @@ -965,11 +967,12 @@ static void dl_rebuild_rd_accounting(void)
>
> rcu_read_lock();
>
> - /*
> - * Clear default root domain DL accounting, it will be computed again
> - * if a task belongs to it.
> - */
> - dl_clear_root_domain(&def_root_domain);
> + for_each_possible_cpu(cpu) {
> + if (dl_bw_visited(cpu, cookie))
> + continue;
> +
> + dl_clear_root_domain_cpu(cpu);
> + }
>
> cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
>
> @@ -996,7 +999,6 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
> {
> sched_domains_mutex_lock();
> partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
> - dl_rebuild_rd_accounting();
> sched_domains_mutex_unlock();
> }
With this patch, partition_and_rebuild_sched_domains() is essentially
the same as partition_sched_domains(). We can remove
partition_and_rebuild_sched_domains() and use partition_sched_domains()
directly. Also we don't need to expose partition_sched_domains_locked()
as well as there is no more caller outside of topology.c.
Cheers,
Longman
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 339434271cba..17b040c92885 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -166,7 +166,7 @@ static inline unsigned long dl_bw_capacity(int i)
> }
> }
>
> -static inline bool dl_bw_visited(int cpu, u64 cookie)
> +bool dl_bw_visited(int cpu, u64 cookie)
> {
> struct root_domain *rd = cpu_rq(cpu)->rd;
>
> @@ -207,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i)
> return SCHED_CAPACITY_SCALE;
> }
>
> -static inline bool dl_bw_visited(int cpu, u64 cookie)
> +bool dl_bw_visited(int cpu, u64 cookie)
> {
> return false;
> }
> @@ -2981,18 +2981,22 @@ void dl_clear_root_domain(struct root_domain *rd)
> rd->dl_bw.total_bw = 0;
>
> /*
> - * dl_server bandwidth is only restored when CPUs are attached to root
> - * domains (after domains are created or CPUs moved back to the
> - * default root doamin).
> + * dl_servers are not tasks. Since dl_add_task_root_domanin ignores
> + * them, we need to account for them here explicitly.
> */
> for_each_cpu(i, rd->span) {
> struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
>
> if (dl_server(dl_se) && cpu_active(i))
> - rd->dl_bw.total_bw += dl_se->dl_bw;
> + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
> }
> }
>
> +void dl_clear_root_domain_cpu(int cpu)
> +{
> + dl_clear_root_domain(cpu_rq(cpu)->rd);
> +}
> +
> #endif /* CONFIG_SMP */
>
> static void switched_from_dl(struct rq *rq, struct task_struct *p)
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index b70d6002bb93..bdfda0ef1bd9 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2796,6 +2796,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
> ndoms_cur = ndoms_new;
>
> update_sched_domain_debugfs();
> + dl_rebuild_rd_accounting();
> }
>
> /*
Powered by blists - more mailing lists