[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250326102553.GA12071@noisy.programming.kicks-ass.net>
Date: Wed, 26 Mar 2025 11:25:53 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: "Chen, Yu C" <yu.c.chen@...el.com>
Cc: juri.lelli@...hat.com, vincent.guittot@...aro.org,
dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
mgorman@...e.de, vschneid@...hat.com, linux-kernel@...r.kernel.org,
tim.c.chen@...ux.intel.com, tglx@...utronix.de, len.brown@...el.com,
gautham.shenoy@....com, mingo@...nel.org, kprateek.nayak@....com,
yu.chen.surf@...mail.com
Subject: Re: [RFC][PATCH] sched: Cache aware load-balancing
On Wed, Mar 26, 2025 at 10:38:41AM +0100, Peter Zijlstra wrote:
> Nah, the saner thing to do is to preserve the topology averages and look
> at those instead of the per-cpu values.
>
> Eg. have task_cache_work() compute and store averages in the
> sched_domain structure and then use those.
A little something like so perhaps ?
This immediately also gives the information required for clusters and
finding the best LLC of a Node and things like that.
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -82,6 +82,9 @@ struct sched_domain_shared {
atomic_t nr_busy_cpus;
int has_idle_cores;
int nr_idle_scan;
+
+ unsigned long sum_occ;
+ unsigned long avg_occ;
};
struct sched_domain {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1286,8 +1286,8 @@ static void task_cache_work(struct callb
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
unsigned long m_a_occ = 0;
- int cpu, m_a_cpu = -1;
- cpumask_var_t cpus;
+ int m_a_cpu = -1;
+ int cpu;
WARN_ON_ONCE(work != &p->cache_work);
@@ -1296,46 +1296,46 @@ static void task_cache_work(struct callb
if (p->flags & PF_EXITING)
return;
- if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
- return;
-
scoped_guard (cpus_read_lock) {
- cpumask_copy(cpus, cpu_online_mask);
- for_each_cpu(cpu, cpus) {
- /* XXX sched_cluster_active */
- struct sched_domain *sd = per_cpu(sd_llc, cpu);
- unsigned long occ, m_occ = 0, a_occ = 0;
- int m_cpu = -1, nr = 0, i;
+ for_each_online_cpu(cpu) {
+ struct sched_domain *sd;
+ struct sched_domain_shared *sds;
+ unsigned long occ;
+
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & SD_SHARE_LLC))
+ break;
- for_each_cpu(i, sched_domain_span(sd)) {
+ sds = sd->shared;
occ = fraction_mm_sched(cpu_rq(i),
per_cpu_ptr(mm->pcpu_sched, i));
- a_occ += occ;
- if (occ > m_occ) {
- m_occ = occ;
- m_cpu = i;
- }
- nr++;
- trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
- per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
- }
-
- a_occ /= nr;
- if (a_occ > m_a_occ) {
- m_a_occ = a_occ;
- m_a_cpu = m_cpu;
+ sds->sum_occ += occ + 1;
}
+ }
- trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
- per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
+ for_each_online_cpu(cpu) {
+ struct sched_domain *sd;
+ struct sched_domain_shared *sds;
+
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & SD_SHARE_LLC))
+ break;
+
+ sds = sd->shared;
+ if (sds->agg_occ) {
+ sds->avg_occ = (sds->agg_occ - sd->span_weight) /
+ sd->span_weight;
+ sds->sum_occ = 0;
+ }
- for_each_cpu(i, sched_domain_span(sd)) {
- /* XXX threshold ? */
- per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
+ if (sd == per_cpu(sd_llc, cpu)) {
+ if (sds->avg_occ > m_a_occ) {
+ m_a_occ = sds->avg_occ;
+ m_a_cpu = cpu;
+ }
+ }
}
-
- cpumask_andnot(cpus, cpus, sched_domain_span(sd));
}
}
@@ -1346,8 +1346,6 @@ static void task_cache_work(struct callb
m_a_cpu = -1;
mm->mm_sched_cpu = m_a_cpu;
-
- free_cpumask_var(cpus);
}
void init_sched_mm(struct task_struct *p)
Powered by blists - more mailing lists