[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <SL2PR06MB30828CF9FF2879AFC9DC53D2BD0C9@SL2PR06MB3082.apcprd06.prod.outlook.com>
Date: Fri, 11 Mar 2022 02:03:46 +0000
From: 王擎 <wangqing@...o.com>
To: Vincent Guittot <vincent.guittot@...aro.org>
CC: Catalin Marinas <Catalin.Marinas@....com>,
Will Deacon <will@...nel.org>,
Sudeep Holla <sudeep.holla@....com>,
Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
"Rafael J. Wysocki" <rafael@...nel.org>,
Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
Daniel Bristot de Oliveira <bristot@...hat.com>,
"linux-arm-kernel@...ts.infradead.org"
<linux-arm-kernel@...ts.infradead.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: RE: [PATCH] sched: topology: make cache topology separate from cpu
topology
>On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@...o.com> wrote:
>>
>> From: Wang Qing <wangqing@...o.com>
>>
>> Some architectures(e.g. ARM64), caches are implemented below:
>> cluster: ****** cluster 0 ***** ****** cluster 1 *****
>> core: 0 1 2 3 4 5 6 7
(add cache level 1) c0 c1 c2 c3 c4 c5 c6 c7
>> cache(Leveln): **cache0** **cache1** **cache2** **cache3**
(add cache level 3) *************share level 3 cache ***************
>> sd_llc_id(current): 0 0 0 0 4 4 4 4
>> sd_llc_id(should be): 0 0 2 2 4 4 6 6
>>
Here, n always be 2 in ARM64, but others are also possible.
core[0,1] form a complex(ARMV9), which share L2 cache, core[2,3] is the same.
>> Caches and cpus have different topology, this causes cpus_share_cache()
>> return the wrong value, which will affect the CPU load balance.
>>
>What does your current scheduler topology look like?
>
>For CPU 0 to 3, do you have the below ?
>DIE [0 - 3] [4-7]
>MC [0] [1] [2] [3]
The current scheduler topology consistent with CPU topology:
DIE [0-7]
MC [0-3] [4-7] (SD_SHARE_PKG_RESOURCES)
Most Android phones have this topology.
>
>But you would like something like below for cpu 0-1 instead ?
>DIE [0 - 3] [4-7]
>CLS [0 - 1] [2 - 3]
>MC [0] [1]
>
>with SD_SHARE_PKG_RESOURCES only set to MC level ?
We don't change the current scheduler topology, but the
cache topology should be separated like below:
[0-7] (shared level 3 cache )
[0-1] [2-3][4-5][6-7] (shared level 2 cache )
>
>>
>> Cache topology should be separated with CPU topology, it can be obtained
>> from "next-level-cache" in DTS preferentially.
>>
>> Signed-off-by: Wang Qing <wangqing@...o.com>
>> ---
>> arch/arm64/kernel/smp.c | 1 +
>> drivers/base/arch_topology.c | 23 +++++++++++++++++++++++
>> include/linux/arch_topology.h | 3 +++
>> kernel/sched/topology.c | 33 +++++++++++++++++++++++++++++++--
>> 4 files changed, 58 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
>> index 27df5c1..94cf649
>> --- a/arch/arm64/kernel/smp.c
>> +++ b/arch/arm64/kernel/smp.c
>> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>> unsigned int this_cpu;
>>
>> init_cpu_topology();
>> + init_cpu_cache_topology();
>>
>> this_cpu = smp_processor_id();
>> store_cpu_topology(this_cpu);
>> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
>> index 9761541..613213f
>> --- a/drivers/base/arch_topology.c
>> +++ b/drivers/base/arch_topology.c
>> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
>> */
>> struct cpu_topology cpu_topology[NR_CPUS];
>> EXPORT_SYMBOL_GPL(cpu_topology);
>> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>
>AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
>not initialized for other archs
I see, will be fixed in V2.
Thanks,
Wang
>
>>
>> const struct cpumask *cpu_coregroup_mask(int cpu)
>> {
>> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
>> else if (of_have_populated_dt() && parse_dt_topology())
>> reset_cpu_topology();
>> }
>> +
>> +void __init init_cpu_cache_topology(void)
>> +{
>> + struct device_node *node_cpu, *node_cache;
>> + int cpu, level;
>> +
>> + for_each_possible_cpu(cpu) {
>> + node_cpu = of_get_cpu_node(cpu, NULL);
>> + if (!node_cpu)
>> + continue;
>> +
>> + level = 0;
>> + node_cache = node_cpu;
>> + while (level < MAX_CPU_CACHE_LEVEL) {
>> + node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
>> + if (!node_cache)
>> + break;
>> + cache_topology[cpu][level++] = node_cache;
>> + }
>> + of_node_put(node_cpu);
>> + }
>> +}
>> #endif
>> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
>> index cce6136b..d37f47d
>> --- a/include/linux/arch_topology.h
>> +++ b/include/linux/arch_topology.h
>> @@ -72,6 +72,8 @@ struct cpu_topology {
>> };
>>
>> #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
>> +#define MAX_CPU_CACHE_LEVEL 7
>> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>> extern struct cpu_topology cpu_topology[NR_CPUS];
>>
>> #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id)
>> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
>> #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling)
>> #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling)
>> void init_cpu_topology(void);
>> +void init_cpu_cache_topology(void);
>> void store_cpu_topology(unsigned int cpuid);
>> const struct cpumask *cpu_coregroup_mask(int cpu);
>> const struct cpumask *cpu_clustergroup_mask(int cpu);
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index d201a70..10850d6
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>
>> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
>> +{
>> + int cache_level, cpu_id;
>> + int first, last;
>> + int id = cpumask_first(sched_domain_span(sd));
>> + int size = cpumask_weight(sched_domain_span(sd));
>> +
>> + *first_cpu = id;
>> + *cpu_num = size;
>> +
>> + for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
>> + if (!cache_topology[cpu][cache_level])
>> + break;
>> +
>> + first = -1;
>> + last = id;
>> + for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
>> + if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
>> + if (cpu_id >= id && cpu_id < id + size) {
>> + first = (first == -1)?cpu_id:first;
>> + last = cpu_id;
>> + } else
>> + return;
>> + }
>> + }
>> + *first_cpu = first;
>> + *cpu_num = last - first + 1;
>> + }
>> +}
>> +
>> static void update_top_cache_domain(int cpu)
>> {
>> struct sched_domain_shared *sds = NULL;
>> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
>>
>> sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>> if (sd) {
>> - id = cpumask_first(sched_domain_span(sd));
>> - size = cpumask_weight(sched_domain_span(sd));
>> + set_sd_llc(cpu, sd, &id, &size);
>
>In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
>find shared memory. It seems that cpu_coregroup_mask doesn't return
>the correct cpumask in your case as it returns a full cluster instead
>of a subset
>
>> sds = sd->shared;
>
>sds must stay aligned with id and size so instead of modifying id and
>size you should returns a cpumask that reflects your topology
>
>> }
>>
>> --
>> 2.7.4
>
Powered by blists - more mailing lists