lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1689842053-5291-2-git-send-email-Kenan.Liu@linux.alibaba.com>
Date:   Thu, 20 Jul 2023 16:34:12 +0800
From:   "Kenan.Liu" <Kenan.Liu@...ux.alibaba.com>
To:     mingo@...hat.com, peterz@...radead.org, juri.lelli@...hat.com,
        vincent.guittot@...aro.org, dietmar.eggemann@....com,
        rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de,
        bristot@...hat.com, vschneid@...hat.com
Cc:     luoben@...ux.alibaba.com, linux-kernel@...r.kernel.org
Subject: [RFC PATCH 1/2] sched/fair: Adjust CFS loadbalance for machine with qemu native CPU topology.

From: "Kenan.Liu" <Kenan.Liu@...ux.alibaba.com>

Multithreading workloads in VM with Qemu may encounter an unexpected
phenomenon: one hyperthread of a physical core is busy while its sibling
is idle. The main reason is that hyperthread index is consecutive in qemu
native x86 CPU model which is different from the physical topology. As the
current kernel scheduler implementation, hyperthread with an even ID
number will be picked up in a much higher probability during load-balancing
and load-deploying. To solve the imbalance, when on a machine with multi
core and hyperthread index is consecutive per core, change the result of
select_idle_core() according to the hyperthread on which the task ran
before.

Signed-off-by: Kenan.Liu <Kenan.Liu@...ux.alibaba.com>
Signed-off-by: Ben Luo <luoben@...ux.alibaba.com>
---
 kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a80a739..ad7c93f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -125,6 +125,9 @@
 static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+static bool smt_neighbour_topo;
+static bool core_smt_topo_detect;
+static unsigned int smt_nr_cpu = 2;
 
 int sched_thermal_decay_shift;
 static int __init setup_sched_thermal_decay_shift(char *str)
@@ -140,6 +143,26 @@ static int __init setup_sched_thermal_decay_shift(char *str)
 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
 
 #ifdef CONFIG_SMP
+static void explore_core_smp_topology(void)
+{
+	int cpu = smp_processor_id(), sibling;
+	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+	if (nr_cpu_ids <= 2)
+		return;
+
+	smt_nr_cpu = cpumask_weight(smt_mask);
+	if (smt_nr_cpu < 2)
+		return;
+
+	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+		if (cpu == sibling)
+			continue;
+		if (abs(cpu - sibling) == 1)
+			smt_neighbour_topo = true;
+	}
+}
+
 /*
  * For asym packing, by default the lower numbered CPU has higher priority.
  */
@@ -6887,9 +6910,16 @@ void __update_idle_core(struct rq *rq)
 static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
 {
 	bool idle = true;
-	int cpu;
+	int cpu, sibling = core;
+
+	if (!core_smt_topo_detect) {
+		explore_core_smp_topology();
+		core_smt_topo_detect = true;
+	}
 
 	for_each_cpu(cpu, cpu_smt_mask(core)) {
+		if (cpu != core)
+			sibling = cpu;
 		if (!available_idle_cpu(cpu)) {
 			idle = false;
 			if (*idle_cpu == -1) {
@@ -6905,8 +6935,12 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
 			*idle_cpu = cpu;
 	}
 
-	if (idle)
+	if (idle) {
+		if (!smt_neighbour_topo || unlikely(core % smt_nr_cpu))
+			return core;
+		core = task_cpu(p) % smt_nr_cpu ? core : sibling;
 		return core;
+	}
 
 	cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
 	return -1;
-- 
1.8.3.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ