[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <fab8eabb-1cfa-9bf6-02af-3afdff3f955d@linux.intel.com>
Date: Fri, 6 Sep 2019 11:30:20 -0700
From: Tim Chen <tim.c.chen@...ux.intel.com>
To: Dario Faggioli <dfaggioli@...e.com>,
Julien Desfossez <jdesfossez@...italocean.com>,
"Li, Aubrey" <aubrey.li@...ux.intel.com>
Cc: Aaron Lu <aaron.lu@...ux.alibaba.com>,
Aubrey Li <aubrey.intel@...il.com>,
Subhra Mazumdar <subhra.mazumdar@...cle.com>,
Vineeth Remanan Pillai <vpillai@...italocean.com>,
Nishanth Aravamudan <naravamudan@...italocean.com>,
Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>,
Paul Turner <pjt@...gle.com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Linux List Kernel Mailing <linux-kernel@...r.kernel.org>,
Frédéric Weisbecker <fweisbec@...il.com>,
Kees Cook <keescook@...omium.org>,
Greg Kerr <kerrnel@...gle.com>, Phil Auld <pauld@...hat.com>,
Valentin Schneider <valentin.schneider@....com>,
Mel Gorman <mgorman@...hsingularity.net>,
Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>,
Paolo Bonzini <pbonzini@...hat.com>
Subject: Re: [RFC PATCH v3 00/16] Core scheduling v3
On 8/7/19 10:10 AM, Tim Chen wrote:
> 3) Load balancing between CPU cores
> -----------------------------------
> Say if one CPU core's sibling threads get forced idled
> a lot as it has mostly incompatible tasks between the siblings,
> moving the incompatible load to other cores and pulling
> compatible load to the core could help CPU utilization.
>
> So just considering the load of a task is not enough during
> load balancing, task compatibility also needs to be considered.
> Peter has put in mechanisms to balance compatible tasks between
> CPU thread siblings, but not across cores.
>
> Status:
> I have not seen patches on this issue. This issue could lead to
> large variance in workload performance based on your luck
> in placing the workload among the cores.
>
I've made an attempt in the following two patches to address
the load balancing of mismatched load between the siblings.
It is applied on top of Aaron's patches:
- sched: Fix incorrect rq tagged as forced idle
- wrapper for cfs_rq->min_vruntime
https://lore.kernel.org/lkml/20190725143127.GB992@aaronlu/
- core vruntime comparison
https://lore.kernel.org/lkml/20190725143248.GC992@aaronlu/
I will love Julien, Aaron and others to try it out. Suggestions
to tune it is welcomed.
Tim
---
>From c7b91fb26d787d020f0795c3fbec82914889dc67 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@...ux.intel.com>
Date: Wed, 21 Aug 2019 15:48:15 -0700
Subject: [PATCH 1/2] sched: scan core sched load mismatch
Calculate the mismatched load imbalance on a core when
running the core scheduler when we are updating the
load balance statistics. This will guide the load
balancer later to move load to another CPU that can
reduce the mismatched load.
Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
---
kernel/sched/fair.c | 150 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 149 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 730c9359e9c9..b3d6a6482553 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7507,6 +7507,9 @@ static inline int migrate_degrades_locality(struct task_struct *p,
}
#endif
+static inline s64 core_sched_imbalance_improvement(int src_cpu, int dst_cpu,
+ struct task_struct *p);
+
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
@@ -7970,6 +7973,11 @@ struct sg_lb_stats {
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
+#ifdef CONFIG_SCHED_CORE
+ int imbl_cpu;
+ struct task_group *imbl_tg;
+ s64 imbl_load;
+#endif
};
/*
@@ -8314,6 +8322,145 @@ static bool update_nohz_stats(struct rq *rq, bool force)
#endif
}
+#ifdef CONFIG_SCHED_CORE
+static inline int cpu_sibling(int cpu)
+{
+ int i;
+
+ for_each_cpu(i, cpu_smt_mask(cpu)) {
+ if (i == cpu)
+ continue;
+ return i;
+ }
+ return -1;
+}
+
+static inline s64 core_sched_imbalance_delta(int src_cpu, int dst_cpu,
+ int src_sibling, int dst_sibling,
+ struct task_group *tg, u64 task_load)
+{
+ struct sched_entity *se, *se_sibling, *dst_se, *dst_se_sibling;
+ s64 excess, deficit, old_mismatch, new_mismatch;
+
+ if (src_cpu == dst_cpu)
+ return -1;
+
+ /* XXX SMT4 will require additional logic */
+
+ se = tg->se[src_cpu];
+ se_sibling = tg->se[src_sibling];
+
+ excess = se->avg.load_avg - se_sibling->avg.load_avg;
+ if (src_sibling == dst_cpu) {
+ old_mismatch = abs(excess);
+ new_mismatch = abs(excess - 2*task_load);
+ return old_mismatch - new_mismatch;
+ }
+
+ dst_se = tg->se[dst_cpu];
+ dst_se_sibling = tg->se[dst_sibling];
+ deficit = dst_se->avg.load_avg - dst_se_sibling->avg.load_avg;
+
+ old_mismatch = abs(excess) + abs(deficit);
+ new_mismatch = abs(excess - (s64) task_load) +
+ abs(deficit + (s64) task_load);
+
+ if (excess > 0 && deficit < 0)
+ return old_mismatch - new_mismatch;
+ else
+ /* no mismatch improvement */
+ return -1;
+}
+
+static inline s64 core_sched_imbalance_improvement(int src_cpu, int dst_cpu,
+ struct task_struct *p)
+{
+ int src_sibling, dst_sibling;
+ unsigned long task_load = task_h_load(p);
+ struct task_group *tg;
+
+ if (!p->se.parent)
+ return 0;
+
+ tg = p->se.parent->cfs_rq->tg;
+ if (!tg->tagged)
+ return 0;
+
+ /* XXX SMT4 will require additional logic */
+ src_sibling = cpu_sibling(src_cpu);
+ dst_sibling = cpu_sibling(dst_cpu);
+
+ if (src_sibling == -1 || dst_sibling == -1)
+ return 0;
+
+ return core_sched_imbalance_delta(src_cpu, dst_cpu,
+ src_sibling, dst_sibling,
+ tg, task_load);
+}
+
+static inline void core_sched_imbalance_scan(struct sg_lb_stats *sgs,
+ int src_cpu,
+ int dst_cpu)
+{
+ struct rq *rq;
+ struct cfs_rq *cfs_rq, *pos;
+ struct task_group *tg;
+ s64 mismatch;
+ int src_sibling, dst_sibling;
+ u64 src_avg_load_task;
+
+ if (!sched_core_enabled(cpu_rq(src_cpu)) ||
+ !sched_core_enabled(cpu_rq(dst_cpu)) ||
+ src_cpu == dst_cpu)
+ return;
+
+ rq = cpu_rq(src_cpu);
+
+ src_sibling = cpu_sibling(src_cpu);
+ dst_sibling = cpu_sibling(dst_cpu);
+
+ if (src_sibling == -1 || dst_sibling == -1)
+ return;
+
+ src_avg_load_task = cpu_avg_load_per_task(src_cpu);
+
+ if (src_avg_load_task == 0)
+ return;
+
+ /*
+ * Imbalance in tagged task group's load causes forced
+ * idle time in sibling, that will be counted as mismatched load
+ * on the forced idled cpu. Record the source cpu in the sched
+ * group causing the largest mismatched load.
+ */
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+
+ tg = cfs_rq->tg;
+ if (!tg->tagged)
+ continue;
+
+ mismatch = core_sched_imbalance_delta(src_cpu, dst_cpu,
+ src_sibling, dst_sibling,
+ tg, src_avg_load_task);
+
+ if (mismatch > sgs->imbl_load &&
+ mismatch > src_avg_load_task) {
+ sgs->imbl_load = mismatch;
+ sgs->imbl_tg = tg;
+ sgs->imbl_cpu = src_cpu;
+ }
+ }
+}
+
+#else
+#define core_sched_imbalance_scan(sgs, src_cpu, dst_cpu)
+static inline s64 core_sched_imbalance_improvement(int src_cpu, int dst_cpu,
+ struct task_struct *p)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_CORE */
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8345,7 +8492,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
else
load = source_load(i, load_idx);
- sgs->group_load += load;
+ core_sched_imbalance_scan(sgs, i, env->dst_cpu);
+
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
--
2.20.1
>From a11084f84de9c174f36cf2701ba5bbe1546e45f5 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@...ux.intel.com>
Date: Wed, 28 Aug 2019 11:22:43 -0700
Subject: [PATCH 2/2] sched: load balance core imbalanced load
If moving mismatched core scheduling load can reduce load imbalance
more than regular load balancing, move the mismatched load instead.
On regular load balancing, also skip moving a task that could increase
load mismatch.
Move only one mismatched task at a time to reduce load disturbance.
Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
---
kernel/sched/fair.c | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3d6a6482553..69939c977797 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7412,6 +7412,11 @@ struct lb_env {
enum fbq_type fbq_type;
enum group_type src_grp_type;
struct list_head tasks;
+#ifdef CONFIG_SCHED_CORE
+ int imbl_cpu;
+ struct task_group *imbl_tg;
+ s64 imbl_load;
+#endif
};
/*
@@ -7560,6 +7565,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
}
+#ifdef CONFIG_SCHED_CORE
+ /* Don't migrate if we increase core imbalance */
+ if (core_sched_imbalance_improvement(env->src_cpu, env->dst_cpu, p) < 0)
+ return 0;
+#endif
+
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
@@ -8533,6 +8544,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_no_capacity = group_is_overloaded(env, sgs);
sgs->group_type = group_classify(group, sgs);
+
+#ifdef CONFIG_SCHED_CORE
+ if (sgs->imbl_load > env->imbl_load) {
+ env->imbl_cpu = sgs->imbl_cpu;
+ env->imbl_tg = sgs->imbl_tg;
+ env->imbl_load = sgs->imbl_load;
+ }
+#endif
}
/**
@@ -9066,6 +9085,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
+#ifdef CONFIG_SCHED_CORE
+ if (env->imbl_load > env->imbalance) {
+ env->imbalance = cpu_avg_load_per_task(env->imbl_cpu);
+ return cpu_rq(env->imbl_cpu);
+ } else {
+ env->imbl_load = 0;
+ }
+#endif
+
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
--
2.20.1
Powered by blists - more mailing lists