[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b62804cb-2b60-a534-5096-56785a1940bd@bytedance.com>
Date: Wed, 10 Aug 2022 12:02:05 +0800
From: Abel Wu <wuyun.abel@...edance.com>
To: zhangsong <zhangsong34@...wei.com>, mingo@...hat.com,
peterz@...radead.org, juri.lelli@...hat.com,
vincent.guittot@...aro.org
Cc: dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
mgorman@...e.de, bristot@...hat.com, vschneid@...hat.com,
linux-kernel@...r.kernel.org, kernel test robot <lkp@...el.com>
Subject: Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce
interference from IDLE tasks
Hi Zhang Song,
On 8/10/22 9:56 AM, zhangsong Wrote:
> For co-location with NORMAL and IDLE tasks, when CFS trigger load balance,
> it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks from
> the busy src CPU to dst CPU, and migrating IDLE tasks lastly.
Considering the large weight difference between normal and idle tasks,
does the re-ordering really change things? It would be helpful if you
can offer more detailed info.
>
> This is very important for reducing interference from IDLE tasks.
> So the CFS load balance can be optimized to below:
>
> 1.`cfs_tasks` list of CPU rq is owned by NORMAL tasks.
> 2.`cfs_idle_tasks` list of CPU rq which is owned by IDLE tasks.
> 3.Prefer to migrate NORMAL tasks of cfs_tasks to dst CPU.
> 4.Lastly migrate IDLE tasks of cfs_idle_tasks to dst CPU.
>
> This was tested with the following reproduction:
> - small number of NORMAL tasks colocated with a large number of IDLE tasks
>
> With this patch, NORMAL tasks latency can be reduced
> about 5~10% compared with current.
>
> Signed-off-by: zhangsong <zhangsong34@...wei.com>
> Reported-by: kernel test robot <lkp@...el.com>
The Reported-by tag is usually used for reporting a bug in the mainline
kernel, and build error of your patch is not one of them :)
> ---
> V1->V2:
> - fix build test error
> ---
> kernel/sched/core.c | 1 +
> kernel/sched/fair.c | 45 ++++++++++++++++++++++++++++++++++++++++----
> kernel/sched/sched.h | 1 +
> 3 files changed, 43 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ee28253c9ac0..7325c6e552d8 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9733,6 +9733,7 @@ void __init sched_init(void)
> rq->max_idle_balance_cost = sysctl_sched_migration_cost;
>
> INIT_LIST_HEAD(&rq->cfs_tasks);
> + INIT_LIST_HEAD(&rq->cfs_idle_tasks);
>
> rq_attach_root(rq, &def_root_domain);
> #ifdef CONFIG_NO_HZ_COMMON
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 914096c5b1ae..b62bec5b1eb9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3034,6 +3034,21 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
>
> #endif /* CONFIG_NUMA_BALANCING */
>
> +#ifdef CONFIG_SMP
> +static void
> +adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
> + struct rq *rq,
> + struct sched_entity *se)
> +{
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + if (task_has_idle_policy(task_of(se)) || tg_is_idle(cfs_rq->tg))
The tg_is_idle() doesn't have hierarchical judgement on parent task
groups, while rq->cfs{,_idle}_tasks is rq wide. Say A->B where tgA
is idle and tgB isn't, a task from B will be added to the non-idle
list, is this what you want?
> + (*list_op)(&se->group_node, &rq->cfs_idle_tasks);
> + else
> + (*list_op)(&se->group_node, &rq->cfs_tasks);
> +}
> +#endif
> +
> static void
> account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> @@ -3043,7 +3058,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> struct rq *rq = rq_of(cfs_rq);
>
> account_numa_enqueue(rq, task_of(se));
> - list_add(&se->group_node, &rq->cfs_tasks);
> + adjust_rq_cfs_tasks(list_add, rq, se);
> }
> #endif
> cfs_rq->nr_running++;
> @@ -7465,7 +7480,7 @@ done: __maybe_unused;
> * the list, so our cfs_tasks list becomes MRU
> * one.
> */
> - list_move(&p->se.group_node, &rq->cfs_tasks);
> + adjust_rq_cfs_tasks(list_move, rq, &p->se);
> #endif
>
> if (hrtick_enabled_fair(rq))
> @@ -7788,6 +7803,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> if (unlikely(task_has_idle_policy(p)))
> return 0;
>
> + if (tg_is_idle(cfs_rq_of(&p->se)->tg))
> + return 0;
> +
Same as above. But I am not sure this is the right way to do it. We
still want to maintain policy behavior inside an idle task group.
> /* SMT siblings share cache */
> if (env->sd->flags & SD_SHARE_CPUCAPACITY)
> return 0;
> @@ -7800,6 +7818,11 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> &p->se == cfs_rq_of(&p->se)->last))
> return 1;
>
> + /* Preempt sched idle cpu do not consider migration cost */
> + if (cpus_share_cache(env->src_cpu, env->dst_cpu) &&
> + sched_idle_cpu(env->dst_cpu))
> + return 0;
> +
> if (sysctl_sched_migration_cost == -1)
> return 1;
>
> @@ -7990,11 +8013,14 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
> static struct task_struct *detach_one_task(struct lb_env *env)
> {
> struct task_struct *p;
> + struct list_head *tasks = &env->src_rq->cfs_tasks;
> + int loop = 0;
Maybe a boolean variable is enough (and more readable)?
Thanks,
Abel
>
> lockdep_assert_rq_held(env->src_rq);
>
> +again:
> list_for_each_entry_reverse(p,
> - &env->src_rq->cfs_tasks, se.group_node) {
> + tasks, se.group_node) {
> if (!can_migrate_task(p, env))
> continue;
>
> @@ -8009,6 +8035,10 @@ static struct task_struct *detach_one_task(struct lb_env *env)
> schedstat_inc(env->sd->lb_gained[env->idle]);
> return p;
> }
> + if (++loop == 1) {
> + tasks = &env->src_rq->cfs_idle_tasks;
> + goto again;
> + }
> return NULL;
> }
>
> @@ -8026,6 +8056,7 @@ static int detach_tasks(struct lb_env *env)
> unsigned long util, load;
> struct task_struct *p;
> int detached = 0;
> + int loop = 0;
>
> lockdep_assert_rq_held(env->src_rq);
>
> @@ -8041,6 +8072,7 @@ static int detach_tasks(struct lb_env *env)
> if (env->imbalance <= 0)
> return 0;
>
> +again:
> while (!list_empty(tasks)) {
> /*
> * We don't want to steal all, otherwise we may be treated likewise,
> @@ -8142,6 +8174,11 @@ static int detach_tasks(struct lb_env *env)
> list_move(&p->se.group_node, tasks);
> }
>
> + if (env->imbalance > 0 && ++loop == 1) {
> + tasks = &env->src_rq->cfs_idle_tasks;
> + goto again;
> + }
> +
> /*
> * Right now, this is one of only two places we collect this stat
> * so we can safely collect detach_one_task() stats here rather
> @@ -11643,7 +11680,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
> * Move the next running task to the front of the list, so our
> * cfs_tasks list becomes MRU one.
> */
> - list_move(&se->group_node, &rq->cfs_tasks);
> + adjust_rq_cfs_tasks(list_move, rq, se);
> }
> #endif
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e26688d387ae..accb4eea9769 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1068,6 +1068,7 @@ struct rq {
> int online;
>
> struct list_head cfs_tasks;
> + struct list_head cfs_idle_tasks;
>
> struct sched_avg avg_rt;
> struct sched_avg avg_dl;
Powered by blists - more mailing lists