[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtBDA1uZ1xvo_uhqnNVg69DPXXJBeo0+aQdVJiosB_qgrw@mail.gmail.com>
Date: Tue, 29 Mar 2022 11:03:44 +0200
From: Vincent Guittot <vincent.guittot@...aro.org>
To: Tim Chen <tim.c.chen@...ux.intel.com>
Cc: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...e.hu>,
Juri Lelli <juri.lelli@...hat.com>,
Yu Chen <yu.c.chen@...el.com>,
Walter Mack <walter.mack@...el.com>,
Mel Gorman <mgorman@...e.de>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/2] sched/fair: Simple runqueue order on migrate
On Sat, 26 Mar 2022 at 00:52, Tim Chen <tim.c.chen@...ux.intel.com> wrote:
>
> From: "Peter Zijlstra (Intel)" <peterz@...radead.org>
>
> From: Peter Zijlstra (Intel) <peterz@...radead.org>
>
> There's a number of problems with SMP migration of fair tasks, but
> basically it boils down to a task not receiving equal service on each
> runqueue (consider the trivial 3 tasks 2 cpus infeasible weight
> scenario).
>
> Fully solving that with vruntime placement is 'hard', not least
> because a task might be very under-services on a busy runqueue and
> would need to be placed so far left on the new runqueue that it would
> significantly impact latency on the existing tasks.
>
> Instead do minimal / basic placement instead; when moving to a less
> busy queue place at the front of the queue to receive time sooner.
> When moving to a busier queue, place at the end of the queue to
> receive time later.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
> Tested-by: Chen Yu <yu.c.chen@...el.com>
> Tested-by: Walter Mack <walter.mack@...el.com>
> ---
> kernel/sched/fair.c | 33 +++++++++++++++++++++++++++++----
> kernel/sched/features.h | 2 ++
> 2 files changed, 31 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 2498e97804fd..c5d2cb3a8f42 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4223,6 +4223,27 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> se->vruntime = max_vruntime(se->vruntime, vruntime);
> }
>
> +static void place_entity_migrate(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + if (!sched_feat(PLACE_MIGRATE))
> + return;
> +
> + if (cfs_rq->nr_running < se->migrated) {
> + /*
> + * Migrated to a shorter runqueue, go first because
> + * we were under-served on the old runqueue.
> + */
> + se->vruntime = cfs_rq->min_vruntime;
> + return;
> + }
> +
> + /*
> + * Migrated to a longer runqueue, go last because
> + * we got over-served on the old runqueue.
> + */
> + se->vruntime = cfs_rq->min_vruntime + sched_vslice(cfs_rq, se);
> +}
> +
> static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
>
> static inline bool cfs_bandwidth_used(void);
> @@ -4296,6 +4317,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>
> if (flags & ENQUEUE_WAKEUP)
> place_entity(cfs_rq, se, 0);
> + else if (se->migrated)
> + place_entity_migrate(cfs_rq, se);
>
> check_schedstat_required();
> update_stats_enqueue_fair(cfs_rq, se, flags);
> @@ -6930,6 +6953,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
> */
> static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> {
> + struct sched_entity *se = &p->se;
> /*
> * As blocked tasks retain absolute vruntime the migration needs to
> * deal with this by subtracting the old and adding the new
> @@ -6962,7 +6986,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> * rq->lock and can modify state directly.
> */
> lockdep_assert_rq_held(task_rq(p));
> - detach_entity_cfs_rq(&p->se);
> + detach_entity_cfs_rq(se);
>
> } else {
> /*
> @@ -6973,14 +6997,15 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> * wakee task is less decayed, but giving the wakee more load
> * sounds not bad.
> */
> - remove_entity_load_avg(&p->se);
> + remove_entity_load_avg(se);
> }
>
> /* Tell new CPU we are migrated */
> - p->se.avg.last_update_time = 0;
> + se->avg.last_update_time = 0;
>
> /* We have migrated, no longer consider this task hot */
> - p->se.migrated = 1;
> + for_each_sched_entity(se)
> + se->migrated = READ_ONCE(cfs_rq_of(se)->nr_running) + !se->on_rq;
Why do we need to loop on se ? Isn't p->se enough ?
>
> update_scan_period(p, new_cpu);
> }
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 1cf435bbcd9c..681c84fd062c 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -100,3 +100,5 @@ SCHED_FEAT(LATENCY_WARN, false)
>
> SCHED_FEAT(ALT_PERIOD, true)
> SCHED_FEAT(BASE_SLICE, true)
> +
> +SCHED_FEAT(PLACE_MIGRATE, true)
> --
> 2.32.0
>
Powered by blists - more mailing lists