linux-kernel - Re: [PATCH 5/7 v5] sched/fair: Add push task mechanism for EAS

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAB8ipk-MgPPptz36W0zYx3TGiESyMwoY3Xoh8xw+zubPSJsZMg@mail.gmail.com>
Date: Tue, 15 Apr 2025 10:31:00 +0800
From: Xuewen Yan <xuewen.yan94@...il.com>
To: Vincent Guittot <vincent.guittot@...aro.org>
Cc: mingo@...hat.com, peterz@...radead.org, juri.lelli@...hat.com, 
	dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com, 
	mgorman@...e.de, vschneid@...hat.com, lukasz.luba@....com, 
	rafael.j.wysocki@...el.com, pierre.gondois@....com, 
	linux-kernel@...r.kernel.org, qyousef@...alina.io, hongyan.xia2@....com, 
	christian.loehle@....com, luis.machado@....com, qperret@...gle.com
Subject: Re: [PATCH 5/7 v5] sched/fair: Add push task mechanism for EAS

Hi Vincent,

On Mon, Mar 3, 2025 at 5:06 AM Vincent Guittot
<vincent.guittot@...aro.org> wrote:
>
> EAS is based on wakeup events to efficiently place tasks on the system, but
> there are cases where a task doesn't have wakeup events anymore or at a far
> too low pace. For such situation, we can take advantage of the task being
> put back in the enqueued list to check if it should be pushed on another
> CPU. When the task is alone on the CPU, it's never put back in the enqueued
> list; In this special case, we use the tick to run the check.
>
> Wake up events remain the main way to migrate tasks but we now detect
> situation where a task is stuck on a CPU by checking that its utilization
> is larger than the max available compute capacity (max cpu capacity or
> uclamp max setting)
>
> Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
> ---
>  kernel/sched/fair.c  | 220 +++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/sched.h |   2 +
>  2 files changed, 222 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a9b97bbc085f..c3e383b86808 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7051,6 +7051,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>         hrtick_update(rq);
>  }
>
> +static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
>  static void set_next_buddy(struct sched_entity *se);
>
>  /*
> @@ -7081,6 +7082,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>                 h_nr_idle = task_has_idle_policy(p);
>                 if (task_sleep || task_delayed || !se->sched_delayed)
>                         h_nr_runnable = 1;
> +
> +               fair_remove_pushable_task(rq, p);
>         } else {
>                 cfs_rq = group_cfs_rq(se);
>                 slice = cfs_rq_min_slice(cfs_rq);
> @@ -8589,6 +8592,197 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>         return target;
>  }
>
> +static inline bool task_stuck_on_cpu(struct task_struct *p, int cpu)
> +{
> +       unsigned long max_capa, util;
> +
> +       max_capa = min(get_actual_cpu_capacity(cpu),
> +                      uclamp_eff_value(p, UCLAMP_MAX));
> +       util = max(task_util_est(p), task_runnable(p));
> +
> +       /*
> +        * Return true only if the task might not sleep/wakeup because of a low
> +        * compute capacity. Tasks, which wake up regularly, will be handled by
> +        * feec().
> +        */
I am carefully studying this series of patches. I have some doubts
about this part.

Need we check the state?
READ_ONCE(p->__state) != TASK_RUNNING;
Because the tick will check it.

On the other hand, need we check the sched_delayed?
Because it also checks it in put_prev_task_fair().

Thanks!

> +       return (util > max_capa);
> +}
> +
> +static inline bool sched_energy_push_task(struct task_struct *p, struct rq *rq)
> +{
> +       if (p->nr_cpus_allowed == 1)
> +               return false;
> +
> +       if (is_rd_overutilized(rq->rd))
> +               return false;
> +
> +       if (task_stuck_on_cpu(p, cpu_of(rq)))
> +               return true;
> +
> +       return false;
> +}
> +
> +static int active_load_balance_cpu_stop(void *data);
> +
> +static inline void check_pushable_task(struct task_struct *p, struct rq *rq)
> +{
> +       int new_cpu, cpu = cpu_of(rq);
> +
> +       if (!sched_energy_enabled())
> +               return;
> +
> +       if (WARN_ON(!p))
> +               return;
> +
> +       if (WARN_ON(!task_current(rq, p)))
> +               return;
> +
> +       if (is_migration_disabled(p))
> +               return;
> +
> +       /* If there are several task, wait for being put back */
> +       if (rq->nr_running > 1)
> +               return;
> +
> +       if (!sched_energy_push_task(p, rq))
> +               return;
> +
> +       new_cpu = find_energy_efficient_cpu(p, cpu);
> +
> +       if (new_cpu == cpu)
> +               return;
> +
> +       /*
> +        * ->active_balance synchronizes accesses to
> +        * ->active_balance_work.  Once set, it's cleared
> +        * only after active load balance is finished.
> +        */
> +       if (!rq->active_balance) {
> +               rq->active_balance = 1;
> +               rq->push_cpu = new_cpu;
> +       } else
> +               return;
> +
> +       raw_spin_rq_unlock(rq);
> +       stop_one_cpu_nowait(cpu,
> +               active_load_balance_cpu_stop, rq,
> +               &rq->active_balance_work);
> +       raw_spin_rq_lock(rq);
> +}
> +
> +static inline int has_pushable_tasks(struct rq *rq)
> +{
> +       return !plist_head_empty(&rq->cfs.pushable_tasks);
> +}
> +
> +static struct task_struct *pick_next_pushable_fair_task(struct rq *rq)
> +{
> +       struct task_struct *p;
> +
> +       if (!has_pushable_tasks(rq))
> +               return NULL;
> +
> +       p = plist_first_entry(&rq->cfs.pushable_tasks,
> +                             struct task_struct, pushable_tasks);
> +
> +       WARN_ON_ONCE(rq->cpu != task_cpu(p));
> +       WARN_ON_ONCE(task_current(rq, p));
> +       WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
> +       WARN_ON_ONCE(!task_on_rq_queued(p));
> +
> +       /*
> +        * Remove task from the pushable list as we try only once after that
> +        * the task has been put back in enqueued list.
> +        */
> +       plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
> +
> +       return p;
> +}
> +
> +/*
> + * See if the non running fair tasks on this rq can be sent on other CPUs
> + * that fits better with their profile.
> + */
> +static bool push_fair_task(struct rq *rq)
> +{
> +       struct task_struct *next_task;
> +       int prev_cpu, new_cpu;
> +       struct rq *new_rq;
> +
> +       next_task = pick_next_pushable_fair_task(rq);
> +       if (!next_task)
> +               return false;
> +
> +       if (is_migration_disabled(next_task))
> +               return true;
> +
> +       /* We might release rq lock */
> +       get_task_struct(next_task);
> +
> +       prev_cpu = rq->cpu;
> +
> +       new_cpu = find_energy_efficient_cpu(next_task, prev_cpu);
> +
> +       if (new_cpu == prev_cpu)
> +               goto out;
> +
> +       new_rq = cpu_rq(new_cpu);
> +
> +       if (double_lock_balance(rq, new_rq)) {
> +               /* The task has already migrated in between */
> +               if (task_cpu(next_task) != rq->cpu) {
> +                       double_unlock_balance(rq, new_rq);
> +                       goto out;
> +               }
> +
> +               deactivate_task(rq, next_task, 0);
> +               set_task_cpu(next_task, new_cpu);
> +               activate_task(new_rq, next_task, 0);
> +
> +               resched_curr(new_rq);
> +
> +               double_unlock_balance(rq, new_rq);
> +       }
> +
> +out:
> +       put_task_struct(next_task);
> +
> +       return true;
> +}
> +
> +static void push_fair_tasks(struct rq *rq)
> +{
> +       /* push_fair_task() will return true if it moved a fair task */
> +       while (push_fair_task(rq))
> +               ;
> +}
> +
> +static DEFINE_PER_CPU(struct balance_callback, fair_push_head);
> +
> +static inline void fair_queue_pushable_tasks(struct rq *rq)
> +{
> +       if (!sched_energy_enabled() || !has_pushable_tasks(rq))
> +               return;
> +
> +       queue_balance_callback(rq, &per_cpu(fair_push_head, rq->cpu), push_fair_tasks);
> +}
> +static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p)
> +{
> +       if (sched_energy_enabled())
> +               plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
> +}
> +
> +static void fair_add_pushable_task(struct rq *rq, struct task_struct *p)
> +{
> +       if (sched_energy_enabled() && task_on_rq_queued(p) && !p->se.sched_delayed) {
> +               if (sched_energy_push_task(p, rq)) {
> +                       plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
> +                       plist_node_init(&p->pushable_tasks, p->prio);
> +                       plist_add(&p->pushable_tasks, &rq->cfs.pushable_tasks);
> +               }
> +       }
> +}
> +
>  /*
>   * select_task_rq_fair: Select target runqueue for the waking task in domains
>   * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8758,6 +8952,10 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>         return sched_balance_newidle(rq, rf) != 0;
>  }
>  #else
> +static inline void check_pushable_task(struct task_struct *p, struct rq *rq) {}
> +static inline void fair_queue_pushable_tasks(struct rq *rq) {}
> +static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p) {}
> +static inline void fair_add_pushable_task(struct rq *rq, struct task_struct *p) {}
>  static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
>  #endif /* CONFIG_SMP */
>
> @@ -8947,6 +9145,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>                 put_prev_entity(cfs_rq, pse);
>                 set_next_entity(cfs_rq, se);
>
> +               /*
> +                * The previous task might be eligible for being pushed on
> +                * another cpu if it is still active.
> +                */
> +               fair_add_pushable_task(rq, prev);
> +
>                 __set_next_task_fair(rq, p, true);
>         }
>
> @@ -9019,6 +9223,13 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
>                 cfs_rq = cfs_rq_of(se);
>                 put_prev_entity(cfs_rq, se);
>         }
> +
> +       /*
> +        * The previous task might be eligible for being pushed on another cpu
> +        * if it is still active.
> +        */
> +       fair_add_pushable_task(rq, prev);
> +
>  }
>
>  /*
> @@ -13151,6 +13362,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>         if (static_branch_unlikely(&sched_numa_balancing))
>                 task_tick_numa(rq, curr);
>
> +       check_pushable_task(curr, rq);
>         update_misfit_status(curr, rq);
>         check_update_overutilized_status(task_rq(curr));
>
> @@ -13303,6 +13515,8 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
>  {
>         struct sched_entity *se = &p->se;
>
> +       fair_remove_pushable_task(rq, p);
> +
>  #ifdef CONFIG_SMP
>         if (task_on_rq_queued(p)) {
>                 /*
> @@ -13320,6 +13534,11 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
>         if (hrtick_enabled_fair(rq))
>                 hrtick_start_fair(rq, p);
>
> +       /*
> +        * Try to push prev task before checking misfit for next task as
> +        * the migration of prev can make next fitting the CPU
> +        */
> +       fair_queue_pushable_tasks(rq);
>         update_misfit_status(p, rq);
>         sched_fair_update_stop_tick(rq, p);
>  }
> @@ -13350,6 +13569,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
>         cfs_rq->tasks_timeline = RB_ROOT_CACHED;
>         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
>  #ifdef CONFIG_SMP
> +       plist_head_init(&cfs_rq->pushable_tasks);
>         raw_spin_lock_init(&cfs_rq->removed.lock);
>  #endif
>  }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index ab16d3d0e51c..2db198dccf21 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -722,6 +722,8 @@ struct cfs_rq {
>         struct list_head        leaf_cfs_rq_list;
>         struct task_group       *tg;    /* group that "owns" this runqueue */
>
> +       struct plist_head       pushable_tasks;
> +
>         /* Locally cached copy of our task_group's idle value */
>         int                     idle;
>
> --
> 2.43.0
>
>