linux-kernel - Re: [PATCH 4/6 v8] sched/fair: Add push task mechanism for fair

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtA1NHY=aN_KgSRRhjS0KwTvwMqmUwgG+khm2-mvyS=nSw@mail.gmail.com>
Date: Mon, 9 Feb 2026 14:17:43 +0100
From: Vincent Guittot <vincent.guittot@...aro.org>
To: Shrikanth Hegde <sshegde@...ux.ibm.com>
Cc: mingo@...hat.com, peterz@...radead.org, vschneid@...hat.com, 
	juri.lelli@...hat.com, qyousef@...alina.io, hongyan.xia2@....com, 
	christian.loehle@....com, luis.machado@....com, dietmar.eggemann@....com, 
	rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de, 
	linux-kernel@...r.kernel.org, pierre.gondois@....com, kprateek.nayak@....com
Subject: Re: [PATCH 4/6 v8] sched/fair: Add push task mechanism for fair

On Sun, 7 Dec 2025 at 13:13, Shrikanth Hegde <sshegde@...ux.ibm.com> wrote:
>
>
>
> On 12/2/25 11:42 PM, Vincent Guittot wrote:
> > EAS is based on wakeup events to efficiently place tasks on the system, but
> > there are cases where a task doesn't have wakeup events anymore or at a far
> > too low pace. For such situation, we can take advantage of the task being
> > put back in the enqueued list to check if it should be pushed on another
> > CPU.
> > When the task is alone on the CPU, it's never put back in the enqueued
> > list; In this special case, we use the tick to run the check.
> >
> > Add a push task mechanism that enables fair scheduler to push runnable
> > tasks. EAS will be one user but other feature like filling idle CPUs
> > can also take advantage of it.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
> > ---
> >   kernel/sched/fair.c  | 212 ++++++++++++++++++++++++++++++++++++++++++-
> >   kernel/sched/sched.h |   4 +
> >   2 files changed, 214 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 80c4131fb35b..252254168c92 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -6989,6 +6989,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >       hrtick_update(rq);
> >   }
> >
> > +static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
> > +
> >   /*
> >    * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> >    * failing half-way through and resume the dequeue later.
> > @@ -7017,6 +7019,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> >               h_nr_idle = task_has_idle_policy(p);
> >               if (task_sleep || task_delayed || !se->sched_delayed)
> >                       h_nr_runnable = 1;
> > +
> > +             fair_remove_pushable_task(rq, p);
> >       }
> >
> >       for_each_sched_entity(se) {
> > @@ -8504,6 +8508,187 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >       return target;
> >   }
> >
> > +DEFINE_STATIC_KEY_FALSE(sched_push_task);
> > +
> > +static inline bool sched_push_task_enabled(void)
> > +{
> > +     return static_branch_unlikely(&sched_push_task);
> > +}
> > +
> > +static bool fair_push_task(struct rq *rq, struct task_struct *p)
> > +{
> > +     return false;
> > +}
> > +
> > +static inline int has_pushable_tasks(struct rq *rq)
> > +{
> > +     return !plist_head_empty(&rq->cfs.pushable_tasks);
> > +}
> > +
> > +static struct task_struct *pick_next_pushable_fair_task(struct rq *rq)
> > +{
> > +     struct task_struct *p;
> > +
> > +     if (!has_pushable_tasks(rq))
> > +             return NULL;
> > +
> > +     p = plist_first_entry(&rq->cfs.pushable_tasks,
> > +                           struct task_struct, pushable_tasks);
> > +
> > +     WARN_ON_ONCE(rq->cpu != task_cpu(p));
> > +     WARN_ON_ONCE(task_current(rq, p));
> > +     WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
> > +     WARN_ON_ONCE(!task_on_rq_queued(p));
> > +
> > +     /*
> > +      * Remove task from the pushable list as we try only once after that
> > +      * the task has been put back in enqueued list.
> > +      */
> > +     plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
> > +
> > +     return p;
> > +}
> > +
> > +static int
> > +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags);
> > +
> > +/*
> > + * See if the non running fair tasks on this rq can be sent on other CPUs
> > + * that fits better with their profile.
> > + */
> > +static bool push_fair_task(struct rq *rq)
> > +{
> > +     struct task_struct *next_task;
> > +     int prev_cpu, new_cpu;
> > +     struct rq *new_rq;
> > +
> > +     next_task = pick_next_pushable_fair_task(rq);
> > +     if (!next_task)
> > +             return false;
> > +
> > +     if (is_migration_disabled(next_task))
> > +             return true;
> > +
> > +     /* We might release rq lock */
> > +     get_task_struct(next_task);
> > +
> > +     prev_cpu = rq->cpu;
> > +
> > +     new_cpu = select_task_rq_fair(next_task, prev_cpu, 0);
> > +
> > +     if (new_cpu == prev_cpu)
> > +             goto out;
> > +
> > +     new_rq = cpu_rq(new_cpu);
> > +
> > +     if (double_lock_balance(rq, new_rq)) {
> > +             /* The task has already migrated in between */
> > +             if (task_cpu(next_task) != rq->cpu) {
> > +                     double_unlock_balance(rq, new_rq);
> > +                     goto out;
> > +             }
> > +
> > +             deactivate_task(rq, next_task, 0);
> > +             set_task_cpu(next_task, new_cpu);
> > +             activate_task(new_rq, next_task, 0);
> > +
> > +             resched_curr(new_rq);
> > +
> > +             double_unlock_balance(rq, new_rq);
> > +     }
> > +
> > +out:
> > +     put_task_struct(next_task);
> > +
> > +     return true;
> > +}
> > +
> > +static void push_fair_tasks(struct rq *rq)
> > +{
> > +     /* push_fair_task() will return true if it moved a fair task */
> > +     while (push_fair_task(rq))
> > +             ;
> > +}
> > +
> > +static DEFINE_PER_CPU(struct balance_callback, fair_push_head);
> > +
> > +static inline void fair_queue_pushable_tasks(struct rq *rq)
> > +{
> > +     if (!sched_push_task_enabled() || !has_pushable_tasks(rq))
> > +             return;
> > +
> > +     queue_balance_callback(rq, &per_cpu(fair_push_head, rq->cpu), push_fair_tasks);
> > +}
> > +
> > +static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p)
> > +{
> > +     if (sched_push_task_enabled())
> > +             plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
> > +}
> > +
> > +static void fair_add_pushable_task(struct rq *rq, struct task_struct *p)
> > +{
> > +     if (sched_push_task_enabled() && fair_push_task(rq, p)) {
> > +             plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
> > +             plist_node_init(&p->pushable_tasks, p->prio);
> > +             plist_add(&p->pushable_tasks, &rq->cfs.pushable_tasks);
> > +     }
> > +}
> > +
> > +static int active_load_balance_cpu_stop(void *data);
> > +
> > +/*
> > + * See if the alone task running on the CPU should migrate on a better than
> > + * the local one.
> > + */
> > +static inline bool check_pushable_task(struct task_struct *p, struct rq *rq)
> > +{
> > +     int new_cpu, cpu = cpu_of(rq);
> > +
> > +     if (!sched_push_task_enabled())
> > +             return false;
> > +
> > +     if (WARN_ON(!p))
> > +             return false;
> > +
> > +     if (WARN_ON(!task_current(rq, p)))
> > +             return false;
> > +
> > +     if (is_migration_disabled(p))
> > +             return false;
> > +
> > +     /* If there are several task, wait for being put back */
> > +     if (rq->nr_running > 1)
> > +             return false;
> > +
> > +     if (!fair_push_task(rq, p))
> > +             return false;
> > +
>
> RT matters for EAS too? or only CFS?
>
> Since we have quite a few patches floating around push task framework,
> can we generalize the framework for pushing the current task out?
>
> push_current_task(rq, CFS|RT|DL|IDLE|EXT|ALL)
> - Depending on the second argument push the task out after doing necessary
> class specific checks? Maybe a new method be added per class.

Sorry I thought that I answered to your email but I can't find it

The generalization is not straight forward as they are not all using
the same kind of list like DL which uses a rb tree and the place where
we want to check if a task should be added in the pushable list

>
> - current cpu hotplug code can make use of this infra with (ALL)
> - push_rt_task with (RT), sched_balance_rq (CFS)
> - push_current_from_paravirt_cpu (CFS|RT)  (Patch series which i sent few days ago)
>
> I know it is tricky right now due to specific checks in each path and
> the way new cpu is found is different and all that. affine_move_task seems
> quite complicated to fit in.
>
> Maybe i thinking too far.

This could come in a 2nd step of consolidation once we know what we
want to put in each push callback.

>
>
> > +     new_cpu = select_task_rq_fair(p, cpu, 0);
> > +
> > +     if (new_cpu == cpu)
> > +             return false;
> > +
> > +     /*
> > +      * ->active_balance synchronizes accesses to
> > +      * ->active_balance_work.  Once set, it's cleared
> > +      * only after active load balance is finished.
> > +      */
> > +     if (!rq->active_balance) {
> > +             rq->active_balance = 1;
> > +             rq->push_cpu = new_cpu;
> > +     } else
> > +             return false;
> > +
> > +     raw_spin_rq_unlock(rq);
>
> can this race with sched_balance_rq?
> I think it is okay since rq->active_balance = 0 at the end. so work buffer
> should be protected.

Yeah, rq->active_balance protects it

>
> > +     stop_one_cpu_nowait(cpu,
> > +             active_load_balance_cpu_stop, rq,
> > +             &rq->active_balance_work);
> > +     raw_spin_rq_lock(rq);
> > +
> > +     return true;
> > +}
> > +
> >   /*
> >    * select_task_rq_fair: Select target runqueue for the waking task in domains
> >    * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> > @@ -8973,6 +9158,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
> >               put_prev_entity(cfs_rq, pse);
> >               set_next_entity(cfs_rq, se);
> >
> > +             /*
> > +              * The previous task might be eligible for being pushed on
> > +              * another cpu if it is still active.
> > +              */
> > +             fair_add_pushable_task(rq, prev);
> > +
> >               __set_next_task_fair(rq, p, true);
> >       }
> >
> > @@ -9036,6 +9227,13 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
> >               cfs_rq = cfs_rq_of(se);
> >               put_prev_entity(cfs_rq, se);
> >       }
> > +
> > +     /*
> > +      * The previous task might be eligible for being pushed on another cpu
> > +      * if it is still active.
> > +      */
> > +     fair_add_pushable_task(rq, prev);
> > +
> >   }
> >
> >   /*
> > @@ -13390,8 +13588,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
> >       if (static_branch_unlikely(&sched_numa_balancing))
> >               task_tick_numa(rq, curr);
> >
> > -     update_misfit_status(curr, rq);
> > -     check_update_overutilized_status(task_rq(curr));
> > +     if (!check_pushable_task(curr, rq)) {
> > +             update_misfit_status(curr, rq);
> > +             check_update_overutilized_status(task_rq(curr));
> > +     }
> >
> >       task_tick_core(rq, curr);
> >   }
> > @@ -13552,6 +13752,8 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
> >   {
> >       struct sched_entity *se = &p->se;
> >
> > +     fair_remove_pushable_task(rq, p);
> > +
> >       if (task_on_rq_queued(p)) {
> >               /*
> >                * Move the next running task to the front of the list, so our
> > @@ -13567,6 +13769,11 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
> >       if (hrtick_enabled_fair(rq))
> >               hrtick_start_fair(rq, p);
> >
> > +     /*
> > +      * Try to push prev task before checking misfit for next task as
> > +      * the migration of prev can make next fitting the CPU
> > +      */
> > +     fair_queue_pushable_tasks(rq);
> >       update_misfit_status(p, rq);
> >       sched_fair_update_stop_tick(rq, p);
> >   }
> > @@ -13596,6 +13803,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
> >   {
> >       cfs_rq->tasks_timeline = RB_ROOT_CACHED;
> >       cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
> > +     plist_head_init(&cfs_rq->pushable_tasks);
> >       raw_spin_lock_init(&cfs_rq->removed.lock);
> >   }
> >
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index b419a4d98461..697bd654298a 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -711,6 +711,8 @@ struct cfs_rq {
> >               unsigned long   runnable_avg;
> >       } removed;
> >
> > +     struct plist_head       pushable_tasks;
> > +
> >   #ifdef CONFIG_FAIR_GROUP_SCHED
> >       u64                     last_update_tg_load_avg;
> >       unsigned long           tg_load_avg_contrib;
> > @@ -3620,6 +3622,8 @@ static inline bool sched_energy_enabled(void) { return false; }
> >
> >   #endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
> >
> > +DECLARE_STATIC_KEY_FALSE(sched_push_task);
> > +
> You have sched_energy_present which is also enabled at the same point.
> Do you see more usecases for sched_push_task?

In my current patchset sched_push_task is only enabled for EAS but I
wanted to make it possible to be enabled for other cases