lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtDOQVEMRWaK9xEVqSDKcvUfai4CUck6G=oOdaeRBhZQUw@mail.gmail.com>
Date: Fri, 6 Jun 2025 17:03:36 +0200
From: Vincent Guittot <vincent.guittot@...aro.org>
To: Peter Zijlstra <peterz@...radead.org>
Cc: mingo@...hat.com, juri.lelli@...hat.com, dietmar.eggemann@....com, 
	rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de, vschneid@...hat.com, 
	clm@...a.com, linux-kernel@...r.kernel.org
Subject: Re: [RFC][PATCH 5/5] sched: Add ttwu_queue support for delayed tasks

On Tue, 20 May 2025 at 12:18, Peter Zijlstra <peterz@...radead.org> wrote:
>
> One of the things lost with introduction of DELAY_DEQUEUE is the
> ability of TTWU to move those tasks around on wakeup, since they're
> on_rq, and as such, need to be woken in-place.

I was thinking that you would call select_task_rq() somewhere in the
wake up path of delayed entity to get a chance to migrate it which was
one reason for the perf regression (and which would have also been
useful for EAS case) but IIUC, the task is still enqueued on the same
CPU but the target cpu will do the enqueue itself instead on the local
CPU. Or am I missing something ?

>
> Doing the in-place thing adds quite a bit of cross-cpu latency, add a
> little something that gets remote CPUs to do their own in-place
> wakeups, significantly reducing the rq->lock contention.
>
> Reported-by: Chris Mason <clm@...a.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
>  kernel/sched/core.c     |   74 ++++++++++++++++++++++++++++++++++++++++++------
>  kernel/sched/fair.c     |    5 ++-
>  kernel/sched/features.h |    1
>  kernel/sched/sched.h    |    1
>  4 files changed, 72 insertions(+), 9 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3784,6 +3784,8 @@ static int __ttwu_runnable(struct rq *rq
>         return 1;
>  }
>
> +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
> +
>  /*
>   * Consider @p being inside a wait loop:
>   *
> @@ -3811,6 +3813,33 @@ static int __ttwu_runnable(struct rq *rq
>   */
>  static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  {
> +#ifdef CONFIG_SMP
> +       if (sched_feat(TTWU_QUEUE_DELAYED) && READ_ONCE(p->se.sched_delayed)) {
> +               /*
> +                * Similar to try_to_block_task():
> +                *
> +                * __schedule()                         ttwu()
> +                *   prev_state = prev->state             if (p->sched_delayed)
> +                *   if (prev_state)                         smp_acquire__after_ctrl_dep()
> +                *     try_to_block_task()                   p->state = TASK_WAKING
> +                *       ... set_delayed()
> +                *         RELEASE p->sched_delayed = 1
> +                *
> +                * __schedule() and ttwu() have matching control dependencies.
> +                *
> +                * Notably, once we observe sched_delayed we know the task has
> +                * passed try_to_block_task() and p->state is ours to modify.
> +                *
> +                * TASK_WAKING controls ttwu() concurrency.
> +                */
> +               smp_acquire__after_ctrl_dep();
> +               WRITE_ONCE(p->__state, TASK_WAKING);
> +
> +               if (ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_DELAYED))
> +                       return 1;
> +       }
> +#endif
> +
>         CLASS(__task_rq_lock, guard)(p);
>         return __ttwu_runnable(guard.rq, p, wake_flags);
>  }
> @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
>         update_rq_clock(rq);
>
>         llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> +               struct rq *p_rq = task_rq(p);
> +               int ret;
> +
> +               /*
> +                * This is the ttwu_runnable() case. Notably it is possible for
> +                * on-rq entities to get migrated -- even sched_delayed ones.

I haven't found where the sched_delayed task could migrate on another cpu.

> +                */
> +               if (unlikely(p_rq != rq)) {
> +                       rq_unlock(rq, &rf);
> +                       p_rq = __task_rq_lock(p, &rf);
> +               }
> +
> +               ret = __ttwu_runnable(p_rq, p, WF_TTWU);
> +
> +               if (unlikely(p_rq != rq)) {
> +                       if (!ret)
> +                               set_task_cpu(p, cpu_of(rq));
> +
> +                       __task_rq_unlock(p_rq, &rf);
> +                       rq_lock(rq, &rf);
> +                       update_rq_clock(rq);
> +               }
> +
> +               if (ret) {
> +                       // XXX ttwu_stat()
> +                       continue;
> +               }
> +
> +               /*
> +                * This is the 'normal' case where the task is blocked.
> +                */
> +
>                 if (WARN_ON_ONCE(p->on_cpu))
>                         smp_cond_load_acquire(&p->on_cpu, !VAL);
>
> -               if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
> -                       set_task_cpu(p, cpu_of(rq));
> -
>                 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
>         }
>
> @@ -3974,7 +4032,7 @@ static inline bool ttwu_queue_cond(struc
>
>  static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
>  {
> -       bool def = sched_feat(TTWU_QUEUE_DEFAULT);
> +       bool def = sched_feat(TTWU_QUEUE_DEFAULT) || (wake_flags & WF_DELAYED);
>
>         if (!ttwu_queue_cond(p, cpu, def))
>                 return false;
> @@ -4269,8 +4327,8 @@ int try_to_wake_up(struct task_struct *p
>                  * __schedule().  See the comment for smp_mb__after_spinlock().
>                  *
>                  * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
> -                * schedule()'s deactivate_task() has 'happened' and p will no longer
> -                * care about it's own p->state. See the comment in __schedule().
> +                * schedule()'s try_to_block_task() has 'happened' and p will no longer
> +                * care about it's own p->state. See the comment in try_to_block_task().
>                  */
>                 smp_acquire__after_ctrl_dep();
>
> @@ -6712,8 +6770,8 @@ static void __sched notrace __schedule(i
>         preempt = sched_mode == SM_PREEMPT;
>
>         /*
> -        * We must load prev->state once (task_struct::state is volatile), such
> -        * that we form a control dependency vs deactivate_task() below.
> +        * We must load prev->state once, such that we form a control
> +        * dependency vs try_to_block_task() below.
>          */
>         prev_state = READ_ONCE(prev->__state);
>         if (sched_mode == SM_IDLE) {
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5395,7 +5395,10 @@ static __always_inline void return_cfs_r
>
>  static void set_delayed(struct sched_entity *se)
>  {
> -       se->sched_delayed = 1;
> +       /*
> +        * See TTWU_QUEUE_DELAYED in ttwu_runnable().
> +        */
> +       smp_store_release(&se->sched_delayed, 1);
>
>         /*
>          * Delayed se of cfs_rq have no tasks queued on them.
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -82,6 +82,7 @@ SCHED_FEAT(TTWU_QUEUE, false)
>  SCHED_FEAT(TTWU_QUEUE, true)
>  #endif
>  SCHED_FEAT(TTWU_QUEUE_ON_CPU, true)
> +SCHED_FEAT(TTWU_QUEUE_DELAYED, false)

I'm not sure that the feature will be tested as people mainly test
default config

>  SCHED_FEAT(TTWU_QUEUE_DEFAULT, false)
>
>  /*
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2313,6 +2313,7 @@ static inline int task_on_rq_migrating(s
>  #define WF_RQ_SELECTED         0x80 /* ->select_task_rq() was called */
>
>  #define WF_ON_CPU              0x0100
> +#define WF_DELAYED             0x0200
>
>  #ifdef CONFIG_SMP
>  static_assert(WF_EXEC == SD_BALANCE_EXEC);
>
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ