[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJZ5v0j=GN43RWi0zqwHOch-NJWMtfLoRFeMsHPGnOKPUxfGvA@mail.gmail.com>
Date: Tue, 14 Jan 2025 15:50:12 +0100
From: "Rafael J. Wysocki" <rafael@...nel.org>
To: Frederic Weisbecker <frederic@...nel.org>
Cc: LKML <linux-kernel@...r.kernel.org>, Peter Zijlstra <peterz@...radead.org>,
Daniel Lezcano <daniel.lezcano@...aro.org>, linux-pm@...r.kernel.org,
Thomas Gleixner <tglx@...utronix.de>, Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>
Subject: Re: [PATCH 4/6] cpuidle: Handle TIF_NR_POLLING on behalf of
CPUIDLE_FLAG_MWAIT states
On Thu, Jan 2, 2025 at 4:02 PM Frederic Weisbecker <frederic@...nel.org> wrote:
>
> From: Peter Zijlstra <peterz@...radead.org>
>
> The current handling of TIF_NR_POLLING is a bit of a maze:
>
> 1) TIF_NR_POLLING is set on idle entry (one atomic set)
>
> 2) Once cpuidle has selected an appropriate state and the tick is
> evaluated and then possibly stopped, TIF_NR_POLLING is cleared
> (one RmW operation)
>
> 2) The cpuidle state is then called with TIF_NR_POLLING cleared but if
> the state polls on (or monitors) need_resched() it sets again
> TIF_NR_POLLING before sleeping and clears it on wake-up. Summary:
> another pair of set/clear
>
> 3) Set back TIF_NR_POLLING (one atomic set)
>
> 4) goto 2) if need_resched() is not set
>
> All those costly atomic operations, fully ordered RmW for some of
> them, could be avoided if the cpuidle core knew in advance if the target
> state polls on (or monitors) need_resched(). If so, TIF_NR_POLLING could
> simply be set once upon entering the idle loop and cleared once after
> idle loop exit.
>
> Start dealing with that with handling TIF_NR_POLLING on behalf of
> mwait based states.
>
> [fweisbec: _ Handle broadcast properly
> _ Ignore mwait_idle() as it can be used by default_idle_call()]
>
> Signed-off-by: Peter Zijlstra <peterz@...radead.org>
> Signed-off-by: Frederic Weisbecker <frederic@...nel.org>
> ---
> arch/x86/include/asm/mwait.h | 27 +++++++++++-------------
> drivers/cpuidle/cpuidle.c | 22 +++++++++++++++++++-
> include/linux/sched/idle.h | 7 ++++++-
> kernel/sched/idle.c | 40 +++++++++++++-----------------------
> 4 files changed, 53 insertions(+), 43 deletions(-)
>
> diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
> index 920426d691ce..3e06a7f3bf5a 100644
> --- a/arch/x86/include/asm/mwait.h
> +++ b/arch/x86/include/asm/mwait.h
> @@ -116,25 +116,22 @@ static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx)
> */
> static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
> {
> - if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
> - if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
> - mb();
> - clflush((void *)¤t_thread_info()->flags);
> - mb();
> - }
> + if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
> + mb();
> + clflush((void *)¤t_thread_info()->flags);
> + mb();
> + }
>
> - __monitor((void *)¤t_thread_info()->flags, 0, 0);
> + __monitor((void *)¤t_thread_info()->flags, 0, 0);
>
> - if (!need_resched()) {
> - if (ecx & 1) {
> - __mwait(eax, ecx);
> - } else {
> - __sti_mwait(eax, ecx);
> - raw_local_irq_disable();
> - }
> + if (!need_resched()) {
> + if (ecx & 1) {
> + __mwait(eax, ecx);
> + } else {
> + __sti_mwait(eax, ecx);
> + raw_local_irq_disable();
> }
> }
> - current_clr_polling();
> }
>
> /*
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index 0835da449db8..46c0a2726f67 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -217,10 +217,10 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
> int index)
> {
> int entered_state;
> -
> struct cpuidle_state *target_state = &drv->states[index];
> bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP);
> ktime_t time_start, time_end;
> + bool polling;
>
> instrumentation_begin();
>
> @@ -237,6 +237,23 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
> broadcast = false;
> }
>
> + polling = target_state->flags & CPUIDLE_FLAG_MWAIT;
Hmmm. What about "polling" states, like state 0 on all x86?
They also monitor need_resched() -see poll_idle().
> +
> + /*
> + * If the target state doesn't poll on need_resched(), this is
> + * the last check after which further TIF_NEED_RESCHED remote setting
> + * will involve an IPI.
> + */
> + if (!polling && current_clr_polling_and_test()) {
> + if (broadcast)
> + tick_broadcast_exit();
> + dev->last_residency_ns = 0;
> + local_irq_enable();
> + instrumentation_end();
> + return -EBUSY;
> + }
> +
> +
> if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
> leave_mm();
>
> @@ -336,6 +353,9 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
> dev->states_usage[index].rejected++;
> }
>
> + if (!polling)
> + __current_set_polling();
> +
> instrumentation_end();
>
> return entered_state;
> diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
> index e670ac282333..3e3482bfb028 100644
> --- a/include/linux/sched/idle.h
> +++ b/include/linux/sched/idle.h
> @@ -68,6 +68,8 @@ static __always_inline bool __must_check current_set_polling_and_test(void)
>
> static __always_inline bool __must_check current_clr_polling_and_test(void)
> {
> + bool ret;
> +
> __current_clr_polling();
>
> /*
> @@ -76,7 +78,10 @@ static __always_inline bool __must_check current_clr_polling_and_test(void)
> */
> smp_mb__after_atomic();
>
> - return unlikely(tif_need_resched());
> + ret = unlikely(tif_need_resched());
> + if (ret)
> + __current_set_polling();
> + return ret;
> }
>
> #else
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index 621696269584..9eece3df1080 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -114,12 +114,13 @@ void __cpuidle default_idle_call(void)
> stop_critical_timings();
>
> ct_cpuidle_enter();
> - arch_cpu_idle();
> + arch_cpu_idle(); // XXX assumes !polling
Well, what if x86_idle is default_idle()? Say, somebody boots with
IDLE_NOMWAIT?
> ct_cpuidle_exit();
>
> start_critical_timings();
> trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
> cond_tick_broadcast_exit();
> + __current_set_polling();
> }
> local_irq_enable();
> instrumentation_end();
> @@ -128,31 +129,14 @@ void __cpuidle default_idle_call(void)
> static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
> struct cpuidle_device *dev)
> {
> + int ret;
> +
> if (current_clr_polling_and_test())
> return -EBUSY;
>
> - return cpuidle_enter_s2idle(drv, dev);
> -}
> -
> -static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
> - int next_state)
> -{
> - /*
> - * The idle task must be scheduled, it is pointless to go to idle, just
> - * update no idle residency and return.
> - */
> - if (current_clr_polling_and_test()) {
> - dev->last_residency_ns = 0;
> - local_irq_enable();
> - return -EBUSY;
> - }
> -
> - /*
> - * Enter the idle state previously returned by the governor decision.
> - * This function will block until an interrupt occurs and will take
> - * care of re-enabling the local interrupts
> - */
> - return cpuidle_enter(drv, dev, next_state);
> + ret = cpuidle_enter_s2idle(drv, dev);
> + __current_set_polling();
> + return ret;
> }
>
> /**
> @@ -213,7 +197,7 @@ static void cpuidle_idle_call(void)
> tick_nohz_idle_stop_tick();
>
> next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
> - call_cpuidle(drv, dev, next_state);
> + cpuidle_enter(drv, dev, next_state);
> } else {
> bool stop_tick = true;
>
> @@ -227,7 +211,12 @@ static void cpuidle_idle_call(void)
> else
> tick_nohz_idle_retain_tick();
>
> - entered_state = call_cpuidle(drv, dev, next_state);
> + /*
> + * Enter the idle state previously returned by the governor decision.
> + * This function will block until an interrupt occurs and will take
> + * care of re-enabling the local interrupts.
> + */
> + entered_state = cpuidle_enter(drv, dev, next_state);
> /*
> * Give the governor an opportunity to reflect on the outcome
> */
> @@ -235,7 +224,6 @@ static void cpuidle_idle_call(void)
> }
>
> exit_idle:
> - __current_set_polling();
>
> /*
> * It is up to the idle functions to re-enable local interrupts
> --
Powered by blists - more mailing lists