[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87cyakmhdv.ffs@tglx>
Date: Tue, 01 Jul 2025 10:42:36 +0200
From: Thomas Gleixner <tglx@...utronix.de>
To: Prakash Sangappa <prakash.sangappa@...cle.com>,
linux-kernel@...r.kernel.org
Cc: peterz@...radead.org, rostedt@...dmis.org,
mathieu.desnoyers@...icios.com, bigeasy@...utronix.de,
kprateek.nayak@....com, vineethr@...ux.ibm.com
Subject: Re: [PATCH V6 1/7] Sched: Scheduler time slice extension
On Tue, Jul 01 2025 at 00:37, Prakash Sangappa wrote:
The subsystem prefix for the scheduler is 'sched:' It's not that hard to
figure out.
> unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> - unsigned long ti_work);
> + unsigned long ti_work,
> + bool irq);
No need for a new line
> /**
> * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
> @@ -316,7 +317,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> * EXIT_TO_USER_MODE_WORK are set
> * 4) check that interrupts are still disabled
> */
> -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
> +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
> + bool irq)
Ditto. 100 characters line width, please use it. And if you need a line
break, please align the second lines arguments properly. This is
documented:
https://www.kernel.org/doc/html/latest/process/maintainer-tip.html
> {
> unsigned long ti_work;
>
> @@ -327,7 +329,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>
> ti_work = read_thread_flags();
> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> - ti_work = exit_to_user_mode_loop(regs, ti_work);
> + ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
> +
> + if (irq)
> + rseq_delay_resched_fini();
This is an unconditional function call for every interrupt return and
it's even done when the whole thing is known to be non-functional at
compile time:
> +void rseq_delay_resched_fini(void)
> +{
> +#ifdef CONFIG_SCHED_HRTICK
....
> +#endif
> +}
Seriously?
> arch_exit_to_user_mode_prepare(regs, ti_work);
>
> @@ -396,6 +401,10 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>
> CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
>
> + /* reschedule if sched delay was granted */
Sentences start with an upper case letter and please use full words and
not arbitrary abbreviations. This is neither twatter nor SMS.
> + if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay)
> + set_tsk_need_resched(current);
> +
> if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
> if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
> local_irq_enable();
> @@ -411,7 +420,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
> if (unlikely(work & SYSCALL_WORK_EXIT))
> syscall_exit_work(regs, work);
> local_irq_disable_exit_to_user();
> - exit_to_user_mode_prepare(regs);
> + exit_to_user_mode_prepare(regs, false);
> }
>
> /**
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5bcf44ae6c79..9b4670d85131 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -338,6 +338,7 @@ extern int __must_check io_schedule_prepare(void);
> extern void io_schedule_finish(int token);
> extern long io_schedule_timeout(long timeout);
> extern void io_schedule(void);
> +extern void hrtick_local_start(u64 delay);
>
> /* wrapper function to trace from this header file */
> DECLARE_TRACEPOINT(sched_set_state_tp);
> @@ -1263,6 +1264,7 @@ struct task_struct {
> int softirq_context;
> int irq_config;
> #endif
> + unsigned sched_time_delay:1;
Find an arbitrary place by rolling a dice and stick it in, right?
There is already a section with bit fields in this struct. So it's more
than bloody obvious to stick it there instead of creating a hole in the
middle of task struct.
> #ifdef CONFIG_PREEMPT_RT
> int softirq_disable_cnt;
> #endif
> @@ -2245,6 +2247,20 @@ static inline bool owner_on_cpu(struct task_struct *owner)
> unsigned long sched_cpu_util(int cpu);
> #endif /* CONFIG_SMP */
>
> +#ifdef CONFIG_RSEQ
> +
Remove these newlines please. They have zero value.
> +extern bool rseq_delay_resched(void);
> +extern void rseq_delay_resched_fini(void);
> +extern void rseq_delay_resched_tick(void);
> +
> +#else
> @@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>
> local_irq_enable_exit_to_user(ti_work);
>
> - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
> - schedule();
> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
> + if (irq && rseq_delay_resched())
unlikely() and again this results in an unconditional function call for
every interrupt when CONFIG_RSEQ is enabled. A pointless exercise for
the majority of use cases.
What's worse is that it breaks the LAZY semantics. I explained this to
you before and this thing needs to be tied on the LAZY bit otherwise a
SCHED_OTHER task can prevent a real-time task from running, which is
fundamentally wrong.
So this wants to be:
if (likely(!irq || !rseq_delay_resched(ti_work))
schedule();
and
static inline bool rseq_delay_resched(unsigned long ti_work)
{
// Set when all Kconfig conditions are met
if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
return false;
// Only NEED_RESCHED_LAZY can be delayed
if (ti_work & _TIF_NEED_RESCHED)
return false;
// NONE indicates that current::rseq == NULL
// PROBE indicates that current::rseq::flags needs to be
// evaluated.
// REQUESTED indicates that there was a successful request
// already.
if (likely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))
return false;
return __rseq_delay_resched();
}
or something like that.
> +bool rseq_delay_resched(void)
> +{
> + struct task_struct *t = current;
> + u32 flags;
> +
> + if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
> + return false;
> +
> + if (!t->rseq)
> + return false;
> +
> + if (t->sched_time_delay)
> + return false;
Then all of the above conditions go away.
> + if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
> + return false;
> +
> + if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
> + return false;
> +
> + flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
> + if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
> + return false;
> +
> + t->sched_time_delay = 1;
and this becomes:
t->rseq_delay_resched = RSEQ_RESCHED_DELAY_REQUESTED;
> + return true;
> +}
> +
> +void rseq_delay_resched_fini(void)
What's does _fini() mean here? Absolutely nothing. This wants to be a
self explaining function name and see below
> +{
> +#ifdef CONFIG_SCHED_HRTICK
You really are fond of pointless function calls. Obviously performance
is not really a concern in your work.
> + extern void hrtick_local_start(u64 delay);
header files with prototypes exist for a reason....
> + struct task_struct *t = current;
> + /*
> + * IRQs off, guaranteed to return to userspace, start timer on this CPU
> + * to limit the resched-overdraft.
> + *
> + * If your critical section is longer than 30 us you get to keep the
> + * pieces.
> + */
> + if (t->sched_time_delay)
> + hrtick_local_start(30 * NSEC_PER_USEC);
> +#endif
This whole thing can be condensed into:
static inline void rseq_delay_resched_arm_timer(void)
{
if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
return;
if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_REQUESTED))
hrtick_local_start(...);
}
> +}
> +
> +void rseq_delay_resched_tick(void)
> +{
> +#ifdef CONFIG_SCHED_HRTICK
> + struct task_struct *t = current;
> +
> + if (t->sched_time_delay)
> + set_tsk_need_resched(t);
> +#endif
Oh well.....
> +}
> +
> #ifdef CONFIG_DEBUG_RSEQ
>
> /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4ad7cf3cfdca..c1b64879115f 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -845,6 +845,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
>
> WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
>
> + rseq_delay_resched_tick();
> +
> rq_lock(rq, &rf);
> update_rq_clock(rq);
> rq->donor->sched_class->task_tick(rq, rq->curr, 1);
> @@ -918,6 +920,16 @@ void hrtick_start(struct rq *rq, u64 delay)
>
> #endif /* CONFIG_SMP */
>
> +void hrtick_local_start(u64 delay)
How is this supposed to compile cleanly without a prototype?
Thanks,
tglx
Powered by blists - more mailing lists