[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9238C4ED-6F19-4B25-8AAB-BEFB16A705BC@oracle.com>
Date: Tue, 1 Jul 2025 18:40:13 +0000
From: Prakash Sangappa <prakash.sangappa@...cle.com>
To: Thomas Gleixner <tglx@...utronix.de>
CC: "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"peterz@...radead.org" <peterz@...radead.org>,
"rostedt@...dmis.org"
<rostedt@...dmis.org>,
"mathieu.desnoyers@...icios.com"
<mathieu.desnoyers@...icios.com>,
"bigeasy@...utronix.de"
<bigeasy@...utronix.de>,
"kprateek.nayak@....com" <kprateek.nayak@....com>,
"vineethr@...ux.ibm.com" <vineethr@...ux.ibm.com>
Subject: Re: [PATCH V6 1/7] Sched: Scheduler time slice extension
> On Jul 1, 2025, at 1:42 AM, Thomas Gleixner <tglx@...utronix.de> wrote:
>
> On Tue, Jul 01 2025 at 00:37, Prakash Sangappa wrote:
>
> The subsystem prefix for the scheduler is 'sched:' It's not that hard to
> figure out.
Will fix that.
>
>> unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>> - unsigned long ti_work);
>> + unsigned long ti_work,
>> + bool irq);
>
> No need for a new line
Ok.
>
>> /**
>> * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
>> @@ -316,7 +317,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>> * EXIT_TO_USER_MODE_WORK are set
>> * 4) check that interrupts are still disabled
>> */
>> -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>> +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
>> + bool irq)
>
> Ditto. 100 characters line width, please use it. And if you need a line
> break, please align the second lines arguments properly. This is
> documented:
>
> https://urldefense.com/v3/__https://www.kernel.org/doc/html/latest/process/maintainer-tip.html__;!!ACWV5N9M2RV99hQ!KeHcAVTEgkkOTIZCEsRYgcQtDwYHZ4s77sjO9fKsjO530m5mavRMOnDMB_v6fm5TBvEfRPV2tcR2KTqt1865VZ0$
Got it.
>
>> {
>> unsigned long ti_work;
>>
>> @@ -327,7 +329,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>>
>> ti_work = read_thread_flags();
>> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
>> - ti_work = exit_to_user_mode_loop(regs, ti_work);
>> + ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
>> +
>> + if (irq)
>> + rseq_delay_resched_fini();
>
> This is an unconditional function call for every interrupt return and
> it's even done when the whole thing is known to be non-functional at
> compile time:
Will include IS_CONFIG check.
>
>> +void rseq_delay_resched_fini(void)
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
> ....
>> +#endif
>> +}
>
> Seriously?
Will make the new config CONFIG_SCHED_PREEMPT_DELAY dependent not SCHED_HRTICK,
So, I can remove these #ifdef, #endif.
>
>> arch_exit_to_user_mode_prepare(regs, ti_work);
>>
>> @@ -396,6 +401,10 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>>
>> CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
>>
>> + /* reschedule if sched delay was granted */
>
> Sentences start with an upper case letter and please use full words and
> not arbitrary abbreviations. This is neither twatter nor SMS.
Will fix.
>
>> + if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay)
>> + set_tsk_need_resched(current);
>> +
>> if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
>> if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
>> local_irq_enable();
>> @@ -411,7 +420,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>> if (unlikely(work & SYSCALL_WORK_EXIT))
>> syscall_exit_work(regs, work);
>> local_irq_disable_exit_to_user();
>> - exit_to_user_mode_prepare(regs);
>> + exit_to_user_mode_prepare(regs, false);
>> }
>>
>> /**
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 5bcf44ae6c79..9b4670d85131 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -338,6 +338,7 @@ extern int __must_check io_schedule_prepare(void);
>> extern void io_schedule_finish(int token);
>> extern long io_schedule_timeout(long timeout);
>> extern void io_schedule(void);
>> +extern void hrtick_local_start(u64 delay);
>>
>> /* wrapper function to trace from this header file */
>> DECLARE_TRACEPOINT(sched_set_state_tp);
>> @@ -1263,6 +1264,7 @@ struct task_struct {
>> int softirq_context;
>> int irq_config;
>> #endif
>> + unsigned sched_time_delay:1;
>
> Find an arbitrary place by rolling a dice and stick it in, right?
Sorry, merging issue. I had it next to the following
> unsigned in_thrashing:1;
Will fix it.
>
> There is already a section with bit fields in this struct. So it's more
> than bloody obvious to stick it there instead of creating a hole in the
> middle of task struct.
>
>> #ifdef CONFIG_PREEMPT_RT
>> int softirq_disable_cnt;
>> #endif
>> @@ -2245,6 +2247,20 @@ static inline bool owner_on_cpu(struct task_struct *owner)
>> unsigned long sched_cpu_util(int cpu);
>> #endif /* CONFIG_SMP */
>>
>> +#ifdef CONFIG_RSEQ
>> +
>
> Remove these newlines please. They have zero value.
Ok
>
>> +extern bool rseq_delay_resched(void);
>> +extern void rseq_delay_resched_fini(void);
>> +extern void rseq_delay_resched_tick(void);
>> +
>> +#else
>
>> @@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>
>> local_irq_enable_exit_to_user(ti_work);
>>
>> - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
>> - schedule();
>> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
>> + if (irq && rseq_delay_resched())
>
> unlikely() and again this results in an unconditional function call for
> every interrupt when CONFIG_RSEQ is enabled. A pointless exercise for
> the majority of use cases.
>
> What's worse is that it breaks the LAZY semantics. I explained this to
> you before and this thing needs to be tied on the LAZY bit otherwise a
> SCHED_OTHER task can prevent a real-time task from running, which is
> fundamentally wrong.
>
> So this wants to be:
>
> if (likely(!irq || !rseq_delay_resched(ti_work))
> schedule();
>
> and
>
> static inline bool rseq_delay_resched(unsigned long ti_work)
> {
> // Set when all Kconfig conditions are met
> if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
> return false;
>
> // Only NEED_RESCHED_LAZY can be delayed
> if (ti_work & _TIF_NEED_RESCHED)
> return false;
>
> // NONE indicates that current::rseq == NULL
> // PROBE indicates that current::rseq::flags needs to be
> // evaluated.
> // REQUESTED indicates that there was a successful request
> // already.
> if (likely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))
> return false;
>
> return __rseq_delay_resched();
> }
>
> or something like that.
Will refactor the code.
>
>> +bool rseq_delay_resched(void)
>> +{
>> + struct task_struct *t = current;
>> + u32 flags;
>> +
>> + if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
>> + return false;
>> +
>> + if (!t->rseq)
>> + return false;
>> +
>> + if (t->sched_time_delay)
>> + return false;
>
> Then all of the above conditions go away.
>
>> + if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
>> + return false;
>> +
>> + if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
>> + return false;
>> +
>> + flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
>> + if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
>> + return false;
>> +
>> + t->sched_time_delay = 1;
>
> and this becomes:
>
> t->rseq_delay_resched = RSEQ_RESCHED_DELAY_REQUESTED;
>
>> + return true;
>> +}
>> +
>> +void rseq_delay_resched_fini(void)
>
> What's does _fini() mean here? Absolutely nothing. This wants to be a
> self explaining function name and see below
>
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
>
> You really are fond of pointless function calls. Obviously performance
> is not really a concern in your work.
>
>> + extern void hrtick_local_start(u64 delay);
>
> header files with prototypes exist for a reason....
>
>> + struct task_struct *t = current;
>> + /*
>> + * IRQs off, guaranteed to return to userspace, start timer on this CPU
>> + * to limit the resched-overdraft.
>> + *
>> + * If your critical section is longer than 30 us you get to keep the
>> + * pieces.
>> + */
>> + if (t->sched_time_delay)
>> + hrtick_local_start(30 * NSEC_PER_USEC);
>> +#endif
>
> This whole thing can be condensed into:
>
> static inline void rseq_delay_resched_arm_timer(void)
> {
> if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
> return;
> if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_REQUESTED))
> hrtick_local_start(...);
> }
Got it, will make the changes.
>
>> +}
>> +
>> +void rseq_delay_resched_tick(void)
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
>> + struct task_struct *t = current;
>> +
>> + if (t->sched_time_delay)
>> + set_tsk_need_resched(t);
>> +#endif
>
> Oh well.....
>
>> +}
>> +
>> #ifdef CONFIG_DEBUG_RSEQ
>>
>> /*
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 4ad7cf3cfdca..c1b64879115f 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -845,6 +845,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
>>
>> WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
>>
>> + rseq_delay_resched_tick();
>> +
>> rq_lock(rq, &rf);
>> update_rq_clock(rq);
>> rq->donor->sched_class->task_tick(rq, rq->curr, 1);
>> @@ -918,6 +920,16 @@ void hrtick_start(struct rq *rq, u64 delay)
>>
>> #endif /* CONFIG_SMP */
>>
>> +void hrtick_local_start(u64 delay)
>
> How is this supposed to compile cleanly without a prototype?
Will fix.
Thanks for your comments.
-Prakash
>
> Thanks,
>
> tglx
Powered by blists - more mailing lists