lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9238C4ED-6F19-4B25-8AAB-BEFB16A705BC@oracle.com>
Date: Tue, 1 Jul 2025 18:40:13 +0000
From: Prakash Sangappa <prakash.sangappa@...cle.com>
To: Thomas Gleixner <tglx@...utronix.de>
CC: "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "peterz@...radead.org" <peterz@...radead.org>,
        "rostedt@...dmis.org"
	<rostedt@...dmis.org>,
        "mathieu.desnoyers@...icios.com"
	<mathieu.desnoyers@...icios.com>,
        "bigeasy@...utronix.de"
	<bigeasy@...utronix.de>,
        "kprateek.nayak@....com" <kprateek.nayak@....com>,
        "vineethr@...ux.ibm.com" <vineethr@...ux.ibm.com>
Subject: Re: [PATCH V6 1/7] Sched: Scheduler time slice extension



> On Jul 1, 2025, at 1:42 AM, Thomas Gleixner <tglx@...utronix.de> wrote:
> 
> On Tue, Jul 01 2025 at 00:37, Prakash Sangappa wrote:
> 
> The subsystem prefix for the scheduler is 'sched:' It's not that hard to
> figure out.

Will fix that. 

> 
>> unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>> -     unsigned long ti_work);
>> +     unsigned long ti_work,
>> +     bool irq);
> 
> No need for a new line

Ok.

> 
>> /**
>>  * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
>> @@ -316,7 +317,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>  *    EXIT_TO_USER_MODE_WORK are set
>>  * 4) check that interrupts are still disabled
>>  */
>> -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>> +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs,
>> + bool irq)
> 
> Ditto. 100 characters line width, please use it. And if you need a line
> break, please align the second lines arguments properly. This is
> documented:
> 
> https://urldefense.com/v3/__https://www.kernel.org/doc/html/latest/process/maintainer-tip.html__;!!ACWV5N9M2RV99hQ!KeHcAVTEgkkOTIZCEsRYgcQtDwYHZ4s77sjO9fKsjO530m5mavRMOnDMB_v6fm5TBvEfRPV2tcR2KTqt1865VZ0$

Got it.

> 
>> {
>> unsigned long ti_work;
>> 
>> @@ -327,7 +329,10 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
>> 
>> ti_work = read_thread_flags();
>> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
>> - ti_work = exit_to_user_mode_loop(regs, ti_work);
>> + ti_work = exit_to_user_mode_loop(regs, ti_work, irq);
>> +
>> + if (irq)
>> + rseq_delay_resched_fini();
> 
> This is an unconditional function call for every interrupt return and
> it's even done when the whole thing is known to be non-functional at
> compile time:

Will include IS_CONFIG check. 

> 
>> +void rseq_delay_resched_fini(void)
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
>  ....
>> +#endif
>> +}
> 
> Seriously?

Will make the new config CONFIG_SCHED_PREEMPT_DELAY dependent not SCHED_HRTICK,
So, I can remove these #ifdef, #endif.

> 
>> arch_exit_to_user_mode_prepare(regs, ti_work);
>> 
>> @@ -396,6 +401,10 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>> 
>> CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
>> 
>> + /* reschedule if sched delay was granted */
> 
> Sentences start with an upper case letter and please use full words and
> not arbitrary abbreviations. This is neither twatter nor SMS.

Will fix. 

> 
>> + if (IS_ENABLED(CONFIG_RSEQ) && current->sched_time_delay)
>> + set_tsk_need_resched(current);
>> +
>> if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
>> if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
>> local_irq_enable();
>> @@ -411,7 +420,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
>> if (unlikely(work & SYSCALL_WORK_EXIT))
>> syscall_exit_work(regs, work);
>> local_irq_disable_exit_to_user();
>> - exit_to_user_mode_prepare(regs);
>> + exit_to_user_mode_prepare(regs, false);
>> }
>> 
>> /**
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 5bcf44ae6c79..9b4670d85131 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -338,6 +338,7 @@ extern int __must_check io_schedule_prepare(void);
>> extern void io_schedule_finish(int token);
>> extern long io_schedule_timeout(long timeout);
>> extern void io_schedule(void);
>> +extern void hrtick_local_start(u64 delay);
>> 
>> /* wrapper function to trace from this header file */
>> DECLARE_TRACEPOINT(sched_set_state_tp);
>> @@ -1263,6 +1264,7 @@ struct task_struct {
>> int softirq_context;
>> int irq_config;
>> #endif
>> + unsigned sched_time_delay:1;
> 
> Find an arbitrary place by rolling a dice and stick it in, right?

Sorry, merging issue. I had it next to the following 
>         unsigned                        in_thrashing:1;

Will fix it. 



> 
> There is already a section with bit fields in this struct. So it's more
> than bloody obvious to stick it there instead of creating a hole in the
> middle of task struct.
> 
>> #ifdef CONFIG_PREEMPT_RT
>> int softirq_disable_cnt;
>> #endif
>> @@ -2245,6 +2247,20 @@ static inline bool owner_on_cpu(struct task_struct *owner)
>> unsigned long sched_cpu_util(int cpu);
>> #endif /* CONFIG_SMP */
>> 
>> +#ifdef CONFIG_RSEQ
>> +
> 
> Remove these newlines please. They have zero value.

Ok

> 
>> +extern bool rseq_delay_resched(void);
>> +extern void rseq_delay_resched_fini(void);
>> +extern void rseq_delay_resched_tick(void);
>> +
>> +#else
> 
>> @@ -98,8 +99,12 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>> 
>> local_irq_enable_exit_to_user(ti_work);
>> 
>> - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
>> - schedule();
>> + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
>> +       if (irq && rseq_delay_resched())
> 
> unlikely() and again this results in an unconditional function call for
> every interrupt when CONFIG_RSEQ is enabled. A pointless exercise for
> the majority of use cases.
> 
> What's worse is that it breaks the LAZY semantics. I explained this to
> you before and this thing needs to be tied on the LAZY bit otherwise a
> SCHED_OTHER task can prevent a real-time task from running, which is
> fundamentally wrong.
> 
> So this wants to be:
> 
> if (likely(!irq || !rseq_delay_resched(ti_work))
>         schedule();
> 
> and
> 
> static inline bool rseq_delay_resched(unsigned long ti_work)
> {
>        // Set when all Kconfig conditions are met
>        if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
>         return false;
> 
>        // Only NEED_RESCHED_LAZY can be delayed
> if (ti_work & _TIF_NEED_RESCHED)
>         return false;
> 
>        // NONE indicates that current::rseq == NULL
>        // PROBE indicates that current::rseq::flags needs to be
>        // evaluated.
>        // REQUESTED indicates that there was a successful request
>        // already.
>        if (likely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_PROBE))
>         return false;
> 
>        return __rseq_delay_resched();
> }
> 
> or something like that.

Will refactor the code.

> 
>> +bool rseq_delay_resched(void)
>> +{
>> + struct task_struct *t = current;
>> + u32 flags;
>> +
>> + if (!IS_ENABLED(CONFIG_SCHED_HRTICK))
>> + return false;
>> +
>> + if (!t->rseq)
>> + return false;
>> +
>> + if (t->sched_time_delay)
>> + return false;
> 
> Then all of the above conditions go away.
> 
>> + if (copy_from_user_nofault(&flags, &t->rseq->flags, sizeof(flags)))
>> + return false;
>> +
>> + if (!(flags & RSEQ_CS_FLAG_DELAY_RESCHED))
>> + return false;
>> +
>> + flags &= ~RSEQ_CS_FLAG_DELAY_RESCHED;
>> + if (copy_to_user_nofault(&t->rseq->flags, &flags, sizeof(flags)))
>> + return false;
>> +
>> + t->sched_time_delay = 1;
> 
> and this becomes:
> 
>        t->rseq_delay_resched = RSEQ_RESCHED_DELAY_REQUESTED;
> 
>> + return true;
>> +}
>> +
>> +void rseq_delay_resched_fini(void)
> 
> What's does _fini() mean here? Absolutely nothing. This wants to be a
> self explaining function name and see below
> 
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
> 
> You really are fond of pointless function calls. Obviously performance
> is not really a concern in your work.
> 
>> + extern void hrtick_local_start(u64 delay);
> 
> header files with prototypes exist for a reason....
> 
>> + struct task_struct *t = current;
>> + /*
>> + * IRQs off, guaranteed to return to userspace, start timer on this CPU
>> + * to limit the resched-overdraft.
>> + *
>> + * If your critical section is longer than 30 us you get to keep the
>> + * pieces.
>> + */
>> + if (t->sched_time_delay)
>> + hrtick_local_start(30 * NSEC_PER_USEC);
>> +#endif
> 
> This whole thing can be condensed into:
> 
> static inline void rseq_delay_resched_arm_timer(void)
> {
> if (!IS_ENABLED(CONFIG_RSEQ_RESCHED_DELAY))
>         return;
>        if (unlikely(current->rseq_delay_resched != RSEQ_RESCHED_DELAY_REQUESTED))
>         hrtick_local_start(...);
> }

Got it, will make the changes.

> 
>> +}
>> +
>> +void rseq_delay_resched_tick(void)
>> +{
>> +#ifdef CONFIG_SCHED_HRTICK
>> + struct task_struct *t = current;
>> +
>> + if (t->sched_time_delay)
>> + set_tsk_need_resched(t);
>> +#endif
> 
> Oh well.....
> 
>> +}
>> +
>> #ifdef CONFIG_DEBUG_RSEQ
>> 
>> /*
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 4ad7cf3cfdca..c1b64879115f 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -845,6 +845,8 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
>> 
>> WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
>> 
>> + rseq_delay_resched_tick();
>> +
>> rq_lock(rq, &rf);
>> update_rq_clock(rq);
>> rq->donor->sched_class->task_tick(rq, rq->curr, 1);
>> @@ -918,6 +920,16 @@ void hrtick_start(struct rq *rq, u64 delay)
>> 
>> #endif /* CONFIG_SMP */
>> 
>> +void hrtick_local_start(u64 delay)
> 
> How is this supposed to compile cleanly without a prototype?

Will fix.

Thanks for your comments.
-Prakash
> 
> Thanks,
> 
>        tglx

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ