lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a01aea0c-f5bd-ff19-4c90-63e94392bd6d@amd.com>
Date: Fri, 12 Jul 2024 12:10:45 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Vincent Guittot <vincent.guittot@...aro.org>, Peter Zijlstra
	<peterz@...radead.org>
CC: Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
	<linux-kernel@...r.kernel.org>, Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, "Mel
 Gorman" <mgorman@...e.de>, Daniel Bristot de Oliveira <bristot@...hat.com>,
	Valentin Schneider <vschneid@...hat.com>, "Paul E. McKenney"
	<paulmck@...nel.org>, Imran Khan <imran.f.khan@...cle.com>, Leonardo Bras
	<leobras@...hat.com>, Guo Ren <guoren@...nel.org>, Rik van Riel
	<riel@...riel.com>, Tejun Heo <tj@...nel.org>, Cruz Zhao
	<CruzZhao@...ux.alibaba.com>, Lai Jiangshan <jiangshanlai@...il.com>, "Joel
 Fernandes" <joel@...lfernandes.org>, Zqiang <qiang.zhang1211@...il.com>,
	"Julia Lawall" <julia.lawall@...ia.fr>, "Gautham R. Shenoy"
	<gautham.shenoy@....com>
Subject: Re: [PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry
 fast-path in __schedule()

Hello Vincent, Peter,

On 7/11/2024 6:44 PM, Vincent Guittot wrote:
> On Thu, 11 Jul 2024 at 11:19, Peter Zijlstra <peterz@...radead.org> wrote:
>>
>> On Thu, Jul 11, 2024 at 10:00:15AM +0200, Vincent Guittot wrote:
>>> On Wed, 10 Jul 2024 at 11:03, K Prateek Nayak <kprateek.nayak@....com> wrote:
>>
>>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>>> index 1e0c77eac65a..417d3ebbdf60 100644
>>>> --- a/kernel/sched/core.c
>>>> +++ b/kernel/sched/core.c
>>>> @@ -6343,19 +6343,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>>>>    * Constants for the sched_mode argument of __schedule().
>>>>    *
>>>>    * The mode argument allows RT enabled kernels to differentiate a
>>>> - * preemption from blocking on an 'sleeping' spin/rwlock. Note that
>>>> - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
>>>> - * optimize the AND operation out and just check for zero.
>>>> + * preemption from blocking on an 'sleeping' spin/rwlock.
>>>>    */
>>>> -#define SM_NONE                        0x0
>>>> -#define SM_PREEMPT             0x1
>>>> -#define SM_RTLOCK_WAIT         0x2
>>>> -
>>>> -#ifndef CONFIG_PREEMPT_RT
>>>> -# define SM_MASK_PREEMPT       (~0U)
>>>> -#else
>>>> -# define SM_MASK_PREEMPT       SM_PREEMPT
>>>> -#endif
>>>> +#define SM_IDLE                        (-1)
>>>> +#define SM_NONE                        0
>>>> +#define SM_PREEMPT             1
>>>> +#define SM_RTLOCK_WAIT         2
>>>>
>>>>   /*
>>>>    * __schedule() is the main scheduler function.
>>>> @@ -6396,11 +6389,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>>>>    *
>>>>    * WARNING: must be called with preemption disabled!
>>>>    */
>>>> -static void __sched notrace __schedule(unsigned int sched_mode)
>>>> +static void __sched notrace __schedule(int sched_mode)
>>>>   {
>>>>          struct task_struct *prev, *next;
>>>>          unsigned long *switch_count;
>>>>          unsigned long prev_state;
>>>> +       bool preempt = sched_mode > 0;
>>>>          struct rq_flags rf;
>>>>          struct rq *rq;
>>>>          int cpu;
>>>> @@ -6409,13 +6403,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>>>>          rq = cpu_rq(cpu);
>>>>          prev = rq->curr;
>>>>
>>>> -       schedule_debug(prev, !!sched_mode);
>>>> +       schedule_debug(prev, preempt);
>>>>
>>>>          if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
>>>>                  hrtick_clear(rq);
>>>>
>>>>          local_irq_disable();
>>>> -       rcu_note_context_switch(!!sched_mode);
>>>> +       rcu_note_context_switch(preempt);
>>>>
>>>>          /*
>>>>           * Make sure that signal_pending_state()->signal_pending() below
>>>> @@ -6449,7 +6443,12 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>>>>           * that we form a control dependency vs deactivate_task() below.
>>>>           */
>>>>          prev_state = READ_ONCE(prev->__state);
>>>> -       if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
>>>> +       if (sched_mode == SM_IDLE) {
>>>> +               if (!rq->nr_running) {
>>>> +                       next = prev;
>>>> +                       goto picked;
>>>> +               }
>>>> +       } else if (!preempt && prev_state) {
>>>
>>> With CONFIG_PREEMPT_RT, it was only for SM_PREEMPT but not for SM_RTLOCK_WAIT
>>
>> Bah, yes. But then schedule_debug() and rcu_note_context_switch() have
>> an argument that is called 'preempt' but is set for SM_RTLOCK_WAIT.
>>
>> Now, I think the RCU think is actually correct here, it doesn't want to
>> consider SM_RTLOCK_WAIT as a voluntary schedule point, because spinlocks
>> don't either. But it is confusing as heck.
>>
>> We can either write things like:
>>
>>          } else if (sched_mode != SM_PREEMPT && prev_state) {
> 
> this would work with something like below
> 
> #ifdef CONFIG_PREEMPT_RT
>            # define SM_RTLOCK_WAIT       2
> #else
>           # define SM_RTLOCK_WAIT       SM_PREEMPT
> #endif

Since "SM_RTLOCK_WAIT" is only used by "schedule_rtlock()" which is only
defined for PREEMPT_RT kernels (from a quick grep on linux-6.10.y-rt),
it should just work (famous last words) and we can perhaps skip the else
part too?

With this patch, we need to have the following view of what "preempt"
should be for the components in __schedule() looking at "sched_mode":

		schedule_debug()/		SM_MASK_PREEMPT check/
		rcu_note_context_switch()	trace_sched_switch()
SM_IDLE			F				F
SM_NONE			F				F
SM_PREEMPT		T				T
SM_RTLOCK_WAIT *	T				F

   * SM_RTLOCK_WAIT  is only used in PREEMPT_RT

> 
>>
>> or do silly things like:

... and since we are talking about silly ideas, here is one:

(only build tested on tip:sched/core and linux-6.10.y-rt)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 417d3ebbdf60..d9273af69f9e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6345,10 +6345,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   * The mode argument allows RT enabled kernels to differentiate a
   * preemption from blocking on an 'sleeping' spin/rwlock.
   */
-#define SM_IDLE			(-1)
-#define SM_NONE			0
-#define SM_PREEMPT		1
-#define SM_RTLOCK_WAIT		2
+#ifdef CONFIG_PREEMPT_RT
+#define SM_RTLOCK_WAIT		(-2)
+#endif
+#define SM_IDLE			0
+#define SM_NONE			1
+#define SM_PREEMPT		2
  
  /*
   * __schedule() is the main scheduler function.
@@ -6391,10 +6393,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   */
  static void __sched notrace __schedule(int sched_mode)
  {
+	/*
+	 * For PREEMPT_RT kernel, SM_RTLOCK_WAIT is considered as
+	 * preemption by schedule_debug() and
+	 * rcu_note_context_switch().
+	 */
+	bool preempt = (unsigned int) sched_mode > SM_NONE;
  	struct task_struct *prev, *next;
  	unsigned long *switch_count;
  	unsigned long prev_state;
-	bool preempt = sched_mode > 0;
  	struct rq_flags rf;
  	struct rq *rq;
  	int cpu;
@@ -6438,6 +6445,14 @@ static void __sched notrace __schedule(int sched_mode)
  
  	switch_count = &prev->nivcsw;
  
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * PREEMPT_RT kernel do not consider SM_RTLOCK_WAIT as
+	 * preemption when looking at prev->state.
+	 */
+	preempt = sched_mode > SM_NONE;
+#endif
+
  	/*
  	 * We must load prev->state once (task_struct::state is volatile), such
  	 * that we form a control dependency vs deactivate_task() below.
--

>>
>> #define SM_IDLE (-16)
>>
>> keep the SM_MASK_PREEMPT trickery and do:
>>
>>          } else if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
>>
>> Not sure that is actually going to matter at this point though.

-- 
Thanks and Regards,
Prateek

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ