lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e334aff9-248c-4a00-98e1-7bcb7cdd5e90@linux.ibm.com>
Date: Tue, 22 Oct 2024 22:14:41 +0530
From: Shrikanth Hegde <sshegde@...ux.ibm.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: linux-kernel@...r.kernel.org, juri.lelli@...hat.com,
        vincent.guittot@...aro.org, dietmar.eggemann@....com,
        rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de,
        vschneid@...hat.com, ankur.a.arora@...cle.com, efault@....de,
        bigeasy@...utronix.de, tglx@...utronix.de, mingo@...nel.org
Subject: Re: [PATCH 2/5] sched: Add Lazy preemption model



On 10/7/24 13:16, Peter Zijlstra wrote:
> Change fair to use resched_curr_lazy(), which, when the lazy
> preemption model is selected, will set TIF_NEED_RESCHED_LAZY.
> 
> This LAZY bit will be promoted to the full NEED_RESCHED bit on tick.
> As such, the average delay between setting LAZY and actually
> rescheduling will be TICK_NSEC/2.
> 
> In short, Lazy preemption will delay preemption for fair class but
> will function as Full preemption for all the other classes, most
> notably the realtime (RR/FIFO/DEADLINE) classes.
> 
> The goal is to bridge the performance gap with Voluntary, such that we
> might eventually remove that option entirely.
> 
> Suggested-by: Thomas Gleixner <tglx@...utronix.de>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
>   include/linux/preempt.h |    8 ++++-
>   kernel/Kconfig.preempt  |   15 +++++++++
>   kernel/sched/core.c     |   76 ++++++++++++++++++++++++++++++++++++++++++++++--
>   kernel/sched/debug.c    |    5 +--
>   kernel/sched/fair.c     |    6 +--
>   kernel/sched/sched.h    |    1
>   6 files changed, 103 insertions(+), 8 deletions(-)
> 
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_dis
>   extern bool preempt_model_none(void);
>   extern bool preempt_model_voluntary(void);
>   extern bool preempt_model_full(void);
> +extern bool preempt_model_lazy(void);
>   
>   #else
>   
> @@ -502,6 +503,11 @@ static inline bool preempt_model_full(vo
>   	return IS_ENABLED(CONFIG_PREEMPT);
>   }
>   
> +static inline bool preempt_model_lazy(void)
> +{
> +	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
> +}
> +
>   #endif
>   
>   static inline bool preempt_model_rt(void)
> @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void
>    */
>   static inline bool preempt_model_preemptible(void)
>   {
> -	return preempt_model_full() || preempt_model_rt();
> +	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
>   }
>   
>   #endif /* __LINUX_PREEMPT_H */
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -11,6 +11,9 @@ config PREEMPT_BUILD
>   	select PREEMPTION
>   	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
>   
> +config ARCH_HAS_PREEMPT_LAZY
> +	bool
> +
>   choice
>   	prompt "Preemption Model"
>   	default PREEMPT_NONE
> @@ -67,6 +70,18 @@ config PREEMPT
>   	  embedded system with latency requirements in the milliseconds
>   	  range.
>   
> +config PREEMPT_LAZY
> +	bool "Scheduler controlled preemption model"
> +	depends on !ARCH_NO_PREEMPT
> +	depends on ARCH_HAS_PREEMPT_LAZY
> +	select PREEMPT_BUILD
> +	help
> +	  This option provides a scheduler driven preemption model that
> +	  is fundamentally similar to full preemption, but is less
> +	  eager to preempt SCHED_NORMAL tasks in an attempt to
> +	  reduce lock holder preemption and recover some of the performance
> +	  gains seen from using Voluntary preemption.
> +
>   config PREEMPT_RT
>   	bool "Fully Preemptible Kernel (Real-Time)"
>   	depends on EXPERT && ARCH_SUPPORTS_RT
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1078,6 +1078,9 @@ static void __resched_curr(struct rq *rq
>   
>   	lockdep_assert_rq_held(rq);
>   
> +	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
> +		tif = TIF_NEED_RESCHED;
> +
>   	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
>   		return;
>   
> @@ -1103,6 +1106,32 @@ void resched_curr(struct rq *rq)
>   	__resched_curr(rq, TIF_NEED_RESCHED);
>   }
>   
> +#ifdef CONFIG_PREEMPT_DYNAMIC
> +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
> +}
> +#else
> +static __always_inline bool dynamic_preempt_lazy(void)
> +{
> +	return IS_ENABLED(PREEMPT_LAZY);

I had to make it CONFIG_PREEMPT_LAZY for lazy preemption to work
on systems where CONFIG_PREEMPT_DYNAMIC=n.

> +}
> +#endif
> +
> +static __always_inline int tif_need_resched_lazy(void)
> +{
> +	if (dynamic_preempt_lazy())
> +		return TIF_NEED_RESCHED_LAZY;
> +
> +	return TIF_NEED_RESCHED;
> +}
> +
> +void resched_curr_lazy(struct rq *rq)
> +{
> +	__resched_curr(rq, tif_need_resched_lazy());
> +}
> +
>   void resched_cpu(int cpu)
>   {
>   	struct rq *rq = cpu_rq(cpu);
> @@ -5598,6 +5627,10 @@ void sched_tick(void)
>   	update_rq_clock(rq);
>   	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
>   	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> +
> +	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
> +		resched_curr(rq);
> +
>   	curr->sched_class->task_tick(rq, curr, 0);
>   	if (sched_feat(LATENCY_WARN))
>   		resched_latency = cpu_resched_latency(rq);
> @@ -7334,6 +7367,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>    *   preempt_schedule           <- NOP
>    *   preempt_schedule_notrace   <- NOP
>    *   irqentry_exit_cond_resched <- NOP
> + *   dynamic_preempt_lazy       <- false
>    *
>    * VOLUNTARY:
>    *   cond_resched               <- __cond_resched
> @@ -7341,6 +7375,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>    *   preempt_schedule           <- NOP
>    *   preempt_schedule_notrace   <- NOP
>    *   irqentry_exit_cond_resched <- NOP
> + *   dynamic_preempt_lazy       <- false
>    *
>    * FULL:
>    *   cond_resched               <- RET0
> @@ -7348,6 +7383,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_writ
>    *   preempt_schedule           <- preempt_schedule
>    *   preempt_schedule_notrace   <- preempt_schedule_notrace
>    *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
> + *   dynamic_preempt_lazy       <- false
> + *
> + * LAZY:
> + *   cond_resched               <- RET0
> + *   might_resched              <- RET0
> + *   preempt_schedule           <- preempt_schedule
> + *   preempt_schedule_notrace   <- preempt_schedule_notrace
> + *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
> + *   dynamic_preempt_lazy       <- true
>    */
>   
>   enum {
> @@ -7355,6 +7399,7 @@ enum {
>   	preempt_dynamic_none,
>   	preempt_dynamic_voluntary,
>   	preempt_dynamic_full,
> +	preempt_dynamic_lazy,
>   };
>   
>   int preempt_dynamic_mode = preempt_dynamic_undefined;
> @@ -7370,15 +7415,23 @@ int sched_dynamic_mode(const char *str)
>   	if (!strcmp(str, "full"))
>   		return preempt_dynamic_full;
>   
> +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
> +	if (!strcmp(str, "lazy"))
> +		return preempt_dynamic_lazy;
> +#endif
> +
>   	return -EINVAL;
>   }
>   
> +#define preempt_dynamic_key_enable(f)	static_key_enable(&sk_dynamic_##f.key)
> +#define preempt_dynamic_key_disable(f)	static_key_disable(&sk_dynamic_##f.key)
> +
>   #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
>   #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
>   #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
>   #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
> -#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
> -#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
> +#define preempt_dynamic_enable(f)	preempt_dynamic_key_enable(f)
> +#define preempt_dynamic_disable(f)	preempt_dynamic_key_disable(f)
>   #else
>   #error "Unsupported PREEMPT_DYNAMIC mechanism"
>   #endif
> @@ -7398,6 +7451,7 @@ static void __sched_dynamic_update(int m
>   	preempt_dynamic_enable(preempt_schedule);
>   	preempt_dynamic_enable(preempt_schedule_notrace);
>   	preempt_dynamic_enable(irqentry_exit_cond_resched);
> +	preempt_dynamic_key_disable(preempt_lazy);
>   
>   	switch (mode) {
>   	case preempt_dynamic_none:
> @@ -7407,6 +7461,7 @@ static void __sched_dynamic_update(int m
>   		preempt_dynamic_disable(preempt_schedule);
>   		preempt_dynamic_disable(preempt_schedule_notrace);
>   		preempt_dynamic_disable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_disable(preempt_lazy);
>   		if (mode != preempt_dynamic_mode)
>   			pr_info("Dynamic Preempt: none\n");
>   		break;
> @@ -7418,6 +7473,7 @@ static void __sched_dynamic_update(int m
>   		preempt_dynamic_disable(preempt_schedule);
>   		preempt_dynamic_disable(preempt_schedule_notrace);
>   		preempt_dynamic_disable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_disable(preempt_lazy);
>   		if (mode != preempt_dynamic_mode)
>   			pr_info("Dynamic Preempt: voluntary\n");
>   		break;
> @@ -7429,9 +7485,22 @@ static void __sched_dynamic_update(int m
>   		preempt_dynamic_enable(preempt_schedule);
>   		preempt_dynamic_enable(preempt_schedule_notrace);
>   		preempt_dynamic_enable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_disable(preempt_lazy);
>   		if (mode != preempt_dynamic_mode)
>   			pr_info("Dynamic Preempt: full\n");
>   		break;
> +
> +	case preempt_dynamic_lazy:
> +		if (!klp_override)
> +			preempt_dynamic_disable(cond_resched);
> +		preempt_dynamic_disable(might_resched);
> +		preempt_dynamic_enable(preempt_schedule);
> +		preempt_dynamic_enable(preempt_schedule_notrace);
> +		preempt_dynamic_enable(irqentry_exit_cond_resched);
> +		preempt_dynamic_key_enable(preempt_lazy);
> +		if (mode != preempt_dynamic_mode)
> +			pr_info("Dynamic Preempt: lazy\n");
> +		break;
>   	}
>   
>   	preempt_dynamic_mode = mode;
> @@ -7494,6 +7563,8 @@ static void __init preempt_dynamic_init(
>   			sched_dynamic_update(preempt_dynamic_none);
>   		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
>   			sched_dynamic_update(preempt_dynamic_voluntary);
> +		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
> +			sched_dynamic_update(preempt_dynamic_lazy);
>   		} else {
>   			/* Default static call setting, nothing to do */
>   			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
> @@ -7514,6 +7585,7 @@ static void __init preempt_dynamic_init(
>   PREEMPT_MODEL_ACCESSOR(none);
>   PREEMPT_MODEL_ACCESSOR(voluntary);
>   PREEMPT_MODEL_ACCESSOR(full);
> +PREEMPT_MODEL_ACCESSOR(lazy);
>   
>   #else /* !CONFIG_PREEMPT_DYNAMIC: */
>   
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struc
>   static int sched_dynamic_show(struct seq_file *m, void *v)
>   {
>   	static const char * preempt_modes[] = {
> -		"none", "voluntary", "full"
> +		"none", "voluntary", "full", "lazy",
>   	};
> +	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
>   	int i;
>   
> -	for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
> +	for (i = 0; i < j; i++) {
>   		if (preempt_dynamic_mode == i)
>   			seq_puts(m, "(");
>   		seq_puts(m, preempt_modes[i]);
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1251,7 +1251,7 @@ static void update_curr(struct cfs_rq *c
>   		return;
>   
>   	if (resched || did_preempt_short(cfs_rq, curr)) {



If there is a long running task, only after it is not eligible, LAZY would be set and
subsequent tick would upgrade it to NR. If one sets sysctl_sched_base_slice to a large
value (max 4seconds), LAZY would set thereafter(max 4 seconds) if there in no wakeup in
that CPU.

If i set sysctl_sched_base_slice=300ms, spawn 2 stress-ng on one CPU, then LAZY bit is
set usually after 300ms of sched_switch if there are no wakeups. Subsequent tick NR is set.
Initially I was thinking, if there is a long running process, then LAZY would be set after
one tick and on subsequent tick NR would set. I was wrong. It might take a long time for LAZY
to be set, and On subsequent tick NR would be set.

That would be expected behavior since one setting sysctl_sched_base_slice know what to expect?

> -		resched_curr(rq);
> +		resched_curr_lazy(rq);
>   		clear_buddies(cfs_rq, curr);
>   	}
>   }
> @@ -5677,7 +5677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
>   	 * validating it and just reschedule.
>   	 */
>   	if (queued) {

What's this queued used for? hrtick seems to set it. I haven't understood how it works.

> -		resched_curr(rq_of(cfs_rq));
> +		resched_curr_lazy(rq_of(cfs_rq));
>   		return;
>   	}
>   	/*
> @@ -8832,7 +8832,7 @@ static void check_preempt_wakeup_fair(st
>   	return;
>   
>   preempt:
> -	resched_curr(rq);

Is it better to call resched_curr here? When the code arrives here, it wants to
run pse as soon as possible right?

> +	resched_curr_lazy(rq);
>   }
>   
>   static struct task_struct *pick_task_fair(struct rq *rq)
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2692,6 +2692,7 @@ extern void init_sched_rt_class(void);
>   extern void init_sched_fair_class(void);
>   
>   extern void resched_curr(struct rq *rq);
> +extern void resched_curr_lazy(struct rq *rq);
>   extern void resched_cpu(int cpu);
>   
>   extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
> 
> 
> 


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ