lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140604015812.140a00d1@jacob-desktop>
Date:	Wed, 4 Jun 2014 01:58:12 -0700
From:	Jacob Pan <jacob.jun.pan@...ux.intel.com>
To:	Peter Zijlstra <peterz@...radead.org>
Cc:	rafael.j.wysocki@...el.com, linux-kernel@...r.kernel.org,
	linux-pm@...r.kernel.org, lenb@...nel.org, mingo@...nel.org,
	tglx@...utronix.de, hpa@...or.com, arjan@...ux.intel.com,
	rui.zhang@...el.com, luto@...capital.net
Subject: Re: [PATCH] idle, thermal, acpi: Remove home grown idle
 implementations

On Wed, 4 Jun 2014 10:54:18 +0200
Peter Zijlstra <peterz@...radead.org> wrote:

> 
> I'm still sitting on this patch. Jacub you were going to make it play
> nice with QoS?
> 
I had a patchset to work through system PM QOS and still maintain the
idle injection efficiency. When I saw you did not merge the patch
below, I thought you have abandoned it :)

The only issue as per our last discussion is the lack of notification
when PM QOS cannot be met. But that is intrinsic to PM QOS itself.

I also consulted with Arjan and looked at directly intercept with
intel_idle since both intel_powerclamp and intel_idle are arch specific
drivers. But I think that is hard to do at per idle period basis,
since we should still allow "natural" idle during the forced idle time.

So, I think we can take a two stepped approach,
1. integrate your patch with a
updated version of https://lkml.org/lkml/2013/11/26/534 such that there
is no performance/efficiency regression.
2. add notification mechanism to system qos when constraints cannot be
met.


Thanks,

Jacob
> ---
> Subject: idle, thermal, acpi: Remove home grown idle implementations
> From: Peter Zijlstra <peterz@...radead.org>
> Date: Wed Nov 20 14:32:37 CET 2013
> 
> People are starting to grow their own idle implementations in various
> disgusting ways. Collapse the lot and use the generic idle code to
> provide a proper idle cycle implementation.
> 
> This does not fully preseve existing behaviour in that the generic
> idle cycle function calls into the normal cpuidle governed idle
> routines and should thus respect things like QoS parameters and the
> like.
> 
> If people want to over-ride the idle state they should talk to the
> cpuidle folks about extending the interface and attempt to preserve
> QoS guarantees, instead of jumping straight to the deepest possible C
> state -- Jacub Pan said he was going to do this.
> 
> This is reported to work for intel_powerclamp by Jacub Pan, the
> acpi_pad driver is untested.
> 
> Cc: rui.zhang@...el.com
> Cc: jacob.jun.pan@...ux.intel.com
> Cc: lenb@...nel.org
> Cc: Rafael J. Wysocki <rafael.j.wysocki@...el.com>
> Cc: Ingo Molnar <mingo@...nel.org>
> Cc: Thomas Gleixner <tglx@...utronix.de>
> Cc: hpa@...or.com
> Cc: arjan@...ux.intel.com
> Signed-off-by: Peter Zijlstra <peterz@...radead.org>
> ---
>  drivers/acpi/acpi_pad.c            |   41 -----------
>  drivers/thermal/intel_powerclamp.c |   38 ----------
>  include/linux/cpu.h                |    2 
>  include/linux/sched.h              |    3 
>  kernel/sched/core.c                |    9 ++
>  kernel/sched/idle.c                |  129
> ++++++++++++++++++++++---------------
> kernel/time/tick-sched.c           |    2 7 files changed, 95
> insertions(+), 129 deletions(-)
> 
> --- a/drivers/acpi/acpi_pad.c
> +++ b/drivers/acpi/acpi_pad.c
> @@ -40,9 +40,7 @@ static DEFINE_MUTEX(round_robin_lock);
>  static unsigned long power_saving_mwait_eax;
>  
>  static unsigned char tsc_detected_unstable;
> -static unsigned char tsc_marked_unstable;
>  static unsigned char lapic_detected_unstable;
> -static unsigned char lapic_marked_unstable;
>  
>  static void power_saving_mwait_init(void)
>  {
> @@ -152,10 +150,9 @@ static int power_saving_thread(void *dat
>  	unsigned int tsk_index = (unsigned long)data;
>  	u64 last_jiffies = 0;
>  
> -	sched_setscheduler(current, SCHED_RR, &param);
> +	sched_setscheduler(current, SCHED_FIFO, &param);
>  
>  	while (!kthread_should_stop()) {
> -		int cpu;
>  		u64 expire_time;
>  
>  		try_to_freeze();
> @@ -170,41 +167,7 @@ static int power_saving_thread(void *dat
>  
>  		expire_time = jiffies + HZ * (100 - idle_pct) / 100;
>  
> -		while (!need_resched()) {
> -			if (tsc_detected_unstable
> && !tsc_marked_unstable) {
> -				/* TSC could halt in idle, so notify
> users */
> -				mark_tsc_unstable("TSC halts in
> idle");
> -				tsc_marked_unstable = 1;
> -			}
> -			if (lapic_detected_unstable
> && !lapic_marked_unstable) {
> -				int i;
> -				/* LAPIC could halt in idle, so
> notify users */
> -				for_each_online_cpu(i)
> -					clockevents_notify(
> -
> CLOCK_EVT_NOTIFY_BROADCAST_ON,
> -						&i);
> -				lapic_marked_unstable = 1;
> -			}
> -			local_irq_disable();
> -			cpu = smp_processor_id();
> -			if (lapic_marked_unstable)
> -				clockevents_notify(
> -
> CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
> -			stop_critical_timings();
> -
> -
> mwait_idle_with_hints(power_saving_mwait_eax, 1); -
> -			start_critical_timings();
> -			if (lapic_marked_unstable)
> -				clockevents_notify(
> -
> CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
> -			local_irq_enable();
> -
> -			if (jiffies > expire_time) {
> -				do_sleep = 1;
> -				break;
> -			}
> -		}
> +		play_idle(expire_time);
>  
>  		/*
>  		 * current sched_rt has threshold for rt task
> running time. --- a/drivers/thermal/intel_powerclamp.c
> +++ b/drivers/thermal/intel_powerclamp.c
> @@ -256,11 +256,6 @@ static u64 pkg_state_counter(void)
>  	return count;
>  }
>  
> -static void noop_timer(unsigned long foo)
> -{
> -	/* empty... just the fact that we get the interrupt wakes us
> up */ -}
> -
>  static unsigned int get_compensation(int ratio)
>  {
>  	unsigned int comp = 0;
> @@ -365,7 +360,6 @@ static bool powerclamp_adjust_controls(u
>  static int clamp_thread(void *arg)
>  {
>  	int cpunr = (unsigned long)arg;
> -	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
>  	static const struct sched_param param = {
>  		.sched_priority = MAX_USER_RT_PRIO/2,
>  	};
> @@ -374,11 +368,9 @@ static int clamp_thread(void *arg)
>  
>  	set_bit(cpunr, cpu_clamping_mask);
>  	set_freezable();
> -	init_timer_on_stack(&wakeup_timer);
>  	sched_setscheduler(current, SCHED_FIFO, &param);
>  
> -	while (true == clamping && !kthread_should_stop() &&
> -		cpu_online(cpunr)) {
> +	while (clamping && !kthread_should_stop() &&
> cpu_online(cpunr)) { int sleeptime;
>  		unsigned long target_jiffies;
>  		unsigned int guard;
> @@ -426,35 +418,11 @@ static int clamp_thread(void *arg)
>  		if (should_skip)
>  			continue;
>  
> -		target_jiffies = jiffies + duration_jiffies;
> -		mod_timer(&wakeup_timer, target_jiffies);
>  		if (unlikely(local_softirq_pending()))
>  			continue;
> -		/*
> -		 * stop tick sched during idle time, interrupts are
> still
> -		 * allowed. thus jiffies are updated properly.
> -		 */
> -		preempt_disable();
> -		tick_nohz_idle_enter();
> -		/* mwait until target jiffies is reached */
> -		while (time_before(jiffies, target_jiffies)) {
> -			unsigned long ecx = 1;
> -			unsigned long eax = target_mwait;
> -
> -			/*
> -			 * REVISIT: may call enter_idle() to notify
> drivers who
> -			 * can save power during cpu idle. same for
> exit_idle()
> -			 */
> -			local_touch_nmi();
> -			stop_critical_timings();
> -			mwait_idle_with_hints(eax, ecx);
> -			start_critical_timings();
> -			atomic_inc(&idle_wakeup_counter);
> -		}
> -		tick_nohz_idle_exit();
> -		preempt_enable();
> +
> +		play_idle(duration_jiffies);
>  	}
> -	del_timer_sync(&wakeup_timer);
>  	clear_bit(cpunr, cpu_clamping_mask);
>  
>  	return 0;
> --- a/include/linux/cpu.h
> +++ b/include/linux/cpu.h
> @@ -255,6 +255,8 @@ enum cpuhp_state {
>  	CPUHP_ONLINE,
>  };
>  
> +void play_idle(unsigned long jiffies);
> +
>  void cpu_startup_entry(enum cpuhp_state state);
>  void cpu_idle(void);
>  
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1892,6 +1892,7 @@ extern void thread_group_cputime_adjuste
>  /*
>   * Per process flags
>   */
> +#define PF_IDLE		0x00000002	/* I am an IDLE
> thread */ #define PF_EXITING	0x00000004	/* getting shut
> down */ #define PF_EXITPIDONE	0x00000008	/* pi exit
> done on shut down */ #define PF_VCPU
> 0x00000010	/* I'm a virtual CPU */ @@ -2204,7 +2205,7 @@
> extern struct task_struct *idle_task(int */
>  static inline bool is_idle_task(const struct task_struct *p)
>  {
> -	return p->pid == 0;
> +	return !!(p->flags & PF_IDLE);
>  }
>  extern struct task_struct *curr_task(int cpu);
>  extern void set_curr_task(int cpu, struct task_struct *p);
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -676,10 +676,12 @@ static void wake_up_idle_cpu(int cpu)
>  	if (cpu == smp_processor_id())
>  		return;
>  
> -	if (set_nr_and_not_polling(rq->idle))
> +	rcu_read_lock();
> +	if (set_nr_and_not_polling(rq->curr))
>  		smp_send_reschedule(cpu);
>  	else
>  		trace_sched_wake_polling_cpu(cpu);
> +	rcu_read_unlock();
>  }
>  
>  static bool wake_up_full_nohz_cpu(int cpu)
> @@ -1605,10 +1607,12 @@ static void ttwu_queue_remote(struct tas
>  	struct rq *rq = cpu_rq(cpu);
>  
>  	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
> -		if (!set_nr_if_polling(rq->idle))
> +		rcu_read_lock();
> +		if (!set_nr_if_polling(rq->curr))
>  			smp_send_reschedule(cpu);
>  		else
>  			trace_sched_wake_polling_cpu(cpu);
> +		rcu_read_unlock();
>  	}
>  }
>  
> @@ -4537,6 +4541,7 @@ void init_idle(struct task_struct *idle,
>  	__sched_fork(0, idle);
>  	idle->state = TASK_RUNNING;
>  	idle->se.exec_start = sched_clock();
> +	idle->flags |= PF_IDLE;
>  
>  	do_set_cpus_allowed(idle, cpumask_of(cpu));
>  	/*
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -184,66 +184,94 @@ static void cpuidle_idle_call(void)
>   *
>   * Called with polling cleared.
>   */
> -static void cpu_idle_loop(void)
> +static void do_idle(void)
>  {
> -	while (1) {
> -		/*
> -		 * If the arch has a polling bit, we maintain an
> invariant:
> -		 *
> -		 * The polling bit is clear if we're not scheduled
> (i.e. if
> -		 * rq->curr != rq->idle).  This means that, if
> rq->idle has
> -		 * the polling bit set, then setting need_resched is
> -		 * guaranteed to cause the cpu to reschedule.
> -		 */
> +	/*
> +	 * If the arch has a polling bit, we maintain an invariant:
> +	 *
> +	 * The polling bit is clear if we're not scheduled (i.e. if
> rq->curr !=
> +	 * rq->idle).  This means that, if rq->idle has the polling
> bit set,
> +	 * then setting need_resched is guaranteed to cause the cpu
> to
> +	 * reschedule.
> +	 */
>  
> -		__current_set_polling();
> -		tick_nohz_idle_enter();
> +	__current_set_polling();
> +	tick_nohz_idle_enter();
>  
> -		while (!need_resched()) {
> -			check_pgt_cache();
> -			rmb();
> -
> -			if (cpu_is_offline(smp_processor_id()))
> -				arch_cpu_idle_dead();
> -
> -			local_irq_disable();
> -			arch_cpu_idle_enter();
> -
> -			/*
> -			 * In poll mode we reenable interrupts and
> spin.
> -			 *
> -			 * Also if we detected in the wakeup from
> idle
> -			 * path that the tick broadcast device
> expired
> -			 * for us, we don't want to go deep idle as
> we
> -			 * know that the IPI is going to arrive right
> -			 * away
> -			 */
> -			if (cpu_idle_force_poll ||
> tick_check_broadcast_expired())
> -				cpu_idle_poll();
> -			else
> -				cpuidle_idle_call();
> +	while (!need_resched()) {
> +		check_pgt_cache();
> +		rmb();
>  
> -			arch_cpu_idle_exit();
> -		}
> +		if (cpu_is_offline(smp_processor_id()))
> +			arch_cpu_idle_dead();
> +
> +		local_irq_disable();
> +		arch_cpu_idle_enter();
>  
>  		/*
> -		 * Since we fell out of the loop above, we know
> -		 * TIF_NEED_RESCHED must be set, propagate it into
> -		 * PREEMPT_NEED_RESCHED.
> +		 * In poll mode we reenable interrupts and spin.
>  		 *
> -		 * This is required because for polling idle loops
> we will
> -		 * not have had an IPI to fold the state for us.
> +		 * Also if we detected in the wakeup from idle path
> that the
> +		 * tick broadcast device expired for us, we don't
> want to go
> +		 * deep idle as we know that the IPI is going to
> arrive right
> +		 * away
>  		 */
> -		preempt_set_need_resched();
> -		tick_nohz_idle_exit();
> -		__current_clr_polling();
> -		smp_mb__after_clear_bit();
> -		sched_ttwu_pending();
> -		schedule_preempt_disabled();
> +		if (cpu_idle_force_poll ||
> tick_check_broadcast_expired())
> +			cpu_idle_poll();
> +		else
> +			cpuidle_idle_call();
> +
> +		arch_cpu_idle_exit();
>  	}
> +
> +	/*
> +	 * Since we fell out of the loop above, we know
> TIF_NEED_RESCHED must
> +	 * be set, propagate it into PREEMPT_NEED_RESCHED.
> +	 *
> +	 * This is required because for polling idle loops we will
> not have had
> +	 * an IPI to fold the state for us.
> +	 */
> +	preempt_set_need_resched();
> +	tick_nohz_idle_exit();
> +	__current_clr_polling();
> +	smp_mb__after_clear_bit();
> +	sched_ttwu_pending();
> +	schedule_preempt_disabled();
> +}
> +
> +static void play_idle_timer(unsigned long foo)
> +{
> +	set_tsk_need_resched(current);
> +}
> +
> +void play_idle(unsigned long duration)
> +{
> +	DEFINE_TIMER(wakeup_timer, play_idle_timer, 0, 0);
> +
> +	/*
> +	 * Only FIFO tasks can disable the tick since they don't
> need the forced
> +	 * preemption.
> +	 */
> +	WARN_ON_ONCE(current->policy != SCHED_FIFO);
> +	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
> +	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
> +	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
> +	rcu_sleep_check();
> +
> +	init_timer_on_stack(&wakeup_timer);
> +	mod_timer_pinned(&wakeup_timer, jiffies + duration);
> +
> +	preempt_disable();
> +	current->flags |= PF_IDLE;
> +	do_idle();
> +	current->flags &= ~PF_IDLE;
> +	del_timer_sync(&wakeup_timer);
> +	preempt_fold_need_resched();
> +	preempt_enable();
>  }
> +EXPORT_SYMBOL_GPL(play_idle);
>  
> -void cpu_startup_entry(enum cpuhp_state state)
> +__noreturn void cpu_startup_entry(enum cpuhp_state state)
>  {
>  	/*
>  	 * This #ifdef needs to die, but it's too late in the cycle
> to @@ -261,5 +289,6 @@ void cpu_startup_entry(enum cpuhp_state
>  	boot_init_stack_canary();
>  #endif
>  	arch_cpu_idle_prepare();
> -	cpu_idle_loop();
> +	while (1)
> +		do_idle();
>  }
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -807,7 +807,6 @@ void tick_nohz_idle_enter(void)
>  
>  	local_irq_enable();
>  }
> -EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
>  
>  /**
>   * tick_nohz_irq_exit - update next tick event from interrupt exit
> @@ -934,7 +933,6 @@ void tick_nohz_idle_exit(void)
>  
>  	local_irq_enable();
>  }
> -EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
>  
>  static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
>  {

[Jacob Pan]
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ