lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 22 Apr 2015 23:56:01 +0200 (CEST)
From:	Thomas Gleixner <tglx@...utronix.de>
To:	Eric Dumazet <eric.dumazet@...il.com>
cc:	Peter Zijlstra <peterz@...radead.org>,
	viresh kumar <viresh.kumar@...aro.org>,
	Ingo Molnar <mingo@...hat.com>, linaro-kernel@...ts.linaro.org,
	linux-kernel@...r.kernel.org, Steven Miao <realmz6@...il.com>,
	shashim@...eaurora.org
Subject: Re: [PATCH 1/2] timer: Avoid waking up an idle-core by migrate
 running timer

On Wed, 22 Apr 2015, Eric Dumazet wrote:
> On Wed, 2015-04-22 at 20:56 +0200, Thomas Gleixner wrote:
> > On Wed, 22 Apr 2015, Eric Dumazet wrote:
> > > Check commit 4a8e320c929991c9480 ("net: sched: use pinned timers")
> > > for a specific example of the problems that can be raised.
> > 
> > If you have a problem with the core timer code then it should be fixed
> > there and not worked around in some place which will ruin stuff for
> > power saving interested users. I'm so tired of this 'I fix it in my
> > sandbox' attitude, really. If the core code has a shortcoming we fix
> > it there right away because you are probably not the only one who runs
> > into that shortcoming. So if we don't fix it in the core we end up
> > with a metric ton of slightly different (or broken) workarounds which
> > affect the workload/system characteristics of other people.
> > 
> > Just for the record. Even the changelog of this commit is blatantly
> > wrong:
> > 
> >   "We can see that timers get migrated into a single cpu, presumably
> >    idle at the time timers are set up."
> 
> Spare me the obvious typo. A 'not' is missing.

:)
 
> > 
> > The timer migration moves timers to non idle cpus to leave the idle
> > ones alone for power saving sake.
> > 
> > I can see and understand the reason why you want to avoid that, but I
> > have to ask the question whether this pinning is the correct behaviour
> > under all workloads and system characteristics. If yes, then the patch
> > is the right answer, if no, then it is simply the wrong approach.

I take your 'Awesome' below as a no then.

> > > but /proc/sys/kernel/timer_migration adds a fair overhead in many
> > > workloads.
> > > 
> > > get_nohz_timer_target() has to touch 3 cache lines per cpu...
> > 
> > And this is something we can fix and completely avoid if we think
> > about it. Looking at the code I have to admit that the out of line
> > call and the sysctl variable lookup is silly. But its not rocket
> > science to fix this.
> 
> Awesome.

Here you go. Completely untested, at least it compiles.

Thanks,

	tglx
---

Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -343,14 +343,10 @@ extern int runqueue_is_locked(int cpu);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
-extern int get_nohz_timer_target(int pinned);
+extern int get_nohz_timer_target(void);
 #else
 static inline void nohz_balance_enter_idle(int cpu) { }
 static inline void set_cpu_sd_state_idle(void) { }
-static inline int get_nohz_timer_target(int pinned)
-{
-	return smp_processor_id();
-}
 #endif
 
 /*
Index: linux/include/linux/sched/sysctl.h
===================================================================
--- linux.orig/include/linux/sched/sysctl.h
+++ linux/include/linux/sched/sysctl.h
@@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancin
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
-extern unsigned int sysctl_timer_migration;
 extern unsigned int sysctl_sched_shares_window;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
-#ifdef CONFIG_SCHED_DEBUG
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return sysctl_timer_migration;
-}
-#else
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return 1;
-}
-#endif
 
 /*
  *  control realtime throttling:
Index: linux/include/linux/timer.h
===================================================================
--- linux.orig/include/linux/timer.h
+++ linux/include/linux/timer.h
@@ -254,6 +254,16 @@ extern void run_local_timers(void);
 struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#include <linux/sysctl.h>
+
+extern unsigned int sysctl_timer_migration;
+int timer_migration_handler(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp,
+			    loff_t *ppos);
+extern struct static_key timer_migration_enabled;
+#endif
+
 unsigned long __round_jiffies(unsigned long j, int cpu);
 unsigned long __round_jiffies_relative(unsigned long j, int cpu);
 unsigned long round_jiffies(unsigned long j);
Index: linux/kernel/sched/core.c
===================================================================
--- linux.orig/kernel/sched/core.c
+++ linux/kernel/sched/core.c
@@ -593,13 +593,12 @@ void resched_cpu(int cpu)
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
 {
-	int cpu = smp_processor_id();
-	int i;
+	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+	if (!idle_cpu(cpu))
 		return cpu;
 
 	rcu_read_lock();
@@ -7088,8 +7087,6 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
 
-const_debug unsigned int sysctl_timer_migration = 1;
-
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
Index: linux/kernel/sysctl.c
===================================================================
--- linux.orig/kernel/sysctl.c
+++ linux/kernel/sysctl.c
@@ -349,15 +349,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "timer_migration",
-		.data		= &sysctl_timer_migration,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
-	},
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
 	{
@@ -1132,6 +1123,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+	{
+		.procname	= "timer_migration",
+		.data		= &sysctl_timer_migration,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= timer_migration_handler,
+	},
+#endif
 	{ }
 };
 
Index: linux/kernel/time/hrtimer.c
===================================================================
--- linux.orig/kernel/time/hrtimer.c
+++ linux/kernel/time/hrtimer.c
@@ -190,6 +190,23 @@ hrtimer_check_target(struct hrtimer *tim
 #endif
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static inline struct hrtimer_cpu_base *get_target_base(int pinned)
+{
+	if (pinned)
+		return this_cpu_ptr(&hrtimer_bases);
+	if (static_key_true(&timer_migration_enabled))
+		return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+	return this_cpu_ptr(&hrtimer_bases);
+}
+#else
+
+static inline struct hrtimer_cpu_base *get_target_base(int pinned)
+{
+	return this_cpu_ptr(&hrtimer_bases);
+}
+#endif
+
 /*
  * Switch the timer base to the current CPU when possible.
  */
@@ -197,14 +214,13 @@ static inline struct hrtimer_clock_base
 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		    int pinned)
 {
+	struct hrtimer_cpu_base *new_cpu_base, *this_base;
 	struct hrtimer_clock_base *new_base;
-	struct hrtimer_cpu_base *new_cpu_base;
-	int this_cpu = smp_processor_id();
-	int cpu = get_nohz_timer_target(pinned);
 	int basenum = base->index;
 
+	this_base = this_cpu_ptr(&hrtimer_bases);
+	new_cpu_base = get_target_base(pinned);
 again:
-	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
 	new_base = &new_cpu_base->clock_base[basenum];
 
 	if (base != new_base) {
@@ -225,17 +241,19 @@ again:
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
-		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-			cpu = this_cpu;
+		if (new_cpu_base != this_base &&
+		    hrtimer_check_target(timer, new_base)) {
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raw_spin_lock(&base->cpu_base->lock);
+			new_cpu_base = this_base;
 			timer->base = base;
 			goto again;
 		}
 		timer->base = new_base;
 	} else {
-		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-			cpu = this_cpu;
+		if (new_cpu_base != this_base &&
+		    hrtimer_check_target(timer, new_base)) {
+			new_cpu_base = this_base;
 			goto again;
 		}
 	}
Index: linux/kernel/time/timer.c
===================================================================
--- linux.orig/kernel/time/timer.c
+++ linux/kernel/time/timer.c
@@ -104,6 +104,49 @@ EXPORT_SYMBOL(boot_tvec_bases);
 
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+unsigned int sysctl_timer_migration = 1;
+struct static_key timer_migration_enabled = STATIC_KEY_INIT_TRUE;
+
+int timer_migration_handler(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp,
+			    loff_t *ppos)
+{
+	static DEFINE_MUTEX(mutex);
+	bool keyon;
+	int ret;
+
+	mutex_lock(&mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		goto unlock;
+
+	keyon = static_key_enabled(&timer_migration_enabled);
+	if (sysctl_timer_migration && !keyon )
+		static_key_slow_inc(&timer_migration_enabled);
+	else if (!sysctl_timer_migration && keyon)
+		static_key_slow_dec(&timer_migration_enabled);
+
+unlock:
+	mutex_unlock(&mutex);
+	return ret;
+}
+
+static inline struct tvec_base *get_target_base(int pinned)
+{
+	if (pinned)
+		return __this_cpu_read(tvec_bases);
+	if (static_key_true(&timer_migration_enabled))
+		return per_cpu(tvec_bases, get_nohz_timer_target());
+	return __this_cpu_read(tvec_bases);
+}
+#else
+static inline struct tvec_base *get_target_base(int pinned)
+{
+	return __this_cpu_read(tvec_bases);
+}
+#endif
+
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
@@ -774,7 +817,7 @@ __mod_timer(struct timer_list *timer, un
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret = 0 , cpu;
+	int ret = 0;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -787,8 +830,7 @@ __mod_timer(struct timer_list *timer, un
 
 	debug_activate(timer, expires);
 
-	cpu = get_nohz_timer_target(pinned);
-	new_base = per_cpu(tvec_bases, cpu);
+	new_base = get_target_base(pinned);
 
 	if (base != new_base) {
 		/*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ