lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <80182e47a7103608d2ddab7f62c0c3dffc99fdcc.1427782893.git.viresh.kumar@linaro.org>
Date:	Tue, 31 Mar 2015 12:25:16 +0530
From:	Viresh Kumar <viresh.kumar@...aro.org>
To:	Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Thomas Gleixner <tglx@...utronix.de>
Cc:	linaro-kernel@...ts.linaro.org, linux-kernel@...r.kernel.org,
	Viresh Kumar <viresh.kumar@...aro.org>,
	Steven Miao <realmz6@...il.com>
Subject: [PATCH 1/2] timer: Avoid waking up an idle-core by migrate running timer

While queuing a timer, we try to migrate it to a non-idle core if the local core
is idle, but we don't try that if the timer is re-armed from its handler.

There were few unsolved problems due to which it was avoided until now. But
there are cases where solving these problems can be useful. When the timer is
always re-armed from its handler, it never migrates to other cores. And many a
times, it ends up waking an idle core to just service the timer, which could
have been handled by a non-idle core.

Problems to solve, if we allow such migrations:

- Serialization of timer with itself. Its possible that timer may fire on the
  new base, before the timer handler finishes on old base.

- del_timer_sync() can't detect that the timer's handler has not finished.

__mod_timer migrates the timer with following code:

	spin_lock(&old_base->lock);
	timer_set_base(timer, NULL);
	spin_unlock(&old_base->lock);

	spin_lock(&new_base->lock);
	timer_set_base(timer, new_base);
	spin_unlock(&new_base->lock);

Once the new_base->lock is released, del_timer_sync() can take the
new_base->lock and will get new_base->running_timer != timer. del_timer_sync()
will then remove the timer and return while timer's handler hasn't finished yet
on the old base.

To fix these issues, lets use another bit from base pointer to mark if a timer's
handler is currently running or not. This can be used to verify timer's state in
del_timer_sync().

Before running timer's handler we must ensure timer isn't running on any other
CPU. If it is, then process other expired timers first, if any, and then wait
until the handler finishes.

Cc: Steven Miao <realmz6@...il.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@...radead.org>
Signed-off-by: Viresh Kumar <viresh.kumar@...aro.org>
---
 include/linux/timer.h |  3 +-
 kernel/time/timer.c   | 93 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 76 insertions(+), 20 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197e1587..68bf09d69352 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -67,8 +67,9 @@ extern struct tvec_base boot_tvec_bases;
  */
 #define TIMER_DEFERRABLE		0x1LU
 #define TIMER_IRQSAFE			0x2LU
+#define TIMER_RUNNING			0x4LU
 
-#define TIMER_FLAG_MASK			0x3LU
+#define TIMER_FLAG_MASK			0x7LU
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .prev = TIMER_ENTRY_STATIC },	\
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..364644811485 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -105,6 +105,21 @@ static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
 	return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
 }
 
+static inline unsigned int timer_running(struct timer_list *timer)
+{
+	return ((unsigned int)(unsigned long)timer->base & TIMER_RUNNING);
+}
+
+static inline void timer_set_running(struct timer_list *timer)
+{
+	timer->base = (struct tvec_base *)((unsigned long)timer->base | TIMER_RUNNING);
+}
+
+static inline void timer_clear_running(struct timer_list *timer)
+{
+	timer->base = (struct tvec_base *)((unsigned long)timer->base & ~TIMER_RUNNING);
+}
+
 static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 {
 	return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
@@ -781,21 +796,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 	new_base = per_cpu(tvec_bases, cpu);
 
 	if (base != new_base) {
-		/*
-		 * We are trying to schedule the timer on the local CPU.
-		 * However we can't change timer's base while it is running,
-		 * otherwise del_timer_sync() can't detect that the timer's
-		 * handler yet has not finished. This also guarantees that
-		 * the timer is serialized wrt itself.
-		 */
-		if (likely(base->running_timer != timer)) {
-			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
-			spin_unlock(&base->lock);
-			base = new_base;
-			spin_lock(&base->lock);
-			timer_set_base(timer, base);
-		}
+		/* See the comment in lock_timer_base() */
+		timer_set_base(timer, NULL);
+		spin_unlock(&base->lock);
+		base = new_base;
+		spin_lock(&base->lock);
+		timer_set_base(timer, base);
 	}
 
 	timer->expires = expires;
@@ -1016,7 +1022,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
 
 	base = lock_timer_base(timer, &flags);
 
-	if (base->running_timer != timer) {
+	if (!timer_running(timer)) {
 		timer_stats_timer_clear_start_info(timer);
 		ret = detach_if_pending(timer, base, true);
 	}
@@ -1050,12 +1056,12 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  *    ----                             ----
  *                                   <SOFTIRQ>
  *                                   call_timer_fn();
- *                                     base->running_timer = mytimer;
+ *                                     timer_set_running(mytimer);
  *  spin_lock_irq(somelock);
  *                                     <IRQ>
  *                                        spin_lock(somelock);
  *  del_timer_sync(mytimer);
- *   while (base->running_timer == mytimer);
+ *   while (timer_running(mytimer));
  *
  * Now del_timer_sync() will never return and never release somelock.
  * The interrupt on the other CPU is waiting to grab somelock but
@@ -1189,12 +1195,41 @@ static inline void __run_timers(struct tvec_base *base)
 			cascade(base, &base->tv5, INDEX(3));
 		++base->timer_jiffies;
 		list_replace_init(base->tv1.vec + index, head);
+
+again:
 		while (!list_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
 			bool irqsafe;
 
-			timer = list_first_entry(head, struct timer_list,entry);
+			timer = list_first_entry(head, struct timer_list, entry);
+
+			if (unlikely(timer_running(timer))) {
+				/*
+				 * The only way to get here is if the handler,
+				 * running on another base, re-queued itself on
+				 * this base, and the handler hasn't finished
+				 * yet.
+				 */
+
+				if (list_is_last(&timer->entry, head)) {
+					/*
+					 * Drop lock, so that TIMER_RUNNING can
+					 * be cleared on another base.
+					 */
+					spin_unlock(&base->lock);
+
+					while (timer_running(timer))
+						cpu_relax();
+
+					spin_lock(&base->lock);
+				} else {
+					/* Rotate the list and try someone else */
+					list_move_tail(&timer->entry, head);
+				}
+				goto again;
+			}
+
 			fn = timer->function;
 			data = timer->data;
 			irqsafe = tbase_get_irqsafe(timer->base);
@@ -1202,6 +1237,7 @@ static inline void __run_timers(struct tvec_base *base)
 			timer_stats_account_timer(timer);
 
 			base->running_timer = timer;
+			timer_set_running(timer);
 			detach_expired_timer(timer, base);
 
 			if (irqsafe) {
@@ -1213,6 +1249,25 @@ static inline void __run_timers(struct tvec_base *base)
 				call_timer_fn(timer, fn, data);
 				spin_lock_irq(&base->lock);
 			}
+
+			/*
+			 * Handler running on this base, re-queued itself on
+			 * another base ?
+			 */
+			if (unlikely(timer->base != base)) {
+				unsigned long flags;
+				struct tvec_base *tbase;
+
+				spin_unlock(&base->lock);
+
+				tbase = lock_timer_base(timer, &flags);
+				timer_clear_running(timer);
+				spin_unlock(&tbase->lock);
+
+				spin_lock(&base->lock);
+			} else {
+				timer_clear_running(timer);
+			}
 		}
 	}
 	base->running_timer = NULL;
-- 
2.3.0.rc0.44.ga94655d

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ