linux-kernel - [ANNOUNCE] 3.14.2-rt3

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140503170110.GA10652@linutronix.de>
Date:	Sat, 3 May 2014 19:01:11 +0200
From:	Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To:	linux-rt-users <linux-rt-users@...r.kernel.org>
Cc:	LKML <linux-kernel@...r.kernel.org>,
	Thomas Gleixner <tglx@...utronix.de>, rostedt@...dmis.org,
	John Kacur <jkacur@...hat.com>
Subject: [ANNOUNCE] 3.14.2-rt3

Dear RT folks!

I'm pleased to announce the v3.14.2-rt3 patch set.

Changes since v3.14.2-rt2
- rwsem readers are now not allowed to nest. A patch rom Steven Rostedt.
- a few bugs were fixed in the hotplug code which were made during the
  v3.14 port. Fixed by Mike Galbraith.
- Mike Galbraith sent a patch which might fix lazy preempt on x86_64.
  Patch applied and my machine still explodes therefore lazy preempt
  remains off on x86_64.
- Mike Galbraith sent a few patches to get cpu hoplug to work. This
  includes lg_global_trylock_relax().
- A few push downs of migrate_disable() (where we call migrate_disable()
  after the rt_mutex_trylock()) have been reverted. It seems hotplug is
  not too happy about this. A patch by Steven Rostedt and and Mike
  Galbraith
- There was a complaint about a backrace from run_local_timers() in UP
  mode because a spin_try_lock() failed. _This_ particular case was not
  an error. This optimization was for FULL_NO_HZ which is pointless on
  UP because there is no spare CPU. Therefore, this optimization is
  disabled in UP mode and the backtrace is gone. Reported by Stanislav
  Meduna.
- block-mq notifier uses now a spinlock and runs during CPU_POST_DEAD
  instead at CPU_DEAD time. lockdep complained about the sleeping
  ctx->lock within the rawlock (blk_mq_cpu_notify_lock) and CPU_DEAD
  runs with irqs off.

Known issues:

      - bcache is disabled.

      - lazy preempt on x86_64 leads to a crash with some load.

      - CPU hotplug works in general. Steven's test script however
        deadlocks usually on the second invocation.

The delta patch against v3.14.2-rt2 is appended below and can be found
here:
   https://www.kernel.org/pub/linux/kernel/projects/rt/3.14/incr/patch-3.14.2-rt2-rt3.patch.xz

The RT patch against 3.14.2 can be found here:

   https://www.kernel.org/pub/linux/kernel/projects/rt/3.14/patch-3.14.2-rt3.patch.xz

The split quilt queue is available at:

   https://www.kernel.org/pub/linux/kernel/projects/rt/3.14/patches-3.14.2-rt3.tar.xz

Sebastian

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 752fe56..1e649c4 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -94,7 +94,11 @@ static __always_inline bool __preempt_count_dec_and_test(void)
 {
 	if (____preempt_count_dec_and_test())
 		return true;
+#ifdef CONFIG_PREEMPT_LAZY
 	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+	return false;
+#endif
 }
 
 /*
@@ -102,8 +106,12 @@ static __always_inline bool __preempt_count_dec_and_test(void)
  */
 static __always_inline bool should_resched(void)
 {
+#ifdef CONFIG_PREEMPT_LAZY
 	return unlikely(!__this_cpu_read_4(__preempt_count) || \
 			test_thread_flag(TIF_NEED_RESCHED_LAZY));
+#else
+	return unlikely(!__this_cpu_read_4(__preempt_count));
+#endif
 }
 
 #ifdef CONFIG_PREEMPT
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 7c8b356..5701b50 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -72,4 +72,5 @@ void common(void) {
 
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+	DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
 }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index fd2d976..6157ed6 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -365,19 +365,22 @@ ENTRY(resume_kernel)
 need_resched:
 	# preempt count == 0 + NEED_RS set?
 	cmpl $0,PER_CPU_VAR(__preempt_count)
+#ifndef CONFIG_PREEMPT_LAZY
+	jnz restore_all
+#else
 	jz test_int_off
 
 	# atleast preempt count == 0 ?
-	cmpl $_TIF_NEED_RESCHED,PER_CPU_VAR(__preempt_count)
+	cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
 	jne restore_all
 
 	cmpl $0,TI_preempt_lazy_count(%ebp)	# non-zero preempt_lazy_count ?
 	jnz restore_all
 
-	testl $_TIF_NEED_RESCHED_LAZY, %ecx
+	testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
 	jz restore_all
-
 test_int_off:
+#endif
 	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
 	jz restore_all
 	call preempt_schedule_irq
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b650b43..d893814 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -658,8 +658,8 @@ GLOBAL(system_call_after_swapgs)
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */
 sysret_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz sysret_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -771,8 +771,8 @@ GLOBAL(int_with_check)
 	/* First do a reschedule test. */
 	/* edx:	work, edi: workmask */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz  int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -1071,8 +1071,8 @@ ENTRY(native_iret)
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz   retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -1104,7 +1104,22 @@ ENTRY(native_iret)
 	/* rcx:	 threadinfo. interrupts off. */
 ENTRY(retint_kernel)
 	cmpl $0,PER_CPU_VAR(__preempt_count)
+#ifndef CONFIG_PREEMPT_LAZY
 	jnz  retint_restore_args
+#else
+	jz  check_int_off
+
+	# atleast preempt count == 0 ?
+	cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
+	jnz retint_restore_args
+
+	cmpl $0, TI_preempt_lazy_count(%rcx)
+	jnz retint_restore_args
+
+	bt $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
+	jnc  retint_restore_args
+check_int_off:
+#endif
 	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
@@ -1540,7 +1555,7 @@ ENTRY(paranoid_exit)
 	movq %rsp,%rdi			/* &pt_regs */
 	call sync_regs
 	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
+	testl $_TIF_NEED_RESCHED_MASK,%ebx
 	jnz paranoid_schedule
 	movl %ebx,%edx			/* arg3: thread flags */
 	TRACE_IRQS_ON
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
index 136ef86..37acc3a 100644
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@@ -11,7 +11,7 @@
 #include "blk-mq.h"
 
 static LIST_HEAD(blk_mq_cpu_notify_list);
-static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
+static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
 
 static int blk_mq_main_cpu_notify(struct notifier_block *self,
 				  unsigned long action, void *hcpu)
@@ -19,12 +19,15 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
 	unsigned int cpu = (unsigned long) hcpu;
 	struct blk_mq_cpu_notifier *notify;
 
-	raw_spin_lock(&blk_mq_cpu_notify_lock);
+	if (action != CPU_POST_DEAD && action != CPU_POST_DEAD)
+		return NOTIFY_OK;
+
+	spin_lock(&blk_mq_cpu_notify_lock);
 
 	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
 		notify->notify(notify->data, action, cpu);
 
-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
+	spin_unlock(&blk_mq_cpu_notify_lock);
 	return NOTIFY_OK;
 }
 
@@ -32,16 +35,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
 {
 	BUG_ON(!notifier->notify);
 
-	raw_spin_lock(&blk_mq_cpu_notify_lock);
+	spin_lock(&blk_mq_cpu_notify_lock);
 	list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
+	spin_unlock(&blk_mq_cpu_notify_lock);
 }
 
 void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
 {
-	raw_spin_lock(&blk_mq_cpu_notify_lock);
+	spin_lock(&blk_mq_cpu_notify_lock);
 	list_del(&notifier->list);
-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
+	spin_unlock(&blk_mq_cpu_notify_lock);
 }
 
 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a5f25f9..5fb26f7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -48,9 +48,14 @@ static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
 	return __blk_mq_get_ctx(q, get_cpu_light());
 }
 
-static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
+static void __blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 {
 	spin_unlock(&ctx->cpu_lock);
+}
+
+static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
+{
+	__blk_mq_put_ctx(ctx);
 	put_cpu_light();
 }
 
@@ -966,7 +971,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 	struct blk_mq_ctx *ctx;
 	LIST_HEAD(tmp);
 
-	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
+	if (action != CPU_POST_DEAD && action != CPU_POST_DEAD)
 		return;
 
 	/*
@@ -980,6 +985,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 		clear_bit(ctx->index_hw, hctx->ctx_map);
 	}
 	spin_unlock(&ctx->lock);
+	__blk_mq_put_ctx(ctx);
 
 	if (list_empty(&tmp))
 		return;
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 2b2204e..534b16e 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -74,4 +74,10 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu);
 void lg_global_lock(struct lglock *lg);
 void lg_global_unlock(struct lglock *lg);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+#define lg_global_trylock_relax(name)	lg_global_lock(name)
+#else
+void lg_global_trylock_relax(struct lglock *lg);
+#endif
+
 #endif
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 116af6a..5b2cdf4 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -126,8 +126,7 @@ do { \
 #define preempt_enable_notrace() \
 do { \
 	barrier(); \
-	if (unlikely(__preempt_count_dec_and_test() || \
-				test_thread_flag(TIF_NEED_RESCHED_LAZY))) \
+	if (unlikely(__preempt_count_dec_and_test())) \
 		__preempt_schedule_context(); \
 } while (0)
 #else
diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
index 924c2d2..0065b08 100644
--- a/include/linux/rwsem_rt.h
+++ b/include/linux/rwsem_rt.h
@@ -20,7 +20,6 @@
 
 struct rw_semaphore {
 	struct rt_mutex		lock;
-	int			read_depth;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index ac6f08b..c0d1367 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -35,6 +35,7 @@ extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
  */
 extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
 extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
+extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
 
 #define spin_lock(lock)				\
 	do {					\
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 041fada..ce00329 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -649,7 +649,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		smpboot_unpark_threads(cpu);
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
-		goto out_cancel;
+		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
 
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index f2356df..9397974 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -105,3 +105,28 @@ void lg_global_unlock(struct lglock *lg)
 	preempt_enable_nort();
 }
 EXPORT_SYMBOL(lg_global_unlock);
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * HACK: If you use this, you get to keep the pieces.
+ * Used in queue_stop_cpus_work() when stop machinery
+ * is called from inactive CPU, so we can't schedule.
+ */
+# define lg_do_trylock_relax(l)			\
+	do {					\
+		while (!__rt_spin_trylock(l))	\
+			cpu_relax();		\
+	} while (0)
+
+void lg_global_trylock_relax(struct lglock *lg)
+{
+	int i;
+
+	lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+	for_each_possible_cpu(i) {
+		lg_lock_ptr *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		lg_do_trylock_relax(lock);
+	}
+}
+#endif
diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index 5d17727..055a3df 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -180,12 +180,14 @@ EXPORT_SYMBOL(_mutex_unlock);
  */
 int __lockfunc rt_write_trylock(rwlock_t *rwlock)
 {
-	int ret = rt_mutex_trylock(&rwlock->lock);
+	int ret;
 
-	if (ret) {
+	migrate_disable();
+	ret = rt_mutex_trylock(&rwlock->lock);
+	if (ret)
 		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
-		migrate_disable();
-	}
+	else
+		migrate_enable();
 
 	return ret;
 }
@@ -212,11 +214,13 @@ int __lockfunc rt_read_trylock(rwlock_t *rwlock)
 	 * write locked.
 	 */
 	if (rt_mutex_owner(lock) != current) {
+		migrate_disable();
 		ret = rt_mutex_trylock(lock);
-		if (ret) {
+		if (ret)
 			rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
-			migrate_disable();
-		}
+		else
+			migrate_enable();
+
 	} else if (!rwlock->read_depth) {
 		ret = 0;
 	}
@@ -240,13 +244,14 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock)
 {
 	struct rt_mutex *lock = &rwlock->lock;
 
+
 	/*
 	 * recursive read locks succeed when current owns the lock
 	 */
 	if (rt_mutex_owner(lock) != current) {
-		rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
-		__rt_spin_lock(lock);
 		migrate_disable();
+		rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+		__rt_spin_lock(lock);
 	}
 	rwlock->read_depth++;
 }
@@ -316,10 +321,8 @@ EXPORT_SYMBOL(rt_up_write);
 
 void  rt_up_read(struct rw_semaphore *rwsem)
 {
-	if (--rwsem->read_depth == 0) {
-		rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
-		rt_mutex_unlock(&rwsem->lock);
-	}
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&rwsem->lock);
 }
 EXPORT_SYMBOL(rt_up_read);
 
@@ -330,7 +333,6 @@ EXPORT_SYMBOL(rt_up_read);
 void  rt_downgrade_write(struct rw_semaphore *rwsem)
 {
 	BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
-	rwsem->read_depth = 1;
 }
 EXPORT_SYMBOL(rt_downgrade_write);
 
@@ -367,37 +369,20 @@ void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
 
 int  rt_down_read_trylock(struct rw_semaphore *rwsem)
 {
-	struct rt_mutex *lock = &rwsem->lock;
-	int ret = 1;
-
-	/*
-	 * recursive read locks succeed when current owns the rwsem,
-	 * but not when read_depth == 0 which means that the rwsem is
-	 * write locked.
-	 */
-	if (rt_mutex_owner(lock) != current) {
-		ret = rt_mutex_trylock(&rwsem->lock);
-		if (ret)
-			rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
-	} else if (!rwsem->read_depth) {
-		ret = 0;
-	}
+	int ret;
 
+	ret = rt_mutex_trylock(&rwsem->lock);
 	if (ret)
-		rwsem->read_depth++;
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+
 	return ret;
 }
 EXPORT_SYMBOL(rt_down_read_trylock);
 
 static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
 {
-	struct rt_mutex *lock = &rwsem->lock;
-
-	if (rt_mutex_owner(lock) != current) {
-		rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
-		rt_mutex_lock(&rwsem->lock);
-	}
-	rwsem->read_depth++;
+	rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
 }
 
 void  rt_down_read(struct rw_semaphore *rwsem)
@@ -422,7 +407,6 @@ void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
 	debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
 	lockdep_init_map(&rwsem->dep_map, name, key, 0);
 #endif
-	rwsem->read_depth = 0;
 	rwsem->lock.save_state = 0;
 }
 EXPORT_SYMBOL(__rt_rwsem_init);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 42f4f28..5c5cc76 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1001,6 +1001,11 @@ void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
 }
 EXPORT_SYMBOL(rt_spin_unlock_wait);
 
+int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
+{
+	return rt_mutex_trylock(lock);
+}
+
 int __lockfunc rt_spin_trylock(spinlock_t *lock)
 {
 	int ret = rt_mutex_trylock(&lock->lock);
@@ -1045,12 +1050,12 @@ int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
 	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
 	if (atomic_add_unless(atomic, -1, 1))
 		return 0;
+	migrate_disable();
 	rt_spin_lock(lock);
-	if (atomic_dec_and_test(atomic)){
-		migrate_disable();
+	if (atomic_dec_and_test(atomic))
 		return 1;
-	}
 	rt_spin_unlock(lock);
+	migrate_enable();
 	return 0;
 }
 EXPORT_SYMBOL(atomic_dec_and_spin_lock);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index aaae9f1..bcbae9c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -266,7 +266,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	struct irq_cpu_stop_queue_work_info call_args;
 	struct multi_stop_data msdata;
 
-	preempt_disable();
+	preempt_disable_nort();
 	msdata = (struct multi_stop_data){
 		.fn = fn,
 		.data = arg,
@@ -299,7 +299,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	 * This relies on the stopper workqueues to be FIFO.
 	 */
 	if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
-		preempt_enable();
+		preempt_enable_nort();
 		return -ENOENT;
 	}
 
@@ -313,7 +313,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 				 &irq_cpu_stop_queue_work,
 				 &call_args, 1);
 	lg_local_unlock(&stop_cpus_lock);
-	preempt_enable();
+	preempt_enable_nort();
 
 	wait_for_stop_done(&done);
 
@@ -346,7 +346,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
 
 static void queue_stop_cpus_work(const struct cpumask *cpumask,
 				 cpu_stop_fn_t fn, void *arg,
-				 struct cpu_stop_done *done)
+				 struct cpu_stop_done *done, bool inactive)
 {
 	struct cpu_stop_work *work;
 	unsigned int cpu;
@@ -360,11 +360,13 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 	}
 
 	/*
-	 * Disable preemption while queueing to avoid getting
-	 * preempted by a stopper which might wait for other stoppers
-	 * to enter @fn which can lead to deadlock.
+	 * Make sure that all work is queued on all cpus before
+	 * any of the cpus can execute it.
 	 */
-	lg_global_lock(&stop_cpus_lock);
+	if (!inactive)
+		lg_global_lock(&stop_cpus_lock);
+	else
+		lg_global_trylock_relax(&stop_cpus_lock);
 	for_each_cpu(cpu, cpumask)
 		cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
 	lg_global_unlock(&stop_cpus_lock);
@@ -376,7 +378,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
 	struct cpu_stop_done done;
 
 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
-	queue_stop_cpus_work(cpumask, fn, arg, &done);
+	queue_stop_cpus_work(cpumask, fn, arg, &done, false);
 	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
@@ -572,6 +574,8 @@ static int __init cpu_stop_init(void)
 		INIT_LIST_HEAD(&stopper->works);
 	}
 
+	lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
+
 	BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
 	stop_machine_initialized = true;
 	return 0;
@@ -667,11 +671,11 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
 	set_state(&msdata, MULTI_STOP_PREPARE);
 	cpu_stop_init_done(&done, num_active_cpus());
 	queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
-			     &done);
+			     &done, true);
 	ret = multi_cpu_stop(&msdata);
 
 	/* Busy wait for completion. */
-	while (!atomic_read(&done.nr_todo))
+	while (atomic_read(&done.nr_todo))
 		cpu_relax();
 
 	mutex_unlock(&stop_cpus_mutex);
diff --git a/kernel/timer.c b/kernel/timer.c
index 54596b5..8750875 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1461,6 +1461,19 @@ void run_local_timers(void)
 	 * the timer softirq.
 	 */
 #ifdef CONFIG_PREEMPT_RT_FULL
+
+#ifndef CONFIG_SMP
+	/*
+	 * The spin_do_trylock() later may fail as the lock may be hold before
+	 * the interrupt arrived. The spin-lock debugging code will raise a
+	 * warning if the try_lock fails on UP. Since this is only an
+	 * optimization for the FULL_NO_HZ case (not to run the timer softirq on
+	 * an nohz_full CPU) we don't really care and shedule the softirq.
+	 */
+	raise_softirq(TIMER_SOFTIRQ);
+	return;
+#endif
+
 	/* On RT, irq work runs from softirq */
 	if (irq_work_needs_cpu()) {
 		raise_softirq(TIMER_SOFTIRQ);
diff --git a/localversion-rt b/localversion-rt
index c3054d0..1445cd6 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt2
+-rt3
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/