linux-kernel - Re: [PATCH] [RFC] kernel/cpu: Use lockref for online CPU reference counting

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160215170618.GL6375@twins.programming.kicks-ass.net>
Date:	Mon, 15 Feb 2016 18:06:18 +0100
From:	Peter Zijlstra <peterz@...radead.org>
To:	Joonas Lahtinen <joonas.lahtinen@...ux.intel.com>
Cc:	Intel graphics driver community testing & development 
	<intel-gfx@...ts.freedesktop.org>,
	Linux kernel development <linux-kernel@...r.kernel.org>,
	Ingo Molnar <mingo@...nel.org>,
	David Hildenbrand <dahi@...ux.vnet.ibm.com>,
	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
	"Gautham R. Shenoy" <ego@...ux.vnet.ibm.com>,
	Chris Wilson <chris@...is-wilson.co.uk>
Subject: Re: [PATCH] [RFC] kernel/cpu: Use lockref for online CPU reference
 counting

On Mon, Feb 15, 2016 at 03:17:55PM +0100, Peter Zijlstra wrote:
> On Mon, Feb 15, 2016 at 02:36:43PM +0200, Joonas Lahtinen wrote:
> > Instead of implementing a custom locked reference counting, use lockref.
> > 
> > Current implementation leads to a deadlock splat on Intel SKL platforms
> > when lockdep debugging is enabled.
> > 
> > This is due to few of CPUfreq drivers (including Intel P-state) having this;
> > policy->rwsem is locked during driver initialization and the functions called
> > during init that actually apply CPU limits use get_online_cpus (because they
> > have other calling paths too), which will briefly lock cpu_hotplug.lock to
> > increase cpu_hotplug.refcount.
> > 
> > On later calling path, when doing a suspend, when cpu_hotplug_begin() is called
> > in disable_nonboot_cpus(), callbacks to CPUfreq functions get called after,
> > which will lock policy->rwsem and cpu_hotplug.lock is already held by
> > cpu_hotplug_begin() and we do have a potential deadlock scenario reported by
> > our CI system (though it is a very unlikely one). See the Bugzilla link for more
> > details.
> 
> I've been meaning to change the thing into a percpu-rwsem, I just
> haven't had time to look into the lockdep splat that generated.


The below has plenty lockdep issues because percpu-rwsem is
reader-writer fair (like the regular rwsem), so it does throw up a fair
number of very icky issues.

If at all possible, I'd really rather fix those and have a 'saner'
hotplug lock, rather than muddle on with open-coded horror lock we have
now.


--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -225,6 +225,8 @@ extern struct bus_type cpu_subsys;
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
 
+extern void cpu_hotplug_init_task(struct task_struct *p);
+
 extern void cpu_hotplug_begin(void);
 extern void cpu_hotplug_done(void);
 extern void get_online_cpus(void);
@@ -242,6 +244,8 @@ int cpu_down(unsigned int cpu);
 
 #else		/* CONFIG_HOTPLUG_CPU */
 
+static inline void cpu_hotplug_init_task(struct task_struct *p) {}
+
 static inline void cpu_hotplug_begin(void) {}
 static inline void cpu_hotplug_done(void) {}
 #define get_online_cpus()	do { } while (0)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -16,6 +16,15 @@ struct percpu_rw_semaphore {
 	wait_queue_head_t	write_waitq;
 };
 
+#define DEFINE_STATIC_PERCPU_RWSEM(name)				\
+static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_frc_##name);	\
+static struct percpu_rw_semaphore name = {				\
+	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
+	.fast_read_ctr = &__percpu_rwsem_frc_##name,			\
+	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
+	.write_waitq = __WAIT_QUEUE_HEAD_INITIALIZER(name.write_waitq),	\
+}
+
 extern void percpu_down_read(struct percpu_rw_semaphore *);
 extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
 extern void percpu_up_read(struct percpu_rw_semaphore *);
@@ -33,9 +42,11 @@ extern void percpu_free_rwsem(struct per
 	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
 })
 
-
 #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
 
+#define percpu_rwsem_assert_held(sem)                          \
+	lockdep_assert_held(&(sem)->rw_sem)
+
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1403,6 +1403,9 @@ struct task_struct {
 	struct task_struct *last_wakee;
 
 	int wake_cpu;
+#ifdef CONFIG_HOTPLUG_CPU
+	int cpuhp_ref;
+#endif
 #endif
 	int on_rq;
 
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -23,6 +23,7 @@
 #include <linux/tick.h>
 #include <linux/irq.h>
 #include <trace/events/power.h>
+#include <linux/percpu-rwsem.h>
 
 #include "smpboot.h"
 
@@ -51,121 +52,52 @@ EXPORT_SYMBOL(cpu_notifier_register_done
 
 static RAW_NOTIFIER_HEAD(cpu_chain);
 
-/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+/*
+ * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
  * Should always be manipulated under cpu_add_remove_lock
  */
 static int cpu_hotplug_disabled;
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static struct {
-	struct task_struct *active_writer;
-	/* wait queue to wake up the active_writer */
-	wait_queue_head_t wq;
-	/* verifies that no writer will get active while readers are active */
-	struct mutex lock;
-	/*
-	 * Also blocks the new readers during
-	 * an ongoing cpu hotplug operation.
-	 */
-	atomic_t refcount;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-} cpu_hotplug = {
-	.active_writer = NULL,
-	.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
-	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	.dep_map = {.name = "cpu_hotplug.lock" },
-#endif
-};
-
-/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
-#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire_tryread() \
-				  lock_map_acquire_tryread(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
-#define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
+DEFINE_STATIC_PERCPU_RWSEM(hotplug);
 
+void cpu_hotplug_init_task(struct task_struct *p)
+{
+	if (WARN_ON_ONCE(p->cpuhp_ref))
+		p->cpuhp_ref = 0;
+}
 
 void get_online_cpus(void)
 {
 	might_sleep();
-	if (cpu_hotplug.active_writer == current)
+
+	if (current->cpuhp_ref++) /* read recursion */
 		return;
-	cpuhp_lock_acquire_read();
-	mutex_lock(&cpu_hotplug.lock);
-	atomic_inc(&cpu_hotplug.refcount);
-	mutex_unlock(&cpu_hotplug.lock);
+
+	percpu_down_read(&hotplug);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
 
 void put_online_cpus(void)
 {
-	int refcount;
-
-	if (cpu_hotplug.active_writer == current)
+	if (--current->cpuhp_ref)
 		return;
 
-	refcount = atomic_dec_return(&cpu_hotplug.refcount);
-	if (WARN_ON(refcount < 0)) /* try to fix things up */
-		atomic_inc(&cpu_hotplug.refcount);
-
-	if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
-		wake_up(&cpu_hotplug.wq);
-
-	cpuhp_lock_release();
-
+	percpu_up_read(&hotplug);
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
 
-/*
- * This ensures that the hotplug operation can begin only when the
- * refcount goes to zero.
- *
- * Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- *   writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- *   non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
- */
 void cpu_hotplug_begin(void)
 {
-	DEFINE_WAIT(wait);
-
-	cpu_hotplug.active_writer = current;
-	cpuhp_lock_acquire();
-
-	for (;;) {
-		mutex_lock(&cpu_hotplug.lock);
-		prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
-		if (likely(!atomic_read(&cpu_hotplug.refcount)))
-				break;
-		mutex_unlock(&cpu_hotplug.lock);
-		schedule();
-	}
-	finish_wait(&cpu_hotplug.wq, &wait);
+	percpu_down_write(&hotplug);
+	current->cpuhp_ref++; /* allow read-in-write recursion */
 }
 
 void cpu_hotplug_done(void)
 {
-	cpu_hotplug.active_writer = NULL;
-	mutex_unlock(&cpu_hotplug.lock);
-	cpuhp_lock_release();
+	current->cpuhp_ref--;
+	percpu_up_write(&hotplug);
 }
 
 /*
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1414,6 +1414,8 @@ static struct task_struct *copy_process(
 	p->sequential_io_avg	= 0;
 #endif
 
+	cpu_hotplug_init_task(p);
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
 	if (retval)
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -53,6 +53,11 @@ config GENERIC_IO
 config STMP_DEVICE
 	bool
 
+config PERCPU_RWSEM_HOTPLUG
+	def_bool y
+	depends on HOTPLUG_CPU
+	select PERCPU_RWSEM
+
 config ARCH_USE_CMPXCHG_LOCKREF
 	bool