linux-kernel - [RFC PATCH 4/4] Fix: sched/membarrier: p->mm->membarrier

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190906031300.1647-5-mathieu.desnoyers@efficios.com>
Date:   Thu,  5 Sep 2019 23:13:00 -0400
From:   Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        "Paul E. McKenney" <paulmck@...ux.ibm.com>
Cc:     Ingo Molnar <mingo@...nel.org>, linux-kernel@...r.kernel.org,
        Oleg Nesterov <oleg@...hat.com>,
        "Eric W. Biederman" <ebiederm@...ssion.com>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Russell King - ARM Linux admin <linux@...linux.org.uk>,
        Chris Metcalf <cmetcalf@...hip.com>,
        Christoph Lameter <cl@...ux.com>,
        Kirill Tkhai <tkhai@...dex.ru>, Mike Galbraith <efault@....de>,
        Thomas Gleixner <tglx@...utronix.de>,
        Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Subject: [RFC PATCH 4/4] Fix: sched/membarrier: p->mm->membarrier_state racy load

The membarrier_state field is located within the mm_struct, which
is not guaranteed to exist when used from runqueue-lock-free iteration
on runqueues by the membarrier system call.

Copy the membarrier_state from the mm_struct into the scheduler runqueue
in the scheduler prepare task switch when the scheduler switches between
mm.

When registering membarrier for mm, after setting the registration bit
in the mm membarrier state, issue a synchronize_rcu() to ensure the
scheduler observes the change. In order to take care of the case
where a runqueue keeps executing the target mm without swapping to
other mm, iterate over each runqueue and issue an IPI to copy the
membarrier_state from the mm_struct into each runqueue which have the
same mm which state has just been modified.

Move the mm membarrier_state field closer to pgd in mm_struct so sched
switch prepare hits a cache line which is already touched by the
scheduler switch_mm.

membarrier_execve() hook now needs to clear the runqueue's membarrier
state in addition to clear the mm membarrier state, so move its
implementation into the scheduler membarrier code so it can access the
runqueue structure.

As suggested by Linus, move all membarrier.c RCU read-side locks outside
of the for each cpu loops. In fact, it turns out that
membarrier_global_expedited() does not need the RCU read-side critical
section because it does not need to use task_rcu_dereference() anymore,
relying instead on the runqueue's membarrier_state.

Suggested-by: Linus Torvalds <torvalds@...ux-foundation.org>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: "Paul E. McKenney" <paulmck@...ux.ibm.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Oleg Nesterov <oleg@...hat.com>
Cc: "Eric W. Biederman" <ebiederm@...ssion.com>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: Russell King - ARM Linux admin <linux@...linux.org.uk>
Cc: Chris Metcalf <cmetcalf@...hip.com>
Cc: Christoph Lameter <cl@...ux.com>
Cc: Kirill Tkhai <tkhai@...dex.ru>
Cc: Mike Galbraith <efault@....de>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Ingo Molnar <mingo@...nel.org>
---
 include/linux/mm_types.h  |   7 +-
 include/linux/sched/mm.h  |   6 +-
 kernel/sched/core.c       |   1 +
 kernel/sched/membarrier.c | 141 +++++++++++++++++++++++++++-----------
 kernel/sched/sched.h      |  33 +++++++++
 5 files changed, 141 insertions(+), 47 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6a7a1083b6fb..7020572eb605 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -382,6 +382,9 @@ struct mm_struct {
 		unsigned long task_size;	/* size of task vm space */
 		unsigned long highest_vm_end;	/* highest vma end address */
 		pgd_t * pgd;
+#ifdef CONFIG_MEMBARRIER
+		atomic_t membarrier_state;
+#endif
 
 		/**
 		 * @mm_users: The number of users including userspace.
@@ -452,9 +455,7 @@ struct mm_struct {
 		unsigned long flags; /* Must use atomic bitops to access */
 
 		struct core_state *core_state; /* coredumping support */
-#ifdef CONFIG_MEMBARRIER
-		atomic_t membarrier_state;
-#endif
+
 #ifdef CONFIG_AIO
 		spinlock_t			ioctx_lock;
 		struct kioctx_table __rcu	*ioctx_table;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 8557ec664213..7070dbef8066 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -370,10 +370,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 	sync_core_before_usermode();
 }
 
-static inline void membarrier_execve(struct task_struct *t)
-{
-	atomic_set(&t->mm->membarrier_state, 0);
-}
+extern void membarrier_execve(struct task_struct *t);
+
 #else
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
 static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 010d578118d6..1cffc1aa403c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3038,6 +3038,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 	perf_event_task_sched_out(prev, next);
 	rseq_preempt(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
+	membarrier_prepare_task_switch(rq, prev, next);
 	prepare_task(next);
 	prepare_arch_switch(next);
 }
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 7e0a0d6535f3..5744c300d29e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -30,6 +30,28 @@ static void ipi_mb(void *info)
 	smp_mb();	/* IPIs should be serializing but paranoid. */
 }
 
+static void ipi_sync_rq_state(void *info)
+{
+	struct mm_struct *mm = (struct mm_struct *) info;
+
+	if (current->mm != mm)
+		return;
+	WRITE_ONCE(this_rq()->membarrier_state,
+		   atomic_read(&mm->membarrier_state));
+	/*
+	 * Issue a memory barrier after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED
+	 * in the current runqueue to guarantee that no memory access following
+	 * registration is reordered before registration.
+	 */
+	smp_mb();
+}
+
+void membarrier_execve(struct task_struct *t)
+{
+	atomic_set(&t->mm->membarrier_state, 0);
+	WRITE_ONCE(this_rq()->membarrier_state, 0);
+}
+
 static int membarrier_global_expedited(void)
 {
 	int cpu;
@@ -57,8 +79,6 @@ static int membarrier_global_expedited(void)
 
 	cpus_read_lock();
 	for_each_online_cpu(cpu) {
-		struct task_struct *p;
-
 		/*
 		 * Skipping the current CPU is OK even through we can be
 		 * migrated at any point. The current CPU, at the point
@@ -70,16 +90,13 @@ static int membarrier_global_expedited(void)
 		if (cpu == raw_smp_processor_id())
 			continue;
 
-		rcu_read_lock();
-		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
-		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
-				   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+		if (READ_ONCE(cpu_rq(cpu)->membarrier_state) &
+		    MEMBARRIER_STATE_GLOBAL_EXPEDITED) {
 			if (!fallback)
 				__cpumask_set_cpu(cpu, tmpmask);
 			else
 				smp_call_function_single(cpu, ipi_mb, NULL, 1);
 		}
-		rcu_read_unlock();
 	}
 	if (!fallback) {
 		preempt_disable();
@@ -136,6 +153,7 @@ static int membarrier_private_expedited(int flags)
 	}
 
 	cpus_read_lock();
+	rcu_read_lock();
 	for_each_online_cpu(cpu) {
 		struct task_struct *p;
 
@@ -149,7 +167,6 @@ static int membarrier_private_expedited(int flags)
 		 */
 		if (cpu == raw_smp_processor_id())
 			continue;
-		rcu_read_lock();
 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
 		if (p && p->mm == current->mm) {
 			if (!fallback)
@@ -157,8 +174,8 @@ static int membarrier_private_expedited(int flags)
 			else
 				smp_call_function_single(cpu, ipi_mb, NULL, 1);
 		}
-		rcu_read_unlock();
 	}
+	rcu_read_unlock();
 	if (!fallback) {
 		preempt_disable();
 		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
@@ -177,6 +194,71 @@ static int membarrier_private_expedited(int flags)
 	return 0;
 }
 
+static void sync_runqueues_membarrier_state(struct mm_struct *mm)
+{
+	int membarrier_state = atomic_read(&mm->membarrier_state);
+	bool fallback = false;
+	cpumask_var_t tmpmask;
+	int cpu;
+
+	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
+		WRITE_ONCE(this_rq()->membarrier_state, membarrier_state);
+
+		/*
+		 * For single mm user, we can simply issue a memory barrier
+		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
+		 * mm and in the current runqueue to guarantee that no memory
+		 * access following registration is reordered before
+		 * registration.
+		 */
+		smp_mb();
+		return;
+	}
+
+	/*
+	 * For mm with multiple users, we need to ensure all future
+	 * scheduler executions will observe @mm's new membarrier
+	 * state.
+	 */
+	synchronize_rcu();
+
+	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+		/* Fallback for OOM. */
+		fallback = true;
+	}
+
+	/*
+	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
+	 * @mm's membarrier state set bits are also set in in the runqueue's
+	 * membarrier state. This ensures that a runqueue scheduling
+	 * between threads which are users of @mm has its membarrier state
+	 * updated.
+	 */
+	cpus_read_lock();
+	rcu_read_lock();
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		struct task_struct *p;
+
+		p = task_rcu_dereference(&rq->curr);
+		if (p && p->mm == mm) {
+			if (!fallback)
+				__cpumask_set_cpu(cpu, tmpmask);
+			else
+				smp_call_function_single(cpu, ipi_sync_rq_state,
+							 mm, 1);
+		}
+	}
+	rcu_read_unlock();
+	if (!fallback) {
+		preempt_disable();
+		smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
+		preempt_enable();
+		free_cpumask_var(tmpmask);
+	}
+	cpus_read_unlock();
+}
+
 static int membarrier_register_global_expedited(void)
 {
 	struct task_struct *p = current;
@@ -186,23 +268,7 @@ static int membarrier_register_global_expedited(void)
 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 		return 0;
 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
-	if (atomic_read(&mm->mm_users) == 1) {
-		/*
-		 * For single mm user, single threaded process, we can
-		 * simply issue a memory barrier after setting
-		 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
-		 * no memory access following registration is reordered
-		 * before registration.
-		 */
-		smp_mb();
-	} else {
-		/*
-		 * For multi-mm user threads, we need to ensure all
-		 * future scheduler executions will observe the new
-		 * thread flag state for this mm.
-		 */
-		synchronize_rcu();
-	}
+	sync_runqueues_membarrier_state(mm);
 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 		  &mm->membarrier_state);
 
@@ -213,12 +279,14 @@ static int membarrier_register_private_expedited(int flags)
 {
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
-	int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED;
 
 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 			return -EINVAL;
-		state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+		ready_state =
+			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 	}
 
 	/*
@@ -226,20 +294,13 @@ static int membarrier_register_private_expedited(int flags)
 	 * groups, which use the same mm. (CLONE_VM but not
 	 * CLONE_THREAD).
 	 */
-	if ((atomic_read(&mm->membarrier_state) & state) == state)
+	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
 		return 0;
-	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
-		atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
-			  &mm->membarrier_state);
-	if (atomic_read(&mm->mm_users) != 1) {
-		/*
-		 * Ensure all future scheduler executions will observe the
-		 * new thread flag state for this process.
-		 */
-		synchronize_rcu();
-	}
-	atomic_or(state, &mm->membarrier_state);
+		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+	atomic_or(set_state, &mm->membarrier_state);
+	sync_runqueues_membarrier_state(mm);
+	atomic_or(ready_state, &mm->membarrier_state);
 
 	return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 802b1f3405f2..ac26baa855e8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -907,6 +907,10 @@ struct rq {
 
 	atomic_t		nr_iowait;
 
+#ifdef CONFIG_MEMBARRIER
+	int membarrier_state;
+#endif
+
 #ifdef CONFIG_SMP
 	struct root_domain		*rd;
 	struct sched_domain __rcu	*sd;
@@ -2423,3 +2427,32 @@ static inline bool sched_energy_enabled(void)
 static inline bool sched_energy_enabled(void) { return false; }
 
 #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+#ifdef CONFIG_MEMBARRIER
+/*
+ * The scheduler provides memory barriers required by membarrier between:
+ * - prior user-space memory accesses and store to rq->membarrier_state,
+ * - store to rq->membarrier_state and following user-space memory accesses.
+ * In the same way it provides those guarantees around store to rq->curr.
+ */
+static inline void membarrier_prepare_task_switch(struct rq *rq,
+						  struct task_struct *prev,
+						  struct task_struct *next)
+{
+	int membarrier_state = 0;
+	struct mm_struct *next_mm = next->mm;
+
+	if (prev->mm == next_mm)
+		return;
+	if (next_mm)
+		membarrier_state = atomic_read(&next_mm->membarrier_state);
+	if (READ_ONCE(rq->membarrier_state) != membarrier_state)
+		WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+#else
+static inline void membarrier_prepare_task_switch(struct rq *rq,
+						  struct task_struct *prev,
+						  struct task_struct *next)
+{
+}
+#endif
-- 
2.17.1