lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1484596275-30412-1-git-send-email-mathieu.desnoyers@efficios.com>
Date:   Mon, 16 Jan 2017 14:51:15 -0500
From:   Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To:     "Paul E . McKenney" <paulmck@...ux.vnet.ibm.com>
Cc:     linux-kernel@...r.kernel.org,
        Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
        Josh Triplett <josh@...htriplett.org>,
        KOSAKI Motohiro <kosaki.motohiro@...fujitsu.com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Nicholas Miell <nmiell@...cast.net>,
        Ingo Molnar <mingo@...hat.com>,
        Alan Cox <gnomes@...rguk.ukuu.org.uk>,
        Lai Jiangshan <laijs@...fujitsu.com>,
        Stephen Hemminger <stephen@...workplumber.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Peter Zijlstra <peterz@...radead.org>,
        David Howells <dhowells@...hat.com>,
        Pranith Kumar <bobby.prani@...il.com>,
        Michael Kerrisk <mtk.manpages@...il.com>,
        Shuah Khan <shuahkh@....samsung.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Linus Torvalds <torvalds@...ux-foundation.org>
Subject: [RFC PATCH] membarrier: handle nohz_full with expedited thread registration

Threads running on nohz_full CPUs are not considered by
synchronize_sched, but they should be covered by a membarrier system
call with MEMBARRIER_CMD_SHARED command.

Introduce two new commands to membarrier:
MEMBARRIER_CMD_REGISTER_EXPEDITED and
MEMBARRIER_CMD_UNREGISTER_EXPEDITED.

No-hz full threads requiring to receive interrupts to ensure correct
memory ordering pairing compiler barriers with membarrier system call
should register as "expedited" threads.

[ This RFC patch lacks documentation. I mainly want feedback to see if
  everyone is OK with the general approach. ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: Paul E. McKenney <paulmck@...ux.vnet.ibm.com>
Cc: Josh Triplett <josh@...htriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@...fujitsu.com>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Nicholas Miell <nmiell@...cast.net>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: Alan Cox <gnomes@...rguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@...fujitsu.com>
Cc: Stephen Hemminger <stephen@...workplumber.org>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: David Howells <dhowells@...hat.com>
Cc: Pranith Kumar <bobby.prani@...il.com>
Cc: Michael Kerrisk <mtk.manpages@...il.com>
Cc: Shuah Khan <shuahkh@....samsung.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>
---
 fs/exec.c                       |  1 +
 include/linux/sched.h           | 27 +++++++++++++++
 include/uapi/linux/membarrier.h |  6 ++++
 kernel/fork.c                   |  2 ++
 kernel/membarrier.c             | 77 +++++++++++++++++++++++++++++++++++++++--
 5 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e579466..2cf1f87 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1771,6 +1771,7 @@ static int do_execveat_common(int fd, struct filename *filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
+	membarrier_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9e..1242eb9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1998,6 +1998,9 @@ struct task_struct {
 	/* A live task holds one reference. */
 	atomic_t stack_refcount;
 #endif
+#ifdef CONFIG_MEMBARRIER
+	unsigned int membarrier_expedited;
+#endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /*
@@ -3671,4 +3674,28 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
 void cpufreq_remove_update_util_hook(int cpu);
 #endif /* CONFIG_CPU_FREQ */
 
+#ifdef CONFIG_MEMBARRIER
+static inline void membarrier_fork(struct task_struct *t,
+		unsigned long clone_flags)
+{
+	if (clone_flags & CLONE_THREAD)
+		t->membarrier_expedited = 0;
+	else
+		t->membarrier_expedited = current->membarrier_expedited;
+}
+
+static inline void membarrier_execve(struct task_struct *t)
+{
+	t->membarrier_expedited = 0;
+}
+#else
+static inline void membarrier_fork(struct task_struct *t,
+		unsigned long clone_flags)
+{
+}
+static inline void membarrier_execve(struct task_struct *t)
+{
+}
+#endif
+
 #endif
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108b..4b78f07 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,6 +40,10 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_REGISTER_EXPEDITED:
+ *                          TODO
+ * @MEMBARRIER_CMD_UNREGISTER_EXPEDITED:
+ *                          TODO
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
@@ -48,6 +52,8 @@
 enum membarrier_cmd {
 	MEMBARRIER_CMD_QUERY = 0,
 	MEMBARRIER_CMD_SHARED = (1 << 0),
+	MEMBARRIER_CMD_REGISTER_EXPEDITED = (1 << 1),
+	MEMBARRIER_CMD_UNREGISTER_EXPEDITED = (1 << 2),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8a..cec23e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1769,6 +1769,8 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	copy_seccomp(p);
 
+	membarrier_fork(p, clone_flags);
+
 	/*
 	 * Process group and session signals need to be delivered to just the
 	 * parent before the fork or both the parent and the child after the
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 536c727..65a6fbf 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -16,12 +16,79 @@
 
 #include <linux/syscalls.h>
 #include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+
+/*
+ * TODO: private sched.h is needed for runqueue. Should we move the
+ * sched code under kernel/sched/ ?
+ */
+#include "sched/sched.h"
 
 /*
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  * except MEMBARRIER_CMD_QUERY.
  */
-#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
+#define MEMBARRIER_CMD_BITMASK	\
+	(MEMBARRIER_CMD_SHARED \
+	| MEMBARRIER_CMD_REGISTER_EXPEDITED \
+	| MEMBARRIER_CMD_UNREGISTER_EXPEDITED)
+
+static int membarrier_register_expedited(struct task_struct *t)
+{
+	struct rq *rq;
+
+	if (t->membarrier_expedited == UINT_MAX)
+		return -EOVERFLOW;
+	rq = this_rq();
+	raw_spin_lock(&rq->lock);
+	t->membarrier_expedited++;
+	raw_spin_unlock(&rq->lock);
+	return 0;
+}
+
+static int membarrier_unregister_expedited(struct task_struct *t)
+{
+	struct rq *rq;
+
+	if (!t->membarrier_expedited)
+		return -ENOENT;
+	rq = this_rq();
+	raw_spin_lock(&rq->lock);
+	t->membarrier_expedited--;
+	raw_spin_unlock(&rq->lock);
+	return 0;
+}
+
+static void memory_barrier(void *info)
+{
+	smp_mb();
+}
+
+static void membarrier_nohz_full_expedited(void)
+{
+	int cpu;
+
+	if (!tick_nohz_full_enabled())
+		return;
+	for_each_cpu(cpu, tick_nohz_full_mask) {
+		struct rq *rq;
+		struct task_struct *t;
+
+		rq = cpu_rq(cpu);
+		raw_spin_lock(&rq->lock);
+		t = rq->curr;
+		if (t->membarrier_expedited) {
+			int ret;
+
+			ret = smp_call_function_single(cpu, memory_barrier,
+					NULL, 1);
+			WARN_ON_ONCE(ret);
+		}
+		raw_spin_unlock(&rq->lock);
+	}
+}
 
 /**
  * sys_membarrier - issue memory barriers on a set of threads
@@ -57,9 +124,15 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 	case MEMBARRIER_CMD_QUERY:
 		return MEMBARRIER_CMD_BITMASK;
 	case MEMBARRIER_CMD_SHARED:
-		if (num_online_cpus() > 1)
+		if (num_online_cpus() > 1) {
 			synchronize_sched();
+			membarrier_nohz_full_expedited();
+		}
 		return 0;
+	case MEMBARRIER_CMD_REGISTER_EXPEDITED:
+		return membarrier_register_expedited(current);
+	case MEMBARRIER_CMD_UNREGISTER_EXPEDITED:
+		return membarrier_unregister_expedited(current);
 	default:
 		return -EINVAL;
 	}
-- 
2.1.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ