linux-kernel - Re: [PATCH for 4.14] membarrier: Provide register expedited private command

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20171019174846.GZ3521@linux.vnet.ibm.com>
Date:   Thu, 19 Oct 2017 10:48:46 -0700
From:   "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
To:     Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc:     Linus Torvalds <torvalds@...ux-foundation.org>,
        linux-kernel@...r.kernel.org,
        Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Alexander Viro <viro@...iv.linux.org.uk>
Subject: Re: [PATCH for 4.14] membarrier: Provide register expedited private
 command

On Thu, Oct 19, 2017 at 01:30:15PM -0400, Mathieu Desnoyers wrote:
> [ This patch is sent directly to Linus, because it needs to be merged
>   before the end of 4.14 rc cycle. It introduces a "register private
>   expedited" membarrier command which allows eventual removal of
>   important memory barrier constraints on the scheduler fast-paths. It
>   changes how the "private expedited" membarrier command (new to 4.14)
>   is used from user-space. Sorry to send this late in the cycle. ]
> 
> Provide a command allowing processes to register their intent to use
> the private expedited command. This affects how the expedited private
> command introduced in 4.14-rc is meant to be used, and should be merged
> before 4.14 final.
> 
> Processes are now required to register before using
> MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM.
> 
> This fixes a problem that arose when designing requested extensions to
> sys_membarrier() to allow JITs to efficiently flush old code from
> instruction caches.  Several potential algorithms are much less painful
> if the user register intent to use this functionality early on, for
> example, before the process spawns the second thread.  Registering at
> this time removes the need to interrupt each and every thread in that
> process at the first expedited sys_membarrier() system call.
> 
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
> CC: Paul E. McKenney <paulmck@...ux.vnet.ibm.com>

This looks much less intrusive than the earlier series!

Acked-by: Paul E. McKenney <paulmck@...ux.vnet.ibm.com>

> CC: Peter Zijlstra <peterz@...radead.org>
> CC: Ingo Molnar <mingo@...hat.com>
> CC: Alexander Viro <viro@...iv.linux.org.uk>
> CC: Linus Torvalds <torvalds@...ux-foundation.org>
> ---
>  fs/exec.c                       |  1 +
>  include/linux/mm_types.h        |  3 +++
>  include/linux/sched/mm.h        | 16 ++++++++++++++++
>  include/uapi/linux/membarrier.h | 23 ++++++++++++++++-------
>  kernel/sched/membarrier.c       | 34 ++++++++++++++++++++++++++++++----
>  5 files changed, 66 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/exec.c b/fs/exec.c
> index 5470d3c1892a..3e14ba25f678 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1802,6 +1802,7 @@ static int do_execveat_common(int fd, struct filename *filename,
>  	/* execve succeeded */
>  	current->fs->in_exec = 0;
>  	current->in_execve = 0;
> +	membarrier_execve(current);
>  	acct_update_integrals(current);
>  	task_numa_free(current);
>  	free_bprm(bprm);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 46f4ecf5479a..1861ea8dba77 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -445,6 +445,9 @@ struct mm_struct {
>  	unsigned long flags; /* Must use atomic bitops to access the bits */
> 
>  	struct core_state *core_state; /* coredumping support */
> +#ifdef CONFIG_MEMBARRIER
> +	atomic_t membarrier_state;
> +#endif
>  #ifdef CONFIG_AIO
>  	spinlock_t			ioctx_lock;
>  	struct kioctx_table __rcu	*ioctx_table;
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index ae53e413fb13..ab9bf7b73954 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -211,4 +211,20 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
>  	current->flags = (current->flags & ~PF_MEMALLOC) | flags;
>  }
> 
> +#ifdef CONFIG_MEMBARRIER
> +enum {
> +	MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY	= (1U << 0),
> +	MEMBARRIER_STATE_SWITCH_MM			= (1U << 1),
> +};
> +
> +static inline void membarrier_execve(struct task_struct *t)
> +{
> +	atomic_set(&t->mm->membarrier_state, 0);
> +}
> +#else
> +static inline void membarrier_execve(struct task_struct *t)
> +{
> +}
> +#endif
> +
>  #endif /* _LINUX_SCHED_MM_H */
> diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> index 6d47b3249d8a..4e01ad7ffe98 100644
> --- a/include/uapi/linux/membarrier.h
> +++ b/include/uapi/linux/membarrier.h
> @@ -52,21 +52,30 @@
>   *                          (non-running threads are de facto in such a
>   *                          state). This only covers threads from the
>   *                          same processes as the caller thread. This
> - *                          command returns 0. The "expedited" commands
> - *                          complete faster than the non-expedited ones,
> - *                          they never block, but have the downside of
> - *                          causing extra overhead.
> + *                          command returns 0 on success. The
> + *                          "expedited" commands complete faster than
> + *                          the non-expedited ones, they never block,
> + *                          but have the downside of causing extra
> + *                          overhead. A process needs to register its
> + *                          intent to use the private expedited command
> + *                          prior to using it, otherwise this command
> + *                          returns -EPERM.
> + * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
> + *                          Register the process intent to use
> + *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
> + *                          returns 0.
>   *
>   * Command to be passed to the membarrier system call. The commands need to
>   * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
>   * the value 0.
>   */
>  enum membarrier_cmd {
> -	MEMBARRIER_CMD_QUERY			= 0,
> -	MEMBARRIER_CMD_SHARED			= (1 << 0),
> +	MEMBARRIER_CMD_QUERY				= 0,
> +	MEMBARRIER_CMD_SHARED				= (1 << 0),
>  	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
>  	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
> -	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
> +	MEMBARRIER_CMD_PRIVATE_EXPEDITED		= (1 << 3),
> +	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED	= (1 << 4),
>  };
> 
>  #endif /* _UAPI_LINUX_MEMBARRIER_H */
> diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
> index a92fddc22747..dd7908743dab 100644
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -18,6 +18,7 @@
>  #include <linux/membarrier.h>
>  #include <linux/tick.h>
>  #include <linux/cpumask.h>
> +#include <linux/atomic.h>
> 
>  #include "sched.h"	/* for cpu_rq(). */
> 
> @@ -26,21 +27,26 @@
>   * except MEMBARRIER_CMD_QUERY.
>   */
>  #define MEMBARRIER_CMD_BITMASK	\
> -	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
> +	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED	\
> +	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
> 
>  static void ipi_mb(void *info)
>  {
>  	smp_mb();	/* IPIs should be serializing but paranoid. */
>  }
> 
> -static void membarrier_private_expedited(void)
> +static int membarrier_private_expedited(void)
>  {
>  	int cpu;
>  	bool fallback = false;
>  	cpumask_var_t tmpmask;
> 
> +	if (!(atomic_read(&current->mm->membarrier_state)
> +			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
> +		return -EPERM;
> +
>  	if (num_online_cpus() == 1)
> -		return;
> +		return 0;
> 
>  	/*
>  	 * Matches memory barriers around rq->curr modification in
> @@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)
>  	 * rq->curr modification in scheduler.
>  	 */
>  	smp_mb();	/* exit from system call is not a mb */
> +	return 0;
> +}
> +
> +static void membarrier_register_private_expedited(void)
> +{
> +	struct task_struct *p = current;
> +	struct mm_struct *mm = p->mm;
> +
> +	/*
> +	 * We need to consider threads belonging to different thread
> +	 * groups, which use the same mm. (CLONE_VM but not
> +	 * CLONE_THREAD).
> +	 */
> +	if (atomic_read(&mm->membarrier_state)
> +			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
> +		return;
> +	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
> +			&mm->membarrier_state);
>  }
> 
>  /**
> @@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
>  			synchronize_sched();
>  		return 0;
>  	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
> -		membarrier_private_expedited();
> +		return membarrier_private_expedited();
> +	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
> +		membarrier_register_private_expedited();
>  		return 0;
>  	default:
>  		return -EINVAL;
> -- 
> 2.11.0
>