linux-kernel - Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZhUixk5I_n53dbBb@FVFF77S0Q05N>
Date: Tue, 9 Apr 2024 12:13:10 +0100
From: Mark Rutland <mark.rutland@....com>
To: Uros Bizjak <ubizjak@...il.com>
Cc: x86@...nel.org, linux-kernel@...r.kernel.org,
	Thomas Gleixner <tglx@...utronix.de>,
	Ingo Molnar <mingo@...nel.org>, Borislav Petkov <bp@...en8.de>,
	Dave Hansen <dave.hansen@...ux.intel.com>,
	"H. Peter Anvin" <hpa@...or.com>,
	Peter Zijlstra <peterz@...radead.org>
Subject: Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32
 arch_atomic64_{,fetch}_{and,or,xor}() functions

On Tue, Apr 09, 2024 at 12:03:53PM +0200, Uros Bizjak wrote:
> Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to
> use arch_atomic64_try_cmpxchg.  This implementation avoids one extra
> trip through the cmpxchg loop.
> 
> The value preload before the cmpxchg loop does not need to be atomic,
> but should use READ_ONCE to prevent compiler from merging, refetching
> or reordering the read.
> 
> The generated code improves from:
> 
>   1917d5:	31 c9                	xor    %ecx,%ecx
>   1917d7:	31 db                	xor    %ebx,%ebx
>   1917d9:	89 4c 24 3c          	mov    %ecx,0x3c(%esp)
>   1917dd:	8b 74 24 24          	mov    0x24(%esp),%esi
>   1917e1:	89 c8                	mov    %ecx,%eax
>   1917e3:	89 5c 24 34          	mov    %ebx,0x34(%esp)
>   1917e7:	8b 7c 24 28          	mov    0x28(%esp),%edi
>   1917eb:	21 ce                	and    %ecx,%esi
>   1917ed:	89 74 24 4c          	mov    %esi,0x4c(%esp)
>   1917f1:	21 df                	and    %ebx,%edi
>   1917f3:	89 de                	mov    %ebx,%esi
>   1917f5:	89 7c 24 50          	mov    %edi,0x50(%esp)
>   1917f9:	8b 54 24 4c          	mov    0x4c(%esp),%edx
>   1917fd:	8b 7c 24 2c          	mov    0x2c(%esp),%edi
>   191801:	8b 4c 24 50          	mov    0x50(%esp),%ecx
>   191805:	89 d3                	mov    %edx,%ebx
>   191807:	89 f2                	mov    %esi,%edx
>   191809:	f0 0f c7 0f          	lock cmpxchg8b (%edi)
>   19180d:	89 c1                	mov    %eax,%ecx
>   19180f:	8b 74 24 34          	mov    0x34(%esp),%esi
>   191813:	89 d3                	mov    %edx,%ebx
>   191815:	89 44 24 4c          	mov    %eax,0x4c(%esp)
>   191819:	8b 44 24 3c          	mov    0x3c(%esp),%eax
>   19181d:	89 df                	mov    %ebx,%edi
>   19181f:	89 54 24 44          	mov    %edx,0x44(%esp)
>   191823:	89 ca                	mov    %ecx,%edx
>   191825:	31 de                	xor    %ebx,%esi
>   191827:	31 c8                	xor    %ecx,%eax
>   191829:	09 f0                	or     %esi,%eax
>   19182b:	75 ac                	jne    1917d9 <...>
> 
> to:
> 
>   1912ba:	8b 06                	mov    (%esi),%eax
>   1912bc:	8b 56 04             	mov    0x4(%esi),%edx
>   1912bf:	89 44 24 3c          	mov    %eax,0x3c(%esp)
>   1912c3:	89 c1                	mov    %eax,%ecx
>   1912c5:	23 4c 24 34          	and    0x34(%esp),%ecx
>   1912c9:	89 d3                	mov    %edx,%ebx
>   1912cb:	23 5c 24 38          	and    0x38(%esp),%ebx
>   1912cf:	89 54 24 40          	mov    %edx,0x40(%esp)
>   1912d3:	89 4c 24 2c          	mov    %ecx,0x2c(%esp)
>   1912d7:	89 5c 24 30          	mov    %ebx,0x30(%esp)
>   1912db:	8b 5c 24 2c          	mov    0x2c(%esp),%ebx
>   1912df:	8b 4c 24 30          	mov    0x30(%esp),%ecx
>   1912e3:	f0 0f c7 0e          	lock cmpxchg8b (%esi)
>   1912e7:	0f 85 f3 02 00 00    	jne    1915e0 <...>
> 
> Signed-off-by: Uros Bizjak <ubizjak@...il.com>
> Cc: Thomas Gleixner <tglx@...utronix.de>
> Cc: Ingo Molnar <mingo@...nel.org>
> Cc: Borislav Petkov <bp@...en8.de>
> Cc: Dave Hansen <dave.hansen@...ux.intel.com>
> Cc: "H. Peter Anvin" <hpa@...or.com>
> Cc: Peter Zijlstra <peterz@...radead.org>
> ---
>  arch/x86/include/asm/atomic64_32.h | 44 ++++++++++++------------------
>  1 file changed, 18 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
> index 11e817dab44a..84affd7a5d1c 100644
> --- a/arch/x86/include/asm/atomic64_32.h
> +++ b/arch/x86/include/asm/atomic64_32.h
> @@ -201,69 +201,61 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
>  
>  static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);

I reckon it's worth placing this in a helper with a big comment, e.g.

static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
{
	/*
	 * TODO: explain that this might be torn, but it occurs *once*, and can
	 * safely be consumed by atomic64_try_cmpxchg().
	 *
	 * TODO: point to the existing commentary regarding why we use
	 * __READ_ONCE() for KASAN reasons.
	 */
	return __READ_ONCE(v->counter);
}

.. and then use that in each of the instances below.

That way the subtlety is clearly documented, and it'd more clearly align with
the x86_64 verions.

Mark.

>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
>  }
>  
>  static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
>  
> -	return old;
> +	return val;
>  }
>  #define arch_atomic64_fetch_and arch_atomic64_fetch_and
>  
>  static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
>  }
>  
>  static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
>  
> -	return old;
> +	return val;
>  }
>  #define arch_atomic64_fetch_or arch_atomic64_fetch_or
>  
>  static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
>  }
>  
>  static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
> -		c = old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
>  
> -	return old;
> +	return val;
>  }
>  #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
>  
>  static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
>  {
> -	s64 old, c = 0;
> +	s64 val = __READ_ONCE(v->counter);
>  
> -	while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
> -		c = old;
> -
> -	return old;
> +	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
> +	return val;
>  }
>  #define arch_atomic64_fetch_add arch_atomic64_fetch_add
>  
> -- 
> 2.44.0
> 
>