lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YbIB7aJU5ngCcaNj@FVFF77S0Q05N>
Date:   Thu, 9 Dec 2021 13:17:33 +0000
From:   Mark Rutland <mark.rutland@....com>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     will@...nel.org, boqun.feng@...il.com,
        linux-kernel@...r.kernel.org, x86@...nel.org, elver@...gle.com,
        keescook@...omium.org, hch@...radead.org,
        torvalds@...ux-foundation.org
Subject: Re: [RFC][PATCH 2/5] refcount: Use atomic_*_ofl()

On Wed, Dec 08, 2021 at 07:36:57PM +0100, Peter Zijlstra wrote:
> Use the shiny new atomic_*_ofl() functions in order to have better
> code-gen.
> 
> Notably refcount_inc() case no longer distinguishes between
> inc-from-zero and inc-negative in the fast path, this improves
> code-gen:
> 
>     4b88:       b8 01 00 00 00          mov    $0x1,%eax
>     4b8d:       f0 0f c1 43 28          lock xadd %eax,0x28(%rbx)
>     4b92:       85 c0                   test   %eax,%eax
>     4b94:       74 1b                   je     4bb1 <alloc_perf_context+0xf1>
>     4b96:       8d 50 01                lea    0x1(%rax),%edx
>     4b99:       09 c2                   or     %eax,%edx
>     4b9b:       78 20                   js     4bbd <alloc_perf_context+0xfd>
> 
> to:
> 
>     4768:       b8 01 00 00 00          mov    $0x1,%eax
>     476d:       f0 0f c1 43 28          lock xadd %eax,0x28(%rbx)
>     4772:       85 c0                   test   %eax,%eax
>     4774:       7e 14                   jle    478a <alloc_perf_context+0xea>

For comparison, I generated the same for arm64 below with kernel.org crosstool
GCC 11.1.0 and defconfig.

For arm64 there's an existing sub-optimiality for inc/dec where the register
for `1` or `-1` gets generated with a `MOV;MOV` chain or `MOV;NEG` chain rather
than a single `MOV` in either case. I think taht's due to the way we build
LSE/LL-SC variants of add() and build a common inc() atop, and the compiler
just loses the opportunity to constant-fold. I'll take a look at how to make
that neater.

Regardless, the code goes from:

    51f4:       52800024        mov     w4, #0x1                        // #1
    ...
    5224:       2a0403e1        mov     w1, w4
    5228:       b8210001        ldadd   w1, w1, [x0]
    522c:       34000261        cbz     w1, 5278 <alloc_perf_context+0xf8>
    5230:       11000422        add     w2, w1, #0x1
    5234:       2a010041        orr     w1, w2, w1
    5238:       37f80181        tbnz    w1, #31, 5268 <alloc_perf_context+0xe8>

to:

    40e8:       52800024        mov     w4, #0x1                        // #1
    ...
    4118:       2a0403e1        mov     w1, w4
    411c:       b8e10001        ldaddal w1, w1, [x0]
    4120:       7100003f        cmp     w1, #0x0
    4124:       5400018d        b.le    4154 <alloc_perf_context+0xe0>

> without sacrificing on functionality; the only thing that suffers is
> the reported error condition, which might now 'spuriously' report
> 'saturated' instead of 'inc-from-zero'.
> 
> refcount_dec_and_test() is also improved:
> 
>     aa40:       b8 ff ff ff ff          mov    $0xffffffff,%eax
>     aa45:       f0 0f c1 07             lock xadd %eax,(%rdi)
>     aa49:       83 f8 01                cmp    $0x1,%eax
>     aa4c:       74 05                   je     aa53 <ring_buffer_put+0x13>
>     aa4e:       85 c0                   test   %eax,%eax
>     aa50:       7e 1e                   jle    aa70 <ring_buffer_put+0x30>
>     aa52:       c3                      ret
> 
> to:
> 
>     a980:       b8 ff ff ff ff          mov    $0xffffffff,%eax
>     a985:       f0 0f c1 07             lock xadd %eax,(%rdi)
>     a989:       83 e8 01                sub    $0x1,%eax
>     a98c:       78 20                   js     a9ae <ring_buffer_put+0x2e>
>     a98e:       74 01                   je     a991 <ring_buffer_put+0x11>
>     a990:       c3                      ret

For arm64 (with your fixlet applied) that goes from:

    c42c:       52800021        mov     w1, #0x1                        // #1
    c430:       4b0103e1        neg     w1, w1
    c434:       b8610001        ldaddl  w1, w1, [x0]
    c438:       7100043f        cmp     w1, #0x1
    c43c:       54000140        b.eq    c464 <ring_buffer_put+0x50>  // b.none
    c440:       7100003f        cmp     w1, #0x0
    c444:       5400028d        b.le    c494 <ring_buffer_put+0x80>

to:

    c3dc:       52800021        mov     w1, #0x1                        // #1
    c3e0:       4b0103e1        neg     w1, w1
    c3e4:       b8e10002        ldaddal w1, w2, [x0]
    c3e8:       0b020021        add     w1, w1, w2
    c3ec:       7100003f        cmp     w1, #0x0
    c3f0:       5400012b        b.lt    c414 <ring_buffer_put+0x50>  // b.tstop
    c3f4:       540001a0        b.eq    c428 <ring_buffer_put+0x64>  // b.none

I think the add here is due to  the change in your fixlet:

-       if (unlikely(old <= 1))                                                                                                                                                                             
+       if (unlikely(old - 1 <= 0)) 

> XXX atomic_dec_and_test_ofl() is strictly stronger ordered than
> refcount_dec_and_test() is defined -- Power and Arrghh64 ?

I'll leave the ordering to Will.

Thanks,
Mark.

> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
>  include/linux/refcount.h |   15 ++++++++++++---
>  lib/refcount.c           |    5 +++++
>  2 files changed, 17 insertions(+), 3 deletions(-)
> 
> --- a/include/linux/refcount.h
> +++ b/include/linux/refcount.h
> @@ -264,7 +264,10 @@ static inline void __refcount_inc(refcou
>   */
>  static inline void refcount_inc(refcount_t *r)
>  {
> -	__refcount_inc(r, NULL);
> +	atomic_inc_ofl(&r->refs, Eoverflow);
> +	return;
> +Eoverflow:
> +	refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
>  }
>  
>  static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
> @@ -330,7 +333,10 @@ static inline __must_check bool __refcou
>   */
>  static inline __must_check bool refcount_dec_and_test(refcount_t *r)
>  {
> -	return __refcount_dec_and_test(r, NULL);
> +	return atomic_dec_and_test_ofl(&r->refs, Eoverflow);
> +Eoverflow:
> +	refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
> +	return false;
>  }
>  
>  static inline void __refcount_dec(refcount_t *r, int *oldp)
> @@ -356,7 +362,10 @@ static inline void __refcount_dec(refcou
>   */
>  static inline void refcount_dec(refcount_t *r)
>  {
> -	__refcount_dec(r, NULL);
> +	atomic_dec_ofl(&r->refs, Eoverflow);
> +	return;
> +Eoverflow:
> +	refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
>  }
>  
>  extern __must_check bool refcount_dec_if_one(refcount_t *r);
> --- a/lib/refcount.c
> +++ b/lib/refcount.c
> @@ -12,8 +12,13 @@
>  
>  void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t)
>  {
> +	int old = refcount_read(r);
>  	refcount_set(r, REFCOUNT_SATURATED);
>  
> +	/* racy; who cares */
> +	if (old == 1 && t == REFCOUNT_ADD_OVF)
> +		t = REFCOUNT_ADD_UAF;
> +
>  	switch (t) {
>  	case REFCOUNT_ADD_NOT_ZERO_OVF:
>  		REFCOUNT_WARN("saturated; leaking memory");
> 
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ