lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Ya9hdlBuWYUWRQzs@hirez.programming.kicks-ass.net>
Date:   Tue, 7 Dec 2021 14:28:22 +0100
From:   Peter Zijlstra <peterz@...radead.org>
To:     Christoph Hellwig <hch@...radead.org>
Cc:     Jens Axboe <axboe@...nel.dk>,
        "linux-block@...r.kernel.org" <linux-block@...r.kernel.org>,
        keescook@...omium.org,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH] block: switch to atomic_t for request references

On Tue, Dec 07, 2021 at 12:26:24PM +0100, Peter Zijlstra wrote:
> On Sun, Dec 05, 2021 at 10:53:49PM -0800, Christoph Hellwig wrote:
> 
> > > +#define req_ref_zero_or_close_to_overflow(req)	\
> > > +	((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
> > > +
> > > +static inline bool req_ref_inc_not_zero(struct request *req)
> > > +{
> > > +	return atomic_inc_not_zero(&req->ref);
> > > +}
> > > +
> > > +static inline bool req_ref_put_and_test(struct request *req)
> > > +{
> > > +	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
> > > +	return atomic_dec_and_test(&req->ref);
> > > +}
> 
> So it's just about these two ops, right?
> 
> Now, afaict refcount_inc_not_zero() doesn't actually generate terrible
> code here's the fast-path of kernel/events/core.c:ring_buffer_get()
> 
> refcount_inc_not_zero():
> 
>     a9d0:       41 54                   push   %r12
>     a9d2:       49 89 fc                mov    %rdi,%r12
>     a9d5:       e8 00 00 00 00          call   a9da <ring_buffer_get+0xa>       a9d6: R_X86_64_PLT32    __rcu_read_lock-0x4
>     a9da:       4d 8b a4 24 c8 02 00 00         mov    0x2c8(%r12),%r12
>     a9e2:       4d 85 e4                test   %r12,%r12
>     a9e5:       74 24                   je     aa0b <ring_buffer_get+0x3b>
>     a9e7:       41 8b 14 24             mov    (%r12),%edx
>     a9eb:       85 d2                   test   %edx,%edx
>     a9ed:       74 1c                   je     aa0b <ring_buffer_get+0x3b>
>     a9ef:       8d 4a 01                lea    0x1(%rdx),%ecx
> *   a9f2:       89 d0                   mov    %edx,%eax
>     a9f4:       f0 41 0f b1 0c 24       lock cmpxchg %ecx,(%r12)
>     a9fa:       75 32                   jne    aa2e <ring_buffer_get+0x5e>
> *   a9fc:       09 ca                   or     %ecx,%edx
> *   a9fe:       78 19                   js     aa19 <ring_buffer_get+0x49>
>     aa00:       e8 00 00 00 00          call   aa05 <ring_buffer_get+0x35>      aa01: R_X86_64_PLT32    __rcu_read_unlock-0x4
>     aa05:       4c 89 e0                mov    %r12,%rax
>     aa08:       41 5c                   pop    %r12
>     aa0a:       c3                      ret
> 
> The * marked instructions are the difference, vs atomic_inc_not_zero():
> 
>     a9d0:       41 54                   push   %r12
>     a9d2:       49 89 fc                mov    %rdi,%r12
>     a9d5:       e8 00 00 00 00          call   a9da <ring_buffer_get+0xa>       a9d6: R_X86_64_PLT32    __rcu_read_lock-0x4
>     a9da:       4d 8b a4 24 c8 02 00 00         mov    0x2c8(%r12),%r12
>     a9e2:       4d 85 e4                test   %r12,%r12
>     a9e5:       74 1e                   je     aa05 <ring_buffer_get+0x35>
>     a9e7:       41 8b 04 24             mov    (%r12),%eax
>     a9eb:       85 c0                   test   %eax,%eax
>     a9ed:       74 16                   je     aa05 <ring_buffer_get+0x35>
>     a9ef:       8d 50 01                lea    0x1(%rax),%edx
>     a9f2:       f0 41 0f b1 14 24       lock cmpxchg %edx,(%r12)
>     a9f8:       75 f1                   jne    a9eb <ring_buffer_get+0x1b>
>     a9fa:       e8 00 00 00 00          call   a9ff <ring_buffer_get+0x2f>      a9fb: R_X86_64_PLT32    __rcu_read_unlock-0x4
>     a9ff:       4c 89 e0                mov    %r12,%rax
>     aa02:       41 5c                   pop    %r12
>     aa04:       c3                      ret
> 
> 
> Now, ring_buffer_put(), which uses refcount_dec_and_test():
> 
> refcount_dec_and_test()
> 
>     aa40:       b8 ff ff ff ff          mov    $0xffffffff,%eax
>     aa45:       f0 0f c1 07             lock xadd %eax,(%rdi)
>     aa49:       83 f8 01                cmp    $0x1,%eax
>     aa4c:       74 05                   je     aa53 <ring_buffer_put+0x13>
>     aa4e:       85 c0                   test   %eax,%eax
>     aa50:       7e 1e                   jle    aa70 <ring_buffer_put+0x30>
>     aa52:       c3                      ret
> 
> atomic_dec_and_test():
> 
>     aa40:       f0 ff 0f                lock decl (%rdi)
>     aa43:       75 1d                   jne    aa62 <ring_buffer_put+0x22>
> 
>     ...
> 
>     aa62:       c3                      ret
> 
> Has a larger difference, which is fixable with the below patch, leading
> to:
> 
> 
>     a9f0:       f0 ff 0f                lock decl (%rdi)
>     a9f3:       74 03                   je     a9f8 <ring_buffer_put+0x8>
>     a9f5:       7c 1e                   jl     aa15 <ring_buffer_put+0x25>
>     a9f7:       c3                      ret
> 
> 
> So where exactly is the performance fail? Is it purely the mess made of
> refcount_dec_and_test() ?
> 

For refcount_inc(), as extracted from alloc_perf_context(), I get:

    4b68:       b8 01 00 00 00          mov    $0x1,%eax
    4b6d:       f0 0f c1 43 28          lock xadd %eax,0x28(%rbx)
    4b72:       85 c0                   test   %eax,%eax
    4b74:       74 1b                   je     4b91 <alloc_perf_context+0xf1>
    4b76:       8d 50 01                lea    0x1(%rax),%edx
    4b79:       09 c2                   or     %eax,%edx
    4b7b:       78 20                   js     4b9d <alloc_perf_context+0xfd>

the best I can seem to find is: https://godbolt.org/z/ne5o6eEEW

    4b68:	b8 01 00 00 00       	mov    $0x1,%eax
    4b6d:	f0 0f c1 07          	lock xadd %eax,(%rdi)
    4b71:	83 c0 01             	add    $0x1,%eax
    4b74:	74 16                	je     4b8c <alloc_perf_context+0xec>
    4b76:	78 20                	js     4b98 <alloc_perf_context+0xf8>

per the below updated patch. Still, not really pretty, but loads better
I'd say.


---
 arch/x86/include/asm/refcount.h | 48 +++++++++++++++++++++++++++++++++++++++++
 include/linux/refcount.h        |  6 ++++++
 2 files changed, 54 insertions(+)

diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..b32b4a5e0f37
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_REFCOUNT_H
+#define _ASM_X86_REFCOUNT_H
+
+#define refcount_dec_and_test refcount_dec_and_test
+static inline bool refcount_dec_and_test(refcount_t *r)
+{
+	asm_volatile_goto (LOCK_PREFIX "decl %[var]\n\t"
+			   "jz %l[cc_zero]\n\t"
+			   "jl %l[cc_error]"
+			   : : [var] "m" (r->refs.counter)
+			   : "memory"
+			   : cc_zero, cc_error);
+	return false;
+
+cc_zero:
+	return true;
+
+cc_error:
+	refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
+	return false;
+}
+
+#define refcount_inc refcount_inc
+static inline void refcount_inc(refcount_t *r)
+{
+	int one = 1;
+
+	asm_volatile_goto (LOCK_PREFIX "xaddl %%eax, %[var]\n\t"
+			   "addl $1, %%eax\n\t"
+			   "je %l[cc_zero]\n\t"
+			   "js %l[cc_error]"
+			   : : [var] "m" (r->refs.counter), "a" (one)
+			   : "memory"
+			   : cc_zero, cc_error);
+
+	return;
+
+cc_zero:
+	refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
+	return;
+
+cc_error:
+	refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
+	return;
+}
+
+#endif
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index b8a6e387f8f9..3ea5757c0b35 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -147,6 +147,8 @@ static inline unsigned int refcount_read(const refcount_t *r)
 	return atomic_read(&r->refs);
 }
 
+#include <asm/refcount.h>
+
 static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
 {
 	int old = refcount_read(r);
@@ -262,10 +264,12 @@ static inline void __refcount_inc(refcount_t *r, int *oldp)
  * Will WARN if the refcount is 0, as this represents a possible use-after-free
  * condition.
  */
+#ifndef refcount_inc
 static inline void refcount_inc(refcount_t *r)
 {
 	__refcount_inc(r, NULL);
 }
+#endif
 
 static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
 {
@@ -328,10 +332,12 @@ static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
+#ifndef refcount_dec_and_test
 static inline __must_check bool refcount_dec_and_test(refcount_t *r)
 {
 	return __refcount_dec_and_test(r, NULL);
 }
+#endif
 
 static inline void __refcount_dec(refcount_t *r, int *oldp)
 {

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ