[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Ya9hdlBuWYUWRQzs@hirez.programming.kicks-ass.net>
Date: Tue, 7 Dec 2021 14:28:22 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Christoph Hellwig <hch@...radead.org>
Cc: Jens Axboe <axboe@...nel.dk>,
"linux-block@...r.kernel.org" <linux-block@...r.kernel.org>,
keescook@...omium.org,
Linus Torvalds <torvalds@...ux-foundation.org>,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH] block: switch to atomic_t for request references
On Tue, Dec 07, 2021 at 12:26:24PM +0100, Peter Zijlstra wrote:
> On Sun, Dec 05, 2021 at 10:53:49PM -0800, Christoph Hellwig wrote:
>
> > > +#define req_ref_zero_or_close_to_overflow(req) \
> > > + ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
> > > +
> > > +static inline bool req_ref_inc_not_zero(struct request *req)
> > > +{
> > > + return atomic_inc_not_zero(&req->ref);
> > > +}
> > > +
> > > +static inline bool req_ref_put_and_test(struct request *req)
> > > +{
> > > + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
> > > + return atomic_dec_and_test(&req->ref);
> > > +}
>
> So it's just about these two ops, right?
>
> Now, afaict refcount_inc_not_zero() doesn't actually generate terrible
> code here's the fast-path of kernel/events/core.c:ring_buffer_get()
>
> refcount_inc_not_zero():
>
> a9d0: 41 54 push %r12
> a9d2: 49 89 fc mov %rdi,%r12
> a9d5: e8 00 00 00 00 call a9da <ring_buffer_get+0xa> a9d6: R_X86_64_PLT32 __rcu_read_lock-0x4
> a9da: 4d 8b a4 24 c8 02 00 00 mov 0x2c8(%r12),%r12
> a9e2: 4d 85 e4 test %r12,%r12
> a9e5: 74 24 je aa0b <ring_buffer_get+0x3b>
> a9e7: 41 8b 14 24 mov (%r12),%edx
> a9eb: 85 d2 test %edx,%edx
> a9ed: 74 1c je aa0b <ring_buffer_get+0x3b>
> a9ef: 8d 4a 01 lea 0x1(%rdx),%ecx
> * a9f2: 89 d0 mov %edx,%eax
> a9f4: f0 41 0f b1 0c 24 lock cmpxchg %ecx,(%r12)
> a9fa: 75 32 jne aa2e <ring_buffer_get+0x5e>
> * a9fc: 09 ca or %ecx,%edx
> * a9fe: 78 19 js aa19 <ring_buffer_get+0x49>
> aa00: e8 00 00 00 00 call aa05 <ring_buffer_get+0x35> aa01: R_X86_64_PLT32 __rcu_read_unlock-0x4
> aa05: 4c 89 e0 mov %r12,%rax
> aa08: 41 5c pop %r12
> aa0a: c3 ret
>
> The * marked instructions are the difference, vs atomic_inc_not_zero():
>
> a9d0: 41 54 push %r12
> a9d2: 49 89 fc mov %rdi,%r12
> a9d5: e8 00 00 00 00 call a9da <ring_buffer_get+0xa> a9d6: R_X86_64_PLT32 __rcu_read_lock-0x4
> a9da: 4d 8b a4 24 c8 02 00 00 mov 0x2c8(%r12),%r12
> a9e2: 4d 85 e4 test %r12,%r12
> a9e5: 74 1e je aa05 <ring_buffer_get+0x35>
> a9e7: 41 8b 04 24 mov (%r12),%eax
> a9eb: 85 c0 test %eax,%eax
> a9ed: 74 16 je aa05 <ring_buffer_get+0x35>
> a9ef: 8d 50 01 lea 0x1(%rax),%edx
> a9f2: f0 41 0f b1 14 24 lock cmpxchg %edx,(%r12)
> a9f8: 75 f1 jne a9eb <ring_buffer_get+0x1b>
> a9fa: e8 00 00 00 00 call a9ff <ring_buffer_get+0x2f> a9fb: R_X86_64_PLT32 __rcu_read_unlock-0x4
> a9ff: 4c 89 e0 mov %r12,%rax
> aa02: 41 5c pop %r12
> aa04: c3 ret
>
>
> Now, ring_buffer_put(), which uses refcount_dec_and_test():
>
> refcount_dec_and_test()
>
> aa40: b8 ff ff ff ff mov $0xffffffff,%eax
> aa45: f0 0f c1 07 lock xadd %eax,(%rdi)
> aa49: 83 f8 01 cmp $0x1,%eax
> aa4c: 74 05 je aa53 <ring_buffer_put+0x13>
> aa4e: 85 c0 test %eax,%eax
> aa50: 7e 1e jle aa70 <ring_buffer_put+0x30>
> aa52: c3 ret
>
> atomic_dec_and_test():
>
> aa40: f0 ff 0f lock decl (%rdi)
> aa43: 75 1d jne aa62 <ring_buffer_put+0x22>
>
> ...
>
> aa62: c3 ret
>
> Has a larger difference, which is fixable with the below patch, leading
> to:
>
>
> a9f0: f0 ff 0f lock decl (%rdi)
> a9f3: 74 03 je a9f8 <ring_buffer_put+0x8>
> a9f5: 7c 1e jl aa15 <ring_buffer_put+0x25>
> a9f7: c3 ret
>
>
> So where exactly is the performance fail? Is it purely the mess made of
> refcount_dec_and_test() ?
>
For refcount_inc(), as extracted from alloc_perf_context(), I get:
4b68: b8 01 00 00 00 mov $0x1,%eax
4b6d: f0 0f c1 43 28 lock xadd %eax,0x28(%rbx)
4b72: 85 c0 test %eax,%eax
4b74: 74 1b je 4b91 <alloc_perf_context+0xf1>
4b76: 8d 50 01 lea 0x1(%rax),%edx
4b79: 09 c2 or %eax,%edx
4b7b: 78 20 js 4b9d <alloc_perf_context+0xfd>
the best I can seem to find is: https://godbolt.org/z/ne5o6eEEW
4b68: b8 01 00 00 00 mov $0x1,%eax
4b6d: f0 0f c1 07 lock xadd %eax,(%rdi)
4b71: 83 c0 01 add $0x1,%eax
4b74: 74 16 je 4b8c <alloc_perf_context+0xec>
4b76: 78 20 js 4b98 <alloc_perf_context+0xf8>
per the below updated patch. Still, not really pretty, but loads better
I'd say.
---
arch/x86/include/asm/refcount.h | 48 +++++++++++++++++++++++++++++++++++++++++
include/linux/refcount.h | 6 ++++++
2 files changed, 54 insertions(+)
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..b32b4a5e0f37
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_REFCOUNT_H
+#define _ASM_X86_REFCOUNT_H
+
+#define refcount_dec_and_test refcount_dec_and_test
+static inline bool refcount_dec_and_test(refcount_t *r)
+{
+ asm_volatile_goto (LOCK_PREFIX "decl %[var]\n\t"
+ "jz %l[cc_zero]\n\t"
+ "jl %l[cc_error]"
+ : : [var] "m" (r->refs.counter)
+ : "memory"
+ : cc_zero, cc_error);
+ return false;
+
+cc_zero:
+ return true;
+
+cc_error:
+ refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
+ return false;
+}
+
+#define refcount_inc refcount_inc
+static inline void refcount_inc(refcount_t *r)
+{
+ int one = 1;
+
+ asm_volatile_goto (LOCK_PREFIX "xaddl %%eax, %[var]\n\t"
+ "addl $1, %%eax\n\t"
+ "je %l[cc_zero]\n\t"
+ "js %l[cc_error]"
+ : : [var] "m" (r->refs.counter), "a" (one)
+ : "memory"
+ : cc_zero, cc_error);
+
+ return;
+
+cc_zero:
+ refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
+ return;
+
+cc_error:
+ refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
+ return;
+}
+
+#endif
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index b8a6e387f8f9..3ea5757c0b35 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -147,6 +147,8 @@ static inline unsigned int refcount_read(const refcount_t *r)
return atomic_read(&r->refs);
}
+#include <asm/refcount.h>
+
static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
int old = refcount_read(r);
@@ -262,10 +264,12 @@ static inline void __refcount_inc(refcount_t *r, int *oldp)
* Will WARN if the refcount is 0, as this represents a possible use-after-free
* condition.
*/
+#ifndef refcount_inc
static inline void refcount_inc(refcount_t *r)
{
__refcount_inc(r, NULL);
}
+#endif
static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
{
@@ -328,10 +332,12 @@ static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp
*
* Return: true if the resulting refcount is 0, false otherwise
*/
+#ifndef refcount_dec_and_test
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
return __refcount_dec_and_test(r, NULL);
}
+#endif
static inline void __refcount_dec(refcount_t *r, int *oldp)
{
Powered by blists - more mailing lists