linux-kernel - Re: [RFC PATCH 1/4] hazptr: Add initial implementation of hazard pointers

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bba2e656-4c3b-46db-b308-483de440b922@efficios.com>
Date: Fri, 27 Sep 2024 03:20:40 +0200
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Linus Torvalds <torvalds@...ux-foundation.org>,
 Jonas Oberhauser <jonas.oberhauser@...weicloud.com>
Cc: Boqun Feng <boqun.feng@...il.com>, linux-kernel@...r.kernel.org,
 rcu@...r.kernel.org, linux-mm@...ck.org, lkmm@...ts.linux.dev,
 "Paul E. McKenney" <paulmck@...nel.org>,
 Frederic Weisbecker <frederic@...nel.org>,
 Neeraj Upadhyay <neeraj.upadhyay@...nel.org>,
 Joel Fernandes <joel@...lfernandes.org>,
 Josh Triplett <josh@...htriplett.org>, Uladzislau Rezki <urezki@...il.com>,
 Steven Rostedt <rostedt@...dmis.org>, Lai Jiangshan
 <jiangshanlai@...il.com>, Zqiang <qiang.zhang1211@...il.com>,
 Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
 Will Deacon <will@...nel.org>, Waiman Long <longman@...hat.com>,
 Mark Rutland <mark.rutland@....com>, Thomas Gleixner <tglx@...utronix.de>,
 Kent Overstreet <kent.overstreet@...il.com>, Vlastimil Babka
 <vbabka@...e.cz>, maged.michael@...il.com,
 Neeraj Upadhyay <neeraj.upadhyay@....com>
Subject: Re: [RFC PATCH 1/4] hazptr: Add initial implementation of hazard
 pointers

On 2024-09-26 18:12, Linus Torvalds wrote:
> On Thu, 26 Sept 2024 at 08:54, Jonas Oberhauser
> <jonas.oberhauser@...weicloud.com> wrote:
>>
>> No, the issue introduced by the compiler optimization (or by your
>> original patch) is that the CPU can speculatively load from the first
>> pointer as soon as it has completed the load of that pointer:
> 
> You mean the compiler can do it. The inline asm has no impact on what
> the CPU does. The conditional isn't a barrier for the actual hardware.
> But once the compiler doesn't try to do it, the data dependency on the
> address does end up being an ordering constraint on the hardware too
> (I'm happy to say that I haven't heard from the crazies that want
> value prediction in a long time).
> 
> Just use a barrier.  Or make sure to use the proper ordered memory
> accesses when possible. Don't use an inline asm for the compare - we
> don't even have anything insane like that as a portable helper, and we
> shouldn't have it.

How does the compiler barrier help in any way here ?

I am concerned about the compiler SSA GVN (Global Value Numbering)
optimizations, and I don't think a compiler barrier solves anything.
(or I'm missing something obvious)

I was concerned about the suggestion from Jonas to use "node2"
rather than "node" after the equality check as a way to ensure
the intended register is used to return the pointer, because after
the SSA GVN optimization pass, AFAIU this won't help in any way.
I have a set of examples below that show gcc use the result of the
first load, and clang use the result of the second load (on
both x86-64 and aarch64). Likewise when a load-acquire is used as
second load, which I find odd. Hopefully mixing this optimization
from gcc with speculation still abide by the memory model.

Only the asm goto approach ensures that gcc uses the result from
the second load.

#include <stdbool.h>

#define READ_ONCE(x)   \
     (*(__volatile__  __typeof__(x) *)&(x))

static inline
bool same_ptr(void *a, void *b)
{
     asm goto (
#ifdef __x86_64__
         "cmpq %[a], %[b]\n\t"
         "jne %l[ne]\n\t"
#elif defined(__aarch64__)
         "cmp %[a], %[b]\n\t"
         "bne %l[ne]\n\t"
#else
# error "unimplemented"
#endif
         : : [a] "r" (a), [b] "r" (b)
         : : ne);
     return true;
ne:
     return false;
}

int *p;

int fct_2_volatile(void)
{
     int *a, *b;

     do {
         a = READ_ONCE(p);
         asm volatile ("" : : : "memory");
         b = READ_ONCE(p);
     } while (a != b);
     return *b;
}

int fct_volatile_acquire(void)
{
     int *a, *b;

     do {
         a = READ_ONCE(p);
         asm volatile ("" : : : "memory");
         b = __atomic_load_n(&p, __ATOMIC_ACQUIRE);
     } while (a != b);
     return *b;
}

int fct_asm_compare(void)
{
     int *a, *b;

     do {
         a = READ_ONCE(p);
         asm volatile ("" : : : "memory");
         b = READ_ONCE(p);
     } while (!same_ptr(a, b));
     return *b;
}

x86-64 gcc 14.2:

fct_2_volatile:
  mov    rax,QWORD PTR [rip+0x0]        # 7 <fct_2_volatile+0x7>
  mov    rdx,QWORD PTR [rip+0x0]        # e <fct_2_volatile+0xe>
  cmp    rax,rdx
  jne    0 <fct_2_volatile>
  mov    eax,DWORD PTR [rax]
  ret
  cs nop WORD PTR [rax+rax*1+0x0]
fct_volatile_acquire:
  mov    rax,QWORD PTR [rip+0x0]        # 27 <fct_volatile_acquire+0x7>
  mov    rdx,QWORD PTR [rip+0x0]        # 2e <fct_volatile_acquire+0xe>
  cmp    rax,rdx
  jne    20 <fct_volatile_acquire>
  mov    eax,DWORD PTR [rax]
  ret
  cs nop WORD PTR [rax+rax*1+0x0]
fct_asm_compare:
  mov    rdx,QWORD PTR [rip+0x0]        # 47 <fct_asm_compare+0x7>
  mov    rax,QWORD PTR [rip+0x0]        # 4e <fct_asm_compare+0xe>
  cmp    rax,rdx
  jne    40 <fct_asm_compare>
  mov    eax,DWORD PTR [rax]
  ret
main:
  xor    eax,eax
  ret

x86-64 clang 19.1.0:

fct_2_volatile:
  mov    rcx,QWORD PTR [rip+0x0]        # 7 <fct_2_volatile+0x7>
  mov    rax,QWORD PTR [rip+0x0]        # e <fct_2_volatile+0xe>
  cmp    rcx,rax
  jne    0 <fct_2_volatile>
  mov    eax,DWORD PTR [rax]
  ret
  cs nop WORD PTR [rax+rax*1+0x0]
fct_volatile_acquire:
  mov    rcx,QWORD PTR [rip+0x0]        # 27 <fct_volatile_acquire+0x7>
  mov    rax,QWORD PTR [rip+0x0]        # 2e <fct_volatile_acquire+0xe>
  cmp    rcx,rax
  jne    20 <fct_volatile_acquire>
  mov    eax,DWORD PTR [rax]
  ret
  cs nop WORD PTR [rax+rax*1+0x0]
fct_asm_compare:
  mov    rcx,QWORD PTR [rip+0x0]        # 47 <fct_asm_compare+0x7>
  mov    rax,QWORD PTR [rip+0x0]        # 4e <fct_asm_compare+0xe>
  cmp    rax,rcx
  jne    40 <fct_asm_compare>
  mov    eax,DWORD PTR [rax]
  ret
  cs nop WORD PTR [rax+rax*1+0x0]
main:
  xor    eax,eax
  ret

ARM64 gcc 14.2.0:

fct_2_volatile:
         adrp    x0, .LANCHOR0
         add     x0, x0, :lo12:.LANCHOR0
.L2:
         ldr     x1, [x0]
         ldr     x2, [x0]
         cmp     x1, x2
         bne     .L2
         ldr     w0, [x1]
         ret
fct_volatile_acquire:
         adrp    x0, .LANCHOR0
         add     x0, x0, :lo12:.LANCHOR0
.L6:
         ldr     x1, [x0]
         ldar    x2, [x0]
         cmp     x1, x2
         bne     .L6
         ldr     w0, [x1]
         ret
fct_asm_compare:
         adrp    x1, .LANCHOR0
         add     x1, x1, :lo12:.LANCHOR0
.L9:
         ldr     x2, [x1]
         ldr     x0, [x1]
         cmp x2, x0
         bne .L9

         ldr     w0, [x0]
         ret
p:
         .zero   8

armv8-a clang (trunk):

fct_2_volatile:
  adrp	x8, 0 <fct_2_volatile>
  ldr	x10, [x8]
  ldr	x9, [x8]
  cmp	x10, x9
  b.ne	4 <fct_2_volatile+0x4>  // b.any
  ldr	w0, [x9]
  ret
fct_volatile_acquire:
  adrp	x8, 0 <fct_2_volatile>
  add	x8, x8, #0x0
  ldr	x10, [x8]
  ldar	x9, [x8]
  cmp	x10, x9
  b.ne	24 <fct_volatile_acquire+0x8>  // b.any
  ldr	w0, [x9]
  ret
fct_asm_compare:
  adrp	x8, 0 <fct_2_volatile>
  ldr	x9, [x8]
  ldr	x8, [x8]
  cmp	x9, x8
  b.ne	3c <fct_asm_compare>  // b.any
  ldr	w0, [x8]
  ret
main:
  mov	w0, wzr

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com