linux-kernel - Re: [PATCH] x86/tsc: use real seqcount_latch in cyc2ns_read

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20181011084047.GA9885@hirez.programming.kicks-ass.net>
Date:   Thu, 11 Oct 2018 10:40:47 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Eric Dumazet <edumazet@...gle.com>
Cc:     linux-kernel <linux-kernel@...r.kernel.org>,
        Eric Dumazet <eric.dumazet@...il.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>,
        "H. Peter Anvin" <hpa@...or.com>, Borislav Petkov <bp@...en8.de>
Subject: Re: [PATCH] x86/tsc: use real seqcount_latch in cyc2ns_read_begin()

On Thu, Oct 11, 2018 at 09:31:33AM +0200, Peter Zijlstra wrote:
> On Wed, Oct 10, 2018 at 05:33:36PM -0700, Eric Dumazet wrote:
> > While looking at native_sched_clock() disassembly I had
> > the surprise to see the compiler (gcc 7.3 here) had
> > optimized out the loop, meaning the code is broken.
> > 
> > Using the documented and approved API not only fixes the bug,
> > it also makes the code more readable.
> > 
> > Replacing five this_cpu_read() by one this_cpu_ptr() makes
> > the generated code smaller.
> 
> Does not for me, that is, the resulting asm is actually larger
> 
> You're quite right the loop went missing; no idea wth that compiler is
> smoking (gcc-8.2 for me). In order to eliminate that loop it needs to
> think that two consecutive loads of this_cpu_read(cyc2ns.seq.sequence)
> will return the same value. But this_cpu_read() is an asm() statement,
> it _should_ not assume such.
> 
> We assume that this_cpu_read() implies READ_ONCE() in a number of
> locations, this really should not happen.
> 
> The reason it was written using this_cpu_read() is so that it can use
> %gs: prefixed instructions and avoid ever loading that percpu offset and
> doing manual address computation.
> 
> Let me prod at this with a sharp stick.

OK, so apart from the inlining being crap, which is fixed by:

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6490f618e096..638491062fea 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -57,7 +57,7 @@ struct cyc2ns {
 
 static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
 
-void cyc2ns_read_begin(struct cyc2ns_data *data)
+void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
 {
 	int seq, idx;
 
@@ -74,7 +74,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data)
 	} while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
 }
 
-void cyc2ns_read_end(void)
+void __always_inline cyc2ns_read_end(void)
 {
 	preempt_enable_notrace();
 }
@@ -103,7 +103,7 @@ void cyc2ns_read_end(void)
  *                      -johnstul@...ibm.com "math is hard, lets go shopping!"
  */
 
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
 	struct cyc2ns_data data;
 	unsigned long long ns;


That gets us:

native_sched_clock:
	pushq   %rbp    #
	movq    %rsp, %rbp      #,

 ... jump label ...

	rdtsc
	salq    $32, %rdx       #, tmp110
	orq     %rax, %rdx      # low, tmp110
	movl %gs:cyc2ns+32(%rip),%ecx   # cyc2ns.seq.sequence, pfo_ret__
	andl    $1, %ecx        #, idx
	salq    $4, %rcx        #, tmp116
	movl %gs:cyc2ns(%rcx),%esi      # cyc2ns.data[idx_14].cyc2ns_mul, pfo_ret__
	movl    %esi, %esi      # pfo_ret__, pfo_ret__
	movq    %rsi, %rax      # pfo_ret__, tmp133
	mulq    %rdx    # _23
	movq %gs:cyc2ns+8(%rcx),%rdi    # cyc2ns.data[idx_14].cyc2ns_offset, pfo_ret__
	addq    $cyc2ns, %rcx   #, tmp117
	movl %gs:4(%rcx),%ecx   # cyc2ns.data[idx_14].cyc2ns_shift, pfo_ret__
	shrdq   %rdx, %rax      # pfo_ret__,, tmp134
	shrq    %cl, %rdx       # pfo_ret__,
	testb   $64, %cl        #, pfo_ret__
	cmovne  %rdx, %rax      #,, tmp134
	addq    %rdi, %rax      # pfo_ret__, <retval>
	popq    %rbp    #
	ret


If we then fix the percpu mess, with:

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e9202a0de8f0..1a19d11cfbbd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -185,22 +185,22 @@ do {									\
 	typeof(var) pfo_ret__;				\
 	switch (sizeof(var)) {				\
 	case 1:						\
-		asm(op "b "__percpu_arg(1)",%0"		\
+		asm volatile(op "b "__percpu_arg(1)",%0"\
 		    : "=q" (pfo_ret__)			\
 		    : "m" (var));			\
 		break;					\
 	case 2:						\
-		asm(op "w "__percpu_arg(1)",%0"		\
+		asm volatile(op "w "__percpu_arg(1)",%0"\
 		    : "=r" (pfo_ret__)			\
 		    : "m" (var));			\
 		break;					\
 	case 4:						\
-		asm(op "l "__percpu_arg(1)",%0"		\
+		asm volatile(op "l "__percpu_arg(1)",%0"\
 		    : "=r" (pfo_ret__)			\
 		    : "m" (var));			\
 		break;					\
 	case 8:						\
-		asm(op "q "__percpu_arg(1)",%0"		\
+		asm volatile(op "q "__percpu_arg(1)",%0"\
 		    : "=r" (pfo_ret__)			\
 		    : "m" (var));			\
 		break;					\


That turns into:

native_sched_clock:
	pushq   %rbp    #
	movq    %rsp, %rbp      #,

 ... jump label ...

	rdtsc
	salq    $32, %rdx       #, tmp110
	orq     %rax, %rdx      # low, tmp110
	movq    %rdx, %r10      # tmp110, _23
	movq    $cyc2ns, %r9    #, tmp136
.L235:
	movl %gs:cyc2ns+32(%rip),%eax   # cyc2ns.seq.sequence, pfo_ret__
	movl    %eax, %ecx      # pfo_ret__, idx
	andl    $1, %ecx        #, idx
	salq    $4, %rcx        #, tmp116
	addq    %r9, %rcx       # tmp136, tmp117
	movq %gs:8(%rcx),%rdi   # cyc2ns.data[idx_14].cyc2ns_offset, pfo_ret__
	movl %gs:(%rcx),%esi    # cyc2ns.data[idx_14].cyc2ns_mul, pfo_ret__
	movl %gs:4(%rcx),%ecx   # cyc2ns.data[idx_14].cyc2ns_shift, pfo_ret__
	movl %gs:cyc2ns+32(%rip),%r8d   # cyc2ns.seq.sequence, pfo_ret__
	cmpl    %r8d, %eax      # pfo_ret__, pfo_ret__
	jne     .L235   #,
	movl    %esi, %esi      # pfo_ret__, pfo_ret__
	movq    %rsi, %rax      # pfo_ret__, tmp133
	mulq    %r10    # _23
	shrdq   %rdx, %rax      # pfo_ret__,, tmp134
	shrq    %cl, %rdx       # pfo_ret__,
	testb   $64, %cl        #, pfo_ret__
	cmovne  %rdx, %rax      #,, tmp134
	addq    %rdi, %rax      # pfo_ret__, <retval>
	popq    %rbp    #
	ret

which is exactly right. Except perhaps for the mess that
mul_u64_u32_shr() turns into.