linux-kernel - Re: [PATCH] x86/tsc: use real seqcount_latch in cyc2ns_read

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Date:   Thu, 11 Oct 2018 17:24:38 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Eric Dumazet <edumazet@...gle.com>
Cc:     LKML <linux-kernel@...r.kernel.org>,
        Eric Dumazet <eric.dumazet@...il.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>,
        "H. Peter Anvin" <hpa@...or.com>, Borislav Petkov <bp@...en8.de>
Subject: Re: [PATCH] x86/tsc: use real seqcount_latch in cyc2ns_read_begin()

On Thu, Oct 11, 2018 at 08:00:42AM -0700, Eric Dumazet wrote:
> Yes, but the code size is bigger (I have looked at the disassembly)
> 
> All these %gs plus offset add up

> Total length : 0xA7 bytes

effective length: 0x78 bytes

> 00000000000002a0 <native_sched_clock>:
>      2a0: 4c 8d 54 24 08        lea    0x8(%rsp),%r10
>      2a5: 48 83 e4 f0          and    $0xfffffffffffffff0,%rsp
>      2a9: 41 ff 72 f8          pushq  -0x8(%r10)
>      2ad: 55                    push   %rbp
>      2ae: 48 89 e5              mov    %rsp,%rbp
>      2b1: 41 52                push   %r10

       2b3: 66 66 66 66 90	k8_nop5_atomic

>      2b8: 0f 31                rdtsc
>      2ba: 48 c1 e2 20          shl    $0x20,%rdx
>      2be: 48 09 c2              or     %rax,%rdx
>      2c1: 49 89 d2              mov    %rdx,%r10
>      2c4: 49 c7 c1 00 00 00 00 mov    $0x0,%r9
> 		2c7: R_X86_64_32S .data..percpu..shared_aligned
>      2cb: 65 8b 05 00 00 00 00 mov    %gs:0x0(%rip),%eax        # 2d2
> <native_sched_clock+0x32>
> 		2ce: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
>      2d2: 89 c1                mov    %eax,%ecx
>      2d4: 83 e1 01              and    $0x1,%ecx
>      2d7: 48 c1 e1 04          shl    $0x4,%rcx
>      2db: 4c 01 c9              add    %r9,%rcx
>      2de: 65 48 8b 79 08        mov    %gs:0x8(%rcx),%rdi
>      2e3: 65 8b 31              mov    %gs:(%rcx),%esi
>      2e6: 65 8b 49 04          mov    %gs:0x4(%rcx),%ecx
>      2ea: 65 44 8b 05 00 00 00 mov    %gs:0x0(%rip),%r8d        # 2f2
> <native_sched_clock+0x52>
>      2f1: 00
> 		2ee: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
>      2f2: 44 39 c0              cmp    %r8d,%eax
>      2f5: 75 d4                jne    2cb <native_sched_clock+0x2b>
>      2f7: 89 f6                mov    %esi,%esi
>      2f9: 48 89 f0              mov    %rsi,%rax
>      2fc: 49 f7 e2              mul    %r10
>      2ff: 48 0f ad d0          shrd   %cl,%rdx,%rax
>      303: 48 d3 ea              shr    %cl,%rdx
>      306: f6 c1 40              test   $0x40,%cl
>      309: 48 0f 45 c2          cmovne %rdx,%rax
>      30d: 48 01 f8              add    %rdi,%rax
>      310: 41 5a                pop    %r10
>      312: 5d                    pop    %rbp
>      313: 49 8d 62 f8          lea    -0x8(%r10),%rsp
>      317: c3                    retq
> 
> New version :
> 
> Total length = 0x91 bytes

effective: 0x71

> 
> 00000000000002a0 <native_sched_clock>:
>      2a0: 4c 8d 54 24 08        lea    0x8(%rsp),%r10
>      2a5: 48 83 e4 f0          and    $0xfffffffffffffff0,%rsp
>      2a9: 41 ff 72 f8          pushq  -0x8(%r10)
>      2ad: 55                    push   %rbp
>      2ae: 48 89 e5              mov    %rsp,%rbp
>      2b1: 41 52                push   %r10

       2b3: 66 66 66 66 90	k8_nop5_atomic

>      2b8: 0f 31                rdtsc
>      2ba: 48 c1 e2 20          shl    $0x20,%rdx
>      2be: 48 09 c2              or     %rax,%rdx
>      2c1: 49 89 d1              mov    %rdx,%r9
>      2c4: 49 c7 c0 00 00 00 00 mov    $0x0,%r8
>		2c7: R_X86_64_32S .data..percpu..shared_aligned
>      2cb: 65 4c 03 05 00 00 00 add    %gs:0x0(%rip),%r8        # 2d3
> <native_sched_clock+0x33>
>      2d2: 00
>		2cf: R_X86_64_PC32 this_cpu_off-0x4
>      2d3: 41 8b 40 20          mov    0x20(%r8),%eax
>      2d7: 89 c6                mov    %eax,%esi
>      2d9: 83 e6 01              and    $0x1,%esi
>      2dc: 48 c1 e6 04          shl    $0x4,%rsi
>      2e0: 4c 01 c6              add    %r8,%rsi
>      2e3: 8b 3e                mov    (%rsi),%edi
>      2e5: 8b 4e 04              mov    0x4(%rsi),%ecx
>      2e8: 48 8b 76 08          mov    0x8(%rsi),%rsi
>      2ec: 41 3b 40 20          cmp    0x20(%r8),%eax
>      2f0: 75 e1                jne    2d3 <native_sched_clock+0x33>
>      2f2: 48 89 f8              mov    %rdi,%rax
>      2f5: 49 f7 e1              mul    %r9
>      2f8: 48 0f ad d0          shrd   %cl,%rdx,%rax
>      2fc: 48 d3 ea              shr    %cl,%rdx
>      2ff: f6 c1 40              test   $0x40,%cl
>      302: 48 0f 45 c2          cmovne %rdx,%rax
>      306: 48 01 f0              add    %rsi,%rax
>      309: 41 5a                pop    %r10
>      30b: 5d                    pop    %rbp
>      30c: 49 8d 62 f8          lea    -0x8(%r10),%rsp
>      310: c3                    retq

Ah, right you are. But my version only touches the one cacheline,
whereas yours will do that extra cpu offset load, which might or might
not be hot.

Difficult..

You have some weird stack setup though.. mine doesn't have that:

$ objdump -dr defconfig-build/arch/x86/kernel/tsc.o | awk '/<native_sched_clock>:$/ {p=1} /^$/ {p=0} {if (p) print $0}'
0000000000000b00 <native_sched_clock>:
b00:   55                      push   %rbp
b01:   48 89 e5                mov    %rsp,%rbp
b04:   66 66 66 66 90          k8_nop5_atomic
b09:   0f 31                   rdtsc
b0b:   48 c1 e2 20             shl    $0x20,%rdx
b0f:   48 09 c2                or     %rax,%rdx
b12:   49 89 d2                mov    %rdx,%r10
b15:   49 c7 c1 00 00 00 00    mov    $0x0,%r9
		b18: R_X86_64_32S       .data..percpu..shared_aligned
b1c:   65 8b 05 00 00 00 00    mov    %gs:0x0(%rip),%eax        # b23 <native_sched_clock+0x23>
		b1f: R_X86_64_PC32      .data..percpu..shared_aligned+0x1c
b23:   89 c1                   mov    %eax,%ecx
b25:   83 e1 01                and    $0x1,%ecx
b28:   48 c1 e1 04             shl    $0x4,%rcx
b2c:   4c 01 c9                add    %r9,%rcx
b2f:   65 48 8b 79 08          mov    %gs:0x8(%rcx),%rdi
b34:   65 8b 31                mov    %gs:(%rcx),%esi
b37:   65 8b 49 04             mov    %gs:0x4(%rcx),%ecx
b3b:   65 44 8b 05 00 00 00    mov    %gs:0x0(%rip),%r8d        # b43 <native_sched_clock+0x43>
b42:   00
		b3f: R_X86_64_PC32      .data..percpu..shared_aligned+0x1c
b43:   44 39 c0                cmp    %r8d,%eax
b46:   75 d4                   jne    b1c <native_sched_clock+0x1c>
b48:   89 f6                   mov    %esi,%esi
b4a:   48 89 f0                mov    %rsi,%rax
b4d:   49 f7 e2                mul    %r10
b50:   48 0f ad d0             shrd   %cl,%rdx,%rax
b54:   48 d3 ea                shr    %cl,%rdx
b57:   f6 c1 40                test   $0x40,%cl
b5a:   48 0f 45 c2             cmovne %rdx,%rax
b5e:   48 01 f8                add    %rdi,%rax
b61:   5d                      pop    %rbp
b62:   c3                      retq

Which gets me to 0x62 effective bytes.