lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Date:   Thu, 11 Oct 2018 17:24:38 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Eric Dumazet <edumazet@...gle.com>
Cc:     LKML <linux-kernel@...r.kernel.org>,
        Eric Dumazet <eric.dumazet@...il.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>,
        "H. Peter Anvin" <hpa@...or.com>, Borislav Petkov <bp@...en8.de>
Subject: Re: [PATCH] x86/tsc: use real seqcount_latch in cyc2ns_read_begin()

On Thu, Oct 11, 2018 at 08:00:42AM -0700, Eric Dumazet wrote:
> Yes, but the code size is bigger (I have looked at the disassembly)
> 
> All these %gs plus offset add up

> Total length : 0xA7 bytes

effective length: 0x78 bytes

> 00000000000002a0 <native_sched_clock>:
>      2a0: 4c 8d 54 24 08        lea    0x8(%rsp),%r10
>      2a5: 48 83 e4 f0          and    $0xfffffffffffffff0,%rsp
>      2a9: 41 ff 72 f8          pushq  -0x8(%r10)
>      2ad: 55                    push   %rbp
>      2ae: 48 89 e5              mov    %rsp,%rbp
>      2b1: 41 52                push   %r10

       2b3: 66 66 66 66 90	k8_nop5_atomic

>      2b8: 0f 31                rdtsc
>      2ba: 48 c1 e2 20          shl    $0x20,%rdx
>      2be: 48 09 c2              or     %rax,%rdx
>      2c1: 49 89 d2              mov    %rdx,%r10
>      2c4: 49 c7 c1 00 00 00 00 mov    $0x0,%r9
> 		2c7: R_X86_64_32S .data..percpu..shared_aligned
>      2cb: 65 8b 05 00 00 00 00 mov    %gs:0x0(%rip),%eax        # 2d2
> <native_sched_clock+0x32>
> 		2ce: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
>      2d2: 89 c1                mov    %eax,%ecx
>      2d4: 83 e1 01              and    $0x1,%ecx
>      2d7: 48 c1 e1 04          shl    $0x4,%rcx
>      2db: 4c 01 c9              add    %r9,%rcx
>      2de: 65 48 8b 79 08        mov    %gs:0x8(%rcx),%rdi
>      2e3: 65 8b 31              mov    %gs:(%rcx),%esi
>      2e6: 65 8b 49 04          mov    %gs:0x4(%rcx),%ecx
>      2ea: 65 44 8b 05 00 00 00 mov    %gs:0x0(%rip),%r8d        # 2f2
> <native_sched_clock+0x52>
>      2f1: 00
> 		2ee: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
>      2f2: 44 39 c0              cmp    %r8d,%eax
>      2f5: 75 d4                jne    2cb <native_sched_clock+0x2b>
>      2f7: 89 f6                mov    %esi,%esi
>      2f9: 48 89 f0              mov    %rsi,%rax
>      2fc: 49 f7 e2              mul    %r10
>      2ff: 48 0f ad d0          shrd   %cl,%rdx,%rax
>      303: 48 d3 ea              shr    %cl,%rdx
>      306: f6 c1 40              test   $0x40,%cl
>      309: 48 0f 45 c2          cmovne %rdx,%rax
>      30d: 48 01 f8              add    %rdi,%rax
>      310: 41 5a                pop    %r10
>      312: 5d                    pop    %rbp
>      313: 49 8d 62 f8          lea    -0x8(%r10),%rsp
>      317: c3                    retq
> 
> New version :
> 
> Total length = 0x91 bytes

effective: 0x71

> 
> 00000000000002a0 <native_sched_clock>:
>      2a0: 4c 8d 54 24 08        lea    0x8(%rsp),%r10
>      2a5: 48 83 e4 f0          and    $0xfffffffffffffff0,%rsp
>      2a9: 41 ff 72 f8          pushq  -0x8(%r10)
>      2ad: 55                    push   %rbp
>      2ae: 48 89 e5              mov    %rsp,%rbp
>      2b1: 41 52                push   %r10

       2b3: 66 66 66 66 90	k8_nop5_atomic

>      2b8: 0f 31                rdtsc
>      2ba: 48 c1 e2 20          shl    $0x20,%rdx
>      2be: 48 09 c2              or     %rax,%rdx
>      2c1: 49 89 d1              mov    %rdx,%r9
>      2c4: 49 c7 c0 00 00 00 00 mov    $0x0,%r8
>		2c7: R_X86_64_32S .data..percpu..shared_aligned
>      2cb: 65 4c 03 05 00 00 00 add    %gs:0x0(%rip),%r8        # 2d3
> <native_sched_clock+0x33>
>      2d2: 00
>		2cf: R_X86_64_PC32 this_cpu_off-0x4
>      2d3: 41 8b 40 20          mov    0x20(%r8),%eax
>      2d7: 89 c6                mov    %eax,%esi
>      2d9: 83 e6 01              and    $0x1,%esi
>      2dc: 48 c1 e6 04          shl    $0x4,%rsi
>      2e0: 4c 01 c6              add    %r8,%rsi
>      2e3: 8b 3e                mov    (%rsi),%edi
>      2e5: 8b 4e 04              mov    0x4(%rsi),%ecx
>      2e8: 48 8b 76 08          mov    0x8(%rsi),%rsi
>      2ec: 41 3b 40 20          cmp    0x20(%r8),%eax
>      2f0: 75 e1                jne    2d3 <native_sched_clock+0x33>
>      2f2: 48 89 f8              mov    %rdi,%rax
>      2f5: 49 f7 e1              mul    %r9
>      2f8: 48 0f ad d0          shrd   %cl,%rdx,%rax
>      2fc: 48 d3 ea              shr    %cl,%rdx
>      2ff: f6 c1 40              test   $0x40,%cl
>      302: 48 0f 45 c2          cmovne %rdx,%rax
>      306: 48 01 f0              add    %rsi,%rax
>      309: 41 5a                pop    %r10
>      30b: 5d                    pop    %rbp
>      30c: 49 8d 62 f8          lea    -0x8(%r10),%rsp
>      310: c3                    retq

Ah, right you are. But my version only touches the one cacheline,
whereas yours will do that extra cpu offset load, which might or might
not be hot.

Difficult..

You have some weird stack setup though.. mine doesn't have that:

$ objdump -dr defconfig-build/arch/x86/kernel/tsc.o | awk '/<native_sched_clock>:$/ {p=1} /^$/ {p=0} {if (p) print $0}'
0000000000000b00 <native_sched_clock>:
b00:   55                      push   %rbp
b01:   48 89 e5                mov    %rsp,%rbp
b04:   66 66 66 66 90          k8_nop5_atomic
b09:   0f 31                   rdtsc
b0b:   48 c1 e2 20             shl    $0x20,%rdx
b0f:   48 09 c2                or     %rax,%rdx
b12:   49 89 d2                mov    %rdx,%r10
b15:   49 c7 c1 00 00 00 00    mov    $0x0,%r9
		b18: R_X86_64_32S       .data..percpu..shared_aligned
b1c:   65 8b 05 00 00 00 00    mov    %gs:0x0(%rip),%eax        # b23 <native_sched_clock+0x23>
		b1f: R_X86_64_PC32      .data..percpu..shared_aligned+0x1c
b23:   89 c1                   mov    %eax,%ecx
b25:   83 e1 01                and    $0x1,%ecx
b28:   48 c1 e1 04             shl    $0x4,%rcx
b2c:   4c 01 c9                add    %r9,%rcx
b2f:   65 48 8b 79 08          mov    %gs:0x8(%rcx),%rdi
b34:   65 8b 31                mov    %gs:(%rcx),%esi
b37:   65 8b 49 04             mov    %gs:0x4(%rcx),%ecx
b3b:   65 44 8b 05 00 00 00    mov    %gs:0x0(%rip),%r8d        # b43 <native_sched_clock+0x43>
b42:   00
		b3f: R_X86_64_PC32      .data..percpu..shared_aligned+0x1c
b43:   44 39 c0                cmp    %r8d,%eax
b46:   75 d4                   jne    b1c <native_sched_clock+0x1c>
b48:   89 f6                   mov    %esi,%esi
b4a:   48 89 f0                mov    %rsi,%rax
b4d:   49 f7 e2                mul    %r10
b50:   48 0f ad d0             shrd   %cl,%rdx,%rax
b54:   48 d3 ea                shr    %cl,%rdx
b57:   f6 c1 40                test   $0x40,%cl
b5a:   48 0f 45 c2             cmovne %rdx,%rax
b5e:   48 01 f8                add    %rdi,%rax
b61:   5d                      pop    %rbp
b62:   c3                      retq

Which gets me to 0x62 effective bytes.



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ