lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 18 Aug 2016 11:21:48 +0200
From:	Denys Vlasenko <dvlasenk@...hat.com>
To:	Linus Torvalds <torvalds@...ux-foundation.org>
Cc:	Andy Lutomirski <luto@...capital.net>,
	Sara Sharon <sara.sharon@...el.com>,
	Dan Williams <dan.j.williams@...el.com>,
	Christian König <christian.koenig@....com>,
	Vinod Koul <vinod.koul@...el.com>,
	Alex Deucher <alexander.deucher@....com>,
	Johannes Berg <johannes.berg@...el.com>,
	"Rafael J. Wysocki" <rafael.j.wysocki@...el.com>,
	Andy Lutomirski <luto@...nel.org>,
	the arch/x86 maintainers <x86@...nel.org>,
	Ingo Molnar <mingo@...nel.org>,
	LKML <linux-kernel@...r.kernel.org>,
	Adrian Hunter <adrian.hunter@...el.com>
Subject: Re: RFC: Petition Intel/AMD to add POPF_IF insn

On 08/17/2016 11:35 PM, Linus Torvalds wrote:
> On Wed, Aug 17, 2016 at 2:26 PM, Linus Torvalds
> <torvalds@...ux-foundation.org> wrote:
>>
>> Of course, if somebody uses native_restore_fl() to actually *disable*
>> interrupts (when they weren't already disabled), then this untested
>> patch will just not work. But why would you do somethign so stupid?
>> Famous last words...
>
> Looking around, the vsmp code actually uses "native_restore_fl()" to
> not just set the interrupt flag, but AC as well.
>
> And the PV spinlock case has that "push;popf" sequence encoded in an alternate.
>
> So that trivial patch may (or may not - still not tested) work for
> some quick testing, but needs more effort for any *real* use.

I'm going to test the attached patch.

PV and debug maze is daunting... I discovered that Fedora kernels
compile with the set of .config options which result in
spin_unlock_irqrestore which looks like this:

ffffffff818d0f40 <_raw_spin_unlock_irqrestore>:
ffffffff818d0f40:       e8 1b 31 00 00          callq  ffffffff818d4060 <__fentry__>    ffffffff818d0f41: R_X86_64_PC32 __fentry__-0x4
ffffffff818d0f45:       55                      push   %rbp
ffffffff818d0f46:       48 89 e5                mov    %rsp,%rbp
ffffffff818d0f49:       41 54                   push   %r12
ffffffff818d0f4b:       53                      push   %rbx
ffffffff818d0f4c:       48 8b 55 08             mov    0x8(%rbp),%rdx
ffffffff818d0f50:       49 89 fc                mov    %rdi,%r12
ffffffff818d0f53:       48 89 f3                mov    %rsi,%rbx
ffffffff818d0f56:       48 83 c7 18             add    $0x18,%rdi
ffffffff818d0f5a:       be 01 00 00 00          mov    $0x1,%esi
ffffffff818d0f5f:       e8 8c b8 83 ff          callq  ffffffff8110c7f0 <lock_release>  ffffffff818d0f60: R_X86_64_PC32 lock_release-0x4
ffffffff818d0f64:       4c 89 e7                mov    %r12,%rdi
ffffffff818d0f67:       e8 d4 fb 83 ff          callq  ffffffff81110b40 <do_raw_spin_unlock>    ffffffff818d0f68: R_X86_64_PC32 do_raw_spin_unlock-0x4
ffffffff818d0f6c:       f6 c7 02                test   $0x2,%bh
ffffffff818d0f6f:       74 1b                   je     ffffffff818d0f8c <_raw_spin_unlock_irqrestore+0x4c>
ffffffff818d0f71:       e8 aa 9d 83 ff          callq  ffffffff8110ad20 <trace_hardirqs_on>     ffffffff818d0f72: R_X86_64_PC32 trace_hardirqs_on-0x4
ffffffff818d0f76:       48 89 df                mov    %rbx,%rdi
ffffffff818d0f79:       ff 14 25 48 32 e2 81    callq  *0xffffffff81e23248      ffffffff818d0f7c: R_X86_64_32S  pv_irq_ops+0x8
ffffffff818d0f80:       65 ff 0d c9 c4 73 7e    decl   %gs:0x7e73c4c9(%rip)        # d450 <__preempt_count>     ffffffff818d0f83: R_X86_64_PC32 __preempt_count-0x4
ffffffff818d0f87:       5b                      pop    %rbx
ffffffff818d0f88:       41 5c                   pop    %r12
ffffffff818d0f8a:       5d                      pop    %rbp
ffffffff818d0f8b:       c3                      retq
ffffffff818d0f8c:       48 89 df                mov    %rbx,%rdi
ffffffff818d0f8f:       ff 14 25 48 32 e2 81    callq  *0xffffffff81e23248      ffffffff818d0f92: R_X86_64_32S  pv_irq_ops+0x8
ffffffff818d0f96:       e8 35 5b 83 ff          callq  ffffffff81106ad0 <trace_hardirqs_off>    ffffffff818d0f97: R_X86_64_PC32 trace_hardirqs_off-0x4
ffffffff818d0f9b:       eb e3                   jmp    ffffffff818d0f80 <_raw_spin_unlock_irqrestore+0x40>
ffffffff818d0f9d:       0f 1f 00                nopl   (%rax)

Gawd... really Fedora? All this is needed?

Testing with _this_ is not going to show any differences, I need to weed
all the cruft out and test a fast, non-debug .config.

Changed it like this:

+# CONFIG_UPROBES is not set
+# CONFIG_SCHED_OMIT_FRAME_POINTER is not set
+# CONFIG_HYPERVISOR_GUEST is not set
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_KMEMCHECK is not set
+# CONFIG_DEBUG_LOCK_ALLOC is not set
+# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
+# CONFIG_PROVE_RCU is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_FTRACE is not set
+# CONFIG_BINARY_PRINTF is not set

Looks better (it's with the patch already):

00000000000000f0 <_raw_spin_unlock_irqrestore>:
   f0:   53                      push   %rbx
   f1:   48 89 f3                mov    %rsi,%rbx
   f4:   e8 00 00 00 00          callq  f9 <_raw_spin_unlock_irqrestore+0x9>     f5: R_X86_64_PC32       do_raw_spin_unlock-0x4
   f9:   80 e7 02                and    $0x2,%bh
   fc:   74 01                   je     ff <_raw_spin_unlock_irqrestore+0xf>
   fe:   fb                      sti
   ff:   65 ff 0d 00 00 00 00    decl   %gs:0x0(%rip)        # 106 <_raw_spin_unlock_irqrestore+0x16>    102: R_X86_64_PC32      __preempt_count-0x4
  106:   5b                      pop    %rbx
  107:   c3                      retq

This also raises questions. Such as "why do_raw_spin_unlock() is not inlined here?"

Anyway... on to more kernel builds and testing.
Please take a look at the below patch. If it's obviously buggy, let me know.


diff -urp linux-4.7.1.org/arch/x86/include/asm/irqflags.h linux-4.7.1.obj/arch/x86/include/asm/irqflags.h
--- linux-4.7.1.org/arch/x86/include/asm/irqflags.h	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/include/asm/irqflags.h	2016-08-18 10:01:09.514219644 +0200
@@ -44,6 +44,16 @@ static inline void native_irq_enable(voi
  	asm volatile("sti": : :"memory");
  }
  
+/*
+ * This produces a "test; jz; sti" insn sequence.
+ * It's faster than "push reg; popf". If sti is skipped, it's much faster.
+ */
+static inline void native_cond_irq_enable(unsigned long flags)
+{
+	if (flags & X86_EFLAGS_IF)
+		native_irq_enable();
+}
+
  static inline void native_safe_halt(void)
  {
  	asm volatile("sti; hlt": : :"memory");
@@ -69,7 +79,8 @@ static inline notrace unsigned long arch
  
  static inline notrace void arch_local_irq_restore(unsigned long flags)
  {
-	native_restore_fl(flags);
+	if (flags & X86_EFLAGS_IF)
+		native_irq_enable();
  }
  
  static inline notrace void arch_local_irq_disable(void)
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt.c linux-4.7.1.obj/arch/x86/kernel/paravirt.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt.c	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt.c	2016-08-18 10:01:24.797155109 +0200
@@ -313,7 +313,7 @@ struct pv_time_ops pv_time_ops = {
  
  __visible struct pv_irq_ops pv_irq_ops = {
  	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
-	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+	.restore_fl = __PV_IS_CALLEE_SAVE(native_cond_irq_enable),
  	.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
  	.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
  	.safe_halt = native_safe_halt,
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c	2016-08-18 04:43:19.388102755 +0200
@@ -2,7 +2,7 @@
  
  DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
  DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testb $((1<<9)>>8),%ah; jz 1f; sti; 1: ;");
  DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
  DEF_NATIVE(pv_cpu_ops, iret, "iret");
  DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c	2016-08-18 04:42:56.364545509 +0200
@@ -4,7 +4,7 @@
  
  DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
  DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testw $(1<<9),%di; jz 1f; sti; 1: ;");
  DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
  DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
  DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ