[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1ae46402-4cd0-6afc-3dd3-e155ec59f64e@redhat.com>
Date: Thu, 18 Aug 2016 11:21:48 +0200
From: Denys Vlasenko <dvlasenk@...hat.com>
To: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: Andy Lutomirski <luto@...capital.net>,
Sara Sharon <sara.sharon@...el.com>,
Dan Williams <dan.j.williams@...el.com>,
Christian König <christian.koenig@....com>,
Vinod Koul <vinod.koul@...el.com>,
Alex Deucher <alexander.deucher@....com>,
Johannes Berg <johannes.berg@...el.com>,
"Rafael J. Wysocki" <rafael.j.wysocki@...el.com>,
Andy Lutomirski <luto@...nel.org>,
the arch/x86 maintainers <x86@...nel.org>,
Ingo Molnar <mingo@...nel.org>,
LKML <linux-kernel@...r.kernel.org>,
Adrian Hunter <adrian.hunter@...el.com>
Subject: Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/17/2016 11:35 PM, Linus Torvalds wrote:
> On Wed, Aug 17, 2016 at 2:26 PM, Linus Torvalds
> <torvalds@...ux-foundation.org> wrote:
>>
>> Of course, if somebody uses native_restore_fl() to actually *disable*
>> interrupts (when they weren't already disabled), then this untested
>> patch will just not work. But why would you do somethign so stupid?
>> Famous last words...
>
> Looking around, the vsmp code actually uses "native_restore_fl()" to
> not just set the interrupt flag, but AC as well.
>
> And the PV spinlock case has that "push;popf" sequence encoded in an alternate.
>
> So that trivial patch may (or may not - still not tested) work for
> some quick testing, but needs more effort for any *real* use.
I'm going to test the attached patch.
PV and debug maze is daunting... I discovered that Fedora kernels
compile with the set of .config options which result in
spin_unlock_irqrestore which looks like this:
ffffffff818d0f40 <_raw_spin_unlock_irqrestore>:
ffffffff818d0f40: e8 1b 31 00 00 callq ffffffff818d4060 <__fentry__> ffffffff818d0f41: R_X86_64_PC32 __fentry__-0x4
ffffffff818d0f45: 55 push %rbp
ffffffff818d0f46: 48 89 e5 mov %rsp,%rbp
ffffffff818d0f49: 41 54 push %r12
ffffffff818d0f4b: 53 push %rbx
ffffffff818d0f4c: 48 8b 55 08 mov 0x8(%rbp),%rdx
ffffffff818d0f50: 49 89 fc mov %rdi,%r12
ffffffff818d0f53: 48 89 f3 mov %rsi,%rbx
ffffffff818d0f56: 48 83 c7 18 add $0x18,%rdi
ffffffff818d0f5a: be 01 00 00 00 mov $0x1,%esi
ffffffff818d0f5f: e8 8c b8 83 ff callq ffffffff8110c7f0 <lock_release> ffffffff818d0f60: R_X86_64_PC32 lock_release-0x4
ffffffff818d0f64: 4c 89 e7 mov %r12,%rdi
ffffffff818d0f67: e8 d4 fb 83 ff callq ffffffff81110b40 <do_raw_spin_unlock> ffffffff818d0f68: R_X86_64_PC32 do_raw_spin_unlock-0x4
ffffffff818d0f6c: f6 c7 02 test $0x2,%bh
ffffffff818d0f6f: 74 1b je ffffffff818d0f8c <_raw_spin_unlock_irqrestore+0x4c>
ffffffff818d0f71: e8 aa 9d 83 ff callq ffffffff8110ad20 <trace_hardirqs_on> ffffffff818d0f72: R_X86_64_PC32 trace_hardirqs_on-0x4
ffffffff818d0f76: 48 89 df mov %rbx,%rdi
ffffffff818d0f79: ff 14 25 48 32 e2 81 callq *0xffffffff81e23248 ffffffff818d0f7c: R_X86_64_32S pv_irq_ops+0x8
ffffffff818d0f80: 65 ff 0d c9 c4 73 7e decl %gs:0x7e73c4c9(%rip) # d450 <__preempt_count> ffffffff818d0f83: R_X86_64_PC32 __preempt_count-0x4
ffffffff818d0f87: 5b pop %rbx
ffffffff818d0f88: 41 5c pop %r12
ffffffff818d0f8a: 5d pop %rbp
ffffffff818d0f8b: c3 retq
ffffffff818d0f8c: 48 89 df mov %rbx,%rdi
ffffffff818d0f8f: ff 14 25 48 32 e2 81 callq *0xffffffff81e23248 ffffffff818d0f92: R_X86_64_32S pv_irq_ops+0x8
ffffffff818d0f96: e8 35 5b 83 ff callq ffffffff81106ad0 <trace_hardirqs_off> ffffffff818d0f97: R_X86_64_PC32 trace_hardirqs_off-0x4
ffffffff818d0f9b: eb e3 jmp ffffffff818d0f80 <_raw_spin_unlock_irqrestore+0x40>
ffffffff818d0f9d: 0f 1f 00 nopl (%rax)
Gawd... really Fedora? All this is needed?
Testing with _this_ is not going to show any differences, I need to weed
all the cruft out and test a fast, non-debug .config.
Changed it like this:
+# CONFIG_UPROBES is not set
+# CONFIG_SCHED_OMIT_FRAME_POINTER is not set
+# CONFIG_HYPERVISOR_GUEST is not set
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_KMEMCHECK is not set
+# CONFIG_DEBUG_LOCK_ALLOC is not set
+# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
+# CONFIG_PROVE_RCU is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_FTRACE is not set
+# CONFIG_BINARY_PRINTF is not set
Looks better (it's with the patch already):
00000000000000f0 <_raw_spin_unlock_irqrestore>:
f0: 53 push %rbx
f1: 48 89 f3 mov %rsi,%rbx
f4: e8 00 00 00 00 callq f9 <_raw_spin_unlock_irqrestore+0x9> f5: R_X86_64_PC32 do_raw_spin_unlock-0x4
f9: 80 e7 02 and $0x2,%bh
fc: 74 01 je ff <_raw_spin_unlock_irqrestore+0xf>
fe: fb sti
ff: 65 ff 0d 00 00 00 00 decl %gs:0x0(%rip) # 106 <_raw_spin_unlock_irqrestore+0x16> 102: R_X86_64_PC32 __preempt_count-0x4
106: 5b pop %rbx
107: c3 retq
This also raises questions. Such as "why do_raw_spin_unlock() is not inlined here?"
Anyway... on to more kernel builds and testing.
Please take a look at the below patch. If it's obviously buggy, let me know.
diff -urp linux-4.7.1.org/arch/x86/include/asm/irqflags.h linux-4.7.1.obj/arch/x86/include/asm/irqflags.h
--- linux-4.7.1.org/arch/x86/include/asm/irqflags.h 2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/include/asm/irqflags.h 2016-08-18 10:01:09.514219644 +0200
@@ -44,6 +44,16 @@ static inline void native_irq_enable(voi
asm volatile("sti": : :"memory");
}
+/*
+ * This produces a "test; jz; sti" insn sequence.
+ * It's faster than "push reg; popf". If sti is skipped, it's much faster.
+ */
+static inline void native_cond_irq_enable(unsigned long flags)
+{
+ if (flags & X86_EFLAGS_IF)
+ native_irq_enable();
+}
+
static inline void native_safe_halt(void)
{
asm volatile("sti; hlt": : :"memory");
@@ -69,7 +79,8 @@ static inline notrace unsigned long arch
static inline notrace void arch_local_irq_restore(unsigned long flags)
{
- native_restore_fl(flags);
+ if (flags & X86_EFLAGS_IF)
+ native_irq_enable();
}
static inline notrace void arch_local_irq_disable(void)
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt.c linux-4.7.1.obj/arch/x86/kernel/paravirt.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt.c 2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt.c 2016-08-18 10:01:24.797155109 +0200
@@ -313,7 +313,7 @@ struct pv_time_ops pv_time_ops = {
__visible struct pv_irq_ops pv_irq_ops = {
.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+ .restore_fl = __PV_IS_CALLEE_SAVE(native_cond_irq_enable),
.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.safe_halt = native_safe_halt,
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c 2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c 2016-08-18 04:43:19.388102755 +0200
@@ -2,7 +2,7 @@
DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testb $((1<<9)>>8),%ah; jz 1f; sti; 1: ;");
DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
DEF_NATIVE(pv_cpu_ops, iret, "iret");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c 2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c 2016-08-18 04:42:56.364545509 +0200
@@ -4,7 +4,7 @@
DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testw $(1<<9),%di; jz 1f; sti; 1: ;");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
Powered by blists - more mailing lists