[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140809105742.GA5910@pd.tnic>
Date: Sat, 9 Aug 2014 12:57:42 +0200
From: Borislav Petkov <bp@...en8.de>
To: x86-ml <x86@...nel.org>, lkml <linux-kernel@...r.kernel.org>
Cc: Peter Zijlstra <peterz@...radead.org>,
Steven Rostedt <rostedt@...dmis.org>,
Thomas Gleixner <tglx@...utronix.de>
Subject: [RFC PATCH] Flipped jump labels
Hi dudes,
with the current impl. of jump labels, people can't really do the
following:
---
JMP unlikely_code
likely_code
unlikely_code:
unlikely code
---
and after some initialization queries overwrite the JMP with a NOP so
that the likely code gets executed at 0 cost.
The issue is that jump labels unconditionally add NOPs by default
(see arch_static_branch). For example, native_sched_clock() gets the
following code layout here:
--
NOP
unlikely code (which computes time in ns from jiffies)
likely code (which does RDTSC)
--
Yes, unlikely code comes first.
when the jump labels get initialized and all checks done, at runtime we
have this:
0xffffffff8100ce40 <sched_clock>: push %rbp
0xffffffff8100ce41 <sched_clock+1>: mov %rsp,%rbp
0xffffffff8100ce44 <sched_clock+4>: and $0xfffffffffffffff0,%rsp
unconditional JMP!!!
0xffffffff8100ce48 <sched_clock+8>: jmpq 0xffffffff8100ce70 <sched_clock+48>
unlikely code using jiffies
0xffffffff8100ce4d <sched_clock+13>: mov 0x9a71ac(%rip),%r8 # 0xffffffff819b4000 <jiffies_64>
0xffffffff8100ce54 <sched_clock+20>: movabs $0xffc2f745d964b800,%rax
0xffffffff8100ce5e <sched_clock+30>: leaveq
0xffffffff8100ce5f <sched_clock+31>: imul $0x3d0900,%r8,%rdx
0xffffffff8100ce66 <sched_clock+38>: add %rdx,%rax
0xffffffff8100ce69 <sched_clock+41>: retq
0xffffffff8100ce6a <sched_clock+42>: nopw 0x0(%rax,%rax,1)
likely code using RDTSC, see JMP target address.
0xffffffff8100ce70 <sched_clock+48>: rdtsc
so what we end up getting is not really helping because we always get to
pay for that JMP on all modern systems which sport RDTSC even though we
shouldn't really.
And remember this is not something cheap: sched_clock uses the TSC
even if it is unstable so we're always jumping like insane and
unconditionally.
So, long story short, we want this, instead:
0xffffffff8100cf10 <sched_clock>: push %rbp
0xffffffff8100cf11 <sched_clock+1>: mov %rsp,%rbp
0xffffffff8100cf14 <sched_clock+4>: and $0xfffffffffffffff0,%rsp
unconditional JMP is nopped out
0xffffffff8100cf18 <sched_clock+8>: data32 data32 data32 xchg %ax,%ax
likely code which comes first in the function so all the advantages from
it to front end, branch pred, yadda yadda, get to be enjoyed :)
0xffffffff8100cf1d <sched_clock+13>: rdtsc
0xffffffff8100cf1f <sched_clock+15>: mov %eax,%esi
0xffffffff8100cf21 <sched_clock+17>: mov %rdx,%rax
0xffffffff8100cf24 <sched_clock+20>: shl $0x20,%rax
0xffffffff8100cf28 <sched_clock+24>: or %rsi,%rax
0xffffffff8100cf2b <sched_clock+27>: mov %rax,%rcx
0xffffffff8100cf2e <sched_clock+30>: incl %gs:0xb8e0
0xffffffff8100cf36 <sched_clock+38>: mov %gs:0x1d0c30,%rsi
0xffffffff8100cf3f <sched_clock+47>: mov %gs:0x1d0c38,%rax
0xffffffff8100cf48 <sched_clock+56>: cmp %rax,%rsi
0xffffffff8100cf4b <sched_clock+59>: jne 0xffffffff8100cf90 <sched_clock+128>
0xffffffff8100cf4d <sched_clock+61>: mov (%rsi),%eax
0xffffffff8100cf4f <sched_clock+63>: mul %rcx
0xffffffff8100cf52 <sched_clock+66>: shrd $0xa,%rdx,%rax
0xffffffff8100cf57 <sched_clock+71>: add 0x8(%rsi),%rax
0xffffffff8100cf5b <sched_clock+75>: decl %gs:0xb8e0
0xffffffff8100cf63 <sched_clock+83>: je 0xffffffff8100cf88 <sched_clock+120>
0xffffffff8100cf65 <sched_clock+85>: leaveq
0xffffffff8100cf66 <sched_clock+86>: retq
Done, we return here.
0xffffffff8100cf67 <sched_clock+87>: nop
unlikely code which does the jiffies math.
0xffffffff8100cf68 <sched_clock+88>: mov 0x9a7091(%rip),%rax # 0xffffffff819b4000 <jiffies_64>
0xffffffff8100cf6f <sched_clock+95>: leaveq
0xffffffff8100cf70 <sched_clock+96>: imul $0x3d0900,%rax,%rdx
...
So basically what I'm proposing is a jump label type which is
initialized by default to jump to the unlikely code and once
initialization has happened, JMP gets overwritten.
The things to pay attention here is
* this label should be used in places where it is very likely for the
jump to get overwritten. Basically the opposite to tracepoints for which
the jump labels were created initially, to be mostly off.
* It must be used in places where JMP gets overwritten only after some
initialization done first.
Anyway, below is a concept patch for myself to try the idea first - it
seems to work here. Constructive ideas and suggestions are welcome, as
always.
---
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 6a2cefb4395a..2d963c6489a8 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -30,6 +30,22 @@ l_yes:
return true;
}
+static __always_inline bool arch_static_branch_active(struct static_key *key)
+{
+ asm_volatile_goto("1:"
+ "jmp %l[l_yes]\n\t"
+ ".byte " __stringify(GENERIC_NOP3) "\n\t"
+ ".pushsection __jump_table, \"aw\" \n\t"
+ _ASM_ALIGN "\n\t"
+ _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
+ ".popsection \n\t"
+ : : "i" (key) : : l_yes);
+ return false;
+l_yes:
+ return true;
+}
+
+
#endif /* __KERNEL__ */
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 4ca327e900ae..81bc2c4a7eab 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(tsc_khz);
*/
static int __read_mostly tsc_unstable;
-static struct static_key __use_tsc = STATIC_KEY_INIT;
+static struct static_key __use_tsc = STATIC_KEY_INIT_FALSE;
int tsc_clocksource_reliable;
@@ -280,7 +280,7 @@ u64 native_sched_clock(void)
* very important for it to be as fast as the platform
* can achieve it. )
*/
- if (!static_key_false(&__use_tsc)) {
+ if (arch_static_branch_active(&__use_tsc)) {
/* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists