lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <6fc6093ec1acb82860cb941d63b2b9aa288bae4c.1302137785.git.luto@mit.edu>
Date:	Wed,  6 Apr 2011 22:04:02 -0400
From:	Andy Lutomirski <luto@....EDU>
To:	x86@...nel.org
Cc:	Thomas Gleixner <tglx@...utronix.de>, Ingo Molnar <mingo@...e.hu>,
	Andi Kleen <andi@...stfloor.org>, linux-kernel@...r.kernel.org,
	Andy Lutomirski <luto@....edu>
Subject: [RFT/PATCH v2 5/6] x86-64: Move vread_tsc into a new file with sensible options

vread_tsc is short and hot, and it's userspace code so the usual
reasons to keep frame pointers around, enable -pg, and turn off
sibling calls don't apply.

(OK, turning off sibling calls has no effect.  But it might
someday...)

As an added benefit, tsc.c is profilable now.

Signed-off-by: Andy Lutomirski <luto@....edu>
---
 arch/x86/include/asm/tsc.h     |    4 +++
 arch/x86/kernel/Makefile       |    8 +++--
 arch/x86/kernel/tsc.c          |   53 --------------------------------------
 arch/x86/kernel/vread_tsc_64.c |   55 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+), 56 deletions(-)
 create mode 100644 arch/x86/kernel/vread_tsc_64.c

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132f..8f2b3c6 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void);
 extern int check_tsc_unstable(void);
 extern unsigned long native_calibrate_tsc(void);
 
+#ifdef CONFIG_X86_64
+extern cycles_t vread_tsc(void);
+#endif
+
 /*
  * Boot-time check whether the TSCs are synchronized across
  * all CPUs/cores:
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2..7626fb8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
-CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
 CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 CFLAGS_REMOVE_pvclock.o = -pg
@@ -24,13 +23,16 @@ endif
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o	:= $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o		:= $(nostackp)
-CFLAGS_tsc.o		:= $(nostackp)
+CFLAGS_vread_tsc_64.o	:= $(nostackp)
 CFLAGS_paravirt.o	:= $(nostackp)
 GCOV_PROFILE_vsyscall_64.o	:= n
 GCOV_PROFILE_hpet.o		:= n
 GCOV_PROFILE_tsc.o		:= n
 GCOV_PROFILE_paravirt.o		:= n
 
+# vread_tsc_64 is hot and should be fully optimized:
+CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-omit-frame-pointer -fno-optimize-sibling-calls
+
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
@@ -39,7 +41,7 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o vread_tsc_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 69ff619..5346381 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,59 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
 		ret : clocksource_tsc.cycle_last;
 }
 
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
-	cycle_t ret;
-	u64 zero, last;
-
-	/*
-	 * rdtsc is unordered, and we want it to be ordered like
-	 * a load with respect to other CPUs (and we don't want
-	 * it to execute absurdly early wrt code on this CPU).
-	 * rdtsc_barrier() is a barrier that provides this ordering
-	 * with respect to *earlier* loads.  (Which barrier to use
-	 * depends on the CPU.)
-	 */
-	rdtsc_barrier();
-
-	asm volatile ("rdtsc\n\t"
-		      "shl $0x20,%%rdx\n\t"
-		      "or %%rdx,%%rax\n\t"
-		      "shl $0x20,%%rdx"
-		      : "=a" (ret), "=d" (zero) : : "cc");
-
-	/*
-	 * zero == 0, but as far as the processor is concerned, zero
-	 * depends on the output of rdtsc.  So we can use it as a
-	 * load barrier by loading something that depends on it.
-	 * x86-64 keeps all loads in order wrt each other, so this
-	 * ensures that rdtsc is ordered wrt all later loads.
-	 */
-
-	/*
-	 * This doesn't multiply 'zero' by anything, which generates
-	 * very slightly nicer code than multiplying it by 8.
-	 */
-	last = *( (cycle_t *)
-		  ((char *)&VVAR(vsyscall_gtod_data).clock.cycle_last + zero) );
-
-	if (likely(ret >= last))
-		return ret;
-
-	/*
-	 * GCC likes to generate cmov here, but this branch is extremely
-	 * predictable (it's just a funciton of time and the likely is
-	 * very likely) and there's a data dependence, so force GCC
-	 * to generate a branch instead.  I don't barrier() because
-	 * we don't actually need a barrier, and if this function
-	 * ever gets inlined it will generate worse code.
-	 */
-	asm volatile ("");
-	return last;
-}
-#endif
-
 static void resume_tsc(struct clocksource *cs)
 {
 	clocksource_tsc.cycle_last = 0;
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 0000000..856330e
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,55 @@
+/* This code runs in userspace. */
+
+#define DISABLE_BRANCH_PROFILING
+#include <asm/vgtod.h>
+
+notrace cycle_t __vsyscall_fn vread_tsc(void)
+{
+	cycle_t ret;
+	u64 zero, last;
+
+	/*
+	 * rdtsc is unordered, and we want it to be ordered like
+	 * a load with respect to other CPUs (and we don't want
+	 * it to execute absurdly early wrt code on this CPU).
+	 * rdtsc_barrier() is a barrier that provides this ordering
+	 * with respect to *earlier* loads.  (Which barrier to use
+	 * depends on the CPU.)
+	 */
+	rdtsc_barrier();
+
+	asm volatile ("rdtsc\n\t"
+		      "shl $0x20,%%rdx\n\t"
+		      "or %%rdx,%%rax\n\t"
+		      "shl $0x20,%%rdx"
+		      : "=a" (ret), "=d" (zero) : : "cc");
+
+	/*
+	 * zero == 0, but as far as the processor is concerned, zero
+	 * depends on the output of rdtsc.  So we can use it as a
+	 * load barrier by loading something that depends on it.
+	 * x86-64 keeps all loads in order wrt each other, so this
+	 * ensures that rdtsc is ordered wrt all later loads.
+	 */
+
+	/*
+	 * This doesn't multiply 'zero' by anything, which generates
+	 * very slightly nicer code than multiplying it by 8.
+	 */
+	last = *( (cycle_t *)
+		  ((char *)&VVAR(vsyscall_gtod_data).clock.cycle_last + zero) );
+
+	if (likely(ret >= last))
+		return ret;
+
+	/*
+	 * GCC likes to generate cmov here, but this branch is extremely
+	 * predictable (it's just a funciton of time and the likely is
+	 * very likely) and there's a data dependence, so force GCC
+	 * to generate a branch instead.  I don't barrier() because
+	 * we don't actually need a barrier, and if this function
+	 * ever gets inlined it will generate worse code.
+	 */
+	asm volatile ("");
+	return last;
+}
-- 
1.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ