lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1324324293-10390-3-git-send-email-asharma@fb.com>
Date:	Mon, 19 Dec 2011 11:51:33 -0800
From:	Arun Sharma <asharma@...com>
To:	linux-kernel@...r.kernel.org
Cc:	Kumar Sundararajan <kumar@...com>, Arun Sharma <asharma@...com>,
	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...e.hu>,
	Thomas Gleixner <tglx@...utronix.de>,
	john stultz <johnstul@...ibm.com>,
	Andy Lutomirski <luto@....EDU>,
	Eric Dumazet <eric.dumazet@...il.com>
Subject: [PATCH 2/2] Add a thread cpu time implementation to vDSO

From: Kumar Sundararajan <kumar@...com>

This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)
We use the following method to compute the thread cpu time:

t0 = process start
t1 = most recent context switch time
t2 = time at which the vsyscall is invoked

thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
		= current->se.sum_exec_runtime + now - sched_clock()

At context switch time We stash away

adj_sched_time = sum_exec_runtime - sched_clock()

in a per-cpu struct in the VVAR page (which has now been extended
to two pages) and then compute

thread_cpu_time = adj_sched_time + now

All computations are done in nanosecs on systems where TSC is stable.
If TSC is unstable, we fallback to a regular syscall.

Benchmark data:

Baseline:

        for (i = 0; i < 100000000; i++) {
                clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
                sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
        }

vclock_gettime:

        vclock_gettime = dlsym(vdso, "__vdso_clock_gettime");
        for (i = 0; i < 100000000; i++) {
                (*vclock_gettime)(CLOCK_THREAD_CPUTIME_ID, &ts);
                sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
        }

Baseline: 		19.34 secs
vclock_gettime: 	4.74 secs

This should speed up profilers that need to query thread
cpu time a lot to do fine-grained timestamps.

No statistically significant regression was detected on x86_64
context switch code. Most archs that don't support vsyscalls
will have this code disabled via jump labels.

Signed-off-by: Kumar Sundararajan <kumar@...com>
Signed-off-by: Arun Sharma <asharma@...com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Ingo Molnar <mingo@...e.hu>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: john stultz <johnstul@...ibm.com>
Cc: Andy Lutomirski <luto@....EDU>
Cc: Eric Dumazet <eric.dumazet@...il.com>
Cc: linux-kernel@...r.kernel.org
---
 arch/x86/include/asm/timer.h   |   18 +++++++++++++-----
 arch/x86/include/asm/vvar.h    |    1 +
 arch/x86/kernel/tsc.c          |   16 ++++++++++++++++
 arch/x86/kernel/vsyscall_64.c  |    1 +
 arch/x86/vdso/vclock_gettime.c |   36 ++++++++++++++++++++++++++++++++++--
 arch/x86/vdso/vma.c            |    5 +++++
 include/linux/jiffies.h        |   16 ++++++++++++++++
 kernel/sched.c                 |    8 ++++++++
 8 files changed, 94 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 431793e..99a3670 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -55,19 +55,27 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
 
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
-static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+static inline unsigned long long ___cycles_2_ns(unsigned long long cyc,
+						unsigned long long scale,
+						unsigned long long offset)
 {
 	unsigned long long quot;
 	unsigned long long rem;
-	int cpu = smp_processor_id();
-	unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
+	unsigned long long ns = offset;
 	quot = (cyc >> CYC2NS_SCALE_FACTOR);
 	rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
-	ns += quot * per_cpu(cyc2ns, cpu) +
-		((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
+	ns += quot * scale + ((rem * scale) >> CYC2NS_SCALE_FACTOR);
 	return ns;
 }
 
+static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+{
+	int cpu = smp_processor_id();
+	unsigned long long offset = per_cpu(cyc2ns_offset, cpu);
+	unsigned long long scale = per_cpu(cyc2ns, cpu);
+	return ___cycles_2_ns(cyc, scale, offset);
+}
+
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
 	unsigned long long ns;
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 0fd7a4a..6710e2a 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -47,5 +47,6 @@
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
 DECLARE_VVAR(16, int, vgetcpu_mode)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+DECLARE_VVAR(256, struct vcpu_data, vcpu_data)
 
 #undef DECLARE_VVAR
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db48336..8e9f52c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -623,6 +623,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	if (cpu_khz) {
 		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
 		*offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+		vcpu_data.vpercpu[cpu].cyc2ns = *scale;
+		vcpu_data.vpercpu[cpu].cyc2ns_offset = *offset;
 	}
 
 	sched_clock_idle_wakeup_event(0);
@@ -786,6 +788,8 @@ void mark_tsc_unstable(char *reason)
 		tsc_unstable = 1;
 		sched_clock_stable = 0;
 		disable_sched_clock_irqtime();
+		vcpu_data.thread_cputime_disabled = 1;
+		jump_label_dec(&vcpu_data_enabled);
 		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
@@ -943,6 +947,15 @@ static int __init init_tsc_clocksource(void)
  */
 device_initcall(init_tsc_clocksource);
 
+/* Should be optimized away at compile time */
+static noinline int check_vvar_overflow(void)
+{
+	long vcpu_data_offset =  (long) vvaraddr_vcpu_data - VVAR_ADDRESS;
+	size_t size = (vcpu_data_offset + sizeof(vcpu_data)
+			+ sizeof(struct vpercpu_data) * CONFIG_NR_CPUS);
+	return (size > VVAR_NUM_PAGES * PAGE_SIZE);
+}
+
 void __init tsc_init(void)
 {
 	u64 lpj;
@@ -979,6 +992,9 @@ void __init tsc_init(void)
 
 	/* now allow native_sched_clock() to use rdtsc */
 	tsc_disabled = 0;
+	vcpu_data.tsc_khz = tsc_khz;
+	vcpu_data.thread_cputime_disabled = check_vvar_overflow()
+		|| !cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP);
 
 	if (!no_sched_irq_time)
 		enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 7960d3a..cdfcedf 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -56,6 +56,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
 {
 	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
 };
+DEFINE_VVAR(struct vcpu_data, vcpu_data);
 
 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
 
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6bc0e72..14d466a 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -18,6 +18,7 @@
 #include <asm/vsyscall.h>
 #include <asm/fixmap.h>
 #include <asm/vgtod.h>
+#include <asm/timer.h>
 #include <asm/timex.h>
 #include <asm/hpet.h>
 #include <asm/unistd.h>
@@ -154,8 +155,32 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 	return 0;
 }
 
+notrace static noinline unsigned long do_thread_cpu_time(void)
+{
+	unsigned int p;
+	u_int64_t tscval;
+	unsigned long long adj_sched_time, scale, offset;
+	const struct vcpu_data *vp = &VVAR(vcpu_data);
+	int cpu;
+
+	do {
+		native_read_tscp(&p);
+		cpu = p & 0xfff;
+		adj_sched_time = vp->vpercpu[cpu].adj_sched_time;
+		scale = vp->vpercpu[cpu].cyc2ns;
+		offset = vp->vpercpu[cpu].cyc2ns_offset;
+		rdtscpll(tscval, p);
+		cpu = p & 0xfff;
+	} while (unlikely(adj_sched_time != vp->vpercpu[cpu].adj_sched_time));
+
+	return  ___cycles_2_ns(tscval, scale, offset) + adj_sched_time;
+}
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
+	unsigned long ns;
+	const struct vcpu_data *vp = &VVAR(vcpu_data);
+
 	switch (clock) {
 	case CLOCK_REALTIME:
 		if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
@@ -169,6 +194,13 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 		return do_realtime_coarse(ts);
 	case CLOCK_MONOTONIC_COARSE:
 		return do_monotonic_coarse(ts);
+	case CLOCK_THREAD_CPUTIME_ID:
+		if (vp->thread_cputime_disabled)
+			break;
+		ns = do_thread_cpu_time();
+		ts->tv_sec = 0;
+		timespec_add_ns(ts, ns);
+		return 0;
 	}
 
 	return vdso_fallback_gettime(clock, ts);
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 153407c..8b7630e 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/random.h>
 #include <linux/elf.h>
+#include <linux/jump_label.h>
 #include <asm/vsyscall.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
@@ -24,6 +25,8 @@ extern unsigned short vdso_sync_cpuid;
 extern struct page *vdso_pages[];
 static unsigned vdso_size;
 
+struct jump_label_key vcpu_data_enabled;
+
 static void __init patch_vdso(void *vdso, size_t len)
 {
 	Elf64_Ehdr *hdr = vdso;
@@ -66,6 +69,8 @@ static int __init init_vdso(void)
 	for (i = 0; i < npages; i++)
 		vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
 
+	if (!vcpu_data.thread_cputime_disabled)
+		jump_label_inc(&vcpu_data_enabled);
 	return 0;
 }
 subsys_initcall(init_vdso);
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 265e2c3..d44e6aa 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -312,4 +312,20 @@ extern unsigned long nsecs_to_jiffies(u64 n);
 
 #define TIMESTAMP_SIZE	30
 
+struct vpercpu_data {
+	unsigned long long adj_sched_time;
+	unsigned long long cyc2ns_offset;
+	unsigned long cyc2ns;
+} ____cacheline_aligned;
+
+struct vcpu_data {
+	unsigned int tsc_khz;
+	unsigned int thread_cputime_disabled;
+	struct vpercpu_data vpercpu[0];
+};
+extern struct vcpu_data vcpu_data;
+
+struct jump_label_key;
+extern struct jump_label_key vcpu_data_enabled;
+
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index d6b149c..b738dd6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3216,6 +3216,14 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
+
+#if CONFIG_NR_CPUS <= 64
+	if (static_branch(&vcpu_data_enabled)) {
+		int cpu = smp_processor_id();
+		vcpu_data.vpercpu[cpu].adj_sched_time =
+                  current->se.sum_exec_runtime - sched_clock();
+	}
+#endif
 }
 
 #ifdef CONFIG_SMP
-- 
1.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ