linux-kernel - [PATCH v4.15.7 1/1] x86/vdso: handle clock_gettime(CLOCK_MONOTONIC

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALyZvKxPr1ZsRmHCuhbQJ1ZDjpe4CMXmRQKbyu8tHNf9eb5fdw@mail.gmail.com>
Date:   Tue, 6 Mar 2018 23:56:53 +0000
From:   Jason Vas Dias <jason.vas.dias@...il.com>
To:     x86@...nel.org, linux-kernel@...r.kernel.org,
        andi <andi@...stfloor.org>, tglx@...utronix.de
Subject: [PATCH v4.15.7 1/1] x86/vdso: handle clock_gettime(CLOCK_MONOTONIC_RAW,
 &ts) in VDSO

Handling clock_gettime( CLOCK_MONOTONIC_RAW, &timespec)
by calling  vdso_fallback_gettime(),  ie. syscall,  is too slow  -
latencies of  300-700ns are common on Haswell (06:3C)  CPUs .

This patch against the 4.15.7 stable branch makes the VDSO handle
clock_gettime(CLOCK_GETTIME_RAW, &ts)
by issuing rdtscp in userspace,  IFF the clock source is the TSC, and converting
it to nanoseconds using the vsyscall_gtod_data 'mult' and 'shift' fields :

      volatile u32 tsc_lo, tsc_hi, tsc_cpu;
      asm volatile( "rdtscp" : (=a) tsc_lo, (=d) tsc_hi, (=c) tsc_cpu );
      u64 tsc = (((u64)tsc_hi)<<32) | ((u64)tsc_lo);
      tsc *= gtod->mult;
      tsc >>=gtod->shift;
      /* tsc is now number of nanoseconds */
      ts->tv_sec = __iter_div_u64_rem( tsc, NSEC_PER_SEC, &ts->tv_nsec);

Use of the "open coded asm" style here actually forces the compiler to
always choose the 32-bit version of rdtscp, which sets only %eax,
%edx, and %ecx and does not clear the high bits of %rax, %rdx, and
%rdx , because the
variables are declared 32-bit  - so the same 32-bit version is used whether
the code is compiled with -m32 or -m64 ( tested using gcc 5.4.0, gcc 6.4.1 ) .

The full story and test programs are in Bug #198961 :
    https://bugzilla.kernel.org/show_bug.cgi?id=198961
.

The patched VDSO now handles clock_gettime(CLOCK_MONOTONIC_RAW, &ts)
on the same machine with a latency (minimum time that can be measured)
of
around 100ns (compared with 300-700ns before patch).

I also think it makes sense to expose pointers to the live, updated
gtod->mult and gtod->shift values somehow to userspace . Then
a userspace TSC reader could re-use previous values to avoid
the long-division in most cases and obtain latencies of 10-20ns .

Hence there is now a new method in the VDSO:
   __ vdso_linux_tsc_calibration()
which returns a pointer to a 'struct linux_tsc_calibration'
declared in a new header
   arch/x86/include/uapi/asm/vdso_tsc_calibration.h

If the clock source is NOT the TSC, this function returns NULL .
The pointer is only valid when the system clock source is the TSC .
User-space TSC readers can detect when TSC is modified with Events,
and now can detect when clock source changes from / to TSC with
this function .

The patch :

---
diff --git a/arch/x86/entry/vdso/vclock_gettime.c \
b/arch/x86/entry/vdso/vclock_gettime.c
index f19856d..e840600 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -21,6 +21,7 @@
 #include <linux/math64.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <uapi/asm/vdso_tsc_calibration.h>

 #define gtod (&VVAR(vsyscall_gtod_data))

@@ -246,6 +247,29 @@ notrace static int __always_inline do_monotonic\
(struct timespec *ts)
 	return mode;
 }

+notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
+{
+        volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs
generated for 64-bit as for 32-bit builds
+        u64 ns;
+        register u64 tsc=0;
+        if (gtod->vclock_mode == VCLOCK_TSC)
+        {
+                asm volatile
+                ( "rdtscp"
+                : "=a" (tsc_lo)
+                , "=d" (tsc_hi)
+                , "=c" (tsc_cpu)
+                ); // : eax, edx, ecx used - NOT rax, rdx, rcx
+                tsc     = ((((u64)tsc_hi) & 0xffffffffUL) << 32) |
(((u64)tsc_lo) & 0xffffffffUL);
+                tsc    *= gtod->mult;
+                tsc   >>= gtod->shift;
+                ts->tv_sec  = __iter_div_u64_rem(tsc, NSEC_PER_SEC,
&ns);
+                ts->tv_nsec = ns;
+                return VCLOCK_TSC;
+        }
+        return VCLOCK_NONE;
+}
+
 notrace static void do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
@@ -277,6 +301,10 @@ notrace int __vdso_clock_gettime(clockid_t clock,
struct timespec *ts)
 		if (do_monotonic(ts) == VCLOCK_NONE)
 			goto fallback;
 		break;
+	case CLOCK_MONOTONIC_RAW:
+		if (do_monotonic_raw(ts) == VCLOCK_NONE)
+			goto fallback;
+		break;
 	case CLOCK_REALTIME_COARSE:
 		do_realtime_coarse(ts);
 		break;
@@ -326,3 +354,18 @@ notrace time_t __vdso_time(time_t *t)
 }
 time_t time(time_t *t)
 	__attribute__((weak, alias("__vdso_time")));
+
+extern const struct linux_tsc_calibration *
+        __vdso_linux_tsc_calibration(void);
+
+notrace  const struct linux_tsc_calibration *
+  __vdso_linux_tsc_calibration(void)
+{
+        if( gtod->vclock_mode == VCLOCK_TSC )
+                return ((const struct linux_tsc_calibration*) &gtod->mult);
+        return 0UL;
+}
+
+const struct linux_tsc_calibration * linux_tsc_calibration(void)
+        __attribute((weak, alias("__vdso_linux_tsc_calibration")));
+
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index d3a2dce..41a2ca5 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -24,7 +24,9 @@ VERSION {
 		getcpu;
 		__vdso_getcpu;
 		time;
-		__vdso_time;
+	        __vdso_time;
+                linux_tsc_calibration;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 422764a..d53bd73 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,7 +25,8 @@ VERSION
 	global:
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
-		__vdso_time;
+	        __vdso_time;
+                __vdso_linux_tsc_calibration;
 	};

 	LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S
b/arch/x86/entry/vdso/vdsox32.lds.S
index 05cd1c5..fb13b16 100644
--- a/arch/x86/entry/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -20,7 +20,8 @@ VERSION {
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
 		__vdso_getcpu;
-		__vdso_time;
+	        __vdso_time;
+                __vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff --git a/arch/x86/include/uapi/asm/vdso_tsc_calibration.h \
b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
new file mode 100644
index 0000000..6febb84
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H
+#define _ASM_X86_VDSO_TSC_CALIBRATION_H
+/*
+ * Programs that want to use rdtsc / rdtscp instructions
+ * from user-space can make use of the Linux kernel TSC calibration
+ * by calling :
+ *    __vdso_linux_tsc_calibration(void);
+ * ( one has to resolve this symbol as in
+ *   tools/testing/selftests/vDSO/parse_vdso.c
+ * )
+ * which returns a pointer into the read-only
+ * vvar_page (the vsyscall_gtod_data structure),
+ * with the following layout :
+ */
+
+struct linux_tsc_calibration
+{
+        unsigned int mult;   /* amount to multiply 64-bit TSC value by */
+
+        unsigned int shift;  /* the right shift to apply to
(mult*TSC) yielding \
 nanoseconds */
+};
+
+/* To use:
+ *
+ *  static const struct lnx_tsc_calibration_s*
+ *  (*linux_tsc_cal)(void) = vdso_sym("LINUX_2.6",
"__vdso_linux_tsc_calibration");
+ *  if( linux_tsc_cal == 0UL )
+ *  { fprintf(stderr,"the patch providing __vdso_linux_tsc_calibration is not \
  applied    to the kernel.\n");
+ *    return ERROR;
+ *  }
+ *  static const struct lnx_tsc_calibration_s *clock_source =
(*linux_tsc_cal)();
+ *  if( clock_source == ((void*)0UL))
+ *    fprintf(stderr,"TSC is not the system clocksource.\n");
+ *  volatile unsigned int tsc_lo, tsc_hi, tsc_cpu;
+ *  asm volatile
+ *  ( "rdtscp" : (=a) tsc_hi,  (=d) tsc_lo, (=c) tsc_cpu );
+ *  unsigned long tsc = (((unsigned long)tsc_hi) << 32) | tsc_lo;
+ *  unsigned long nanoseconds =
+ *   (( clock_source -> mult ) * tsc ) >> (clock_source -> shift);
+ *
+ *  nanoseconds is now TSC value converted to nanoseconds,
+ *  according to Linux' clocksource calibration values.
+ *
+ */
+
+#endif
---

Download attachment "vdso_vclock_gettime_userspace_tsc_4.15.7.patch" of type "application/octet-stream" (5606 bytes)