// To be unaffected by random cacheline placement, use generous "align": // // i686-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static // x86_64-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static #include #include #include #include #include #include #include #include #if !defined(__i386__) #define get_sysenter_addr() 0 #else #include long sysenter_addr; long get_sysenter_addr(char **envp) { Elf32_auxv_t *auxv; while (*envp++ != NULL) continue; for (auxv = (void *)envp; auxv->a_type != AT_NULL; auxv++) if( auxv->a_type == AT_SYSINFO) return (sysenter_addr = auxv->a_un.a_val); fprintf(stderr, "AT_SYSINFO not supplied, can't test\n"); exit(0); /* this is not a failure */ } void sysenter_getpid(void) { asm volatile( "\n" " mov $20,%eax" // GETPID "\n" " call *sysenter_addr" ); } #endif #if defined(__i386__) #define L_or_Q "l" #define E_or_R "e" #else #define L_or_Q "q" #define E_or_R "r" #endif static int memvar; asm ( "\n" " .text" "\n" "ret__: ret" ); int main(int argc, char **argv, char **envp) { struct timespec start, end; unsigned long long duration; size_t loops, i; const char *mode; if (argc < 2) { printf("Usage: timing_test [MILLIONS_OF_ITERATIONS] MODE\n"); return 1; } mode = argv[2]; if (!mode) { mode = argv[1]; loops = 10*1000; } else { loops = (size_t)atol(argv[1]) * 1000000; } again: if (!strcmp(mode, "nothing")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ("# nothing"); } } else if (!strcmp(mode, "nop")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ("nop"); } } else if (!strcmp(mode, "rdtsc")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, d; asm volatile ("rdtsc" : "=a" (a), "=d" (d)); } } else if (!strcmp(mode, "lfence_rdtsc")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, d; asm volatile ("lfence;rdtsc" : "=a" (a), "=d" (d)); } } else if (!strcmp(mode, "lfence_rdtsc_lfence")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, d; asm volatile (""); asm volatile ("lfence;rdtsc;lfence" : "=a" (a), "=d" (d)); } } else if (!strcmp(mode, "mfence_rdtsc_mfence")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, d; asm volatile ("mfence;rdtsc;mfence" : "=a" (a), "=d" (d)); } } else if (!strcmp(mode, "rdtscp")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { unsigned int a, c, d; asm volatile ("rdtscp" : "=a" (a), "=c" (c), "=d" (d)); } } else if (!strcmp(mode, "gettimeofday")) { struct timeval tv; clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) gettimeofday(&tv, 0); } else if (!strcmp(mode, "getpid")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) syscall(SYS_getpid); #if defined(__i386__) } else if (!strcmp(mode, "sysenter_getpid")) { get_sysenter_addr(envp); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) sysenter_getpid(); } else if (!strcmp(mode, "iret")) { /* "push cs" is itself a bit expensive, moving it out of loop */ long saved_cs; asm volatile ("mov %%cs,%0" : "=r" (saved_cs)); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push $0" // flags "\n" " push %0" // cs "\n" " push $1f" // ip "\n" " iret" "\n" "1:" : : "r" (saved_cs) ); } #endif #if defined(__x86_64__) } else if (!strcmp(mode, "iret")) { long saved_cs; long saved_ss; asm volatile ("mov %%cs,%0" : "=r" (saved_cs)); asm volatile ("mov %%ss,%0" : "=r" (saved_ss)); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " mov %%rsp,%%rax" "\n" " push %0" // ss "\n" " push %%rax" // sp "\n" " push $0" // flags "\n" " push %1" // cs "\n" " push $1f" // ip "\n" " iretq" "\n" "1:" : : "r" (saved_ss), "r" (saved_cs) : "ax" ); } #endif } else if (!strcmp(mode, "lret")) { /* "push cs" is itself a bit expensive, moving it out of loop */ long saved_cs; asm volatile ("mov %%cs,%0" : "=r" (saved_cs)); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push %0" "\n" " push $1f" "\n" " lret"L_or_Q "\n" "1:" : : "r" (saved_cs) ); } } else if (!strcmp(mode, "callret")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ("call ret__"); } } else if (!strcmp(mode, "ret")) { /* This is useful to measure delays due to * return stack branch prediction not working * (we aren't using paired call/rets here, as CPU expects). * I observed "callret" test above being 4 times faster than this: */ clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push $1f" "\n" " ret" "\n" "1:" ); } } else if (!strcmp(mode, "loadss")) { long saved_ss; asm volatile ("mov %%ss,%0" : "=r" (saved_ss)); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ("mov %0,%%ss" : : "r" (saved_ss)); } } else if (!strcmp(mode, "readss")) { long saved_ss; clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ("mov %%ss,%0" : "=r" (saved_ss)); } } else if (!strcmp(mode, "leave")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push %"E_or_R"bp" "\n" " mov %"E_or_R"sp,%"E_or_R"bp" "\n" " leave" ); } } else if (!strcmp(mode, "noleave")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push %"E_or_R"bp" "\n" " mov %"E_or_R"sp,%"E_or_R"bp" "\n" " mov %"E_or_R"bp,%"E_or_R"sp" "\n" " pop %"E_or_R"bp" ); } } else if (!strcmp(mode, "pushf")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " pushf" "\n" " pop %%"E_or_R"ax" : : : "ax" ); } } else if (!strcmp(mode, "popf")) { long flags; asm volatile ( "\n" " pushf" "\n" " pop %0" : "=r" (flags) ); clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " push %0" "\n" " popf" : : "r" (flags) : "ax" ); } } else if (!strcmp(mode, "or")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " orl $1,%0" : : "m" (memvar) ); } } else if (!strcmp(mode, "lock_or")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " lock orl $1,%0" : : "m" (memvar) ); } } else if (!strcmp(mode, "lock_xadd")) { clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) { asm volatile ( "\n" " lock xaddl %0,%1" : : "r" (0), "m" (memvar) ); } } else if (!strcmp(mode, "rdpmc")) { // Unlikely to work. unsigned int eax, edx; unsigned int ecx = 0; clock_gettime(CLOCK_MONOTONIC, &start); for (i = loops; i != 0; i--) asm volatile ("rdpmc" : "=a" (eax), "=d" (edx) : "c" (ecx)); } else { printf("Unknown mode %s\n", mode); return 1; } clock_gettime(CLOCK_MONOTONIC, &end); duration = (1000*1000*1000ULL * end.tv_sec + end.tv_nsec) - (1000*1000*1000ULL * start.tv_sec + start.tv_nsec); printf("%lu loops in %.5fs = %.2f nsec/loop for %s\n", (unsigned long)loops, (double)duration * 1e-9, (double)duration / loops, mode ); if (!argv[2]) { if (duration < 90*1000*1000) { loops *= 10; goto again; } if (duration < 490*1000*1000) { loops *= 2; goto again; } } return 0; }