#include #include typedef unsigned long long int hp_timing_t; #define MAXSAMPLESTPT 100000 #define MAXCOPYSIZE (1024 * 32) #define ORIG 0 #define NEW 1 static char* buf1 = NULL; static char* buf2 = NULL; hp_timing_t _dl_hp_timing_overhead; # define HP_TIMING_NOW(Var) \ ({ unsigned long long _hi, _lo; \ asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \ (Var) = _hi << 32 | _lo; }) #define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start)) #define HP_TIMING_TOTAL(total_time, start, end) \ do \ { \ hp_timing_t tmptime; \ HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \ total_time += tmptime; \ } \ while (0) void memcpy_orig(char *dst, char *src, int len); void memcpy_new(char *dst, char *src, int len); void (*do_memcpy)(char *dst, char *src, int len); static void do_one_throughput ( char *dst, char *src, size_t len) { __asm__("cpuid" : : : "eax", "ebx", "ecx", "edx"); size_t i; hp_timing_t start __attribute ((unused)); hp_timing_t stop __attribute ((unused)); hp_timing_t total_time = (hp_timing_t) 0; __asm__("cpuid" : : : "eax", "ebx", "ecx", "edx"); for (i = 0; i < MAXSAMPLESTPT; ++i) { HP_TIMING_NOW (start); do_memcpy(buf1, buf2, len); HP_TIMING_NOW (stop); HP_TIMING_TOTAL (total_time, start, stop); } printf ("\t%zd", (size_t) total_time/MAXSAMPLESTPT); } static void do_tpt_test (size_t align1, size_t align2, size_t len) { size_t i, j; char *s1, *s2; s1 = (char *) (buf1 + align1); s2 = (char *) (buf2 + align2); printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2); do_memcpy = memcpy_orig; do_one_throughput (s2, s1, len); do_memcpy = memcpy_new; do_one_throughput (s2, s1, len); putchar ('\n'); } static test_init(void) { int i; buf1 = valloc(MAXCOPYSIZE); buf2 = valloc(MAXCOPYSIZE); for (i = 0; i < MAXCOPYSIZE ; i = i + 64) { buf1[i] = buf2[i] = i & 0xff; } } void memcpy_new(char *dst, char *src, int len) { __asm__("movq %rdi, %rax"); __asm__("movl %edx, %ecx"); __asm__("shrl $6, %ecx"); __asm__("jz 2f"); __asm__("cmp $0x400, %rdx"); __asm__("jae 8f"); __asm__("1:"); __asm__("decl %ecx"); __asm__("movq 0*8(%rsi), %r11"); __asm__("movq 1*8(%rdi), %r8"); __asm__("movq %r11, 0*8(%rdi)"); __asm__("movq %r8, 1*8(%rdi)"); __asm__("movq 2*8(%rsi), %r9"); __asm__("movq 3*8(%rdi), %r10"); __asm__("movq %r9, 2*8(%rdi)"); __asm__("movq %r10, 3*8(%rdi)"); __asm__("movq 4*8(%rsi), %r11"); __asm__("movq 5*8(%rdi), %r8"); __asm__("movq %r11, 4*8(%rdi)"); __asm__("movq %r8, 5*8(%rdi)"); __asm__("movq 6*8(%rsi), %r9"); __asm__("movq 7*8(%rdi), %r10"); __asm__("movq %r9, 6*8(%rdi)"); __asm__("movq %r10, 7*8(%rdi)"); __asm__("leaq 64(%rsi), %rsi"); __asm__("leaq 64(%rdi), %rdi"); __asm__("jnz 1b"); __asm__("2:"); __asm__("movl %edx, %ecx"); __asm__("andl $63, %ecx"); __asm__("shl $3, %ecx"); __asm__("jz 5f"); __asm__("3:"); __asm__("cmp %edi, %esi"); __asm__("mov $8, %r9"); __asm__("jl 4f"); __asm__("neg %r9"); __asm__("4:"); __asm__("decl %ecx"); __asm__("movq (%rsi), %r8"); __asm__("movq %r8, (%rdi)"); __asm__("leaq 8(%rdi), %rdi"); __asm__("leaq 8(%rsi), %rsi"); __asm__("jnz 3b"); __asm__("5:"); __asm__("movl %edx, %ecx"); __asm__("andl $7, %ecx"); __asm__("jz 7f"); __asm__("6:"); __asm__("movb (%rsi), %r8b"); __asm__("movb %r8b, (%rdi)"); __asm__("incq %rdi"); __asm__("incq %rsi"); __asm__("decl %ecx"); __asm__("jnz 6b"); __asm__("7:"); __asm__("retq"); __asm__("8:"); __asm__("movl %edx, %ecx"); __asm__ ("shr $3, %ecx"); __asm__ ("andl $7, %edx"); __asm__("rep movsq "); __asm__ ("jz 9f"); __asm__("movl %edx, %ecx"); __asm__("rep movsb"); __asm__("9:"); } void memcpy_orig(char *dst, char *src, int len) { __asm__("movq %rdi, %rax"); __asm__("movl %edx, %ecx"); __asm__("shrl $6, %ecx"); __asm__("jz 2f"); __asm__("mov $0x80, %r8d "); /*aligned case for loop 1 */ __asm__("1:"); __asm__("decl %ecx"); __asm__("movq 0*8(%rsi), %r11"); __asm__("movq 1*8(%rdi), %r8"); __asm__("movq %r11, 0*8(%rdi)"); __asm__("movq %r8, 1*8(%rdi)"); __asm__("movq 2*8(%rsi), %r9"); __asm__("movq 3*8(%rdi), %r10"); __asm__("movq %r9, 2*8(%rdi)"); __asm__("movq %r10, 3*8(%rdi)"); __asm__("movq 4*8(%rsi), %r11"); __asm__("movq 5*8(%rdi), %r8"); __asm__("movq %r11, 4*8(%rdi)"); __asm__("movq %r8, 5*8(%rdi)"); __asm__("movq 6*8(%rsi), %r9"); __asm__("movq 7*8(%rdi), %r10"); __asm__("movq %r9, 6*8(%rdi)"); __asm__("movq %r10, 7*8(%rdi)"); __asm__("leaq 64(%rsi), %rsi"); __asm__("leaq 64(%rdi), %rdi"); __asm__("jnz 1b"); __asm__("2:"); __asm__("movl %edx, %ecx"); __asm__("andl $63, %ecx"); __asm__("shl $3, %ecx"); __asm__("jz 5f"); __asm__("3:"); __asm__("cmp %edi, %esi"); __asm__("mov $8, %r9"); __asm__("jl 4f"); __asm__("neg %r9"); __asm__("4:"); __asm__("decl %ecx"); __asm__("movq (%rsi), %r8"); __asm__("movq %r8, (%rdi)"); __asm__("leaq 8(%rdi), %rdi"); __asm__("leaq 8(%rsi), %rsi"); __asm__("jnz 3b"); __asm__("5:"); __asm__("movl %edx, %ecx"); __asm__("andl $7, %ecx"); __asm__("jz 7f"); __asm__("6:"); __asm__("movb (%rsi), %r8b"); __asm__("movb %r8b, (%rdi)"); __asm__("incq %rdi"); __asm__("incq %rsi"); __asm__("decl %ecx"); __asm__("jnz 6b"); __asm__("7:"); __asm__("retq"); } void main(void) { int i; test_init(); printf ("%23s", ""); printf ("\t%s\t%s\n", "memcpy_orig", "memcpy_new"); for (i = 1024; i < 1024 * 16; i = i + 1024) do_tpt_test(8, 0, i); }