#include #include typedef unsigned long long int hp_timing_t; #define MAXSAMPLESTPT 1000 #define MAXCOPYSIZE (1024 * 1024 * 100) #define ORIG 0 #define NEW 1 static char* buf1 = NULL; static char* buf2 = NULL; static int repeat_one_test = 32; hp_timing_t _dl_hp_timing_overhead; # define HP_TIMING_NOW(Var) \ ({ unsigned long long _hi, _lo; \ asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \ (Var) = _hi << 32 | _lo; }) #define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start)) #define HP_TIMING_TOTAL(total_time, start, end) \ do \ { \ hp_timing_t tmptime; \ HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \ total_time += tmptime; \ } \ while (0) #define HP_TIMING_BEST(best_time, start, end) \ do \ { \ hp_timing_t tmptime; \ HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \ if (best_time > tmptime) \ best_time = tmptime; \ } \ while (0) void memcpy_new(char *dst, char *src, int len); void memcpy_c(char *dst, char *src, int len); void memcpy_c_e(char *dst, char *src, int len); void (*do_memcpy)(char *dst, char *src, int len); static void do_one_test ( char *dst, char *src, size_t len) { hp_timing_t start __attribute ((unused)); hp_timing_t stop __attribute ((unused)); hp_timing_t best_time = ~ (hp_timing_t) 0; size_t i,j; for (i = 0; i < repeat_one_test; ++i) { HP_TIMING_NOW (start); do_memcpy ( dst, src, len); HP_TIMING_NOW (stop); HP_TIMING_BEST (best_time, start, stop); } printf ("\t%zd", (size_t) best_time); } static void do_test (size_t align1, size_t align2, size_t len) { size_t i, j; char *s1, *s2; s1 = (char *) (buf1 + align1); s2 = (char *) (buf2 + align2); printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2); do_memcpy = memcpy_new; do_one_test (s2, s1, len); do_memcpy = memcpy_c; do_one_test (s2, s1, len); do_memcpy = memcpy_c_e; do_one_test (s2, s1, len); putchar ('\n'); } static test_init(void) { int i; buf1 = valloc(MAXCOPYSIZE); buf2 = valloc(MAXCOPYSIZE); for (i = 0; i < MAXCOPYSIZE ; i = i + 64) { buf1[i] = buf2[i] = i & 0xff; } } void memset_c(char *dst, char *src, int len) { __asm__("mov %rdx, %rcx"); __asm__("shr $3, %rcx"); __asm__("rep stosq"); } void memset_2(char *dst, char *src, int len) { __asm__("sub $128, %rdx"); __asm__("1:"); __asm__("sub $128, %rdx"); __asm__("movdqa %xmm0, (%rdi)"); __asm__("movdqa %xmm0, 16(%rdi)"); __asm__("movdqa %xmm0, 32(%rdi)"); __asm__("movdqa %xmm0, 48(%rdi)"); __asm__("movdqa %xmm0, 64(%rdi)"); __asm__("movdqa %xmm0, 80(%rdi)"); __asm__("movdqa %xmm0, 96(%rdi)"); __asm__("movdqa %xmm0, 112(%rdi)"); __asm__("jae 1b"); } void memcpy_c(char *dst, char *src, int len) { __asm__("mov %rdi, %rax"); __asm__("movl %edx, %ecx"); __asm__("shrl $3, %ecx"); __asm__("andl $7, %edx"); __asm__("rep movsq"); __asm__("movl %edx, %ecx"); __asm__("rep movsb"); __asm__("1:"); } void memcpy_c_e(char *dst, char *src, int len) { __asm__("movq %rdi, %rax"); __asm__("movq %rdx, %rcx"); __asm__("rep movsb"); } void memcpy_new(char *dst, char *src, int len) { __asm__("movq %rdi, %rax"); __asm__("cmpq $0x20, %rdx"); __asm__("jb .Lhandle_tail"); /* * We check whether memory false dependence could occur, * then jump to corresponding copy mode. */ __asm__("cmp %dil, %sil"); __asm__("jl .Lcopy_backward"); __asm__("subq $0x20, %rdx"); __asm__(".Lcopy_forward_loop:"); __asm__("subq $0x20, %rdx"); /* * Move in blocks of 4x8 bytes: */ __asm__("movq 0*8(%rsi), %r8"); __asm__("movq 1*8(%rsi), %r9"); __asm__("movq 2*8(%rsi), %r10"); __asm__("movq 3*8(%rsi), %r11"); __asm__("leaq 4*8(%rsi), %rsi"); __asm__("movq %r8, 0*8(%rdi)"); __asm__("movq %r9, 1*8(%rdi)"); __asm__("movq %r10, 2*8(%rdi)"); __asm__("movq %r11, 3*8(%rdi)"); __asm__("leaq 4*8(%rdi), %rdi"); __asm__("jae .Lcopy_forward_loop"); __asm__("addl $0x20, %edx"); __asm__("jmp .Lhandle_tail"); __asm__(".Lcopy_backward:"); /* * Calculate copy position to tail. */ __asm__("addq %rdx, %rsi"); __asm__("addq %rdx, %rdi"); __asm__("subq $0x20, %rdx"); /* * At most 3 ALU operations in one cycle, * so append NOPS in the same 16bytes trunk. */ __asm__(".p2align 4"); __asm__(".Lcopy_backward_loop:"); __asm__("subq $0x20, %rdx"); __asm__("movq -1*8(%rsi), %r8"); __asm__("movq -2*8(%rsi), %r9"); __asm__("movq -3*8(%rsi), %r10"); __asm__("movq -4*8(%rsi), %r11"); __asm__("leaq -4*8(%rsi), %rsi"); __asm__("movq %r8, -1*8(%rdi)"); __asm__("movq %r9, -2*8(%rdi)"); __asm__("movq %r10, -3*8(%rdi)"); __asm__("movq %r11, -4*8(%rdi)"); __asm__("leaq -4*8(%rdi), %rdi"); __asm__("jae .Lcopy_backward_loop"); /* * Calculate copy position to head. */ __asm__("addl $0x20, %edx"); __asm__("subq %rdx, %rsi"); __asm__("subq %rdx, %rdi"); __asm__(".Lhandle_tail:"); __asm__("cmpl $16, %edx"); __asm__("jb .Lless_16bytes"); /* * Move data from 16 bytes to 31 bytes. */ __asm__("movq 0*8(%rsi), %r8"); __asm__("movq 1*8(%rsi), %r9"); __asm__("movq -2*8(%rsi, %rdx), %r10"); __asm__("movq -1*8(%rsi, %rdx), %r11"); __asm__("movq %r8, 0*8(%rdi)"); __asm__("movq %r9, 1*8(%rdi)"); __asm__("movq %r10, -2*8(%rdi, %rdx)"); __asm__("movq %r11, -1*8(%rdi, %rdx)"); __asm__("jmp .Lend"); __asm__(".p2align 4"); __asm__(".Lless_16bytes:"); __asm__("cmpl $8, %edx"); __asm__("jb .Lless_8bytes"); /* * Move data from 8 bytes to 15 bytes. */ __asm__("movq 0*8(%rsi), %r8"); __asm__("movq -1*8(%rsi, %rdx), %r9"); __asm__("movq %r8, 0*8(%rdi)"); __asm__("movq %r9, -1*8(%rdi, %rdx)"); __asm__("jmp .Lend"); __asm__(".p2align 4"); __asm__(".Lless_8bytes:"); __asm__("cmpl $4, %edx"); __asm__("jb .Lless_3bytes"); /* * Move data from 4 bytes to 7 bytes. */ __asm__("movl (%rsi), %ecx"); __asm__("movl -4(%rsi, %rdx), %r8d"); __asm__("movl %ecx, (%rdi)"); __asm__("movl %r8d, -4(%rdi, %rdx)"); __asm__("jmp .Lend"); __asm__(".p2align 4"); __asm__(".Lless_3bytes:"); __asm__("subl $1, %edx"); __asm__("jb .Lend"); /* * Move data from 1 bytes to 3 bytes. */ __asm__("movzbl (%rsi), %ecx"); __asm__("jz .Lstore_1byte"); __asm__("movzbq 1(%rsi), %r8"); __asm__("movzbq (%rsi, %rdx), %r9"); __asm__("movb %r8b, 1(%rdi)"); __asm__("movb %r9b, (%rdi, %rdx)"); __asm__(".Lstore_1byte:"); __asm__("movb %cl, (%rdi)"); __asm__(".Lend:"); } void main(void) { int i; test_init(); printf ("%23s", ""); printf ("\t%s\t%s\t%s\n", "memcpy_new", "memcpy_c", "memcpy_c_e"); for(i = 0; i< 64;i += 4 ) do_test(0, 0, i); for(i = 0; i< 576;i += 64 ) { do_test(4, 0, i); do_test(0, 4, i); do_test(0, 0, i); do_test(0, 8, i); do_test(8, 0, i); do_test(0, 8*2, i); do_test(8*2,0, i); } return ; }