#include #include typedef unsigned long long int hp_timing_t; #define MAXSAMPLESTPT 1000 #define MAXCOPYSIZE (1024 * 1024 * 100) #define ORIG 0 #define NEW 1 static char* buf1 = NULL; static char* buf2 = NULL; static int repeat_one_test = 32; hp_timing_t _dl_hp_timing_overhead; # define HP_TIMING_NOW(Var) \ ({ unsigned long long _hi, _lo; \ asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \ (Var) = _hi << 32 | _lo; }) #define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start)) #define HP_TIMING_TOTAL(total_time, start, end) \ do \ { \ hp_timing_t tmptime; \ HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \ total_time += tmptime; \ } \ while (0) #define HP_TIMING_BEST(best_time, start, end) \ do \ { \ hp_timing_t tmptime; \ HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \ if (best_time > tmptime) \ best_time = tmptime; \ } \ while (0) void memcpy_orig(char *dst, char *src, int len); void memcpy_new(char *dst, char *src, int len); void memcpy_c(char *dst, char *src, int len); void (*do_memcpy)(char *dst, char *src, int len); static void do_one_test ( char *dst, char *src, size_t len) { hp_timing_t start __attribute ((unused)); hp_timing_t stop __attribute ((unused)); hp_timing_t best_time = ~ (hp_timing_t) 0; size_t i,j; for (i = 0; i < repeat_one_test; ++i) { HP_TIMING_NOW (start); do_memcpy ( dst, src, len); HP_TIMING_NOW (stop); HP_TIMING_BEST (best_time, start, stop); } printf ("\t%zd", (size_t) best_time); } static void do_test (size_t align1, size_t align2, size_t len) { size_t i, j; char *s1, *s2; s1 = (char *) (buf1 + align1); s2 = (char *) (buf2 + align2); printf ("LAT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2); do_memcpy = memcpy_orig; do_one_test (s2, s1, len); do_memcpy = memcpy_new; do_one_test (s2, s1, len); putchar ('\n'); } static test_init(void) { int i; buf1 = valloc(MAXCOPYSIZE); buf2 = valloc(MAXCOPYSIZE); for (i = 0; i < MAXCOPYSIZE ; i = i + 64) { buf1[i] = buf2[i] = i & 0xff; } } void memcpy_new(char *dst, char *src, int len) { __asm__("movq %rdi, %rax"); __asm__("cmp $0x28, %rdx"); __asm__("jb 1f"); /* * We check whether memory false dependece could occur, * then jump to corresponding copy mode. */ __asm__("cmp %dil, %sil"); __asm__("jl 2f"); /* * We append data to avoid store crossing cache. */ __asm__("movq (%rsi), %rcx"); __asm__("movq %rdi, %r8"); __asm__("addq $8, %rdi"); __asm__("andq $-8, %rdi"); __asm__("movq %rcx, (%r8)"); __asm__("subq %rdi, %r8"); __asm__("addq %r8, %rdx"); __asm__("subq %r8, %rsi"); __asm__("subq $0x20, %rdx"); __asm__("3:"); __asm__("subq $0x20, %rdx"); /* * Move in blocks of 4x8 bytes: */ __asm__("movq 0*8(%rsi), %r8"); __asm__("movq 1*8(%rsi), %r9"); __asm__("movq 2*8(%rsi), %r10"); __asm__("movq 3*8(%rsi), %r11"); __asm__("leaq 4*8(%rsi), %rsi"); __asm__("movq %r8, 0*8(%rdi)"); __asm__("movq %r9, 1*8(%rdi)"); __asm__("movq %r10, 2*8(%rdi)"); __asm__("movq %r11, 3*8(%rdi)"); __asm__("leaq 4*8(%rdi), %rdi"); __asm__("jae 3b"); __asm__("addq $0x20, %rdx"); __asm__("jmp 10f"); __asm__("2:"); /* * Calculate copy position to tail. */ __asm__("addq %rdx, %rsi"); __asm__("addq %rdx, %rdi"); /* * We append data to avoid store crossing cache. */ __asm__("movq -8(%rsi), %rcx"); __asm__("movq %rdi, %r8"); __asm__("andq $-8, %rdi"); __asm__("movq %rcx, -8(%r8)"); __asm__("subq %rdi, %r8"); __asm__("subq %r8, %rdx"); __asm__("subq %r8, %rsi"); __asm__("subq $0x20, %rdx"); __asm__(".p2align 4"); __asm__("4:"); __asm__("subq $0x20, %rdx"); __asm__("movq -1*8(%rsi), %r8"); __asm__("movq -2*8(%rsi), %r9"); __asm__("movq -3*8(%rsi), %r10"); __asm__("movq -4*8(%rsi), %r11"); __asm__("leaq -4*8(%rsi), %rsi"); __asm__("movq %r8, -1*8(%rdi)"); __asm__("movq %r9, -2*8(%rdi)"); __asm__("movq %r10, -3*8(%rdi)"); __asm__("movq %r11, -4*8(%rdi)"); __asm__("leaq -4*8(%rdi), %rdi"); __asm__("jae 4b"); /* * Calculate copy position to head. */ __asm__("addq $0x20, %rdx"); __asm__("subq %rdx, %rsi"); __asm__("subq %rdx, %rdi"); __asm__("jmp 10f"); __asm__("1:"); __asm__("cmpq $32, %rdx"); __asm__("jb 10f"); /* * Move data from 32 bytes to 39 bytes. */ __asm__("movq 0*8(%rsi), %rcx"); __asm__("movq 1*8(%rsi), %r8"); __asm__("movq -3*8(%rsi, %rdx), %r9"); __asm__("movq -2*8(%rsi, %rdx), %r10"); __asm__("movq -1*8(%rsi, %rdx), %r11"); __asm__("movq %rcx, 0*8(%rdi)"); __asm__("movq %r8, 1*8(%rdi)"); __asm__("movq %r9, -3*8(%rdi, %rdx)"); __asm__("movq %r10, -2*8(%rdi, %rdx)"); __asm__("movq %r11, -1*8(%rdi, %rdx)"); __asm__("retq"); /* * Move data from 16 bytes to 31 bytes. */ __asm__("10:"); __asm__("cmpq $16, %rdx"); __asm__("jb 5f"); __asm__("movq 0*8(%rsi), %r8"); __asm__("movq 1*8(%rsi), %r9"); __asm__("movq -2*8(%rsi, %rdx), %r10"); __asm__("movq -1*8(%rsi, %rdx), %r11"); __asm__("movq %r8, 0*8(%rdi)"); __asm__("movq %r9, 1*8(%rdi)"); __asm__("movq %r10, -2*8(%rdi, %rdx)"); __asm__("movq %r11, -1*8(%rdi, %rdx)"); __asm__("retq"); __asm__(".p2align 4"); __asm__("5:"); __asm__("cmpq $8, %rdx"); __asm__("jb 6f"); /* * Move data from 8 bytes to 15 bytes. */ __asm__("movq 0*8(%rsi), %r8"); __asm__("movq -1*8(%rsi, %rdx), %r9"); __asm__("movq %r8, 0*8(%rdi)"); __asm__("movq %r9, -1*8(%rdi, %rdx)"); __asm__("retq"); __asm__(".p2align 4"); __asm__("6:"); __asm__("cmpq $4, %rdx"); __asm__("jb 7f"); /* * Move data from 4 bytes to 7 bytes. */ __asm__("movl (%rsi), %ecx"); __asm__("movl -4(%rsi, %rdx), %r8d"); __asm__("movl %ecx, (%rdi)"); __asm__("movl %r8d, -4(%rdi, %rdx)"); __asm__("retq"); __asm__(".p2align 4"); __asm__("7:"); __asm__("cmpl $0, %edx"); __asm__("je 8f"); /* * Move data from 1 bytes to 3 bytes. */ __asm__("9:"); __asm__("movb (%rsi), %r8b"); __asm__("movb %r8b, (%rdi)"); __asm__("incq %rdi"); __asm__("incq %rsi"); __asm__("decl %edx"); __asm__("jnz 9b"); __asm__("8:"); } void memcpy_orig(char *dst, char *src, int len) { __asm("movq %rdi, %rax"); /* * Use 32bit CMP here to avoid long NOP padding. */ __asm("cmp $0x20, %edx"); __asm("jbe 1f"); /* * the code for unaligned copy is good for large-size copy(>100), * so if the size is small, we needn't check dst and src is aligned * or not. */ __asm("cmp $100, %edx"); __asm("jb 2f"); /* * unaligned access always leads to bad performance, so in order to * avoid unaligned access, we align the address(both src and dest) * first, and then copy from a aligned src to an aligned dst by using * shifts. * But we found if src is aligned, although dest is unaligned, the * performance of generic memory copy (That is reading data aligned * from the source and writing data unaligned to the dest) is better * than the one that uses shifts to avoid unaligned access. * So if src is aligned, we needn't check dest is aligned or not, just * goto 2: */ __asm("test $7, %esi"); /* src align check */ __asm("jz 2f"); /* if dest and src both are unaligned, goto unaligned copy */ __asm("test $7, %edi"); __asm("jnz 3f"); __asm("jmp 4f"); __asm("2:"); /* * We check whether memory false dependece could occur, * then jump to corresponding copy mode. */ __asm("cmp %dil, %sil"); __asm("jl 5f"); __asm("subl $0x20, %edx"); __asm("6:"); __asm("subq $0x20, %rdx"); /* * Move in blocks of 4x8 bytes: */ __asm("movq 0*8(%rsi), %r8"); __asm("movq 1*8(%rsi), %r9"); __asm("movq 2*8(%rsi), %r10"); __asm("movq 3*8(%rsi), %r11"); __asm("leaq 4*8(%rsi), %rsi"); __asm("movq %r8, 0*8(%rdi)"); __asm("movq %r9, 1*8(%rdi)"); __asm("movq %r10, 2*8(%rdi)"); __asm("movq %r11, 3*8(%rdi)"); __asm("leaq 4*8(%rdi), %rdi"); __asm("jae 6b"); __asm("addq $0x20, %rdx"); __asm("jmp 1f"); __asm("5:"); /* * Calculate copy position to tail. */ __asm("addq %rdx, %rsi"); __asm("addq %rdx, %rdi"); __asm("subq $0x20, %rdx"); /* * At most 3 ALU operations in one cycle, * so append NOPS in the same 16bytes trunk. */ __asm(".p2align 4"); __asm("6:"); __asm("subq $0x20, %rdx"); __asm("movq -1*8(%rsi), %r8"); __asm("movq -2*8(%rsi), %r9"); __asm("movq -3*8(%rsi), %r10"); __asm("movq -4*8(%rsi), %r11"); __asm("leaq -4*8(%rsi), %rsi"); __asm("movq %r8, -1*8(%rdi)"); __asm("movq %r9, -2*8(%rdi)"); __asm("movq %r10, -3*8(%rdi)"); __asm("movq %r11, -4*8(%rdi)"); __asm("leaq -4*8(%rdi), %rdi"); __asm("jae 6b"); /* * Calculate copy position to head. */ __asm("addq $0x20, %rdx"); __asm("subq %rdx, %rsi"); __asm("subq %rdx, %rdi"); __asm__("1:"); __asm("cmpq $16, %rdx"); __asm("jb 7f"); /* * Move data from 16 bytes to 31 bytes. */ __asm("movq 0*8(%rsi), %r8"); __asm("movq 1*8(%rsi), %r9"); __asm("movq -2*8(%rsi, %rdx), %r10"); __asm("movq -1*8(%rsi, %rdx), %r11"); __asm("movq %r8, 0*8(%rdi)"); __asm("movq %r9, 1*8(%rdi)"); __asm("movq %r10, -2*8(%rdi, %rdx)"); __asm("movq %r11, -1*8(%rdi, %rdx)"); __asm("retq"); __asm(".p2align 4"); __asm__("7:"); __asm("cmpq $8, %rdx"); __asm("jb 8f"); /* * Move data from 8 bytes to 15 bytes. */ __asm("movq 0*8(%rsi), %r8"); __asm("movq -1*8(%rsi, %rdx), %r9"); __asm("movq %r8, 0*8(%rdi)"); __asm("movq %r9, -1*8(%rdi, %rdx)"); __asm("retq"); __asm(".p2align 4"); __asm__("8:"); __asm("cmpq $4, %rdx"); __asm("jb 9f"); /* * Move data from 4 bytes to 7 bytes. */ __asm("movl (%rsi), %ecx"); __asm("movl -4(%rsi, %rdx), %r8d"); __asm("movl %ecx, (%rdi)"); __asm("movl %r8d, -4(%rdi, %rdx)"); __asm("retq"); __asm(".p2align 4"); __asm__("9:"); __asm("cmpl $0, %edx"); __asm("je 10f"); /* * Move data from 1 bytes to 3 bytes. */ __asm__("11:"); __asm("movb (%rsi), %r8b"); __asm("movb %r8b, (%rdi)"); __asm("incq %rdi"); __asm("incq %rsi"); __asm("decl %edx"); __asm("jnz 11b"); __asm__("10:"); __asm("retq"); __asm(".p2align 4"); __asm__("3:"); __asm("movq %rdi, %rcx"); __asm("andq $7, %rcx"); /* Align the destination */ __asm("negq %rcx"); __asm("andq $7, %rcx"); __asm("subq %rcx, %rdx"); /* tune dst address */ __asm("movq (%rsi), %r8"); __asm("movq %r8, (%rdi)"); __asm("addq %rcx, %rdi"); __asm("addq %rcx, %rsi"); __asm("test $7, %esi"); /* src align check */ __asm("jz 2b"); __asm(".p2align 4"); __asm__("4:"); __asm("push %rbx"); __asm("push %r12"); __asm("push %r13"); __asm("push %r14"); __asm("push %r15"); /* * Calculate how to shift a word read at the memory operation * aligned srcp to make it aligned for copy. */ __asm("movq %rsi, %r14"); __asm("andq $7, %r14"); __asm("shlq $3, %r14"); __asm("movq $64, %r15"); __asm("subq %r14, %r15"); __asm("andq $-8, %rsi"); /* src aligned */ __asm("movq 0*8(%rsi), %r8"); __asm("movq %rdx, %rbx"); __asm("shrq $5, %rbx"); __asm("jz 12f"); /* * %r8 : store src[0] * %r9 : store src[1] * %r10: store src[2] * %r11: store src[3] * %r12: store src[4] * %r13: store the tmp data */ __asm(".p2align 4"); __asm("13:"); __asm("movq 1*8(%rsi), %r9"); __asm("movq 2*8(%rsi), %r10"); __asm("movq 3*8(%rsi), %r11"); __asm("movq 4*8(%rsi), %r12"); __asm("movq %r9, %r13"); __asm("movb %r14b, %cl"); __asm("shrq %cl, %r8"); __asm("shrq %cl, %r13"); __asm("movb %r15b, %cl"); __asm("shlq %cl, %r9"); __asm("orq %r8, %r9"); __asm("movq %r10, %r8"); __asm("shlq %cl, %r10"); __asm("orq %r13, %r10"); __asm("movq %r11, %r13"); __asm("movb %r14b, %cl"); __asm("shrq %cl, %r8"); __asm("shrq %cl, %r13"); __asm("movb %r15b, %cl"); __asm("shlq %cl, %r11"); __asm("orq %r8, %r11"); __asm("movq %r12, %r8"); __asm("shlq %cl, %r12"); __asm("orq %r13, %r12"); __asm("movq %r9, 0*8(%rdi)"); __asm("movq %r10, 1*8(%rdi)"); __asm("movq %r11, 2*8(%rdi)"); __asm("movq %r12, 3*8(%rdi)"); __asm("leaq 4*8(%rdi), %rdi"); __asm("leaq 4*8(%rsi), %rsi"); __asm("decq %rbx"); __asm("jnz 13b"); __asm(".p2align 4"); __asm("12:"); __asm("shrq $3, %r14"); __asm("addq %r14, %rsi"); __asm("pop %r15"); __asm("pop %r14"); __asm("pop %r13"); __asm("pop %r12"); __asm("pop %rbx"); __asm("andq $31, %rdx"); __asm("jnz 1b"); __asm("retq"); } void main(void) { int i; test_init(); printf ("%23s", ""); printf ("\t%s\t%s\n", "memcpy_orig", "memcpy_new"); for (i = 0; i <= 12; ++i) { do_test (i, 0, 1 << i); do_test (0, i, 1 << i); } for (i = 0; i < 32; ++i) { do_test (i, 0, i); do_test (0, i, i); } for (i = 0; i < 48; ++i) { do_test (0, 8, i); do_test (1, 8, i); do_test (4, 8, i); } for (i = 3; i < 32; ++i) { if ((i & (i - 1)) == 0) continue; do_test (i, 0, 16 * i); do_test (0, i, 16 * i); } }