#include #include typedef unsigned long long int hp_timing_t; #define MAXSAMPLESTPT 1000 #define MAXCOPYSIZE (1024 * 1024) #define ORIG 0 #define NEW 1 static char* buf1 = NULL; static char* buf2 = NULL; static int repeat_one_test = 32; hp_timing_t _dl_hp_timing_overhead; # define HP_TIMING_NOW(Var) \ ({ unsigned long long _hi, _lo; \ asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \ (Var) = _hi << 32 | _lo; }) #define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start)) #define HP_TIMING_TOTAL(total_time, start, end) \ do \ { \ hp_timing_t tmptime; \ HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \ total_time += tmptime; \ } \ while (0) #define HP_TIMING_BEST(best_time, start, end) \ do \ { \ hp_timing_t tmptime; \ HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \ if (best_time > tmptime) \ best_time = tmptime; \ } \ while (0) void copy_page_org(char *dst, char *src, int len); void copy_page_new(char *dst, char *src, int len); void memcpy_c(char *dst, char *src, int len); void (*do_memcpy)(char *dst, char *src, int len); static void do_one_test ( char *dst, char *src, size_t len) { hp_timing_t start __attribute ((unused)); hp_timing_t stop __attribute ((unused)); hp_timing_t best_time = ~ (hp_timing_t) 0; size_t i,j; for (i = 0; i < repeat_one_test; ++i) { HP_TIMING_NOW (start); do_memcpy ( dst, src, len); HP_TIMING_NOW (stop); HP_TIMING_BEST (best_time, start, stop); } printf ("\t%zd", (size_t) best_time); } static void do_test (size_t align1, size_t align2, size_t len) { size_t i, j; char *s1, *s2; s1 = (char *) (buf1 + align1); s2 = (char *) (buf2 + align2); printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2); do_memcpy = copy_page_org; do_one_test (s2, s1, len); do_memcpy = copy_page_new; do_one_test (s2+ (1 << 16), s1 + (1 << 16), len); putchar ('\n'); } static test_init(void) { int i; buf1 = valloc(MAXCOPYSIZE); buf2 = valloc(MAXCOPYSIZE); for (i = 0; i < MAXCOPYSIZE ; i = i + 64) { buf1[i] = buf2[i] = i & 0xff; } } void copy_page_new(char *dst, char *src, int len) { __asm__("mov $(4096/64)-5, %ecx"); __asm__("1:"); __asm__("prefetcht0 5*64(%rsi)"); __asm__("decb %cl"); __asm__("movq 0x8*0(%rsi), %r10"); __asm__("movq 0x8*1(%rsi), %rax"); __asm__("movq 0x8*2(%rsi), %r8"); __asm__("movq 0x8*3(%rsi), %r9"); __asm__("movq %r10, 0x8*0(%rdi)"); __asm__("movq %rax, 0x8*1(%rdi)"); __asm__("movq %r8, 0x8*2(%rdi)"); __asm__("movq %r9, 0x8*3(%rdi)"); __asm__("movq 0x8*4(%rsi), %r10"); __asm__("movq 0x8*5(%rsi), %rax"); __asm__("movq 0x8*6(%rsi), %r8"); __asm__("movq 0x8*7(%rsi), %r9"); __asm__("leaq 64(%rsi), %rsi"); __asm__("movq %r10, 0x8*4(%rdi)"); __asm__("movq %rax, 0x8*5(%rdi)"); __asm__("movq %r8, 0x8*6(%rdi)"); __asm__("movq %r9, 0x8*7(%rdi)"); __asm__("leaq 64(%rdi), %rdi"); __asm__("jnz 1b"); __asm__("mov $5, %dl"); __asm__("2:"); __asm__("decb %dl"); __asm__("movq 0x8*0(%rsi), %r10"); __asm__("movq 0x8*1(%rsi), %rax"); __asm__("movq 0x8*2(%rsi), %r8"); __asm__("movq 0x8*3(%rsi), %r9"); __asm__("movq %r10, 0x8*0(%rdi)"); __asm__("movq %rax, 0x8*1(%rdi)"); __asm__("movq %r8, 0x8*2(%rdi)"); __asm__("movq %r9, 0x8*3(%rdi)"); __asm__("movq 0x8*4(%rsi), %r10"); __asm__("movq 0x8*5(%rsi), %rax"); __asm__("movq 0x8*6(%rsi), %r8"); __asm__("movq 0x8*7(%rsi), %r9"); __asm__("leaq 64(%rsi), %rsi"); __asm__("movq %r10, 0x8*4(%rdi)"); __asm__("movq %rax, 0x8*5(%rdi)"); __asm__("movq %r8, 0x8*6(%rdi)"); __asm__("movq %r9, 0x8*7(%rdi)"); __asm__("leaq 64(%rdi), %rdi"); __asm__("jnz 2b"); } void copy_page_org(char *dst, char *src, int len) { __asm__("subq $2*8,%rsp"); __asm__("movq %rbx,(%rsp)"); __asm__("movq %r12,1*8(%rsp)"); __asm__("movl $(4096/64)-5,%ecx"); __asm__(".p2align 4"); __asm__("1:"); __asm__("dec %rcx"); __asm__("movq (%rsi), %rax"); __asm__("movq 8 (%rsi), %rbx"); __asm__("movq 16 (%rsi), %rdx"); __asm__("movq 24 (%rsi), %r8"); __asm__("movq 32 (%rsi), %r9"); __asm__("movq 40 (%rsi), %r10"); __asm__("movq 48 (%rsi), %r11"); __asm__("movq 56 (%rsi), %r12"); __asm__("prefetcht0 5*64(%rsi)"); __asm__("movq %rax, (%rdi)"); __asm__("movq %rbx, 8 (%rdi)"); __asm__("movq %rdx, 16 (%rdi)"); __asm__("movq %r8, 24 (%rdi)"); __asm__("movq %r9, 32 (%rdi)"); __asm__("movq %r10, 40 (%rdi)"); __asm__("movq %r11, 48 (%rdi)"); __asm__("movq %r12, 56 (%rdi)"); __asm__("leaq 64 (%rsi), %rsi"); __asm__("leaq 64 (%rdi), %rdi"); __asm__("jnz 1b"); __asm__("movl $5,%ecx"); __asm__(".p2align 4"); __asm__("2:"); __asm__("decl %ecx"); __asm__("movq (%rsi), %rax"); __asm__("movq 8 (%rsi), %rbx"); __asm__("movq 16 (%rsi), %rdx"); __asm__("movq 24 (%rsi), %r8"); __asm__("movq 32 (%rsi), %r9"); __asm__("movq 40 (%rsi), %r10"); __asm__("movq 48 (%rsi), %r11"); __asm__("movq 56 (%rsi), %r12"); __asm__("movq %rax, (%rdi)"); __asm__("movq %rbx, 8 (%rdi)"); __asm__("movq %rdx, 16 (%rdi)"); __asm__("movq %r8, 24 (%rdi)"); __asm__("movq %r9, 32 (%rdi)"); __asm__("movq %r10, 40 (%rdi)"); __asm__("movq %r11, 48 (%rdi)"); __asm__("movq %r12, 56 (%rdi)"); __asm__("leaq 64(%rdi),%rdi"); __asm__("leaq 64(%rsi),%rsi"); __asm__("jnz 2b"); __asm__("movq (%rsp),%rbx"); __asm__("movq 1*8(%rsp),%r12"); __asm__("addq $2*8,%rsp"); } void main(void) { int i; test_init(); printf ("%23s", ""); printf ("\t%s\t%s\t%s\n", "copy_page_org", "copy_page_new"); do_test(0, 0, 4096); do_test(0, 0, 4096); do_test(0, 0, 4096); do_test(0, 0, 4096); do_test(0, 0, 4096); return ; }