#include #include #include #include #include #include #include #ifndef PASSES #define PASSES 10 #endif static int init_pmc(void) { static struct perf_event_attr perf_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, .pinned = 1, }; struct perf_event_mmap_page *pc; int perf_fd; perf_fd = syscall(__NR_perf_event_open, &perf_attr, 0, -1, -1, 0); if (perf_fd < 0) { fprintf(stderr, "perf_event_open failed: errno %d\n", errno); exit(1); } pc = mmap(NULL, 4096, PROT_READ, MAP_SHARED, perf_fd, 0); if (pc == MAP_FAILED) { fprintf(stderr, "perf_event mmap() failed: errno %d\n", errno); exit(1); } return pc->index - 1; } static inline unsigned int rdpmc(unsigned int counter) { unsigned int low, high; #ifndef NOFENCE asm volatile("mfence"); #endif asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter)); #ifndef NOFENCE asm volatile("mfence"); #endif // return low bits, counter might to 32 or 40 bits wide. return low; } __attribute__((noinline)) unsigned long overhead(const unsigned char *buff, unsigned long len) { return 0; } // Existing kernel loop. // Simple 'adc' loop, abs max 8 bytes/clock. // Will be very slow on old cpu (Pre Ivy bridge) cpu where 'dec' // has a false dependeny against the carry flag. // And also pre-broadwell where the adc take two clocks. __attribute__((noinline)) unsigned long adc_dec_8(const unsigned char *buff, unsigned long len) { unsigned long csum = 0; long count = len/64; asm("1: addq 0(%[buff]), %[csum]\n" " adcq 8(%[buff]), %[csum]\n" " adcq 16(%[buff]), %[csum]\n" " adcq 24(%[buff]), %[csum]\n" " adcq 32(%[buff]), %[csum]\n" " adcq 40(%[buff]), %[csum]\n" " adcq 48(%[buff]), %[csum]\n" " adcq 56(%[buff]), %[csum]\n" " adcq $0, %[csum]\n" " lea 64(%[buff]), %[buff]\n" " dec %[count]\n" " jnz 1b" : [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count) : [buff] "r" (buff) : "memory"); return csum; } // As above but only 4 adc __attribute__((noinline)) unsigned long adc_dec_4(const unsigned char *buff, unsigned long len) { unsigned long csum = 0; long count = len/32; asm(" clc\n" "1: adcq (%[buff]), %[csum]\n" " adcq 8(%[buff]), %[csum]\n" " adcq 16(%[buff]), %[csum]\n" " adcq 24(%[buff]), %[csum]\n" " lea 32(%[buff]), %[buff]\n" " dec %[count]\n" " jnz 1b\n" " adcq $0, %[csum]\n" : [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count) : [buff] "r" (buff) : "memory"); return csum; } // As above but only 2 adc __attribute__((noinline)) unsigned long adc_dec_2(const unsigned char *buff, unsigned long len) { unsigned long csum = 0; long count = len/16; asm(" clc\n" "1: adcq (%[buff]), %[csum]\n" " adcq 8(%[buff]), %[csum]\n" " lea 16(%[buff]), %[buff]\n" " dec %[count]\n" " jnz 1b\n" " adcq $0, %[csum]\n" : [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count) : [buff] "r" (buff) : "memory"); return csum; } // Alternate 'adc' loop that avoids using the zero flag. __attribute__((noinline)) unsigned long adc_jcxz_4(const unsigned char *buff, unsigned long len) { unsigned long csum = 0; buff += len; len = -len; asm(" clc\n" "1: jrcxz 2f\n" " adcq 0(%[buff], %[len]), %[csum]\n" " adcq 8(%[buff], %[len]), %[csum]\n" " adcq 16(%[buff], %[len]), %[csum]\n" " adcq 24(%[buff], %[len]), %[csum]\n" " lea 32(%[len]), %[len]\n" " jmp 1b\n" "2: adcq $0, %[csum]\n" : [csum] "+&r" (csum), [len] "+&c" (len) : [buff] "r" (buff) : "memory"); return csum; } // Shorter alternate 'adc' loop that avoids using the zero flag. __attribute__((noinline)) unsigned long adc_jcxz_2(const unsigned char *buff, unsigned long len) { unsigned long csum = 0; buff += len; len = -len; asm(" clc\n" "1: jrcxz 2f\n" " adcq 0(%[buff], %[len]), %[csum]\n" " adcq 8(%[buff], %[len]), %[csum]\n" " lea 16(%[len]), %[len]\n" " jmp 1b\n" "2: adcq $0, %[csum]\n" : [csum] "+&r" (csum), [len] "+&c" (len) : [buff] "r" (buff) : "memory"); return csum; } // Proposed kernel loop. // This splits the adc chain in two. __attribute__((noinline)) unsigned long adc_4_pair(const unsigned char *buff, unsigned long len) { unsigned long csum = 0; long count = len/64; unsigned long csum1; asm("1: movq 0(%[buff]), %[csum1]\n" " adcq 8(%[buff]), %[csum1]\n" " adcq 16(%[buff]), %[csum1]\n" " adcq 24(%[buff]), %[csum1]\n" " adcq 32(%[buff]), %[csum1]\n" " adcq $0, %[csum1]\n" " lea 64(%[buff]), %[buff]\n" " addq 40-64(%[buff]), %[csum]\n" " adcq 48-64(%[buff]), %[csum]\n" " adcq 56-64(%[buff]), %[csum]\n" " adcq %[csum1], %[csum]\n" " adcq $0, %[csum]\n" " dec %[count]\n" " jnz 1b" : [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count) : [buff] "r" (buff) : "memory"); return csum; } // Less unrolled version of previous. __attribute__((noinline)) unsigned long adc_2_pair(const unsigned char *buff, unsigned long len) { unsigned long csum = 0; long count = len; unsigned long csum1; asm("1: movq 0(%[buff]), %[csum1]\n" " adcq 8(%[buff]), %[csum1]\n" " adcq $0, %[csum1]\n" " lea 32(%[buff]), %[buff]\n" " addq 16-32(%[buff]), %[csum]\n" " adcq 24-32(%[buff]), %[csum]\n" " adcq %[csum1], %[csum]\n" " adcq $0, %[csum]\n" " sub $32, %[count]\n" " jnz 1b" : [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count) : [buff] "r" (buff) : "memory"); return csum; } // adxc/adxo loop. // This is the only loop that exceeds 8 bytes/clock for 1k buffers. __attribute__((noinline)) unsigned long adcx_adox(const unsigned char *buff, unsigned long len) { unsigned long csum = 0; unsigned long csum_odd; buff += len; len = -len; asm( " xor %[csum_odd], %[csum_odd]\n" // Also clears carry and overflow "10: jrcxz 20f\n" " adcx (%[buff], %[len]), %[csum]\n" " adox 8(%[buff], %[len]), %[csum_odd]\n" " adcx 16(%[buff], %[len]), %[csum]\n" " adox 24(%[buff], %[len]), %[csum_odd]\n" " adcx 32(%[buff], %[len]), %[csum]\n" " adox 40(%[buff], %[len]), %[csum_odd]\n" " adcx 48(%[buff], %[len]), %[csum]\n" " adox 56(%[buff], %[len]), %[csum_odd]\n" " lea 64(%[len]), %[len]\n" " jmp 10b\n" "20: adox %[len], %[csum_odd]\n" // [len] is zero " adcx %[csum_odd], %[csum]\n" " adcx %[len], %[csum]" : [csum] "+&r" (csum), [len] "+&c" (len), [csum_odd] "=&r" (csum_odd) : [buff] "r" (buff) : "memory"); return csum; } #define OPTIMISER_HIDE_VAR(x) asm volatile( "": "+r" (x)) // In principe the cpu can do 2 reads every clock. // So combining with adds to different registers it should // be possible to get this C code to run near to 8 bytes/clock. // But the compiler doesn't want to 'play ball'. // The OPTIMISER_HIDE_VAR() helps a bit __attribute__((noinline)) unsigned long add_c_16(const unsigned char *buff, unsigned long len) { unsigned long csum_odd = 0; unsigned long csum = 0; buff += len; len = -len; do { csum += *(unsigned int *)(buff + len + 0); csum_odd += *(unsigned int *)(buff + len + 4); OPTIMISER_HIDE_VAR(csum); OPTIMISER_HIDE_VAR(csum_odd); csum += *(unsigned int *)(buff + len + 8); csum_odd += *(unsigned int *)(buff + len + 12); OPTIMISER_HIDE_VAR(csum); OPTIMISER_HIDE_VAR(csum_odd); } while (len += 16); csum += csum_odd; return csum; } // As above but unrolled further __attribute__((noinline)) unsigned long add_c_32(const unsigned char *buff, unsigned long len) { unsigned long csum_odd = 0; unsigned long csum = 0; buff += len; len = -len; do { csum += *(unsigned int *)(buff + len + 0); csum_odd += *(unsigned int *)(buff + len + 4); OPTIMISER_HIDE_VAR(csum); OPTIMISER_HIDE_VAR(csum_odd); csum += *(unsigned int *)(buff + len + 8); csum_odd += *(unsigned int *)(buff + len + 12); OPTIMISER_HIDE_VAR(csum); OPTIMISER_HIDE_VAR(csum_odd); csum += *(unsigned int *)(buff + len + 16); csum_odd += *(unsigned int *)(buff + len + 20); OPTIMISER_HIDE_VAR(csum); OPTIMISER_HIDE_VAR(csum_odd); csum += *(unsigned int *)(buff + len + 24); csum_odd += *(unsigned int *)(buff + len + 28); OPTIMISER_HIDE_VAR(csum); OPTIMISER_HIDE_VAR(csum_odd); } while (len += 32); csum += csum_odd; return csum; } // Alternative C loop that sums 'val << 32' to get the carries. // Might be better on arm64 than x86 __attribute__((noinline)) unsigned long add_c_high(const unsigned char *buff, unsigned long len) { unsigned long sum_hi_odd = 0, sum_lo_odd = 0; unsigned long sum_hi = 0, sum_lo = 0; unsigned long val; unsigned int i; buff += len; len = -len; do { val = *(unsigned long *)(buff + len + 0); sum_lo += val; sum_hi += val >> 32; val = *(unsigned long *)(buff + len + 8); sum_lo_odd += val; sum_hi_odd += val >> 32; } while (len += 16); OPTIMISER_HIDE_VAR(sum_lo); OPTIMISER_HIDE_VAR(sum_hi); sum_lo += sum_lo_odd; sum_hi += sum_hi_odd; sum_hi += (sum_lo >> 32) - (sum_hi & 0xffffffffu); return sum_hi + (sum_lo & 0xffffffff); } unsigned char buffer[8192] __attribute__((aligned(4096))) = { 0x46,0x56,0x20,0x04,0x10,0x02,0x50,0x07,0x72,0x4d,0xc6,0x3d,0x31,0x85,0x2d,0xbd, 0xe2,0xe0,0x9d,0x3e,0x3b,0x7a,0x70,0x3d,0xd2,0xfb,0x8c,0xbf,0x95,0x10,0xa9,0xbe, 0xeb,0xfd,0x29,0x40,0xd5,0x7a,0x61,0x40,0xde,0xcd,0x14,0xbf,0x81,0x1b,0xf6,0x3f, 0xbc,0xff,0x17,0x3f,0x67,0x1c,0x6e,0xbe,0xf4,0xc2,0x05,0x40,0x0b,0x13,0x78,0x3f, 0xfe,0x47,0xa7,0xbd,0x59,0xc2,0x15,0x3f,0x07,0xd0,0xea,0xbf,0x97,0xf1,0x3c,0x3f, 0xcc,0xfa,0x6b,0x40,0x72,0x6a,0x4f,0xbe,0x0b,0xe3,0x75,0x3e,0x3c,0x9b,0x0e,0xbf, 0xa9,0xeb,0xb7,0x3f,0xeb,0x4a,0xec,0x3e,0x33,0x8c,0x0c,0x3f,0x6a,0xf2,0xf3,0x3e, 0x2b,0x45,0x86,0x3f,0x83,0xce,0x8a,0x3f,0xf6,0x01,0x16,0x40,0x9c,0x17,0x47,0x3e, 0x44,0x83,0x61,0x40,0x74,0xc7,0x5c,0x3f,0xec,0xe7,0x95,0x3f,0xee,0x19,0xb5,0xbf, 0xb5,0xf0,0x03,0xbf,0xd1,0x02,0x1c,0x3e,0xa3,0x55,0x90,0xbe,0x1e,0x0b,0xa1,0xbf, 0xa4,0xa8,0xb4,0x3f,0xc6,0x68,0x91,0x3f,0xd1,0xc5,0xab,0x3f,0xb9,0x14,0x62,0x3f, 0x7c,0xe0,0xb9,0xbf,0xc0,0xa4,0xb5,0x3d,0x6f,0xd9,0xa7,0x3f,0x8f,0xc4,0xb0,0x3d, 0x48,0x2c,0x7a,0x3e,0x83,0xb2,0x3c,0x40,0x36,0xd3,0x18,0x40,0xb7,0xa9,0x57,0x40, 0xda,0xd3,0x95,0x3f,0x74,0x95,0xc0,0xbe,0xbb,0xce,0x71,0x3e,0x95,0xec,0x18,0xbf, 0x94,0x17,0xdd,0x3f,0x98,0xa5,0x02,0x3f,0xbb,0xfb,0xbb,0x3e,0xd0,0x5a,0x9c,0x3f, 0xd4,0x70,0x9b,0xbf,0x3b,0x9f,0x20,0xc0,0x84,0x5b,0x0f,0x40,0x5e,0x48,0x2c,0xbf, }; static __attribute__((noinline)) void print_csum(const char *msg, unsigned long csum, unsigned int *ticks) { static unsigned int offset; unsigned int i; csum = (csum & 0xffffffff) + (csum >> 32); csum = (csum & 0xffffffff) + (csum >> 32); csum = (csum & 0xffff) + (csum >> 16); csum = (csum & 0xffff) + (csum >> 16); printf(" %4lx", csum); for (i = 0; i < PASSES; i++) printf(" %5d", ticks[i] - offset); printf(" %s\n", msg); if (!offset) { offset = ticks[0]; for (i = 1; i < PASSES; i++) if (offset > ticks[i]) offset = ticks[i]; } } static inline __attribute__((always_inline)) void do_csum(int pmc_id, const char *msg, unsigned long (*fn)(const unsigned char *, unsigned long), const unsigned char *buf, unsigned long len) { unsigned int ticks[PASSES]; unsigned long csum; unsigned int tick; unsigned int i; for (i = 0; i < PASSES; i++) { tick = rdpmc(pmc_id); csum = fn(buf, len); ticks[i] = rdpmc(pmc_id) - tick; } print_csum(msg, csum, ticks); } int main(int argc, char **argv) { unsigned long csum; unsigned int len, off; unsigned int pmc_id = init_pmc(); len = 128; off = 0; for (;;) { switch (getopt(argc, argv, "l:o:")) { case -1: break; default: exit(1); case 'l': len = atoi(optarg); continue; case 'o': off = atoi(optarg); continue; } break; } /* Solving partial blocks is a different problem... */ len = (len + 63) & ~63; if (!len) len = 64; do_csum(pmc_id, "overhead", overhead, buffer + off, len); do_csum(pmc_id, "adc_dec_2", adc_dec_2, buffer + off, len); do_csum(pmc_id, "adc_dec_4", adc_dec_4, buffer + off, len); do_csum(pmc_id, "adc_dec_8", adc_dec_8, buffer + off, len); do_csum(pmc_id, "adc_jcxz_2", adc_jcxz_2, buffer + off, len); do_csum(pmc_id, "adc_jcxz_4", adc_jcxz_4, buffer + off, len); do_csum(pmc_id, "adc_2_pair", adc_2_pair, buffer + off, len); do_csum(pmc_id, "adc_4_pair", adc_4_pair, buffer + off, len); do_csum(pmc_id, "adcx_adox", adcx_adox, buffer + off, len); do_csum(pmc_id, "add_c_16", add_c_16, buffer + off, len); do_csum(pmc_id, "add_c_32", add_c_32, buffer + off, len); do_csum(pmc_id, "add_c_high", add_c_high, buffer + off, len); return 0; }