arch/x86/lib/csum-partial_64.c | 66 ++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index cea25ca8b8cf..9f1fa47ccbe0 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -33,6 +33,20 @@ static inline __wsum csum_tail(u64 temp64, int odd) return (__force __wsum)result; } +static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5]) +{ + asm volatile("addq %1,%0\n\t" + "adcq %2,%0\n\t" + "adcq %3,%0\n\t" + "adcq %4,%0\n\t" + "adcq %5,%0\n\t" + "adcq $0,%0" + :"=r" (sum) + :"m" (m[0]), "m" (m[1]), "m" (m[2]), + "m" (m[3]), "m" (m[4]), "0" (sum)); + return sum; +} + /* * Do a checksum on an arbitrary memory area. * Returns a 32bit checksum. @@ -64,47 +78,23 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) * has noticeable negative affect on codegen for all other cases with * minimal performance benefit here. */ - if (len == 40) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq $0,%[res]" - : [res] "+r"(temp64) - : [src] "r"(buff), "m"(*(const char(*)[40])buff)); + if (likely(len == 40)) { + temp64 = update_csum_40b(temp64, buff); return csum_tail(temp64, odd); } - if (unlikely(len >= 64)) { - /* - * Extra accumulators for better ILP in the loop. - */ - u64 tmp_accum, tmp_carries; - asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t" - "xorl %k[tmp_carries],%k[tmp_carries]\n\t" - "subl $64, %[len]\n\t" - "1:\n\t" - "addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcl $0,%k[tmp_carries]\n\t" - "addq 4*8(%[src]),%[tmp_accum]\n\t" - "adcq 5*8(%[src]),%[tmp_accum]\n\t" - "adcq 6*8(%[src]),%[tmp_accum]\n\t" - "adcq 7*8(%[src]),%[tmp_accum]\n\t" - "adcl $0,%k[tmp_carries]\n\t" - "addq $64, %[src]\n\t" - "subl $64, %[len]\n\t" - "jge 1b\n\t" - "addq %[tmp_accum],%[res]\n\t" - "adcq %[tmp_carries],%[res]\n\t" - "adcq $0,%[res]" - : [tmp_accum] "=&r"(tmp_accum), - [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64), - [len] "+r"(len), [src] "+r"(buff) - : "m"(*(const char *)buff)); + /* Do two 40-byte chunks in parallel to get better ILP */ + if (likely(len >= 80)) { + u64 temp64_2 = 0; + do { + temp64 = update_csum_40b(temp64, buff); + temp64_2 = update_csum_40b(temp64_2, buff + 40); + buff += 80; + len -= 80; + } while (len >= 80); + asm("addq %1,%0\n\t" + "adcq $0,%0" + :"=r" (temp64): "r" (temp64_2)); } if (len & 32) {