netdev - [PATCH v4 net-next] net: Implement fast csum_partial for x86

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 26 Feb 2016 12:03:15 -0800
From:	Tom Herbert <tom@...bertland.com>
To:	<davem@...emloft.net>, <netdev@...r.kernel.org>
CC:	<torvalds@...ux-foundation.org>, <tglx@...utronix.de>,
	<mingo@...hat.com>, <hpa@...or.com>, <x86@...nel.org>,
	<kernel-team@...com>
Subject: [PATCH v4 net-next] net: Implement fast csum_partial for x86_64

This patch implements performant csum_partial for x86_64. The intent is
to speed up checksum calculation, particularly for smaller lengths such
as those that are present when doing skb_postpull_rcsum when getting
CHECKSUM_COMPLETE from device or after CHECKSUM_UNNECESSARY conversion.

- v4
   - went back to C code with inline assembly for critical routines
   - implemented suggestion from Linus to deal with lengths < 8

Testing:

Correctness:

Verified correctness by testing arbitrary length buffer filled with
random data. For each buffer I compared the computed checksum
using the original algorithm for each possible alignment (0-7 bytes).

Performance:

Isolating old and new implementation for some common cases:

             Old      New     %
    Len/Aln  nsecs    nsecs   Improv
    --------+-------+--------+-------
    1400/0    195.6    181.7   7%     (Big packet)
    40/0      11.4     6.2     45%    (Ipv6 hdr cmn case)
    8/4       7.9      3.2     59%    (UDP, VXLAN in IPv4)
    14/0      8.9      5.9     33%    (Eth hdr)
    14/4      9.2      5.9     35%    (Eth hdr in IPv4)
    14/3      9.6      5.9     38%    (Eth with odd align)
    20/0      9.0      6.2     31%    (IP hdr without options)
    7/1       8.9      4.2     52%    (buffer in one quad)
    100/0    17.4     13.9     20%    (medium-sized pkt)
    100/2    17.8     14.2     20%    (medium-sized pkt w/ alignment)

Results from: Intel(R) Xeon(R) CPU X5650 @ 2.67GHz

Also tested on these with similar results:

Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz
Intel(R) Xeon(R) CPU E5-2680 v2 @ 2.80GHz

Branch  prediction:

To test the effects of poor branch prediction in the jump tables I
tested checksum performance with runs for two combinations of length
and alignment. As the baseline I performed the test by doing half of
calls with the first combination, followed by using the second
combination for the second half. In the test case, I interleave the
two combinations so that in every call the length and alignment are
different to defeat the effects of branch prediction. Running several
cases, I did not see any material performance difference between the
two scenarios (perf stat output is below), neither does either case
show a significant number of branch misses.

Interleave lengths case:

perf stat --repeat 10 -e '{instructions, branches, branch-misses}' \
    ./csum -M new-thrash -I -l 100 -S 24 -a 1 -c 100000000

 Performance counter stats for './csum -M new-thrash -I -l 100 -S 24 -a 1 -c 100000000' (10 runs):

     9,556,693,202      instructions               ( +-  0.00% )
     1,176,208,640       branches                                                     ( +-  0.00% )
            19,487       branch-misses            #    0.00% of all branches          ( +-  6.07% )

       2.049732539 seconds time elapsed

    Non-interleave case:

perf stat --repeat 10 -e '{instructions, branches, branch-misses}' \
     ./csum -M new-thrash -l 100 -S 24 -a 1 -c 100000000

Performance counter stats for './csum -M new-thrash -l 100 -S 24 -a 1 -c 100000000' (10 runs):

     9,782,188,310      instructions               ( +-  0.00% )
     1,251,286,958       branches                                                     ( +-  0.01% )
            18,950       branch-misses            #    0.00% of all branches          ( +- 12.74% )

       2.271789046 seconds time elapsed

Signed-off-by: Tom Herbert <tom@...bertland.com>
---
 arch/x86/include/asm/checksum_64.h |  21 ++++
 arch/x86/lib/csum-partial_64.c     | 225 ++++++++++++++++++++-----------------
 2 files changed, 143 insertions(+), 103 deletions(-)

diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index cd00e17..e20c35b 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -188,6 +188,27 @@ static inline unsigned add32_with_carry(unsigned a, unsigned b)
 	return a;
 }
 
+static inline unsigned long add64_with_carry(unsigned long a, unsigned long b)
+{
+	asm("addq %2,%0\n\t"
+	    "adcq $0,%0"
+	    : "=r" (a)
+	    : "0" (a), "rm" (b));
+	return a;
+}
+
+static inline unsigned int add32_with_carry3(unsigned int a, unsigned int b,
+					     unsigned int c)
+{
+	asm("addl %2,%0\n\t"
+	    "adcl %3,%0\n\t"
+	    "adcl $0,%0"
+	    : "=r" (a)
+	    : "" (a), "rm" (b), "rm" (c));
+
+	return a;
+}
+
 #define HAVE_ARCH_CSUM_ADD
 static inline __wsum csum_add(__wsum csum, __wsum addend)
 {
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 9845371..df82c9b 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -8,114 +8,66 @@
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <asm/checksum.h>
+#include <asm/word-at-a-time.h>
 
-static inline unsigned short from32to16(unsigned a) 
+static inline unsigned long rotate_by8_if_odd(unsigned long sum, bool aligned)
 {
-	unsigned short b = a >> 16; 
-	asm("addw %w2,%w0\n\t"
-	    "adcw $0,%w0\n" 
-	    : "=r" (b)
-	    : "0" (b), "r" (a));
-	return b;
+	if (unlikely(aligned))
+		asm("rorq $0x8,%0"
+			: "=r" (sum)
+			: "0" (sum));
+	return sum;
 }
 
-/*
- * Do a 64-bit checksum on an arbitrary memory area.
- * Returns a 32bit checksum.
- *
- * This isn't as time critical as it used to be because many NICs
- * do hardware checksumming these days.
- * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
- */
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+/* Extract eight high order (nbo) bytes of quad "val". */
+static inline unsigned long csum_partial_lt8_head(unsigned long val, int len)
 {
-	unsigned odd, count;
-	unsigned long result = 0;
+	return val & ((1ul << len * 8) - 1);
+}
 
-	if (unlikely(len == 0))
-		return result; 
-	odd = 1 & (unsigned long) buff;
-	if (unlikely(odd)) {
-		result = *buff << 8;
-		len--;
-		buff++;
-	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *)buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			unsigned long zero;
-			unsigned count64;
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
-
-			/* main loop using 64byte blocks */
-			zero = 0;
-			count64 = count >> 3;
-			while (count64) { 
-				asm("addq 0*8(%[src]),%[res]\n\t"
-				    "adcq 1*8(%[src]),%[res]\n\t"
-				    "adcq 2*8(%[src]),%[res]\n\t"
-				    "adcq 3*8(%[src]),%[res]\n\t"
-				    "adcq 4*8(%[src]),%[res]\n\t"
-				    "adcq 5*8(%[src]),%[res]\n\t"
-				    "adcq 6*8(%[src]),%[res]\n\t"
-				    "adcq 7*8(%[src]),%[res]\n\t"
-				    "adcq %[zero],%[res]"
-				    : [res] "=r" (result)
-				    : [src] "r" (buff), [zero] "r" (zero),
-				    "[res]" (result));
-				buff += 64;
-				count64--;
-			}
-
-			/* last up to 7 8byte blocks */
-			count %= 8; 
-			while (count) { 
-				asm("addq %1,%0\n\t"
-				    "adcq %2,%0\n" 
-					    : "=r" (result)
-				    : "m" (*(unsigned long *)buff), 
-				    "r" (zero),  "0" (result));
-				--count; 
-					buff += 8;
-			}
-			result = add32_with_carry(result>>32,
-						  result&0xffffffff); 
-
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
-	}
-	if (len & 1)
-		result += *buff;
-	result = add32_with_carry(result>>32, result & 0xffffffff); 
-	if (unlikely(odd)) { 
-		result = from32to16(result);
-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+/* Extract eight low order (nbo) bytes of quad in "val". */
+static inline unsigned long csum_partial_lt8_tail(unsigned long val, int len)
+{
+	return val >> (8 * (8 - len));
+}
+
+/* Sum over buffer up to 64 bytes. */
+static unsigned long csum_partial_le64(const void *buff, unsigned int len,
+				       unsigned long sum)
+{
+	asm("lea 40f(, %[slen], 4), %%r11\n\t"
+	    "clc\n\t"
+	    "jmpq *%%r11\n\t"
+	    "adcq 7*8(%[src]),%[res]\n\t"
+	    "adcq 6*8(%[src]),%[res]\n\t"
+	    "adcq 5*8(%[src]),%[res]\n\t"
+	    "adcq 4*8(%[src]),%[res]\n\t"
+	    "adcq 3*8(%[src]),%[res]\n\t"
+	    "adcq 2*8(%[src]),%[res]\n\t"
+	    "adcq 1*8(%[src]),%[res]\n\t"
+	    "adcq 0*8(%[src]),%[res]\n\t"
+	    "nop\n\t"
+	    "40: adcq $0,%[res]"
+		    : [res] "=r" (sum)
+		    : [src] "r" (buff),
+		      [slen] "r" (-((unsigned long)(len >> 3))), "[res]" (sum)
+		    : "r11");
+
+	if (len & 0x7) {
+		unsigned long val;
+		/*
+		 * Since "len" is > 8 here we backtrack in the buffer to load
+		 * the outstaning bytes into the low order bytes of a quad and
+		 * sum those in csum_partial_lt8_tail. By doing this we avoid
+		 * additional calls to load_unaligned_zeropad.
+		 */
+
+		val = csum_partial_lt8_tail(*(unsigned long *)(buff + len - 8),
+					    len & 0x7);
+		sum = add64_with_carry(val, sum);
 	}
-	return result;
+
+	return sum;
 }
 
 /*
@@ -130,10 +82,77 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
  *
  * it's best to have buff aligned on a 64-bit boundary
  */
+static unsigned int do_csum(const void *buff, int len, unsigned int sum)
+{
+	bool aligned = false;
+	unsigned long result = 0;
+
+	/*
+	 * For "small" lengths don't bother with alignment, x86 handles
+	 * this pretty well.
+	 */
+	if (len <= 8) {
+		unsigned long val;
+
+		/* Optimize trivial cases */
+		if (len == 8)
+			return add32_with_carry3(sum,
+				       *(unsigned int *)buff,
+				       *(unsigned int *)(buff + 4));
+		else if (len == 0)
+			return sum;
+
+		val = load_unaligned_zeropad(buff);
+		result = csum_partial_lt8_head(val, len);
+
+		return add32_with_carry3(sum, result >> 32,
+					 result & 0xffffffff);
+	} else if (len <= 64) {
+		result = csum_partial_le64(buff, len, result);
+
+		return add32_with_carry3(sum, result >> 32,
+					 result & 0xffffffff);
+	}
+
+	/*
+	 * Length is greater than 64. Sum to eight byte alignment before
+	 * proceeding with main loop.
+	 */
+	aligned = !!((unsigned long)buff & 0x1);
+	if (aligned) {
+		unsigned int align = 7 & -(unsigned long)buff;
+
+		result = csum_partial_lt8_head(*(unsigned long *)buff, align);
+		buff += align;
+		len -= align;
+		result = rotate_by8_if_odd(result, align);
+	}
+
+	/* main loop using 64byte blocks */
+	for (; len >= 64; len -= 64, buff += 64) {
+		asm("addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcq 4*8(%[src]),%[res]\n\t"
+		    "adcq 5*8(%[src]),%[res]\n\t"
+		    "adcq 6*8(%[src]),%[res]\n\t"
+		    "adcq 7*8(%[src]),%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [res] "=r" (result)
+		    : [src] "r" (buff),
+		    "[res]" (result));
+	}
+
+	result = csum_partial_le64(buff, len, result);
+	result = rotate_by8_if_odd(result, aligned);
+
+	return add32_with_carry3(sum, result >> 32, result & 0xffffffff);
+}
+
 __wsum csum_partial(const void *buff, int len, __wsum sum)
 {
-	return (__force __wsum)add32_with_carry(do_csum(buff, len),
-						(__force u32)sum);
+	return (__force __wsum)do_csum(buff, len, (__force u32)sum);
 }
 
 /*
-- 
2.6.5