netdev - [PATCH v2] x86: Optimise x86 IP checksum code

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <d7d167a576934b149a90e46e2247b91b@AcuMS.aculab.com>
Date:   Sun, 10 May 2020 19:03:00 +0000
From:   David Laight <David.Laight@...LAB.COM>
To:     'linux-kernel' <linux-kernel@...r.kernel.org>,
        "'x86@...nel.org'" <x86@...nel.org>,
        'Thomas Gleixner' <tglx@...utronix.de>
CC:     "Peter Zijlstra (Intel)" <peterz@...radead.org>,
        "netdev@...r.kernel.org" <netdev@...r.kernel.org>
Subject: [PATCH v2] x86: Optimise x86 IP checksum code

Performance improvements to the amd64 IP checksum code.
Summing to alternate registers almost doubles the performace
(probably from 4 to 6.2 bytes/clock) on Ivy Bridge cpu.
Loop carrying the carry flag improves Haswell from 7 to 8 bytes/clock.
Older cpu will still approach 4 bytes/clock.
All achieved with a less loop unrolling - improving the performance
for small buffers that are not a multiple of the loop span.

Signed-off-by: David Laight <david.laight@...lab.com>
--

Changes since v1:
- Added cast to result.

I spent far too long looking at this for some code that has to calculate the
UDP checksum before sending packets through a raw IPv4 socket.

Prior to Ivy (maybe Sandy) bridge adc always took two clocks, so the adc chain
can only run at 4 bytes/clock (the same as 32bit adds to a 64 bit register).
In Ivy bridge the 'carry' flag is available a clock earlier so 8 bytes/clock
is technically possible.

In order to 'loop carry' the carry flag some ingenuity is needed.
Although 'dec reg' doesn't change carry, it has a partial dependency against
the flags register - which makes the loop very slow.
The loop instruction (dec %cx, jump non-zero) is far too slow on Intel cpu.
But jcxz (jump if %cx zero) is about the same as a normal conditional jump.

I did get about 12 bytes/clock using adox/adcx but that would need run-time
patching and some cpu that support the instructions may run them very slowly.

arch/x86/lib/csum-partial_64.c | 192 +++++++++++++++++++++--------------------
 1 file changed, 98 insertions(+), 94 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d6..7f25ab2 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -10,113 +10,118 @@
 #include <linux/export.h>
 #include <asm/checksum.h>
 
-static inline unsigned short from32to16(unsigned a) 
-{
-	unsigned short b = a >> 16; 
-	asm("addw %w2,%w0\n\t"
-	    "adcw $0,%w0\n" 
-	    : "=r" (b)
-	    : "0" (b), "r" (a));
-	return b;
-}
-
 /*
  * Do a 64-bit checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
  *
  * This isn't as time critical as it used to be because many NICs
  * do hardware checksumming these days.
+ *
+ * All Intel cpus prior to Ivy Bridge (mayby Sandy Bridge) have a 2 clock
+ * latency on the result of adc.
+ * This limits any adc loop to 4 bytes/clock - the same as a C loop
+ * that adds 32 bit values to a 64 bit register.
+ * On Ivy bridge the adc result latency is still 2 clocks, but the carry
+ * latency is reduced to 1 clock.
+ * So summing to alternate registers increases the throughput to approaching
+ * 8 bytes/clock.
+ * Older cpu (eg core 2) have a 6+ clock delay accessing any of the flags
+ * after a partial update (eg adc after inc).
+ * The stange 'jecxz' loop avoids this.
+ * The Ivy bridge then needs the loop unrolling once more to approach
+ * 8 bytes per clock.
  * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ * On 7th gen cpu using adox/adoc can get 12 bytes/clock (maybe 16?)
+ * provided the loop is unrolled further than the one below.
+ * But some other cpu that support the adx take 6 clocks for each.
+ *
+ * The 'sum' value on entry is added in, it can exceed 32 bits but
+ * must not get to 56 bits.
  */
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+static unsigned do_csum(const unsigned char *buff, long len, u64 sum)
 {
-	unsigned odd, count;
-	unsigned long result = 0;
+	unsigned int src_align;
+	u64 sum_0 = 0, sum_1;
+	long len_tmp;
+	bool odd = false;
 
-	if (unlikely(len == 0))
-		return result; 
-	odd = 1 & (unsigned long) buff;
-	if (unlikely(odd)) {
-		result = *buff << 8;
-		len--;
-		buff++;
-	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *)buff;
-			count--;
-			len -= 2;
-			buff += 2;
+	/* 64bit align the base address */
+	src_align = (unsigned long)buff & 7;
+	if (src_align) {
+		if (unlikely(src_align & 1)) {
+			sum <<= 8;
+			/* The extra flag generates better code! */
+			odd = true;
 		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			unsigned long zero;
-			unsigned count64;
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
+		buff -= src_align;
+		len += src_align;
+		if (likely(src_align == 4))
+			sum_0 = *(u32 *)(buff + 4);
+		else
+			/* Mask off unwanted low bytes from full 64bit word */
+			sum_0 = *(u64 *)buff & (~0ull << (src_align * 8));
+		if (unlikely(len < 8)) {
+			/* Mask off the unwanted high bytes */
+			sum += sum_0 & ~(~0ull << (len * 8));
+			goto reduce_32;
+		}
+		len -= 8;
+		buff += 8;
+	}
 
-			/* main loop using 64byte blocks */
-			zero = 0;
-			count64 = count >> 3;
-			while (count64) { 
-				asm("addq 0*8(%[src]),%[res]\n\t"
-				    "adcq 1*8(%[src]),%[res]\n\t"
-				    "adcq 2*8(%[src]),%[res]\n\t"
-				    "adcq 3*8(%[src]),%[res]\n\t"
-				    "adcq 4*8(%[src]),%[res]\n\t"
-				    "adcq 5*8(%[src]),%[res]\n\t"
-				    "adcq 6*8(%[src]),%[res]\n\t"
-				    "adcq 7*8(%[src]),%[res]\n\t"
-				    "adcq %[zero],%[res]"
-				    : [res] "=r" (result)
-				    : [src] "r" (buff), [zero] "r" (zero),
-				    "[res]" (result));
-				buff += 64;
-				count64--;
-			}
+	/* Read first 8 bytes to 16 byte align the loop below */
+	sum_1 = len & 8 ? *(u64 *)buff : 0;
 
-			/* last up to 7 8byte blocks */
-			count %= 8; 
-			while (count) { 
-				asm("addq %1,%0\n\t"
-				    "adcq %2,%0\n" 
-					    : "=r" (result)
-				    : "m" (*(unsigned long *)buff), 
-				    "r" (zero),  "0" (result));
-				--count; 
-				buff += 8;
-			}
-			result = add32_with_carry(result>>32,
-						  result&0xffffffff); 
+	/* The main loop uses negative offsets from the end of the buffer */
+	buff += len;
 
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
+	/* Add in trailing bytes to 64bit align the length */
+	if (len & 7) {
+		unsigned int tail_len = len & 7;
+		buff -= tail_len;
+		if (likely(tail_len == 4))
+			sum += *(u32 *)buff;
+		else
+			/* Mask off the unwanted high bytes */
+			sum += *(u64 *)buff & ~(~0ull << (tail_len * 8));
 	}
-	if (len & 1)
-		result += *buff;
-	result = add32_with_carry(result>>32, result & 0xffffffff); 
-	if (unlikely(odd)) { 
-		result = from32to16(result);
-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-	}
-	return result;
+
+	/* Align and negate len so that we need to sum [buff[len]..buf[0]) */
+	len = -(len & ~15);
+
+	/*
+	 * Align the byte count to a multiple of 16 then
+	 * add 64 bit words to alternating registers.
+	 * Finally reduce to 64 bits.
+	 */
+	asm(	"	bt    $4, %[len]\n"
+		"	jnc   10f\n"
+		"	add   (%[buff], %[len]), %[sum_0]\n"
+		"	adc   8(%[buff], %[len]), %[sum_1]\n"
+		"	lea   16(%[len]), %[len]\n"
+		"10:	jecxz 20f\n"
+		"	adc   (%[buff], %[len]), %[sum_0]\n"
+		"	adc   8(%[buff], %[len]), %[sum_1]\n"
+		"	lea   32(%[len]), %[len_tmp]\n"
+		"	adc   16(%[buff], %[len]), %[sum_0]\n"
+		"	adc   24(%[buff], %[len]), %[sum_1]\n"
+		"	mov   %[len_tmp], %[len]\n"
+		"	jmp   10b\n"
+		"20:	adc   %[sum_0], %[sum]\n"
+		"	adc   %[sum_1], %[sum]\n"
+		"	adc   $0, %[sum]\n"
+	    : [sum] "+&r" (sum), [sum_0] "+&r" (sum_0), [sum_1] "+&r" (sum_1),
+	    	[len] "+&c" (len), [len_tmp] "=&r" (len_tmp)
+	    : [buff] "r" (buff)
+	    : "memory" );
+
+reduce_32:
+	sum = add32_with_carry(sum>>32, sum & 0xffffffff); 
+
+	if (unlikely(odd))
+		return __builtin_bswap32(sum);
+
+	return sum;
 }
 
 /*
@@ -133,8 +138,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
  */
 __wsum csum_partial(const void *buff, int len, __wsum sum)
 {
-	return (__force __wsum)add32_with_carry(do_csum(buff, len),
-						(__force u32)sum);
+	return (__force __wsum)do_csum(buff, len, (__force u32)sum);
 }
 EXPORT_SYMBOL(csum_partial);
 
-- 
1.8.1.2

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)