[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Thu, 11 Nov 2021 10:44:19 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Eric Dumazet <eric.dumazet@...il.com>
Cc: "David S . Miller" <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>,
netdev <netdev@...r.kernel.org>,
Eric Dumazet <edumazet@...gle.com>, x86@...nel.org,
Alexander Duyck <alexander.duyck@...il.com>
Subject: Re: [RFC] x86/csum: rewrite csum_partial()
On Thu, Nov 11, 2021 at 10:10:19AM +0100, Peter Zijlstra wrote:
> On Wed, Nov 10, 2021 at 10:53:22PM -0800, Eric Dumazet wrote:
> > + /*
> > + * This implements an optimized version of
> > + * switch (dwords) {
> > + * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
> > + * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
> > + * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
> > + * ...
> > + * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
> > + * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
> > + * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
> > + * }
> > + *
> > + * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
> > + */
> > + asm(" call 1f\n"
> > + "1: pop %[dest]\n"
>
> That's terrible. I think on x86_64 we can do: lea (%%rip), %[dest], not
> sure what would be the best way on i386.
>
> > + " lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
> > + " clc\n"
> > + " jmp *%[dest]\n .align 4\n"
>
> That's an indirect branch, you can't do that these days. This would need
> to use JMP_NOSPEC (except we don't have a !ASSEMBLER version of that.
> But that would also completely and utterly destroy performance.
>
> Also, objtool would complain about this if it hadn't tripped over that
> first instruction:
>
> arch/x86/lib/csum-partial_64.o: warning: objtool: do_csum()+0x84: indirect jump found in RETPOLINE build
>
> I'm not sure what the best way is to unroll loops without using computed
> gotos/jump-tables though :/
>
> > + "2:\n"
> > + " adcl 14*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 13*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 12*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 11*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 10*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 9*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 8*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 7*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 6*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 5*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 4*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 3*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 2*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 1*4(%[src]),%[res]\n .align 4\n"
> > + " adcl 0*4(%[src]),%[res]\n"
> > + " adcl $0,%[res]"
>
> If only the CPU would accept: REP ADCL (%%rsi), %[res] :/
>
> > + : [res] "=r" (result), [dest] "=&r" (dest)
> > + : [src] "r" (buff), "[res]" (result),
> > + [skip] "r" (dwords ^ 15)
> > + : "memory");
> > + }
This is the best I could come up with ...
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -32,7 +32,6 @@ static inline unsigned short from32to16(
*/
static unsigned do_csum(const unsigned char *buff, unsigned len)
{
- unsigned long dwords;
unsigned odd, result = 0;
odd = 1 & (unsigned long) buff;
@@ -64,50 +63,54 @@ static unsigned do_csum(const unsigned c
result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
}
- dwords = len >> 2;
- if (dwords) { /* dwords is in [1..15] */
- unsigned long dest;
-
- /*
- * This implements an optimized version of
- * switch (dwords) {
- * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
- * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
- * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
- * ...
- * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
- * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
- * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
- * }
- *
- * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
- */
- asm(" call 1f\n"
- "1: pop %[dest]\n"
- " lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
- " clc\n"
- " jmp *%[dest]\n .align 4\n"
- "2:\n"
- " adcl 14*4(%[src]),%[res]\n .align 4\n"
- " adcl 13*4(%[src]),%[res]\n .align 4\n"
- " adcl 12*4(%[src]),%[res]\n .align 4\n"
- " adcl 11*4(%[src]),%[res]\n .align 4\n"
- " adcl 10*4(%[src]),%[res]\n .align 4\n"
- " adcl 9*4(%[src]),%[res]\n .align 4\n"
- " adcl 8*4(%[src]),%[res]\n .align 4\n"
- " adcl 7*4(%[src]),%[res]\n .align 4\n"
- " adcl 6*4(%[src]),%[res]\n .align 4\n"
- " adcl 5*4(%[src]),%[res]\n .align 4\n"
- " adcl 4*4(%[src]),%[res]\n .align 4\n"
- " adcl 3*4(%[src]),%[res]\n .align 4\n"
- " adcl 2*4(%[src]),%[res]\n .align 4\n"
- " adcl 1*4(%[src]),%[res]\n .align 4\n"
- " adcl 0*4(%[src]),%[res]\n"
- " adcl $0,%[res]"
- : [res] "=r" (result), [dest] "=&r" (dest)
- : [src] "r" (buff), "[res]" (result),
- [skip] "r" (dwords ^ 15)
- : "memory");
+ if (len >> 2) { /* dwords is in [1..15] */
+ if (len >= 32) {
+ asm(" addl 0*4(%[src]),%[res]\n"
+ " adcl 1*4(%[src]),%[res]\n"
+ " adcl 2*4(%[src]),%[res]\n"
+ " adcl 3*4(%[src]),%[res]\n"
+ " adcl 4*4(%[src]),%[res]\n"
+ " adcl 5*4(%[src]),%[res]\n"
+ " adcl 6*4(%[src]),%[res]\n"
+ " adcl 7*4(%[src]),%[res]\n"
+ " adcl $0,%[res]"
+ : [res] "=r" (result)
+ : [src] "r" (buff), "[res]" (result)
+ : "memory");
+ buff += 32;
+ len -= 32;
+ }
+ if (len >= 16) {
+ asm(" addl 0*4(%[src]),%[res]\n"
+ " adcl 1*4(%[src]),%[res]\n"
+ " adcl 2*4(%[src]),%[res]\n"
+ " adcl 3*4(%[src]),%[res]\n"
+ " adcl $0,%[res]"
+ : [res] "=r" (result)
+ : [src] "r" (buff), "[res]" (result)
+ : "memory");
+ buff += 16;
+ len -= 16;
+ }
+ if (len >= 8) {
+ asm(" addl 0*4(%[src]),%[res]\n"
+ " adcl 1*4(%[src]),%[res]\n"
+ " adcl $0,%[res]"
+ : [res] "=r" (result)
+ : [src] "r" (buff), "[res]" (result)
+ : "memory");
+ buff += 8;
+ len -= 8;
+ }
+ if (len >= 4) {
+ asm(" addl 0*4(%[src]),%[res]\n"
+ " adcl $0,%[res]"
+ : [res] "=r" (result)
+ : [src] "r" (buff), "[res]" (result)
+ : "memory");
+ buff += 4;
+ len -= 4;
+ }
}
if (len & 3U) {
@@ -120,7 +123,7 @@ static unsigned do_csum(const unsigned c
if (len & 1)
result += *buff;
}
- if (unlikely(odd)) {
+ if (unlikely(odd)) {
result = from32to16(result);
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
}
Powered by blists - more mailing lists