[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Thu, 11 Nov 2021 08:02:07 -0800
From: Eric Dumazet <edumazet@...gle.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Eric Dumazet <eric.dumazet@...il.com>,
"David S . Miller" <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>,
netdev <netdev@...r.kernel.org>, x86@...nel.org,
Alexander Duyck <alexander.duyck@...il.com>
Subject: Re: [RFC] x86/csum: rewrite csum_partial()
On Thu, Nov 11, 2021 at 1:44 AM Peter Zijlstra <peterz@...radead.org> wrote:
>
> On Thu, Nov 11, 2021 at 10:10:19AM +0100, Peter Zijlstra wrote:
> > On Wed, Nov 10, 2021 at 10:53:22PM -0800, Eric Dumazet wrote:
> > > + /*
> > > + * This implements an optimized version of
> > > + * switch (dwords) {
> > > + * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
> > > + * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
> > > + * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
> > > + * ...
> > > + * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
> > > + * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
> > > + * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
> > > + * }
> > > + *
> > > + * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
> > > + */
> > > + asm(" call 1f\n"
> > > + "1: pop %[dest]\n"
> >
> > That's terrible. I think on x86_64 we can do: lea (%%rip), %[dest], not
> > sure what would be the best way on i386.
> >
> > > + " lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
> > > + " clc\n"
> > > + " jmp *%[dest]\n .align 4\n"
> >
> > That's an indirect branch, you can't do that these days. This would need
> > to use JMP_NOSPEC (except we don't have a !ASSEMBLER version of that.
> > But that would also completely and utterly destroy performance.
> >
> > Also, objtool would complain about this if it hadn't tripped over that
> > first instruction:
> >
> > arch/x86/lib/csum-partial_64.o: warning: objtool: do_csum()+0x84: indirect jump found in RETPOLINE build
> >
> > I'm not sure what the best way is to unroll loops without using computed
> > gotos/jump-tables though :/
> >
> > > + "2:\n"
> > > + " adcl 14*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 13*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 12*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 11*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 10*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 9*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 8*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 7*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 6*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 5*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 4*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 3*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 2*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 1*4(%[src]),%[res]\n .align 4\n"
> > > + " adcl 0*4(%[src]),%[res]\n"
> > > + " adcl $0,%[res]"
> >
> > If only the CPU would accept: REP ADCL (%%rsi), %[res] :/
> >
> > > + : [res] "=r" (result), [dest] "=&r" (dest)
> > > + : [src] "r" (buff), "[res]" (result),
> > > + [skip] "r" (dwords ^ 15)
> > > + : "memory");
> > > + }
>
> This is the best I could come up with ...
>
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -32,7 +32,6 @@ static inline unsigned short from32to16(
> */
> static unsigned do_csum(const unsigned char *buff, unsigned len)
> {
> - unsigned long dwords;
> unsigned odd, result = 0;
>
> odd = 1 & (unsigned long) buff;
> @@ -64,50 +63,54 @@ static unsigned do_csum(const unsigned c
> result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> }
>
> - dwords = len >> 2;
> - if (dwords) { /* dwords is in [1..15] */
> - unsigned long dest;
> -
> - /*
> - * This implements an optimized version of
> - * switch (dwords) {
> - * case 15: res = add_with_carry(res, buf32[14]); fallthrough;
> - * case 14: res = add_with_carry(res, buf32[13]); fallthrough;
> - * case 13: res = add_with_carry(res, buf32[12]); fallthrough;
> - * ...
> - * case 3: res = add_with_carry(res, buf32[2]); fallthrough;
> - * case 2: res = add_with_carry(res, buf32[1]); fallthrough;
> - * case 1: res = add_with_carry(res, buf32[0]); fallthrough;
> - * }
> - *
> - * "adcl 8byteoff(%reg1),%reg2" are using either 3 or 4 bytes.
> - */
> - asm(" call 1f\n"
> - "1: pop %[dest]\n"
> - " lea (2f-1b)(%[dest],%[skip],4),%[dest]\n"
> - " clc\n"
> - " jmp *%[dest]\n .align 4\n"
> - "2:\n"
> - " adcl 14*4(%[src]),%[res]\n .align 4\n"
> - " adcl 13*4(%[src]),%[res]\n .align 4\n"
> - " adcl 12*4(%[src]),%[res]\n .align 4\n"
> - " adcl 11*4(%[src]),%[res]\n .align 4\n"
> - " adcl 10*4(%[src]),%[res]\n .align 4\n"
> - " adcl 9*4(%[src]),%[res]\n .align 4\n"
> - " adcl 8*4(%[src]),%[res]\n .align 4\n"
> - " adcl 7*4(%[src]),%[res]\n .align 4\n"
> - " adcl 6*4(%[src]),%[res]\n .align 4\n"
> - " adcl 5*4(%[src]),%[res]\n .align 4\n"
> - " adcl 4*4(%[src]),%[res]\n .align 4\n"
> - " adcl 3*4(%[src]),%[res]\n .align 4\n"
> - " adcl 2*4(%[src]),%[res]\n .align 4\n"
> - " adcl 1*4(%[src]),%[res]\n .align 4\n"
> - " adcl 0*4(%[src]),%[res]\n"
> - " adcl $0,%[res]"
> - : [res] "=r" (result), [dest] "=&r" (dest)
> - : [src] "r" (buff), "[res]" (result),
> - [skip] "r" (dwords ^ 15)
> - : "memory");
> + if (len >> 2) { /* dwords is in [1..15] */
> + if (len >= 32) {
> + asm(" addl 0*4(%[src]),%[res]\n"
> + " adcl 1*4(%[src]),%[res]\n"
> + " adcl 2*4(%[src]),%[res]\n"
> + " adcl 3*4(%[src]),%[res]\n"
> + " adcl 4*4(%[src]),%[res]\n"
> + " adcl 5*4(%[src]),%[res]\n"
> + " adcl 6*4(%[src]),%[res]\n"
> + " adcl 7*4(%[src]),%[res]\n"
> + " adcl $0,%[res]"
> + : [res] "=r" (result)
> + : [src] "r" (buff), "[res]" (result)
> + : "memory");
> + buff += 32;
> + len -= 32;
> + }
> + if (len >= 16) {
> + asm(" addl 0*4(%[src]),%[res]\n"
> + " adcl 1*4(%[src]),%[res]\n"
> + " adcl 2*4(%[src]),%[res]\n"
> + " adcl 3*4(%[src]),%[res]\n"
> + " adcl $0,%[res]"
> + : [res] "=r" (result)
> + : [src] "r" (buff), "[res]" (result)
> + : "memory");
> + buff += 16;
> + len -= 16;
> + }
> + if (len >= 8) {
> + asm(" addl 0*4(%[src]),%[res]\n"
> + " adcl 1*4(%[src]),%[res]\n"
> + " adcl $0,%[res]"
> + : [res] "=r" (result)
> + : [src] "r" (buff), "[res]" (result)
> + : "memory");
> + buff += 8;
> + len -= 8;
> + }
> + if (len >= 4) {
> + asm(" addl 0*4(%[src]),%[res]\n"
> + " adcl $0,%[res]"
> + : [res] "=r" (result)
> + : [src] "r" (buff), "[res]" (result)
> + : "memory");
> + buff += 4;
> + len -= 4;
> + }
> }
>
> if (len & 3U) {
> @@ -120,7 +123,7 @@ static unsigned do_csum(const unsigned c
> if (len & 1)
> result += *buff;
> }
> - if (unlikely(odd)) {
> + if (unlikely(odd)) {
> result = from32to16(result);
> result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> }
Thanks Peter !
This is more or less the first version I wrote. (I was doing tests for
(len & 32), (len & 16) .. to not have to update len in these blocks.
Then, I tried to add an inline version, a la ip_fast_csum() but for IPv6.
Then I came up with the version I sent, for some reason my .config had
temporarily disabled CONFIG_RETPOLINE,
thanks for reminding me this !
I also missed this warning anyway :
arch/x86/lib/csum-partial_64.o: warning: objtool: csum_partial()+0x2f:
unannotated intra-function call
I will spend a bit more time on this before sending a V2, thanks again !
Powered by blists - more mailing lists