lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Mon, 13 Dec 2021 10:45:13 -0800
From:   Eric Dumazet <edumazet@...gle.com>
To:     David Laight <David.Laight@...lab.com>
Cc:     Noah Goldstein <goldstein.w.n@...il.com>,
        "tglx@...utronix.de" <tglx@...utronix.de>,
        "mingo@...hat.com" <mingo@...hat.com>,
        Borislav Petkov <bp@...en8.de>,
        "dave.hansen@...ux.intel.com" <dave.hansen@...ux.intel.com>,
        X86 ML <x86@...nel.org>, "hpa@...or.com" <hpa@...or.com>,
        "peterz@...radead.org" <peterz@...radead.org>,
        "alexanderduyck@...com" <alexanderduyck@...com>,
        open list <linux-kernel@...r.kernel.org>,
        netdev <netdev@...r.kernel.org>
Subject: Re: [PATCH] lib/x86: Optimise csum_partial of buffers that are not
 multiples of 8 bytes.

On Mon, Dec 13, 2021 at 10:00 AM David Laight <David.Laight@...lab.com> wrote:
>
>
> Add in the trailing bytes first so that there is no need to worry
> about the sum exceeding 64 bits.
>
> Signed-off-by: David Laight <david.laight@...lab.com>
> ---
>
> This ought to be faster - because of all the removed 'adc $0'.
> Guessing how fast x86 code will run is hard!
> There are other ways of handing buffers that are shorter than 8 bytes,
> but I'd rather hope they don't happen in any hot paths.
>
> Note - I've not even compile tested it.
> (But have tested an equivalent change before.)
>
>  arch/x86/lib/csum-partial_64.c | 55 ++++++++++++----------------------
>  1 file changed, 19 insertions(+), 36 deletions(-)
>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index abf819dd8525..fbcc073fc2b5 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -37,6 +37,24 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
>         u64 temp64 = (__force u64)sum;
>         unsigned result;
>
> +       if (len & 7) {
> +               if (unlikely(len < 8)) {
> +                       /* Avoid falling off the start of the buffer */
> +                       if (len & 4) {
> +                               temp64 += *(u32 *)buff;
> +                               buff += 4;
> +                       }
> +                       if (len & 2) {
> +                               temp64 += *(u16 *)buff;
> +                               buff += 2;
> +                       }
> +                       if (len & 1)
> +                               temp64 += *(u8 *)buff;
> +                       goto reduce_to32;
> +               }
> +               temp64 += *(u64 *)(buff + len - 8) << (8 - (len & 7)) * 8;

This is reading far away (end of buffer).

Maybe instead read the first bytes and adjust @buff, to allow for
better hardware prefetching ?



> +       }
> +
>         while (unlikely(len >= 64)) {
>                 asm("addq 0*8(%[src]),%[res]\n\t"
>                     "adcq 1*8(%[src]),%[res]\n\t"
> @@ -82,43 +100,8 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
>                         : "memory");
>                 buff += 8;
>         }
> -       if (len & 7) {
> -#ifdef CONFIG_DCACHE_WORD_ACCESS
> -               unsigned int shift = (8 - (len & 7)) * 8;
> -               unsigned long trail;
> -
> -               trail = (load_unaligned_zeropad(buff) << shift) >> shift;
>
> -               asm("addq %[trail],%[res]\n\t"
> -                   "adcq $0,%[res]"
> -                       : [res] "+r" (temp64)
> -                       : [trail] "r" (trail));
> -#else
> -               if (len & 4) {
> -                       asm("addq %[val],%[res]\n\t"
> -                           "adcq $0,%[res]"
> -                               : [res] "+r" (temp64)
> -                               : [val] "r" ((u64)*(u32 *)buff)
> -                               : "memory");
> -                       buff += 4;
> -               }
> -               if (len & 2) {
> -                       asm("addq %[val],%[res]\n\t"
> -                           "adcq $0,%[res]"
> -                               : [res] "+r" (temp64)
> -                               : [val] "r" ((u64)*(u16 *)buff)
> -                               : "memory");
> -                       buff += 2;
> -               }
> -               if (len & 1) {
> -                       asm("addq %[val],%[res]\n\t"
> -                           "adcq $0,%[res]"
> -                               : [res] "+r" (temp64)
> -                               : [val] "r" ((u64)*(u8 *)buff)
> -                               : "memory");
> -               }
> -#endif
> -       }
> +reduce_to32:
>         result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
>         return (__force __wsum)result;
>  }
> --
> 2.17.1
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ