linux-kernel - Re: [PATCH v3] LoongArch: add checksum optimization for 64-bit system

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAAhV-H6T_F4=D1i6GkJTczHY5i-FSa6o_oXiRdZFGOXBU=pwXg@mail.gmail.com>
Date:   Wed, 1 Mar 2023 20:54:04 +0800
From:   Huacai Chen <chenhuacai@...nel.org>
To:     Bibo Mao <maobibo@...ngson.cn>
Cc:     WANG Xuerui <kernel@...0n.name>,
        David Laight <David.Laight@...lab.com>,
        Jiaxun Yang <jiaxun.yang@...goat.com>,
        loongarch@...ts.linux.dev, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3] LoongArch: add checksum optimization for 64-bit system

Hi, Bibo,

I found the version here [1] provides more functions than this one. So
is it possible to take advantages from both versions?

[1] https://github.com/loongson/linux/commit/92a6df48ccb73dd2c3dc1799add08adf0e0b0deb

Huacai

On Thu, Feb 16, 2023 at 9:09 PM Bibo Mao <maobibo@...ngson.cn> wrote:
>
> loongArch platform is 64-bit system, which supports 8 bytes memory
> accessing, generic checksum function uses 4 byte memory access.
> This patch adds 8-bytes memory access optimization for checksum
> function on loongArch. And the code comes from arm64 system.
>
> When network hw checksum is disabled, iperf performance improves
> about 10% with this patch.
>
> Signed-off-by: Bibo Mao <maobibo@...ngson.cn>
> ---
> Changelog:
> v3: modify function accumulate() to handle better on loongarch platform,
>     maybe it's compiler optimization issue.
> v2: use rotation API in function csum_fold to reduce one instruction.
> ---
>  arch/loongarch/include/asm/checksum.h |  65 ++++++++++++
>  arch/loongarch/lib/Makefile           |   2 +-
>  arch/loongarch/lib/csum.c             | 141 ++++++++++++++++++++++++++
>  3 files changed, 207 insertions(+), 1 deletion(-)
>  create mode 100644 arch/loongarch/include/asm/checksum.h
>  create mode 100644 arch/loongarch/lib/csum.c
>
> diff --git a/arch/loongarch/include/asm/checksum.h b/arch/loongarch/include/asm/checksum.h
> new file mode 100644
> index 000000000000..8a7d368d801d
> --- /dev/null
> +++ b/arch/loongarch/include/asm/checksum.h
> @@ -0,0 +1,65 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2016 ARM Ltd.
> + * Copyright (C) 2023 Loongson Technology Corporation Limited
> + */
> +#ifndef __ASM_CHECKSUM_H
> +#define __ASM_CHECKSUM_H
> +
> +#include <linux/in6.h>
> +
> +#define _HAVE_ARCH_IPV6_CSUM
> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
> +                       const struct in6_addr *daddr,
> +                       __u32 len, __u8 proto, __wsum sum);
> +
> +/*
> + * turns a 32-bit partial checksum (e.g. from csum_partial) into a
> + * 1's complement 16-bit checksum.
> + */
> +static inline __sum16 csum_fold(__wsum sum)
> +{
> +       u32 tmp = (__force u32)sum;
> +
> +       /*
> +        * swap the two 16-bit halves of sum
> +        * if there is a carry from adding the two 16-bit halves,
> +        * it will carry from the lower half into the upper half,
> +        * giving us the correct sum in the upper half.
> +        */
> +       return (__force __sum16)(~(tmp + rol32(tmp, 16)) >> 16);
> +}
> +#define csum_fold csum_fold
> +
> +/*
> + * This is a version of ip_compute_csum() optimized for IP headers,
> + * which always checksum on 4 octet boundaries.  ihl is the number
> + * of 32-bit words and is always >= 5.
> + */
> +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> +{
> +       __uint128_t tmp;
> +       u64 sum;
> +       int n = ihl; /* we want it signed */
> +
> +       tmp = *(const __uint128_t *)iph;
> +       iph += 16;
> +       n -= 4;
> +       tmp += ((tmp >> 64) | (tmp << 64));
> +       sum = tmp >> 64;
> +       do {
> +               sum += *(const u32 *)iph;
> +               iph += 4;
> +       } while (--n > 0);
> +
> +       sum += ror64(sum, 32);
> +       return csum_fold((__force __wsum)(sum >> 32));
> +}
> +#define ip_fast_csum ip_fast_csum
> +
> +extern unsigned int do_csum(const unsigned char *buff, int len);
> +#define do_csum do_csum
> +
> +#include <asm-generic/checksum.h>
> +
> +#endif /* __ASM_CHECKSUM_H */
> diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile
> index 40bde632900f..6ba6df411f90 100644
> --- a/arch/loongarch/lib/Makefile
> +++ b/arch/loongarch/lib/Makefile
> @@ -4,4 +4,4 @@
>  #
>
>  lib-y  += delay.o memset.o memcpy.o memmove.o \
> -          clear_user.o copy_user.o dump_tlb.o unaligned.o
> +          clear_user.o copy_user.o dump_tlb.o unaligned.o csum.o
> diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c
> new file mode 100644
> index 000000000000..a5e84b403c3b
> --- /dev/null
> +++ b/arch/loongarch/lib/csum.c
> @@ -0,0 +1,141 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +// Copyright (C) 2019-2020 Arm Ltd.
> +
> +#include <linux/compiler.h>
> +#include <linux/kasan-checks.h>
> +#include <linux/kernel.h>
> +
> +#include <net/checksum.h>
> +
> +static u64 accumulate(u64 sum, u64 data)
> +{
> +       sum += data;
> +       if (sum < data)
> +               sum += 1;
> +       return sum;
> +}
> +
> +/*
> + * We over-read the buffer and this makes KASAN unhappy. Instead, disable
> + * instrumentation and call kasan explicitly.
> + */
> +unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
> +{
> +       unsigned int offset, shift, sum;
> +       const u64 *ptr;
> +       u64 data, sum64 = 0;
> +
> +       if (unlikely(len == 0))
> +               return 0;
> +
> +       offset = (unsigned long)buff & 7;
> +       /*
> +        * This is to all intents and purposes safe, since rounding down cannot
> +        * result in a different page or cache line being accessed, and @buff
> +        * should absolutely not be pointing to anything read-sensitive. We do,
> +        * however, have to be careful not to piss off KASAN, which means using
> +        * unchecked reads to accommodate the head and tail, for which we'll
> +        * compensate with an explicit check up-front.
> +        */
> +       kasan_check_read(buff, len);
> +       ptr = (u64 *)(buff - offset);
> +       len = len + offset - 8;
> +
> +       /*
> +        * Head: zero out any excess leading bytes. Shifting back by the same
> +        * amount should be at least as fast as any other way of handling the
> +        * odd/even alignment, and means we can ignore it until the very end.
> +        */
> +       shift = offset * 8;
> +       data = *ptr++;
> +       data = (data >> shift) << shift;
> +
> +       /*
> +        * Body: straightforward aligned loads from here on (the paired loads
> +        * underlying the quadword type still only need dword alignment). The
> +        * main loop strictly excludes the tail, so the second loop will always
> +        * run at least once.
> +        */
> +       while (unlikely(len > 64)) {
> +               __uint128_t tmp1, tmp2, tmp3, tmp4;
> +
> +               tmp1 = *(__uint128_t *)ptr;
> +               tmp2 = *(__uint128_t *)(ptr + 2);
> +               tmp3 = *(__uint128_t *)(ptr + 4);
> +               tmp4 = *(__uint128_t *)(ptr + 6);
> +
> +               len -= 64;
> +               ptr += 8;
> +
> +               /* This is the "don't dump the carry flag into a GPR" idiom */
> +               tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> +               tmp2 += (tmp2 >> 64) | (tmp2 << 64);
> +               tmp3 += (tmp3 >> 64) | (tmp3 << 64);
> +               tmp4 += (tmp4 >> 64) | (tmp4 << 64);
> +               tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
> +               tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> +               tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
> +               tmp3 += (tmp3 >> 64) | (tmp3 << 64);
> +               tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
> +               tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> +               tmp1 = ((tmp1 >> 64) << 64) | sum64;
> +               tmp1 += (tmp1 >> 64) | (tmp1 << 64);
> +               sum64 = tmp1 >> 64;
> +       }
> +       while (len > 8) {
> +               __uint128_t tmp;
> +
> +               sum64 = accumulate(sum64, data);
> +               tmp = *(__uint128_t *)ptr;
> +
> +               len -= 16;
> +               ptr += 2;
> +
> +               data = tmp >> 64;
> +               sum64 = accumulate(sum64, tmp);
> +       }
> +       if (len > 0) {
> +               sum64 = accumulate(sum64, data);
> +               data = *ptr;
> +               len -= 8;
> +       }
> +       /*
> +        * Tail: zero any over-read bytes similarly to the head, again
> +        * preserving odd/even alignment.
> +        */
> +       shift = len * -8;
> +       data = (data << shift) >> shift;
> +       sum64 = accumulate(sum64, data);
> +
> +       /* Finally, folding */
> +       sum64 += (sum64 >> 32) | (sum64 << 32);
> +       sum = sum64 >> 32;
> +       sum += (sum >> 16) | (sum << 16);
> +       if (offset & 1)
> +               return (u16)swab32(sum);
> +
> +       return sum >> 16;
> +}
> +
> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
> +                       const struct in6_addr *daddr,
> +                       __u32 len, __u8 proto, __wsum csum)
> +{
> +       __uint128_t src, dst;
> +       u64 sum = (__force u64)csum;
> +
> +       src = *(const __uint128_t *)saddr->s6_addr;
> +       dst = *(const __uint128_t *)daddr->s6_addr;
> +
> +       sum += (__force u32)htonl(len);
> +       sum += (u32)proto << 24;
> +       src += (src >> 64) | (src << 64);
> +       dst += (dst >> 64) | (dst << 64);
> +
> +       sum = accumulate(sum, src >> 64);
> +       sum = accumulate(sum, dst >> 64);
> +
> +       sum += ((sum >> 32) | (sum << 32));
> +       return csum_fold((__force __wsum)(sum >> 32));
> +}
> +EXPORT_SYMBOL(csum_ipv6_magic);
> --
> 2.27.0
>