netdev - Re: [PATCH] arm64: do_csum: implement accelerated scalar version

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190219150848.GA26652@apalos>
Date:   Tue, 19 Feb 2019 17:08:48 +0200
From:   Ilias Apalodimas <ilias.apalodimas@...aro.org>
To:     Ard Biesheuvel <ard.biesheuvel@...aro.org>
Cc:     linux-arm-kernel@...ts.infradead.org, will.deacon@....com,
        steve.capper@....com, netdev@...r.kernel.org,
        "huanglingyan (A)" <huanglingyan2@...wei.com>
Subject: Re: [PATCH] arm64: do_csum: implement accelerated scalar version

On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote:
> It turns out that the IP checksumming code is still exercised often,
> even though one might expect that modern NICs with checksum offload
> have no use for it. However, as Lingyan points out, there are
> combinations of features where the network stack may still fall back
> to software checksumming, and so it makes sense to provide an
> optimized implementation in software as well.
> 
> So provide an implementation of do_csum() in scalar assembler, which,
> unlike C, gives direct access to the carry flag, making the code run
> substantially faster. The routine uses overlapping 64 byte loads for
> all input size > 64 bytes, in order to reduce the number of branches
> and improve performance on cores with deep pipelines.
> 
> On Cortex-A57, this implementation is on par with Lingyan's NEON
> implementation, and roughly 7x as fast as the generic C code.
> 
> Cc: "huanglingyan (A)" <huanglingyan2@...wei.com>
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@...aro.org>
> ---
> Test code after the patch.
> 
>  arch/arm64/include/asm/checksum.h |   3 +
>  arch/arm64/lib/Makefile           |   2 +-
>  arch/arm64/lib/csum.S             | 127 ++++++++++++++++++++
>  3 files changed, 131 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
> index 0b6f5a7d4027..e906b956c1fc 100644
> --- a/arch/arm64/include/asm/checksum.h
> +++ b/arch/arm64/include/asm/checksum.h
> @@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>  }
>  #define ip_fast_csum ip_fast_csum
>  
> +extern unsigned int do_csum(const unsigned char *buff, int len);
> +#define do_csum do_csum
> +
>  #include <asm-generic/checksum.h>
>  
>  #endif	/* __ASM_CHECKSUM_H */
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index 5540a1638baf..a7606007a749 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -3,7 +3,7 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\
>  		   copy_to_user.o copy_in_user.o copy_page.o		\
>  		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
>  		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
> -		   strchr.o strrchr.o tishift.o
> +		   strchr.o strrchr.o tishift.o csum.o
>  
>  ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
>  obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
> diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S
> new file mode 100644
> index 000000000000..534e2ebdc426
> --- /dev/null
> +++ b/arch/arm64/lib/csum.S
> @@ -0,0 +1,127 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@...aro.org>
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +ENTRY(do_csum)
> +	adds		x2, xzr, xzr		// clear x2 and C flag
> +
> +	// 64 bytes at a time
> +	lsr		x3, x1, #6
> +	and		x1, x1, #63
> +	cbz		x3, 1f
> +
> +	// Eight 64-bit adds per iteration
> +0:	ldp		x4, x5, [x0], #64
> +	ldp		x6, x7, [x0, #-48]
> +	ldp		x8, x9, [x0, #-32]
> +	ldp		x10, x11, [x0, #-16]
> +	adcs		x2, x2, x4
> +	sub		x3, x3, #1
> +	adcs		x2, x2, x5
> +	adcs		x2, x2, x6
> +	adcs		x2, x2, x7
> +	adcs		x2, x2, x8
> +	adcs		x2, x2, x9
> +	adcs		x2, x2, x10
> +	adcs		x2, x2, x11
> +	cbnz		x3, 0b
> +	adc		x2, x2, xzr
> +
> +	cbz		x1, 7f
> +	bic		x3, x1, #1
> +	add		x12, x0, x1
> +	add		x0, x0, x3
> +	neg		x3, x3
> +	add		x3, x3, #64
> +	lsl		x3, x3, #3
> +
> +	// Handle remaining 63 bytes or less using an overlapping 64-byte load
> +	// and a branchless code path to complete the calculation
> +	ldp		x4, x5, [x0, #-64]
> +	ldp		x6, x7, [x0, #-48]
> +	ldp		x8, x9, [x0, #-32]
> +	ldp		x10, x11, [x0, #-16]
> +	ldrb		w12, [x12, #-1]
> +
> +	.irp		reg, x4, x5, x6, x7, x8, x9, x10, x11
> +	cmp		x3, #64
> +	csel		\reg, \reg, xzr, lt
> +	ccmp		x3, xzr, #0, lt
> +	csel		x13, x3, xzr, gt
> +	sub		x3, x3, #64
> +CPU_LE(	lsr		\reg, \reg, x13		)
> +CPU_BE(	lsl		\reg, \reg, x13		)
> +	.endr
> +
> +	adds		x2, x2, x4
> +	adcs		x2, x2, x5
> +	adcs		x2, x2, x6
> +	adcs		x2, x2, x7
> +	adcs		x2, x2, x8
> +	adcs		x2, x2, x9
> +	adcs		x2, x2, x10
> +	adcs		x2, x2, x11
> +	adc		x2, x2, xzr
> +
> +CPU_LE(	adds		x12, x2, x12		)
> +CPU_BE(	adds		x12, x2, x12, lsl #8	)
> +	adc		x12, x12, xzr
> +	tst		x1, #1
> +	csel		x2, x2, x12, eq
> +
> +7:	lsr		x1, x2, #32
> +	adds		w2, w2, w1
> +	adc		w2, w2, wzr
> +
> +	lsr		w1, w2, #16
> +	uxth		w2, w2
> +	add		w2, w2, w1
> +
> +	lsr		w1, w2, #16		// handle the carry by hand
> +	add		w2, w2, w1
> +
> +	uxth		w0, w2
> +	ret
> +
> +	// Handle 63 bytes or less
> +1:	tbz		x1, #5, 2f
> +	ldp		x4, x5, [x0], #32
> +	ldp		x6, x7, [x0, #-16]
> +	adds		x2, x2, x4
> +	adcs		x2, x2, x5
> +	adcs		x2, x2, x6
> +	adcs		x2, x2, x7
> +	adc		x2, x2, xzr
> +
> +2:	tbz		x1, #4, 3f
> +	ldp		x4, x5, [x0], #16
> +	adds		x2, x2, x4
> +	adcs		x2, x2, x5
> +	adc		x2, x2, xzr
> +
> +3:	tbz		x1, #3, 4f
> +	ldr		x4, [x0], #8
> +	adds		x2, x2, x4
> +	adc		x2, x2, xzr
> +
> +4:	tbz		x1, #2, 5f
> +	ldr		w4, [x0], #4
> +	adds		x2, x2, x4
> +	adc		x2, x2, xzr
> +
> +5:	tbz		x1, #1, 6f
> +	ldrh		w4, [x0], #2
> +	adds		x2, x2, x4
> +	adc		x2, x2, xzr
> +
> +6:	tbz		x1, #0, 7b
> +	ldrb		w4, [x0]
> +CPU_LE(	adds		x2, x2, x4		)
> +CPU_BE(	adds		x2, x2, x4, lsl #8	)
> +	adc		x2, x2, xzr
> +	b		7b
> +ENDPROC(do_csum)
> -- 
> 2.20.1
> 
>   diff --git a/lib/checksum.c b/lib/checksum.c
>   index d3ec93f9e5f3..7711f1186f71 100644
>   --- a/lib/checksum.c
>   +++ b/lib/checksum.c
>   @@ -37,7 +37,7 @@
>    
>    #include <asm/byteorder.h>
>    
>   -#ifndef do_csum
>   +#if 1 //ndef do_csum
>    static inline unsigned short from32to16(unsigned int x)
>    {
>           /* add up 16-bit and 16-bit for 16+c bit */
>   @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
>           return x;
>    }
>    
>   -static unsigned int do_csum(const unsigned char *buff, int len)
>   +static unsigned int __do_csum(const unsigned char *buff, int len)
>    {
>           int odd;
>           unsigned int result = 0;
>   @@ -206,3 +206,23 @@ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
>    }
>    EXPORT_SYMBOL(csum_tcpudp_nofold);
>    #endif
>   +
>   +extern u8 crypto_ft_tab[];
>   +
>   +static int __init do_selftest(void)
>   +{
>   +       int i, j;
>   +       u16 c1, c2;
>   +
>   +       for (i = 0; i < 1024; i++) {
>   +               for (j = i + 1; j <= 1024; j++) {
>   +                       c1 = __do_csum(crypto_ft_tab + i, j - i);
>   +                       c2 = do_csum(crypto_ft_tab + i, j - i);
>   +
>   +                       if (c1 != c2)
>   +                               pr_err("######### %d %d %x %x\n", i, j, c1, c2);
>   +               }
>   +       }
>   +       return 0;
>   +}
>   +late_initcall(do_selftest);


Acked-by: Ilias Apalodimas <ilias.apalodimas@...aro.org>