lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87ledtxi0q.fsf@mail.lhotse>
Date:   Wed, 30 Aug 2023 14:37:25 +1000
From:   Michael Ellerman <mpe@...erman.id.au>
To:     Danny Tsen <dtsen@...ux.ibm.com>, linux-crypto@...r.kernel.org
Cc:     herbert@...dor.apana.org.au, leitao@...ian.org,
        nayna@...ux.ibm.com, appro@...ptogams.org,
        linux-kernel@...r.kernel.org, linuxppc-dev@...ts.ozlabs.org,
        ltcgcw@...ux.vnet.ibm.com, dtsen@...ibm.com,
        Danny Tsen <dtsen@...ux.ibm.com>
Subject: Re: [PATCH] crypto: vmx: Improved AES/XTS performance of 6-way
 unrolling for ppc.

Danny Tsen <dtsen@...ux.ibm.com> writes:
> Improve AES/XTS performance of 6-way unrolling for PowerPC up
> to 17% with tcrypt.  This is done by using one instruction,
> vpermxor, to replace xor and vsldoi.
>
> This patch has been tested with the kernel crypto module tcrypt.ko and
> has passed the selftest.  The patch is also tested with
> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
>
> Signed-off-by: Danny Tsen <dtsen@...ux.ibm.com>
> ---
>  drivers/crypto/vmx/aesp8-ppc.pl | 141 +++++++++++++++++++++-----------
>  1 file changed, 92 insertions(+), 49 deletions(-)

That's CRYPTOGAMS code, and is so far largely unchanged from the
original. I see you've sent the same change to openssl, but it's not
merged yet. Please document that in the change log, we want to keep the
code in sync as much as possible, and document any divergences.

cheers

> diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
> index 50a0a18f35da..f729589d792e 100644
> --- a/drivers/crypto/vmx/aesp8-ppc.pl
> +++ b/drivers/crypto/vmx/aesp8-ppc.pl
> @@ -132,11 +132,12 @@ rcon:
>  .long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
>  .long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
>  .long	0,0,0,0						?asis
> +.long	0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
>  Lconsts:
>  	mflr	r0
>  	bcl	20,31,\$+4
>  	mflr	$ptr	 #vvvvv "distance between . and rcon
> -	addi	$ptr,$ptr,-0x48
> +	addi	$ptr,$ptr,-0x58
>  	mtlr	r0
>  	blr
>  	.long	0
> @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
>  	li		$x70,0x70
>  	mtspr		256,r0
>  
> +	xxlor		2, 32+$eighty7, 32+$eighty7
> +	vsldoi		$eighty7,$tmp,$eighty7,1        # 0x010101..87
> +	xxlor		1, 32+$eighty7, 32+$eighty7
> +
> +	# Load XOR Lconsts.
> +	mr		$x70, r6
> +	bl		Lconsts
> +	lxvw4x		0, $x40, r6		# load XOR contents
> +	mr		r6, $x70
> +	li		$x70,0x70
> +
>  	subi		$rounds,$rounds,3	# -4 in total
>  
>  	lvx		$rndkey0,$x00,$key1	# load key schedule
> @@ -2537,69 +2549,77 @@ Load_xts_enc_key:
>  	?vperm		v31,v31,$twk5,$keyperm
>  	lvx		v25,$x10,$key_		# pre-load round[2]
>  
> +	# Switch to use the following codes with 0x010101..87 to generate tweak.
> +	#     eighty7 = 0x010101..87
> +	# vsrab         tmp, tweak, seven       # next tweak value, right shift 7 bits
> +	# vand          tmp, tmp, eighty7       # last byte with carry
> +	# vaddubm       tweak, tweak, tweak     # left shift 1 bit (x2)
> +	# xxlor         vsx, 0, 0
> +	# vpermxor      tweak, tweak, tmp, vsx
> +
>  	 vperm		$in0,$inout,$inptail,$inpperm
>  	 subi		$inp,$inp,31		# undo "caller"
>  	vxor		$twk0,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out0,$in0,$twk0
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in1, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in1
>  
>  	 lvx_u		$in1,$x10,$inp
>  	vxor		$twk1,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in1,$in1,$in1,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out1,$in1,$twk1
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in2, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in2
>  
>  	 lvx_u		$in2,$x20,$inp
>  	 andi.		$taillen,$len,15
>  	vxor		$twk2,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in2,$in2,$in2,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out2,$in2,$twk2
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in3, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in3
>  
>  	 lvx_u		$in3,$x30,$inp
>  	 sub		$len,$len,$taillen
>  	vxor		$twk3,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in3,$in3,$in3,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out3,$in3,$twk3
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in4, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in4
>  
>  	 lvx_u		$in4,$x40,$inp
>  	 subi		$len,$len,0x60
>  	vxor		$twk4,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in4,$in4,$in4,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out4,$in4,$twk4
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in5, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in5
>  
>  	 lvx_u		$in5,$x50,$inp
>  	 addi		$inp,$inp,0x60
>  	vxor		$twk5,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in5,$in5,$in5,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out5,$in5,$twk5
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in0, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in0
>  
>  	vxor		v31,v31,$rndkey0
>  	mtctr		$rounds
> @@ -2625,6 +2645,8 @@ Loop_xts_enc6x:
>  	lvx		v25,$x10,$key_		# round[4]
>  	bdnz		Loop_xts_enc6x
>  
> +	xxlor		32+$eighty7, 1, 1	# 0x010101..87
> +
>  	subic		$len,$len,96		# $len-=96
>  	 vxor		$in0,$twk0,v31		# xor with last round key
>  	vcipher		$out0,$out0,v24
> @@ -2634,7 +2656,6 @@ Loop_xts_enc6x:
>  	 vaddubm	$tweak,$tweak,$tweak
>  	vcipher		$out2,$out2,v24
>  	vcipher		$out3,$out3,v24
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vcipher		$out4,$out4,v24
>  	vcipher		$out5,$out5,v24
>  
> @@ -2642,7 +2663,8 @@ Loop_xts_enc6x:
>  	 vand		$tmp,$tmp,$eighty7
>  	vcipher		$out0,$out0,v25
>  	vcipher		$out1,$out1,v25
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in1, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in1
>  	vcipher		$out2,$out2,v25
>  	vcipher		$out3,$out3,v25
>  	 vxor		$in1,$twk1,v31
> @@ -2653,13 +2675,13 @@ Loop_xts_enc6x:
>  
>  	and		r0,r0,$len
>  	 vaddubm	$tweak,$tweak,$tweak
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vcipher		$out0,$out0,v26
>  	vcipher		$out1,$out1,v26
>  	 vand		$tmp,$tmp,$eighty7
>  	vcipher		$out2,$out2,v26
>  	vcipher		$out3,$out3,v26
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in2, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in2
>  	vcipher		$out4,$out4,v26
>  	vcipher		$out5,$out5,v26
>  
> @@ -2673,7 +2695,6 @@ Loop_xts_enc6x:
>  	 vaddubm	$tweak,$tweak,$tweak
>  	vcipher		$out0,$out0,v27
>  	vcipher		$out1,$out1,v27
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vcipher		$out2,$out2,v27
>  	vcipher		$out3,$out3,v27
>  	 vand		$tmp,$tmp,$eighty7
> @@ -2681,7 +2702,8 @@ Loop_xts_enc6x:
>  	vcipher		$out5,$out5,v27
>  
>  	addi		$key_,$sp,$FRAME+15	# rewind $key_
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in3, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in3
>  	vcipher		$out0,$out0,v28
>  	vcipher		$out1,$out1,v28
>  	 vxor		$in3,$twk3,v31
> @@ -2690,7 +2712,6 @@ Loop_xts_enc6x:
>  	vcipher		$out2,$out2,v28
>  	vcipher		$out3,$out3,v28
>  	 vaddubm	$tweak,$tweak,$tweak
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vcipher		$out4,$out4,v28
>  	vcipher		$out5,$out5,v28
>  	lvx		v24,$x00,$key_		# re-pre-load round[1]
> @@ -2698,7 +2719,8 @@ Loop_xts_enc6x:
>  
>  	vcipher		$out0,$out0,v29
>  	vcipher		$out1,$out1,v29
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in4, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in4
>  	vcipher		$out2,$out2,v29
>  	vcipher		$out3,$out3,v29
>  	 vxor		$in4,$twk4,v31
> @@ -2708,14 +2730,14 @@ Loop_xts_enc6x:
>  	vcipher		$out5,$out5,v29
>  	lvx		v25,$x10,$key_		# re-pre-load round[2]
>  	 vaddubm	$tweak,$tweak,$tweak
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  
>  	vcipher		$out0,$out0,v30
>  	vcipher		$out1,$out1,v30
>  	 vand		$tmp,$tmp,$eighty7
>  	vcipher		$out2,$out2,v30
>  	vcipher		$out3,$out3,v30
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in5, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in5
>  	vcipher		$out4,$out4,v30
>  	vcipher		$out5,$out5,v30
>  	 vxor		$in5,$twk5,v31
> @@ -2725,7 +2747,6 @@ Loop_xts_enc6x:
>  	vcipherlast	$out0,$out0,$in0
>  	 lvx_u		$in0,$x00,$inp		# load next input block
>  	 vaddubm	$tweak,$tweak,$tweak
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vcipherlast	$out1,$out1,$in1
>  	 lvx_u		$in1,$x10,$inp
>  	vcipherlast	$out2,$out2,$in2
> @@ -2738,7 +2759,10 @@ Loop_xts_enc6x:
>  	vcipherlast	$out4,$out4,$in4
>  	 le?vperm	$in2,$in2,$in2,$leperm
>  	 lvx_u		$in4,$x40,$inp
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		10, 32+$in0, 32+$in0
> +	 xxlor		32+$in0, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in0
> +	 xxlor		32+$in0, 10, 10
>  	vcipherlast	$tmp,$out5,$in5		# last block might be needed
>  						# in stealing mode
>  	 le?vperm	$in3,$in3,$in3,$leperm
> @@ -2771,6 +2795,8 @@ Loop_xts_enc6x:
>  	mtctr		$rounds
>  	beq		Loop_xts_enc6x		# did $len-=96 borrow?
>  
> +	xxlor		32+$eighty7, 2, 2	# 0x010101..87
> +
>  	addic.		$len,$len,0x60
>  	beq		Lxts_enc6x_zero
>  	cmpwi		$len,0x20
> @@ -3147,6 +3173,17 @@ _aesp8_xts_decrypt6x:
>  	li		$x70,0x70
>  	mtspr		256,r0
>  
> +	xxlor		2, 32+$eighty7, 32+$eighty7
> +	vsldoi		$eighty7,$tmp,$eighty7,1        # 0x010101..87
> +	xxlor		1, 32+$eighty7, 32+$eighty7
> +
> +	# Load XOR Lconsts.
> +	mr		$x70, r6
> +	bl		Lconsts
> +	lxvw4x		0, $x40, r6		# load XOR contents
> +	mr		r6, $x70
> +	li		$x70,0x70
> +
>  	subi		$rounds,$rounds,3	# -4 in total
>  
>  	lvx		$rndkey0,$x00,$key1	# load key schedule
> @@ -3194,64 +3231,64 @@ Load_xts_dec_key:
>  	vxor		$twk0,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out0,$in0,$twk0
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in1, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in1
>  
>  	 lvx_u		$in1,$x10,$inp
>  	vxor		$twk1,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in1,$in1,$in1,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out1,$in1,$twk1
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in2, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in2
>  
>  	 lvx_u		$in2,$x20,$inp
>  	 andi.		$taillen,$len,15
>  	vxor		$twk2,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in2,$in2,$in2,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out2,$in2,$twk2
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in3, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in3
>  
>  	 lvx_u		$in3,$x30,$inp
>  	 sub		$len,$len,$taillen
>  	vxor		$twk3,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in3,$in3,$in3,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out3,$in3,$twk3
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in4, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in4
>  
>  	 lvx_u		$in4,$x40,$inp
>  	 subi		$len,$len,0x60
>  	vxor		$twk4,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in4,$in4,$in4,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out4,$in4,$twk4
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in5, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in5
>  
>  	 lvx_u		$in5,$x50,$inp
>  	 addi		$inp,$inp,0x60
>  	vxor		$twk5,$tweak,$rndkey0
>  	vsrab		$tmp,$tweak,$seven	# next tweak value
>  	vaddubm		$tweak,$tweak,$tweak
> -	vsldoi		$tmp,$tmp,$tmp,15
>  	 le?vperm	$in5,$in5,$in5,$leperm
>  	vand		$tmp,$tmp,$eighty7
>  	 vxor		$out5,$in5,$twk5
> -	vxor		$tweak,$tweak,$tmp
> +	xxlor		32+$in0, 0, 0
> +	vpermxor	$tweak, $tweak, $tmp, $in0
>  
>  	vxor		v31,v31,$rndkey0
>  	mtctr		$rounds
> @@ -3277,6 +3314,8 @@ Loop_xts_dec6x:
>  	lvx		v25,$x10,$key_		# round[4]
>  	bdnz		Loop_xts_dec6x
>  
> +	xxlor		32+$eighty7, 1, 1	# 0x010101..87
> +
>  	subic		$len,$len,96		# $len-=96
>  	 vxor		$in0,$twk0,v31		# xor with last round key
>  	vncipher	$out0,$out0,v24
> @@ -3286,7 +3325,6 @@ Loop_xts_dec6x:
>  	 vaddubm	$tweak,$tweak,$tweak
>  	vncipher	$out2,$out2,v24
>  	vncipher	$out3,$out3,v24
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vncipher	$out4,$out4,v24
>  	vncipher	$out5,$out5,v24
>  
> @@ -3294,7 +3332,8 @@ Loop_xts_dec6x:
>  	 vand		$tmp,$tmp,$eighty7
>  	vncipher	$out0,$out0,v25
>  	vncipher	$out1,$out1,v25
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in1, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in1
>  	vncipher	$out2,$out2,v25
>  	vncipher	$out3,$out3,v25
>  	 vxor		$in1,$twk1,v31
> @@ -3305,13 +3344,13 @@ Loop_xts_dec6x:
>  
>  	and		r0,r0,$len
>  	 vaddubm	$tweak,$tweak,$tweak
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vncipher	$out0,$out0,v26
>  	vncipher	$out1,$out1,v26
>  	 vand		$tmp,$tmp,$eighty7
>  	vncipher	$out2,$out2,v26
>  	vncipher	$out3,$out3,v26
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in2, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in2
>  	vncipher	$out4,$out4,v26
>  	vncipher	$out5,$out5,v26
>  
> @@ -3325,7 +3364,6 @@ Loop_xts_dec6x:
>  	 vaddubm	$tweak,$tweak,$tweak
>  	vncipher	$out0,$out0,v27
>  	vncipher	$out1,$out1,v27
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vncipher	$out2,$out2,v27
>  	vncipher	$out3,$out3,v27
>  	 vand		$tmp,$tmp,$eighty7
> @@ -3333,7 +3371,8 @@ Loop_xts_dec6x:
>  	vncipher	$out5,$out5,v27
>  
>  	addi		$key_,$sp,$FRAME+15	# rewind $key_
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in3, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in3
>  	vncipher	$out0,$out0,v28
>  	vncipher	$out1,$out1,v28
>  	 vxor		$in3,$twk3,v31
> @@ -3342,7 +3381,6 @@ Loop_xts_dec6x:
>  	vncipher	$out2,$out2,v28
>  	vncipher	$out3,$out3,v28
>  	 vaddubm	$tweak,$tweak,$tweak
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vncipher	$out4,$out4,v28
>  	vncipher	$out5,$out5,v28
>  	lvx		v24,$x00,$key_		# re-pre-load round[1]
> @@ -3350,7 +3388,8 @@ Loop_xts_dec6x:
>  
>  	vncipher	$out0,$out0,v29
>  	vncipher	$out1,$out1,v29
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in4, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in4
>  	vncipher	$out2,$out2,v29
>  	vncipher	$out3,$out3,v29
>  	 vxor		$in4,$twk4,v31
> @@ -3360,14 +3399,14 @@ Loop_xts_dec6x:
>  	vncipher	$out5,$out5,v29
>  	lvx		v25,$x10,$key_		# re-pre-load round[2]
>  	 vaddubm	$tweak,$tweak,$tweak
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  
>  	vncipher	$out0,$out0,v30
>  	vncipher	$out1,$out1,v30
>  	 vand		$tmp,$tmp,$eighty7
>  	vncipher	$out2,$out2,v30
>  	vncipher	$out3,$out3,v30
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		32+$in5, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in5
>  	vncipher	$out4,$out4,v30
>  	vncipher	$out5,$out5,v30
>  	 vxor		$in5,$twk5,v31
> @@ -3377,7 +3416,6 @@ Loop_xts_dec6x:
>  	vncipherlast	$out0,$out0,$in0
>  	 lvx_u		$in0,$x00,$inp		# load next input block
>  	 vaddubm	$tweak,$tweak,$tweak
> -	 vsldoi		$tmp,$tmp,$tmp,15
>  	vncipherlast	$out1,$out1,$in1
>  	 lvx_u		$in1,$x10,$inp
>  	vncipherlast	$out2,$out2,$in2
> @@ -3390,7 +3428,10 @@ Loop_xts_dec6x:
>  	vncipherlast	$out4,$out4,$in4
>  	 le?vperm	$in2,$in2,$in2,$leperm
>  	 lvx_u		$in4,$x40,$inp
> -	 vxor		$tweak,$tweak,$tmp
> +	 xxlor		10, 32+$in0, 32+$in0
> +	 xxlor		32+$in0, 0, 0
> +	 vpermxor	$tweak, $tweak, $tmp, $in0
> +	 xxlor		32+$in0, 10, 10
>  	vncipherlast	$out5,$out5,$in5
>  	 le?vperm	$in3,$in3,$in3,$leperm
>  	 lvx_u		$in5,$x50,$inp
> @@ -3421,6 +3462,8 @@ Loop_xts_dec6x:
>  	mtctr		$rounds
>  	beq		Loop_xts_dec6x		# did $len-=96 borrow?
>  
> +	xxlor		32+$eighty7, 2, 2	# 0x010101..87
> +
>  	addic.		$len,$len,0x60
>  	beq		Lxts_dec6x_zero
>  	cmpwi		$len,0x20
> -- 
> 2.31.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ