lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20231214130206.21219-1-xry111@xry111.site>
Date:   Thu, 14 Dec 2023 21:02:06 +0800
From:   Xi Ruoyao <xry111@...111.site>
To:     Huacai Chen <chenhuacai@...nel.org>,
        WANG Xuerui <kernel@...0n.name>
Cc:     loongarch@...ts.linux.dev, linux-kernel@...r.kernel.org,
        Xi Ruoyao <xry111@...111.site>
Subject: [PATCH] LoongArch: Micro-optimize sc_save_fcc and sc_restore_fcc for LA464

On LA464 movcf2gr is 7 times slower than movcf2fr + movfr2gr, and
movgr2cf is 15 times (!) slower than movgr2fr + movfr2cf.

On LA664 movcf2fr + movfr2gr has a similar performance with movcf2gr,
and movgr2fr + movfr2cf has a similar performance with movgr2cf.

To use FP registers in sc_save_fcc and sc_restore_fcc we need to save
FP/LSX/LASX registers before sc_save_fcc, and restore FP/LSX/LASX
registers after sc_restore_fcc.

Signed-off-by: Xi Ruoyao <xry111@...111.site>
---
 arch/loongarch/kernel/fpu.S | 94 +++++++++++++++++++++----------------
 1 file changed, 54 insertions(+), 40 deletions(-)

diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
index d53ab10f4644..ecb127f9a673 100644
--- a/arch/loongarch/kernel/fpu.S
+++ b/arch/loongarch/kernel/fpu.S
@@ -96,43 +96,57 @@
 	.endm
 
 	.macro sc_save_fcc base, tmp0, tmp1
-	movcf2gr	\tmp0, $fcc0
-	move		\tmp1, \tmp0
-	movcf2gr	\tmp0, $fcc1
-	bstrins.d	\tmp1, \tmp0, 15, 8
-	movcf2gr	\tmp0, $fcc2
-	bstrins.d	\tmp1, \tmp0, 23, 16
-	movcf2gr	\tmp0, $fcc3
-	bstrins.d	\tmp1, \tmp0, 31, 24
-	movcf2gr	\tmp0, $fcc4
-	bstrins.d	\tmp1, \tmp0, 39, 32
-	movcf2gr	\tmp0, $fcc5
-	bstrins.d	\tmp1, \tmp0, 47, 40
-	movcf2gr	\tmp0, $fcc6
-	bstrins.d	\tmp1, \tmp0, 55, 48
-	movcf2gr	\tmp0, $fcc7
-	bstrins.d	\tmp1, \tmp0, 63, 56
-	EX	st.d	\tmp1, \base, 0
+	movcf2fr	ft0, $fcc0
+	movcf2fr	ft1, $fcc1
+	movfr2gr.s	\tmp0, ft0
+	movfr2gr.s	\tmp1, ft1
+	EX  st.b	\tmp0, \base, 0
+	EX  st.b	\tmp0, \base, 8
+	movcf2fr	ft0, $fcc2
+	movcf2fr	ft1, $fcc3
+	movfr2gr.s	\tmp0, ft0
+	movfr2gr.s	\tmp1, ft1
+	EX  st.b	\tmp0, \base, 16
+	EX  st.b	\tmp0, \base, 24
+	movcf2fr	ft0, $fcc3
+	movcf2fr	ft1, $fcc4
+	movfr2gr.s	\tmp0, ft0
+	movfr2gr.s	\tmp1, ft1
+	EX  st.b	\tmp0, \base, 32
+	EX  st.b	\tmp0, \base, 40
+	movcf2fr	ft0, $fcc5
+	movcf2fr	ft1, $fcc6
+	movfr2gr.s	\tmp0, ft0
+	movfr2gr.s	\tmp1, ft1
+	EX  st.b	\tmp0, \base, 48
+	EX  st.b	\tmp0, \base, 56
 	.endm
 
 	.macro sc_restore_fcc base, tmp0, tmp1
-	EX	ld.d	\tmp0, \base, 0
-	bstrpick.d	\tmp1, \tmp0, 7, 0
-	movgr2cf	$fcc0, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 15, 8
-	movgr2cf	$fcc1, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 23, 16
-	movgr2cf	$fcc2, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 31, 24
-	movgr2cf	$fcc3, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 39, 32
-	movgr2cf	$fcc4, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 47, 40
-	movgr2cf	$fcc5, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 55, 48
-	movgr2cf	$fcc6, \tmp1
-	bstrpick.d	\tmp1, \tmp0, 63, 56
-	movgr2cf	$fcc7, \tmp1
+	EX	ld.b	\tmp0, \base, 0
+	EX	ld.b	\tmp1, \base, 8
+	movgr2fr.w	ft0, \tmp0
+	movgr2fr.w	ft1, \tmp1
+	movfr2cf	$fcc0, ft0
+	movfr2cf	$fcc1, ft1
+	EX	ld.b	\tmp0, \base, 16
+	EX	ld.b	\tmp1, \base, 24
+	movgr2fr.w	ft0, \tmp0
+	movgr2fr.w	ft1, \tmp1
+	movfr2cf	$fcc2, ft0
+	movfr2cf	$fcc3, ft1
+	EX	ld.b	\tmp0, \base, 32
+	EX	ld.b	\tmp1, \base, 40
+	movgr2fr.w	ft0, \tmp0
+	movgr2fr.w	ft1, \tmp1
+	movfr2cf	$fcc4, ft0
+	movfr2cf	$fcc5, ft1
+	EX	ld.b	\tmp0, \base, 48
+	EX	ld.b	\tmp1, \base, 56
+	movgr2fr.w	ft0, \tmp0
+	movgr2fr.w	ft1, \tmp1
+	movfr2cf	$fcc6, ft0
+	movfr2cf	$fcc7, ft1
 	.endm
 
 	.macro sc_save_fcsr base, tmp0
@@ -449,9 +463,9 @@ SYM_FUNC_END(_init_fpu)
  * a2: fcsr
  */
 SYM_FUNC_START(_save_fp_context)
-	sc_save_fcc	a1 t1 t2
 	sc_save_fcsr	a2 t1
 	sc_save_fp	a0
+	sc_save_fcc	a1 t1 t2
 	li.w		a0, 0				# success
 	jr		ra
 SYM_FUNC_END(_save_fp_context)
@@ -462,8 +476,8 @@ SYM_FUNC_END(_save_fp_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_restore_fp_context)
-	sc_restore_fp	a0
 	sc_restore_fcc	a1 t1 t2
+	sc_restore_fp	a0
 	sc_restore_fcsr	a2 t1
 	li.w		a0, 0				# success
 	jr		ra
@@ -475,9 +489,9 @@ SYM_FUNC_END(_restore_fp_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_save_lsx_context)
-	sc_save_fcc a1, t0, t1
 	sc_save_fcsr a2, t0
 	sc_save_lsx a0
+	sc_save_fcc a1, t0, t1
 	li.w	a0, 0					# success
 	jr	ra
 SYM_FUNC_END(_save_lsx_context)
@@ -488,8 +502,8 @@ SYM_FUNC_END(_save_lsx_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_restore_lsx_context)
-	sc_restore_lsx a0
 	sc_restore_fcc a1, t1, t2
+	sc_restore_lsx a0
 	sc_restore_fcsr a2, t1
 	li.w	a0, 0					# success
 	jr	ra
@@ -501,9 +515,9 @@ SYM_FUNC_END(_restore_lsx_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_save_lasx_context)
-	sc_save_fcc a1, t0, t1
 	sc_save_fcsr a2, t0
 	sc_save_lasx a0
+	sc_save_fcc a1, t0, t1
 	li.w	a0, 0					# success
 	jr	ra
 SYM_FUNC_END(_save_lasx_context)
@@ -514,8 +528,8 @@ SYM_FUNC_END(_save_lasx_context)
  * a2: fcsr
  */
 SYM_FUNC_START(_restore_lasx_context)
-	sc_restore_lasx a0
 	sc_restore_fcc a1, t1, t2
+	sc_restore_lasx a0
 	sc_restore_fcsr a2, t1
 	li.w	a0, 0					# success
 	jr	ra
-- 
2.43.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ