[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1207723262.18313.37.camel@caritas-dev.intel.com>
Date: Wed, 09 Apr 2008 14:41:02 +0800
From: "Huang, Ying" <ying.huang@...el.com>
To: Herbert Xu <herbert@...dor.apana.org.au>,
"Adam J. Richter" <adam@...drasil.com>,
Alexander Kjeldaas <astor@...t.no>,
Sebastian Siewior <linux-crypto@...breakpoint.cc>,
akpm@...ux-foundation.org
Cc: linux-kernel@...r.kernel.org, linux-crypto@...r.kernel.org
Subject: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
This patch increases the performance of AES x86-64 implementation. The
average increment is more than 6.3% and the max increment is
more than 10.2% on Intel CORE 2 CPU. The performance increment is
gained via the following methods:
- Two additional temporary registers are used to hold the subset of
the state, so that the dependency between instructions is reduced.
- The expanded key is loaded via 2 64bit load instead of 4 32-bit load.
This patch is based on 2.6.25-rc8-mm1.
The file attached is the test data via: modprobe tcrypt mode=200
- dmesg_1_core-stockn: stock kernel data
- dmesg_1_core-op4n: patched kernel data
- percent.txt: (time_patched - time_stock) / time_stock * 100
Signed-off-by: Huang Ying <ying.huang@...el.com>
---
arch/x86/crypto/aes-x86_64-asm_64.S | 101 ++++++++++++++++++++----------------
include/crypto/aes.h | 1
2 files changed, 58 insertions(+), 44 deletions(-)
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -46,70 +46,81 @@
#define R7 %rbp
#define R7E %ebp
#define R8 %r8
+#define R8E %r8d
#define R9 %r9
+#define R9E %r9d
#define R10 %r10
#define R11 %r11
+#define R12 %r12
+#define R12E %r12d
+#define R16 %rsp
#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
.global FUNC; \
.type FUNC,@function; \
.align 8; \
-FUNC: movq r1,r2; \
- movq r3,r4; \
- leaq BASE+KEY+48+4(r8),r9; \
- movq r10,r11; \
- movl (r7),r5 ## E; \
- movl 4(r7),r1 ## E; \
- movl 8(r7),r6 ## E; \
- movl 12(r7),r7 ## E; \
- movl BASE+0(r8),r10 ## E; \
- xorl -48(r9),r5 ## E; \
- xorl -44(r9),r1 ## E; \
- xorl -40(r9),r6 ## E; \
- xorl -36(r9),r7 ## E; \
- cmpl $24,r10 ## E; \
+FUNC: subq $24,r11; \
+ movl (r6),r4 ## E; \
+ leaq BASE+KEY+48+8(r7),r8; \
+ movq r1,(r11); \
+ movq r9,r10; \
+ movl 4(r6),r1 ## E; \
+ movq r2,8(r11); \
+ movl 8(r6),r5 ## E; \
+ movq r3,16(r11); \
+ movl 12(r6),r6 ## E; \
+ movl BASE+0(r7),r9 ## E; \
+ xorl -48(r8),r4 ## E; \
+ xorl -44(r8),r1 ## E; \
+ xorl -40(r8),r5 ## E; \
+ xorl -36(r8),r6 ## E; \
+ cmpl $24,r9 ## E; \
jb B128; \
- leaq 32(r9),r9; \
+ leaq 32(r8),r8; \
je B192; \
- leaq 32(r9),r9;
+ leaq 32(r8),r8;
#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
- movq r1,r2; \
- movq r3,r4; \
- movl r5 ## E,(r9); \
- movl r6 ## E,4(r9); \
- movl r7 ## E,8(r9); \
- movl r8 ## E,12(r9); \
+ movq (r9),r1; \
+ movl r4 ## E,(r8); \
+ movq 8(r9),r2; \
+ movl r5 ## E,4(r8); \
+ movq 16(r9),r3; \
+ movl r6 ## E,8(r8); \
+ addq $24,r9; \
+ movl r7 ## E,12(r8); \
ret;
-#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
+#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,ra,rb,rc,rd) \
movzbl r2 ## H,r5 ## E; \
movzbl r2 ## L,r6 ## E; \
+ movl r4 ## E,r8 ## E; \
+ shrl $16,r4 ## E; \
movl TAB+1024(,r5,4),r5 ## E;\
- movw r4 ## X,r2 ## X; \
movl TAB(,r6,4),r6 ## E; \
- roll $16,r2 ## E; \
- shrl $16,r4 ## E; \
movzbl r4 ## H,r7 ## E; \
movzbl r4 ## L,r4 ## E; \
- xorl OFFSET(r8),ra ## E; \
- xorl OFFSET+4(r8),rb ## E; \
+ movq OFFSET(r11),r10; \
+ shrl $16,r2 ## E; \
+ movl r3 ## E,r9 ## E; \
xorl TAB+3072(,r7,4),r5 ## E;\
xorl TAB+2048(,r4,4),r6 ## E;\
- movzbl r1 ## L,r7 ## E; \
movzbl r1 ## H,r4 ## E; \
- movl TAB+1024(,r4,4),r4 ## E;\
- movw r3 ## X,r1 ## X; \
- roll $16,r1 ## E; \
+ movzbl r1 ## L,r7 ## E; \
shrl $16,r3 ## E; \
+ movl TAB+1024(,r4,4),r4 ## E;\
xorl TAB(,r7,4),r5 ## E; \
+ shrl $16,r1 ## E; \
movzbl r3 ## H,r7 ## E; \
movzbl r3 ## L,r3 ## E; \
xorl TAB+3072(,r7,4),r4 ## E;\
xorl TAB+2048(,r3,4),r5 ## E;\
movzbl r1 ## H,r7 ## E; \
movzbl r1 ## L,r3 ## E; \
- shrl $16,r1 ## E; \
+ xorl r10 ## E,ra ## E; \
+ movl r9 ## E,r1 ## E; \
+ movq OFFSET+8(r11),r9; \
+ shrq $32,r10; \
xorl TAB+3072(,r7,4),r6 ## E;\
movl TAB+2048(,r3,4),r3 ## E;\
movzbl r1 ## H,r7 ## E; \
@@ -118,38 +129,40 @@ FUNC: movq r1,r2; \
xorl TAB(,r1,4),r3 ## E; \
movzbl r2 ## H,r1 ## E; \
movzbl r2 ## L,r7 ## E; \
- shrl $16,r2 ## E; \
+ xorl r9 ## E, rc ## E; \
+ movl r8 ## E,r2 ## E; \
+ shrq $32,r9; \
+ xorl r10 ## E,rb ## E; \
xorl TAB+3072(,r1,4),r3 ## E;\
xorl TAB+2048(,r7,4),r4 ## E;\
movzbl r2 ## H,r1 ## E; \
+ xorl r9 ## E, rd ## E; \
movzbl r2 ## L,r2 ## E; \
- xorl OFFSET+8(r8),rc ## E; \
- xorl OFFSET+12(r8),rd ## E; \
- xorl TAB+1024(,r1,4),r3 ## E;\
- xorl TAB(,r2,4),r4 ## E;
+ xorl TAB(,r2,4),r4 ## E; \
+ xorl TAB+1024(,r1,4),r3 ## E;
#define move_regs(r1,r2,r3,r4) \
movl r3 ## E,r1 ## E; \
movl r4 ## E,r2 ## E;
#define entry(FUNC,KEY,B128,B192) \
- prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+ prologue(FUNC,KEY,B128,B192,R2,R7,R12,R1,R3,R4,R6,R10,R5,R11,R16)
-#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
+#define return epilogue(R2,R7,R12,R5,R6,R3,R4,R11,R16)
#define encrypt_round(TAB,OFFSET) \
- round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
+ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R8,R9,R12,R10,R5,R6,R3,R4) \
move_regs(R1,R2,R5,R6)
#define encrypt_final(TAB,OFFSET) \
- round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
+ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R8,R9,R12,R10,R5,R6,R3,R4)
#define decrypt_round(TAB,OFFSET) \
- round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
+ round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R8,R9,R12,R10,R5,R6,R3,R4) \
move_regs(R1,R2,R5,R6)
#define decrypt_final(TAB,OFFSET) \
- round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
+ round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R8,R9,R12,R10,R5,R6,R3,R4)
/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
--- a/include/crypto/aes.h
+++ b/include/crypto/aes.h
@@ -19,6 +19,7 @@
struct crypto_aes_ctx {
u32 key_length;
+ u32 _pad1;
u32 key_enc[AES_MAX_KEYLENGTH_U32];
u32 key_dec[AES_MAX_KEYLENGTH_U32];
};
View attachment "dmesg_1_core-stockn" of type "text/plain" (9946 bytes)
View attachment "dmesg_1_core-op4n" of type "text/plain" (9946 bytes)
View attachment "percent.txt" of type "text/plain" (2570 bytes)
Powered by blists - more mailing lists