[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <68af0be6019ae1f9611fefbf16dcbbf544bfca21.1523950415.git.christophe.leroy@c-s.fr>
Date:   Tue, 17 Apr 2018 09:38:43 +0200 (CEST)
From:   Christophe Leroy <christophe.leroy@....fr>
To:     Benjamin Herrenschmidt <benh@...nel.crashing.org>,
        Paul Mackerras <paulus@...ba.org>,
        Michael Ellerman <mpe@...erman.id.au>
Cc:     linux-kernel@...r.kernel.org, linuxppc-dev@...ts.ozlabs.org
Subject: [PATCH  5/7] powerpc/lib: optimise 32 bits __clear_user()
Rewrite clear_user() on the same principle as memset(0), making use
of dcbz to clear complete cache lines.
This code is a copy/paste of memset(), with some modifications
in order to retrieve remaining number of bytes to be cleared,
as it needs to be returned in case of error.
On a MPC885, throughput is almost doubled:
Before:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s
After:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s
On a MPC8321, throughput is multiplied by 2.12:
Before:
root@...ippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s
After:
root@...ippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s
Signed-off-by: Christophe Leroy <christophe.leroy@....fr>
---
 arch/powerpc/lib/string_32.S | 85 +++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 25 deletions(-)
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 5b2a73fb07be..31fc92b0aae6 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -11,6 +11,7 @@
 #include <asm/errno.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/cache.h>
 
 	.text
 
@@ -61,44 +62,78 @@ _GLOBAL(memcmp)
 #endif
 EXPORT_SYMBOL(memcmp)
 
+CACHELINE_BYTES = L1_CACHE_BYTES
+LG_CACHELINE_BYTES = L1_CACHE_SHIFT
+CACHELINE_MASK = (L1_CACHE_BYTES-1)
+
 _GLOBAL(__clear_user)
-	addi	r6,r3,-4
-	li	r3,0
-	li	r5,0
-	cmplwi	0,r4,4
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero.  This requires that the destination
+ * area is cacheable.
+ */
+	cmplwi	cr0, r4, 4
+	mr	r10, r3
+	li	r3, 0
 	blt	7f
-	/* clear a single word */
-11:	stwu	r5,4(r6)
+
+11:	stw	r3, 0(r10)
 	beqlr
-	/* clear word sized chunks */
-	andi.	r0,r6,3
-	add	r4,r0,r4
-	subf	r6,r0,r6
-	srwi	r0,r4,2
-	andi.	r4,r4,3
+	andi.	r0, r10, 3
+	add	r11, r0, r4
+	subf	r6, r0, r10
+
+	clrlwi	r7, r6, 32 - LG_CACHELINE_BYTES
+	add	r8, r7, r11
+	srwi	r9, r8, LG_CACHELINE_BYTES
+	addic.	r9, r9, -1	/* total number of complete cachelines */
+	ble	2f
+	xori	r0, r7, CACHELINE_MASK & ~3
+	srwi.	r0, r0, 2
+	beq	3f
+	mtctr	r0
+4:	stwu	r3, 4(r6)
+	bdnz	4b
+3:	mtctr	r9
+	li	r7, 4
+10:	dcbz	r7, r6
+	addi	r6, r6, CACHELINE_BYTES
+	bdnz	10b
+	clrlwi	r11, r8, 32 - LG_CACHELINE_BYTES
+	addi	r11, r11, 4
+
+2:	srwi	r0 ,r11 ,2
 	mtctr	r0
-	bdz	7f
-1:	stwu	r5,4(r6)
+	bdz	6f
+1:	stwu	r3, 4(r6)
 	bdnz	1b
-	/* clear byte sized chunks */
-7:	cmpwi	0,r4,0
+6:	andi.	r11, r11, 3
 	beqlr
-	mtctr	r4
-	addi	r6,r6,3
-8:	stbu	r5,1(r6)
+	mtctr	r11
+	addi	r6, r6, 3
+8:	stbu	r3, 1(r6)
 	bdnz	8b
 	blr
-90:	mr	r3,r4
+
+7:	cmpwi	cr0, r4, 0
+	beqlr
+	mtctr	r4
+	addi	r6, r10, -1
+9:	stbu	r3, 1(r6)
+	bdnz	9b
 	blr
-91:	mfctr	r3
-	slwi	r3,r3,2
-	add	r3,r3,r4
+
+90:	mr	r3, r4
 	blr
-92:	mfctr	r3
+91:	add	r3, r10, r4
+	subf	r3, r6, r3
 	blr
 
 	EX_TABLE(11b, 90b)
+	EX_TABLE(4b, 91b)
+	EX_TABLE(10b, 91b)
 	EX_TABLE(1b, 91b)
-	EX_TABLE(8b, 92b)
+	EX_TABLE(8b, 91b)
+	EX_TABLE(9b, 91b)
 
 EXPORT_SYMBOL(__clear_user)
-- 
2.13.3
Powered by blists - more mailing lists
 
