lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sat, 28 May 2022 16:12:35 +0800
From:   Yu-Jen Chang <arthurchang09@...il.com>
To:     ak@...ux.intel.com, jdike@...ux.intel.com
Cc:     tglx@...utronix.de, mingo@...hat.com, bp@...en8.de,
        dave.hansen@...ux.intel.com, x86@...nel.org, hpa@...or.com,
        keescook@...omium.org, linux-kernel@...r.kernel.org,
        linux-hardening@...r.kernel.org, richard@....at,
        anton.ivanov@...bridgegreys.com, johannes@...solutions.net,
        linux-um@...ts.infradead.org, jserv@...s.ncku.edu.tw,
        Yu-Jen Chang <arthurchang09@...il.com>
Subject: [PATCH 1/2] x86/lib: Optimize memchr()

The original assembly version of memchr() is implemented with
the byte-wise comparing technique, which does not fully
use 64-bits registers in x86_64 CPU. We use word-wide
comparing so that 8 characters can be compared at the same time
on x86_64 CPU. First we align the input and then use word-wise
comparing to find the first 64-bit word that contain the target.
Secondly, we compare every byte in the word and get the output.

We create two files to measure the performance. The first file
contains on average 10 characters ahead the target character.
The second file contains at least 1000 characters ahead the
target character. Our implementation of “memchr()” is slightly
better in the first test and nearly 4x faster than the orginal
implementation in the second test.

Signed-off-by: Yu-Jen Chang <arthurchang09@...il.com>
Signed-off-by: Ching-Chun (Jim) Huang <jserv@...s.ncku.edu.tw>
---
 arch/x86/include/asm/string_64.h |  3 ++
 arch/x86/lib/Makefile            |  1 +
 arch/x86/lib/string_64.c         | 78 ++++++++++++++++++++++++++++++++
 3 files changed, 82 insertions(+)
 create mode 100644 arch/x86/lib/string_64.c

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 6e450827f..edce657e0 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -14,6 +14,9 @@
 extern void *memcpy(void *to, const void *from, size_t len);
 extern void *__memcpy(void *to, const void *from, size_t len);
 
+#define __HAVE_ARCH_MEMCHR
+extern void *memchr(const void *cs, int c, size_t length);
+
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
 void *__memset(void *s, int c, size_t n);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f76747862..4d530e559 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -69,5 +69,6 @@ else
         lib-y += clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o
+        lib-y += string_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
diff --git a/arch/x86/lib/string_64.c b/arch/x86/lib/string_64.c
new file mode 100644
index 000000000..4e067d5be
--- /dev/null
+++ b/arch/x86/lib/string_64.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/align.h>
+
+/* How many bytes are loaded each iteration of the word copy loop */
+#define LBLOCKSIZE (sizeof(long))
+
+#ifdef __HAVE_ARCH_MEMCHR
+
+void *memchr(const void *cs, int c, size_t length)
+{
+	const unsigned char *src = (const unsigned char *)cs, d = c;
+
+	while (!IS_ALIGNED((long)src, sizeof(long))) {
+		if (!length--)
+			return NULL;
+		if (*src == d)
+			return (void *)src;
+		src++;
+	}
+	if (length >= LBLOCKSIZE) {
+		unsigned long mask = d << 8 | d;
+		unsigned int i = 32;
+		long xor, data;
+		const long consta = 0xFEFEFEFEFEFEFEFF,
+			   constb = 0x8080808080808080;
+
+		/*
+		 * Create a 8-bytes mask for word-wise comparing.
+		 * For example, a mask for 'a' is 0x6161616161616161.
+		 */
+
+		mask |= mask << 16;
+		for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
+			mask |= mask << i;
+		/*
+		 * We perform word-wise comparing with following operation:
+		 *	1. Perform xor on the long word @src and @mask
+		 *	   and put into @xor.
+		 *	2. Add @xor with @consta.
+		 *	3. ~@xor & @constb.
+		 *	4. Perform & with the result of step 2 and 3.
+		 *
+		 * Step 1 creates a byte which is 0 in the long word if
+		 * there is at least one target byte in it.
+		 *
+		 * Step 2 to Step 4 find if there is a byte with 0 in
+		 * the long word.
+		 */
+		asm volatile("1:\n\t"
+			     "movq (%0),%1\n\t"
+			     "xorq %6,%1\n\t"
+			     "lea (%1,%4), %2\n\t"
+			     "notq %1\n\t"
+			     "andq %5,%1\n\t"
+			     "testq %1,%2\n\t"
+			     "jne 2f\n\t"
+			     "add $8,%0\n\t"
+			     "sub $8,%3\n\t"
+			     "cmp $7,%3\n\t"
+			     "ja 1b\n\t"
+			     "2:\n\t"
+			     : "=D"(src), "=r"(xor), "=r"(data), "=r"(length)
+			     : "r"(consta), "r"(constb), "r"(mask), "0"(src),
+			       "1"(xor), "2"(data), "3"(length)
+			     : "memory", "cc");
+	}
+
+	while (length--) {
+		if (*src == d)
+			return (void *)src;
+		src++;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(memchr);
+#endif
-- 
2.25.1

Powered by blists - more mailing lists