lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1461744350-24156-1-git-send-email-zengzhaoxiu@163.com>
Date:	Wed, 27 Apr 2016 16:05:50 +0800
From:	zengzhaoxiu@....com
To:	akpm@...ux-foundation.org, linux@...izon.com, peterz@...radead.org
Cc:	linux-kernel@...r.kernel.org, Zhaoxiu Zeng <zhaoxiu.zeng@...il.com>
Subject: [patch V2] lib: GCD: add binary GCD algorithm

From: Zhaoxiu Zeng <zhaoxiu.zeng@...il.com>

Because some architectures (alpha, armv6, etc.) don't provide hardware division,
the mod operation is slow! Binary GCD algorithm uses simple arithmetic operations,
it replaces division with arithmetic shifts, comparisons, and subtraction.

I have compiled successfully with x86_64_defconfig and i386_defconfig.

I use the following code to test:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <time.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		for (;;) {

			a >>= __builtin_ctzl(a);
			if (a == b)
				break;

			if (a < b)
				swap(a, b);
			a -= b;
		}

		b <<= __builtin_ctzl(r);
		return b;
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r ^= (r - 1);

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				break;

			if (a < b)
				swap(a, b);

			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
		}

		return b;
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))
	#define TEST_TIMES 100
	static unsigned long res[TEST_ENTRIES][TEST_TIMES];

	#define TIME_T struct timespec

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static uint64_t diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000UL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000UL + temp.tv_nsec;
	}

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		unsigned int i, j;
		TIME_T start, end;

		for (i = 0; i < TEST_ENTRIES; i++) {
			srand(seed);
			start = read_time();
			for (j = 0; j < TEST_TIMES; j++)
				res[i][j] = gcd_func[i](get_rand(), get_rand());
			end = read_time();
			printf("gcd%d: elapsed %lld\n", i, diff_time(start, end));
			sleep(1);
		}

		for (j = 0; j < TEST_TIMES; j++) {
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[i][j] != res[0][j])
					break;
			}
			if (i < TEST_ENTRIES) {
				fprintf(stderr, "Error: ");
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "res%d %ld%s", i, res[i][j], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.2.0-35-generic #40-Ubuntu x86_64" got:

zhaoxiuzeng@...oxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 92281
gcd1: elapsed 55005
gcd2: elapsed 91088
zhaoxiuzeng@...oxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 115546
gcd1: elapsed 55928
gcd2: elapsed 91938
zhaoxiuzeng@...oxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 91189
gcd1: elapsed 55493
gcd2: elapsed 90078
zhaoxiuzeng@...oxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 157364
gcd1: elapsed 55204
gcd2: elapsed 90058
zhaoxiuzeng@...oxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 91667
gcd1: elapsed 54641
gcd2: elapsed 91364

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@...il.com>
---
 lib/gcd.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 72 insertions(+), 9 deletions(-)

diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..460b414 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,19 +2,82 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * use __ffs if the CPU has efficient __ffs
+ */
+#if (defined(CONFIG_ALPHA) && defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)) || \
+	defined(CONFIG_ARC) || \
+	(defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 5) || defined(CONFIG_ARM64) || \
+	defined(CONFIG_AVR32) || \
+	defined(CONFIG_BLACKFIN) || \
+	defined(CONFIG_C6X) || \
+	defined(CONFIG_CRIS) || \
+	defined(CONFIG_FRV) || \
+	defined(CONFIG_HEXAGON) || \
+	defined(CONFIG_IA64) || \
+	(defined(CONFIG_M68K) && \
+	 (!defined(CONFIG_CPU_HAS_NO_BITFIELDS) || \
+	  ((defined(__mcfisaaplus__) || defined(__mcfisac__)) && \
+	   !defined(CONFIG_M68000) && !defined(CONFIG_MCPU32)))) || \
+	defined(CONFIG_MN10300) || \
+	defined(CONFIG_OPENRISC) || \
+	defined(CONFIG_POWERPC) || \
+	defined(CONFIG_S390) || \
+	defined(CONFIG_TILE) || \
+	defined(CONFIG_UNICORE32) || \
+	defined(CONFIG_X86) || \
+	defined(CONFIG_XTENSA)
+# define USE_FFS 1
+#elif defined(CONFIG_MIPS)
+# define USE_FFS (__builtin_constant_p(cpu_has_clo_clz) && cpu_has_clo_clz)
+#else
+# define USE_FFS 0
+#endif
+
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knith has noted, appears a first-century Chinese math text.)
+ */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	if (USE_FFS) {
+		b >>= __ffs(b);
+	} else {
+		/* least-significant mask, equ to "(1UL << ffs(r)) - 1" */
+		r ^= (r - 1);
+
+		while (!(b & r))
+			b >>= 1;
+	}
+
+	for (;;) {
+		if (USE_FFS) {
+			a >>= __ffs(a);
+		} else {
+			while (!(a & r))
+				a >>= 1;
+		}
+		if (a == b)
+			break;
 
-	if (a < b)
-		swap(a, b);
+		if (a < b)
+			swap(a, b);
+
+		a -= b;
+		if (!USE_FFS) {
+			a >>= 1;
+			if (a & r)
+				a += b;
+		}
+	}
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	if (USE_FFS) {
+		b <<= __ffs(r);
 	}
 	return b;
 }
-- 
2.5.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ