lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <1425838655-24775-1-git-send-email-yury.norov@gmail.com>
Date:	Sun,  8 Mar 2015 21:17:35 +0300
From:	Yury Norov <yury.norov@...il.com>
To:	linux@...izon.com, klimov.linux@...il.com,
	linux@...musvillemoes.dk, akpm@...ux-foundation.org
Cc:	davem@...emloft.net, dborkman@...hat.com,
	hannes@...essinduktion.org, laijs@...fujitsu.com,
	msalter@...hat.com, takahiro.akashi@...aro.org, tgraf@...g.ch,
	valentinrothberg@...il.com, yury.norov@...il.com,
	linux-kernel@...r.kernel.org, chris@...is-wilson.co.uk
Subject: Re: [PATCH v5 0/3] lib: find_*_bit reimplementation

It looks I screwed up with performance measurements.

At first, I measured performance of 'find_next_zero_bit', while wrote about
'find_next_bit'. I was really thinking it's not matter, so I'm sorry.

At second, to run test on userspace I got ffs and ffz from kernel.  And it
looks like all performance difference is caused by them.  Simple measurement
of performance of current 'find_next_bit' and 'find_next_zero_bit' shows the
same 35% difference, while the code is virtually identical. The trick is that
new implementation doesn't use ffz, that is slow for me (because I got generic
implementation), and so 'find_next_zero_bit' looks faster, while it's not.
In kernel, ffs and ffz are both fast.  I'm sorry again.

When I ran this test in kernel mode, I found measurements weird. I tested it on
Core i7-3770@...0GHz and Core i7-2630QM@...0GHz. First machine did show noizly
but relatively equal results (I cannot attach it now), second one is more stable,
and it looks like performance degrade for about 20%.

	find_next_bit	find_next_zero_bit
        old	new	old	new
        25	30	24	29
        25	30      24	29
        25	30      25	29
        26	29      24	29
        25	30      24	29
        26	30      24	29
        26	30      24	30
        25	30      25	29
        26	30      25	29
        25	30      25	30

The module I use for testing is here.

To Alexey:
Could you be so kind to run this test on your Odroid? It's also interesting, is
there real difference between generic and platform implementations. 

---
 lib/Kconfig.debug   |   8 +++
 lib/Makefile        |   1 +
 lib/test_find_bit.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 198 insertions(+)
 create mode 100644 lib/test_find_bit.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c5cefb3..5d78dbb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1669,6 +1669,14 @@ config DMA_API_DEBUG
 
 	  If unsure, say N.
 
+config TEST_FIND_BIT
+	tristate "Test find_bit family performance"
+	default n
+	help
+	  This builds the "test_find_bit".
+
+	  If unsure, say N.
+
 config TEST_LKM
 	tristate "Test module loading with 'hello world' module"
 	default n
diff --git a/lib/Makefile b/lib/Makefile
index d857965..9186da4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test-hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
+obj-$(CONFIG_TEST_FIND_BIT) += test_find_bit.o
 obj-$(CONFIG_TEST_KASAN) += test_kasan.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_LKM) += test_module.o
diff --git a/lib/test_find_bit.c b/lib/test_find_bit.c
new file mode 100644
index 0000000..0c04022
--- /dev/null
+++ b/lib/test_find_bit.c
@@ -0,0 +1,189 @@
+/*
+ * This module tests 'find_next_bit' performance.
+ */
+
+#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+#define BITOP_WORD(nr)         ((nr) / BITS_PER_LONG)
+
+unsigned long find_next_bit_old(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	const unsigned long *p = addr + BITOP_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG-1);
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (offset) {
+		tmp = *(p++);
+		tmp &= (~0UL << offset);
+		if (size < BITS_PER_LONG)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= BITS_PER_LONG;
+		result += BITS_PER_LONG;
+	}
+	while (size & ~(BITS_PER_LONG-1)) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp &= (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;	/* Nope. */
+found_middle:
+	return result + __ffs(tmp);
+}
+
+unsigned long find_next_zero_bit_old(const unsigned long *addr, unsigned long size,
+				 unsigned long offset)
+{
+	const unsigned long *p = addr + BITOP_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG-1);
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (offset) {
+		tmp = *(p++);
+		tmp |= ~0UL >> (BITS_PER_LONG - offset);
+		if (size < BITS_PER_LONG)
+			goto found_first;
+		if (~tmp)
+			goto found_middle;
+		size -= BITS_PER_LONG;
+		result += BITS_PER_LONG;
+	}
+	while (size & ~(BITS_PER_LONG-1)) {
+		if (~(tmp = *(p++)))
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp |= ~0UL << size;
+	if (tmp == ~0UL)	/* Are any bits zero? */
+		return result + size;	/* Nope. */
+found_middle:
+	return result + ffz(tmp);
+ }
+static unsigned long _find_next_bit(const unsigned long *addr,
+		unsigned long nbits, unsigned long start, unsigned long invert)
+{
+	unsigned long tmp;
+
+	if (!nbits || start >= nbits)
+		return nbits;
+
+	tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+	/* Handle 1st word. */
+	tmp &= BITMAP_FIRST_WORD_MASK(start);
+	start = round_down(start, BITS_PER_LONG);
+
+	while (!tmp) {
+		start += BITS_PER_LONG;
+		if (start >= nbits)
+			return nbits;
+
+		tmp = addr[start / BITS_PER_LONG] ^ invert;
+	}
+
+	return min(start + __ffs(tmp), nbits);
+}
+
+unsigned long find_next_bit_new(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	return _find_next_bit(addr, size, offset, 0UL);
+}
+
+unsigned long find_next_zero_bit_new(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	return _find_next_bit(addr, size, offset, ~0UL);
+}
+
+static int __init test_module_init(void)
+{
+	long long start, old, new;
+	unsigned long *addr;
+	unsigned long ret, i, nbits = 30*1024*1024;
+
+	addr = alloc_pages_exact(nbits/8, GFP_KERNEL);
+	if (!addr) {
+		pr_warn("No mem\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < 10; i++) {
+		get_random_bytes(addr, nbits/8);
+
+		ret = 0;
+		start = jiffies;
+		while (ret < nbits)
+			ret = find_next_zero_bit_old(addr, nbits, ret + 1);
+
+		old = jiffies - start;
+
+		ret = 0;
+		start = jiffies;
+		while (ret < nbits)
+			ret = find_next_zero_bit_new(addr, nbits, ret + 1);
+
+		new = jiffies - start;
+		pr_warn("%lld\t%lld\n", old, new);
+	}
+
+	pr_warn("\n");
+
+	for (i = 0; i < 10; i++) {
+		get_random_bytes(addr, nbits/8);
+
+		ret = 0;
+		start = jiffies;
+		while (ret < nbits)
+			ret = find_next_bit_old(addr, nbits, ret + 1);
+
+		old = jiffies - start;
+
+		ret = 0;
+		start = jiffies;
+		while (ret < nbits)
+			ret = find_next_bit_new(addr, nbits, ret + 1);
+
+		new = jiffies - start;
+
+		pr_warn("%lld\t%lld\n", old, new);
+	}
+
+	free_pages_exact(addr, nbits/8);
+	return 0;
+}
+module_init(test_module_init);
+
+static void __exit test_module_exit(void) {}
+module_exit(test_module_exit);
+
+MODULE_AUTHOR("Yury Norov <yury.norov@...il.com>");
+MODULE_LICENSE("GPL");
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ