lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240121153649.2733274-4-visitorckw@gmail.com>
Date: Sun, 21 Jan 2024 23:36:47 +0800
From: Kuan-Wei Chiu <visitorckw@...il.com>
To: colyli@...e.de,
	kent.overstreet@...ux.dev
Cc: bfoster@...hat.com,
	jserv@...s.ncku.edu.tw,
	linux-bcache@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	linux-bcachefs@...r.kernel.org,
	Kuan-Wei Chiu <visitorckw@...il.com>
Subject: [PATCH 3/5] bcachefs: Optimize sort_cmp_size() using bottom-up heapsort

This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.

Currently, sort_cmp_size employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.

Note: This patch depends on patch "bcachefs: Introduce parent function
for sort_cmp_size()".

The experimental data presented below is based on an array generated
by get_random_u32().

|   N   | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 |     235498       |     136592       |  17954 us |  14834 us |
| 20000 |     510666       |     293254       |  23549 us |  18364 us |
| 30000 |     800461       |     457351       |  33361 us |  19493 us |
| 40000 |    1101317       |     626661       |  33672 us |  26810 us |
| 50000 |    1409743       |     799745       |  42634 us |  34694 us |
| 60000 |    1721165       |     974737       |  51414 us |  41367 us |
| 70000 |    2037972       |    1152111       |  63084 us |  49146 us |
| 80000 |    2362590       |    1333270       |  73802 us |  54715 us |
| 90000 |    2690155       |    1516148       |  82693 us |  63070 us |
| 100000|    3019730       |    1699757       |  88981 us |  70367 us |

Refs:
  BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
  QUICKSORT (if n is not very small)
  Ingo Wegener
  Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
  https://doi.org/10.1016/0304-3975(93)90364-Y

Signed-off-by: Kuan-Wei Chiu <visitorckw@...il.com>
---
This patch has undergone unit testing and micro benchmarking using the
following code [1].

[1]:
static long long int cmp_count = 0;

static int mycmp(const void *a, const void *b, size_t size)
{
    u32 _a = *(u32 *)a;
	u32 _b = *(u32 *)b;

	cmp_count++;
	if (_a < _b)
		return -1;
	else if (_a > _b)
		return 1;
	else
		return 0;
}

static int test(void)
{
    size_t N, i;
	ktime_t start, end;
	s64 delta;
	u32 *arr;

	for (N = 10000; N <= 100000; N += 10000) {
		arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
		cmp_count = 0;

		for (i = 0; i < N; i++)
			arr[i] = get_random_u32();
		
		start = ktime_get();
		sort_cmp_size(arr, N, sizeof(u32), mycmp, NULL);
		end = ktime_get();

		delta = ktime_us_delta(end, start);
        printk(KERN_INFO "time: %lld\n", delta);
		printk(KERN_INFO "comparisons: %lld\n", cmp_count);

		for (i = 0; i < N - 1; i++) {
			if (arr[i] > arr[i+1]) {
				kfree(arr);
				return -1;
			}
		}

		kfree(arr);
	}
    return 0;
}

 fs/bcachefs/util.c | 52 +++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index f5bbf96df2ce..4dacb2b9a0a5 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -979,7 +979,8 @@ void sort_cmp_size(void *base, size_t num, size_t size,
 	  void (*swap_func)(void *, void *, size_t size))
 {
 	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
+	int i = (num / 2 - 1) * size, n = num * size, j, k;
+	const size_t lsbit = size & -size;
 
 	if (!swap_func) {
 		if (size == 4 && alignment_ok(base, 4))
@@ -992,28 +993,45 @@ void sort_cmp_size(void *base, size_t num, size_t size,
 
 	/* heapify */
 	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
+		/* Find the sift-down path all the way to the leaves. */
+		for (j = i; k = j * 2 + size, k + size < n;)
+			j = cmp_func(base + k, base + k + size, size) > 0 ? k : k + size;
+
+		/* Special case for the last leaf with no sibling. */
+		if (j * 2 + size * 2 == n)
+			j = j * 2 + size;
+
+		/* Backtrack to the correct location. */
+		while (j != i && cmp_func(base + i, base + j, size) >= 0)
+			j = parent(j, lsbit, size);
+
+		/* Shift the element into its correct place. */
+		for (k = j; j != i;) {
+			j = parent(j, lsbit, size);
+			swap_func(base + j, base + k, size);
 		}
 	}
 
 	/* sort */
 	for (i = n - size; i > 0; i -= size) {
 		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
+
+		/* Find the sift-down path all the way to the leaves. */
+		for (j = 0; k = j * 2 + size, k + size < i;)
+			j = cmp_func(base + k, base + k + size, size) > 0 ? k : k + size;
+
+		/* Special case for the last leaf with no sibling. */
+		if (j * 2 + size * 2 == i)
+			j = j * 2 + size;
+
+		/* Backtrack to the correct location. */
+		while (j && cmp_func(base, base + j, size) >= 0)
+			j = parent(j, lsbit, size);
+
+		/* Shift the element into its correct place. */
+		for (k = j; j;) {
+			j = parent(j, lsbit, size);
+			swap_func(base + j, base + k, size);
 		}
 	}
 }
-- 
2.25.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ