lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240121153649.2733274-5-visitorckw@gmail.com>
Date: Sun, 21 Jan 2024 23:36:48 +0800
From: Kuan-Wei Chiu <visitorckw@...il.com>
To: colyli@...e.de,
	kent.overstreet@...ux.dev
Cc: bfoster@...hat.com,
	jserv@...s.ncku.edu.tw,
	linux-bcache@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	linux-bcachefs@...r.kernel.org,
	Kuan-Wei Chiu <visitorckw@...il.com>
Subject: [PATCH 4/5] bcachefs: Optimize number of comparisons in heap_sift_down

Optimize the heap_sift_down() macro, resulting in a significant
reduction of approximately 50% in the number of comparisons for large
random inputs, while maintaining identical results.

The current implementation performs two comparisons per level to
identify the maximum among three elements. In contrast, the proposed
bottom-up variation uses only one comparison per level to assess two
children until reaching the leaves. Then, it sifts up until the correct
position is determined.

Typically, the process of sifting down proceeds to the leaf level,
resulting in O(1) secondary comparisons instead of log2(n). This
optimization significantly reduces the number of costly indirect
function calls and improves overall performance.

The experimental data below is derived from first adding N elements
generated by get_random_u32() to the heap, and then performing heap_pop
until the heap is empty.

|   N   | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 |     239233       |     142673       |   1219 us |   1000 us |
| 20000 |     518498       |     305394       |   2564 us |   1904 us |
| 30000 |     812864       |     476594       |   4197 us |   3203 us |
| 40000 |    1117553       |     651737       |   5666 us |   4290 us |
| 50000 |    1430128       |     830600       |   7156 us |   5574 us |
| 60000 |    1746128       |    1012201       |   8862 us |   7036 us |
| 70000 |    2066099       |    1196653       |  10454 us |   8111 us |
| 80000 |    2394593       |    1383311       |  11993 us |   9602 us |
| 90000 |    2727097       |    1572381       |  13501 us |  10718 us |
| 100000|    3059841       |    1763083       |  15325 us |  11776 us |

Signed-off-by: Kuan-Wei Chiu <visitorckw@...il.com>
---
This patch has undergone unit testing and micro benchmarking using the
following code [1].

[1]:
static long long int cmp_count = 0;

typedef HEAP(u32) heap;

int mycmp(heap *h, u32 a, u32 b)
{
    cmp_count++;
    if (a < b)
        return -1;
    if (a > b)
        return 1;
    return 0;
}

int check_heap(heap *h, int (*cmp)(heap *, u32, u32))
{
    size_t i;

    for (i = 0; i < h->used / 2; i++) {
        if (i * 2 + 1 < h->used)
            if (cmp(h, h->data[i], h->data[i * 2 + 1]) > 0)
                return -1;
        if (i * 2 + 2 < h->used)
            if (cmp(h, h->data[i], h->data[i * 2 + 2]) > 0)
                return -1;
    }
    return 0;
}

static int test(void)
{
    size_t N, i;
    u32 x;
    heap myheap;
    ktime_t start, end;
	s64 delta;

	/* Test for correctness. */
    for (N = 1000; N <= 10000; N += 1000) {
        init_heap(&myheap, N, GFP_KERNEL);
        for (i = 0; i < N; i++) {
            heap_add(&myheap, get_random_u32(), mycmp, NULL);
            if (check_heap(&myheap, mycmp))
                return -1;
        }
        for (i = 0; i < N; i++) {
            heap_pop(&myheap, x, mycmp, NULL);
            if (check_heap(&myheap, mycmp))
                return -1;
        }
        free_heap(&myheap);
    }

	/* Micro-benchmark. */
	for (N = 10000; N <= 100000; N += 10000) {
		cmp_count = 0;
        init_heap(&myheap, N, GFP_KERNEL);

        start = ktime_get();
        for (i = 0; i < N; i++)
            heap_add(&myheap, get_random_u32(), mycmp, NULL);
        for (i = 0; i < N; i++)
            heap_pop(&myheap, x, mycmp, NULL);
        end = ktime_get();
		delta = ktime_us_delta(end, start);
        printk(KERN_INFO "time: %lld\n", delta);
        printk(KERN_INFO "comparisons: %lld\n", cmp_count);
        free_heap(&myheap);
    }

    return 0;
}

 fs/bcachefs/util.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index c75fc31915d3..be22eb63084b 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -131,17 +131,20 @@ do {									\
 
 #define heap_sift_down(h, i, cmp, set_backpointer)			\
 do {									\
-	size_t _c, _j = i;						\
+	size_t _j, _k;							\
 									\
-	for (; _j * 2 + 1 < (h)->used; _j = _c) {			\
-		_c = _j * 2 + 1;					\
-		if (_c + 1 < (h)->used &&				\
-		    cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0)	\
-			_c++;						\
-									\
-		if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0)		\
-			break;						\
-		heap_swap(h, _c, _j, set_backpointer);			\
+	for (_j = i; _k = _j * 2 + 1, _k + 1 < (h)->used;) {		\
+		if (cmp(h, (h)->data[_k], (h)->data[_k + 1]) >= 0)	\
+			_k++;						\
+		_j = _k;						\
+	}								\
+	if (_j * 2 + 2 == (h)->used)					\
+		_j = _j * 2 + 1;					\
+	while (_j != i && cmp(h, (h)->data[i], (h)->data[_j]) <= 0)	\
+		_j = (_j - 1) / 2;					\
+	for (_k = _j; _j != i;) {					\
+		_j = (_j - 1) / 2;					\
+		heap_swap(h, _j, _k, set_backpointer);			\
 	}								\
 } while (0)
 
-- 
2.25.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ