[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210721135926.602840-1-nborisov@suse.com>
Date: Wed, 21 Jul 2021 16:59:26 +0300
From: Nikolay Borisov <nborisov@...e.com>
To: linux-kernel@...r.kernel.org
Cc: ndesaulniers@...gle.com, torvalds@...ux-foundation.org,
linux-fsdevel@...r.kernel.org, david@...morbit.com,
Nikolay Borisov <nborisov@...e.com>
Subject: [PATCH] lib/string: Bring optimized memcmp from glibc
This is glibc's memcmp version. The upside is that for architectures
which don't have an optimized version the kernel can provide some
solace in the form of a generic, word-sized optimized memcmp. I tested
this with a heavy IOCTL_FIDEDUPERANGE(2) workload and here are the
results I got:
UNPATCHED PATCHED
real 1m13.127s 1m10.611s
user 0m0.030s 0m0.033s
sys 0m5.890s 0m4.001s (32% decrease)
Comparing 2 perf profiles of the workload yield:
30.44% -29.39% [kernel.vmlinux] [k] memcmp
Signed-off-by: Nikolay Borisov <nborisov@...e.com>
---
lib/string.c | 303 +++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 293 insertions(+), 10 deletions(-)
diff --git a/lib/string.c b/lib/string.c
index 7548eb715ddb..915db7e49804 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -923,22 +923,305 @@ EXPORT_SYMBOL(memmove);
#endif
#ifndef __HAVE_ARCH_MEMCMP
+
+/* Threshold value for when to enter the unrolled loops. */
+#define MEMCMP_THRESH 16
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+# define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+# define CMP_LT_OR_GT(a, b) memcmp_bytes((a), (b))
+/*
+ * Compare A and B bytewise in the byte order of the machine.
+ * A and B are known to be different. This is needed only on little-endian
+ * machines.
+ */
+static inline int memcmp_bytes(unsigned long a, unsigned long b)
+{
+ long srcp1 = (long) &a;
+ long srcp2 = (long) &b;
+ unsigned long a0, b0;
+
+ do {
+ a0 = ((uint8_t *) srcp1)[0];
+ b0 = ((uint8_t *) srcp2)[0];
+ srcp1 += 1;
+ srcp2 += 1;
+ } while (a0 == b0);
+ return a0 - b0;
+}
+
+#else
+# define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2)))
+# define CMP_LT_OR_GT(a, b) ((a) > (b) ? 1 : -1)
+#endif
+
+/*
+ * The strategy of this memcmp is:
+ * 1. Compare bytes until one of the block pointers is aligned.
+ *
+ * 2. Compare using memcmp_common_alignment or memcmp_not_common_alignment,
+ * regarding the alignment of the other block after the initial byte operations.
+ * The maximum number of full words (of type unsigned long) are compared in
+ * this way.
+ *
+ * 3. Compare the few remaining bytes.
+ */
+
+/*
+ * Compare blocks at SRCP1 and SRCP2 with LEN `unsigned long' objects (not LEN
+ * bytes!). Both SRCP1 and SRCP2 should be aligned for memory operations on
+ * `unsigned long's.
+ */
+static int memcmp_common_alignment(long srcp1, long srcp2, size_t len)
+{
+ unsigned long a0, a1;
+ unsigned long b0, b1;
+
+ switch (len % 4) {
+ default: /* Avoid warning about uninitialized local variables. */
+ case 2:
+ a0 = ((unsigned long *) srcp1)[0];
+ b0 = ((unsigned long *) srcp2)[0];
+ srcp1 -= 2 * sizeof(unsigned long);
+ srcp2 -= 2 * sizeof(unsigned long);
+ len += 2;
+ goto do1;
+ case 3:
+ a1 = ((unsigned long *) srcp1)[0];
+ b1 = ((unsigned long *) srcp2)[0];
+ srcp1 -= sizeof(unsigned long);
+ srcp2 -= sizeof(unsigned long);
+ len += 1;
+ goto do2;
+ case 0:
+ if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+ return 0;
+ a0 = ((unsigned long *) srcp1)[0];
+ b0 = ((unsigned long *) srcp2)[0];
+ goto do3;
+ case 1:
+ a1 = ((unsigned long *) srcp1)[0];
+ b1 = ((unsigned long *) srcp2)[0];
+ srcp1 += sizeof(unsigned long);
+ srcp2 += sizeof(unsigned long);
+ len -= 1;
+ if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+ goto do0;
+ /* Fallthrough */
+ }
+
+ do {
+ a0 = ((unsigned long *) srcp1)[0];
+ b0 = ((unsigned long *) srcp2)[0];
+ if (a1 != b1)
+ return CMP_LT_OR_GT(a1, b1);
+
+do3:
+ a1 = ((unsigned long *) srcp1)[1];
+ b1 = ((unsigned long *) srcp2)[1];
+ if (a0 != b0)
+ return CMP_LT_OR_GT(a0, b0);
+
+do2:
+ a0 = ((unsigned long *) srcp1)[2];
+ b0 = ((unsigned long *) srcp2)[2];
+ if (a1 != b1)
+ return CMP_LT_OR_GT(a1, b1);
+
+do1:
+ a1 = ((unsigned long *) srcp1)[3];
+ b1 = ((unsigned long *) srcp2)[3];
+ if (a0 != b0)
+ return CMP_LT_OR_GT(a0, b0);
+
+ srcp1 += 4 * sizeof(unsigned long);
+ srcp2 += 4 * sizeof(unsigned long);
+ len -= 4;
+ } while (len != 0);
+
+ /*
+ * This is the right position for do0. Please don't move it into the
+ * loop.
+ */
+do0:
+ if (a1 != b1)
+ return CMP_LT_OR_GT(a1, b1);
+ return 0;
+}
+
+/*
+ * Compare blocks at SRCP1 and SRCP2 with LEN `unsigned long' objects (not LEN
+ * bytes!). SRCP2 should be aligned for memory operations on `unsigned long',
+ * but SRCP1 *should be unaligned*.
+ */
+static int memcmp_not_common_alignment(long srcp1, long srcp2, size_t len)
+{
+ unsigned long a0, a1, a2, a3;
+ unsigned long b0, b1, b2, b3;
+ unsigned long x;
+ int shl, shr;
+
+ /*
+ * Calculate how to shift a word read at the memory operation
+ * aligned srcp1 to make it aligned for comparison.
+ */
+
+ shl = 8 * (srcp1 % sizeof(unsigned long));
+ shr = 8 * sizeof(unsigned long) - shl;
+
+ /*
+ * Make SRCP1 aligned by rounding it down to the beginning of the
+ * `unsigned long' it points in the middle of.
+ */
+ srcp1 &= -sizeof(unsigned long);
+
+ switch (len % 4) {
+ default: /* Avoid warning about uninitialized local variables. */
+ case 2:
+ a1 = ((unsigned long *) srcp1)[0];
+ a2 = ((unsigned long *) srcp1)[1];
+ b2 = ((unsigned long *) srcp2)[0];
+ srcp1 -= 1 * sizeof(unsigned long);
+ srcp2 -= 2 * sizeof(unsigned long);
+ len += 2;
+ goto do1;
+ case 3:
+ a0 = ((unsigned long *) srcp1)[0];
+ a1 = ((unsigned long *) srcp1)[1];
+ b1 = ((unsigned long *) srcp2)[0];
+ srcp2 -= 1 * sizeof(unsigned long);
+ len += 1;
+ goto do2;
+ case 0:
+ if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+ return 0;
+ a3 = ((unsigned long *) srcp1)[0];
+ a0 = ((unsigned long *) srcp1)[1];
+ b0 = ((unsigned long *) srcp2)[0];
+ srcp1 += 1 * sizeof(unsigned long);
+ goto do3;
+ case 1:
+ a2 = ((unsigned long *) srcp1)[0];
+ a3 = ((unsigned long *) srcp1)[1];
+ b3 = ((unsigned long *) srcp2)[0];
+ srcp1 += 2 * sizeof(unsigned long);
+ srcp2 += 1 * sizeof(unsigned long);
+ len -= 1;
+ if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+ goto do0;
+ /* Fallthrough */
+ }
+
+ do {
+ a0 = ((unsigned long *) srcp1)[0];
+ b0 = ((unsigned long *) srcp2)[0];
+ x = MERGE(a2, shl, a3, shr);
+ if (x != b3)
+ return CMP_LT_OR_GT(x, b3);
+
+do3:
+ a1 = ((unsigned long *) srcp1)[1];
+ b1 = ((unsigned long *) srcp2)[1];
+ x = MERGE(a3, shl, a0, shr);
+ if (x != b0)
+ return CMP_LT_OR_GT(x, b0);
+
+do2:
+ a2 = ((unsigned long *) srcp1)[2];
+ b2 = ((unsigned long *) srcp2)[2];
+ x = MERGE(a0, shl, a1, shr);
+ if (x != b1)
+ return CMP_LT_OR_GT(x, b1);
+
+do1:
+ a3 = ((unsigned long *) srcp1)[3];
+ b3 = ((unsigned long *) srcp2)[3];
+ x = MERGE(a1, shl, a2, shr);
+ if (x != b2)
+ return CMP_LT_OR_GT(x, b2);
+
+ srcp1 += 4 * sizeof(unsigned long);
+ srcp2 += 4 * sizeof(unsigned long);
+ len -= 4;
+ } while (len != 0);
+
+ /*
+ * This is the right position for do0. Please don't move it into
+ * the loop.
+ */
+do0:
+ x = MERGE(a2, shl, a3, shr);
+ if (x != b3)
+ return CMP_LT_OR_GT(x, b3);
+ return 0;
+}
+
+#undef memcmp
/**
* memcmp - Compare two areas of memory
- * @cs: One area of memory
- * @ct: Another area of memory
+ * @s1: One area of memory
+ * @s2: Another area of memory
* @count: The size of the area.
*/
-#undef memcmp
-__visible int memcmp(const void *cs, const void *ct, size_t count)
+__visible int memcmp(const void *s1, const void *s2, size_t len)
{
- const unsigned char *su1, *su2;
- int res = 0;
+ unsigned long a0;
+ unsigned long b0;
+ long srcp1 = (long) s1;
+ long srcp2 = (long) s2;
+ unsigned long res;
+
+ if (len >= MEMCMP_THRESH) {
+ /*
+ * There are at least some bytes to compare. No need to test
+ * for LEN == 0 in this alignment loop.
+ */
+ while (!IS_ALIGNED(srcp2, sizeof(unsigned long))) {
+ a0 = ((uint8_t *) srcp1)[0];
+ b0 = ((uint8_t *) srcp2)[0];
+ srcp1 += 1;
+ srcp2 += 1;
+ res = a0 - b0;
+ if (res != 0)
+ return res;
+ len -= 1;
+ }
- for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
- if ((res = *su1 - *su2) != 0)
- break;
- return res;
+ /*
+ * SRCP2 is now aligned for memory operations on `unsigned long'.
+ * SRCP1 alignment determines if we can do a simple, aligned
+ * compare or need to shuffle bits.
+ */
+
+ if (IS_ALIGNED(srcp1, sizeof(unsigned long)))
+ res = memcmp_common_alignment(srcp1, srcp2,
+ len / sizeof(unsigned long));
+ else
+ res = memcmp_not_common_alignment(srcp1, srcp2,
+ len / sizeof(unsigned long));
+
+ if (res != 0)
+ return res;
+
+ /* Number of bytes remaining in the interval [0..sizeof(unsigned long)-1]. */
+ srcp1 += len & -sizeof(unsigned long);
+ srcp2 += len & -sizeof(unsigned long);
+ len %= sizeof(unsigned long);
+ }
+
+ /* There are just a few bytes to compare. Use byte memory operations. */
+ while (len != 0) {
+ a0 = ((uint8_t *) srcp1)[0];
+ b0 = ((uint8_t *) srcp2)[0];
+ srcp1 += 1;
+ srcp2 += 1;
+ res = a0 - b0;
+ if (res != 0)
+ return res;
+ len -= 1;
+ }
+
+ return 0;
}
EXPORT_SYMBOL(memcmp);
#endif
--
2.25.1
Powered by blists - more mailing lists