[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241125041129.192999-7-ebiggers@kernel.org>
Date: Sun, 24 Nov 2024 20:11:29 -0800
From: Eric Biggers <ebiggers@...nel.org>
To: linux-kernel@...r.kernel.org
Cc: linux-crypto@...r.kernel.org,
x86@...nel.org,
Ard Biesheuvel <ardb@...nel.org>
Subject: [PATCH 6/6] x86/crc32: implement crc32_be using new template
From: Eric Biggers <ebiggers@...gle.com>
crc32_be was previously unoptimized on x86. Optimize it using the new
template. This improves performance by over 25x in some cases.
Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit:
Length Before After
------ ------ -----
1 389 MB/s 325 MB/s
16 2845 MB/s 2911 MB/s
64 3012 MB/s 6513 MB/s
127 2567 MB/s 9057 MB/s
128 3048 MB/s 11589 MB/s
200 3070 MB/s 14042 MB/s
256 3067 MB/s 20454 MB/s
511 2938 MB/s 26245 MB/s
512 3081 MB/s 36926 MB/s
1024 3090 MB/s 61914 MB/s
3173 3065 MB/s 76201 MB/s
4096 3084 MB/s 82547 MB/s
16384 3084 MB/s 89333 MB/s
Signed-off-by: Eric Biggers <ebiggers@...gle.com>
---
arch/x86/lib/crc-pclmul-consts.h | 49 +++++++++++++++++++++++++++++++-
arch/x86/lib/crc32-glue.c | 4 +++
arch/x86/lib/crc32-pclmul.S | 1 +
3 files changed, 53 insertions(+), 1 deletion(-)
diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
index c3ca689eae3b8..f8af6e9278c83 100644
--- a/arch/x86/lib/crc-pclmul-consts.h
+++ b/arch/x86/lib/crc-pclmul-consts.h
@@ -1,10 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* CRC constants generated by:
*
- * ./scripts/crc/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320
+ * ./scripts/crc/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_msb_0x04c11db7
*
* Do not edit manually.
*/
/*
@@ -97,5 +97,52 @@ static const struct {
0xb4e5b025f7011641, /* floor(x^95 / G(x)) */
0x1db710641, /* G(x) */
},
.extract_crc_mask = {0, 0xffffffff},
};
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ * x^5 + x^4 + x^2 + x + 1
+ */
+static const struct {
+ u8 bswap_mask[16];
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc32_msb_0x04c11db7_consts __cacheline_aligned __maybe_unused = {
+ .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+ .fold_across_2048_bits_consts = {
+ 0x88fe2237, /* x^(2048+0) mod G(x) */
+ 0xcbcf3bcb, /* x^(2048+64) mod G(x) */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x567fddeb, /* x^(1024+0) mod G(x) */
+ 0x10bd4d7c, /* x^(1024+64) mod G(x) */
+ },
+ .fold_across_512_bits_consts = {
+ 0xe6228b11, /* x^(512+0) mod G(x) */
+ 0x8833794c, /* x^(512+64) mod G(x) */
+ },
+ .fold_across_256_bits_consts = {
+ 0x75be46b7, /* x^(256+0) mod G(x) */
+ 0x569700e5, /* x^(256+64) mod G(x) */
+ },
+ .fold_across_128_bits_consts = {
+ 0xe8a45605, /* x^(128+0) mod G(x) */
+ 0xc5b9cd4c, /* x^(128+64) mod G(x) */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x04d101df481b4e5a, /* floor(x^96 / G(x)) - x^64 */
+ 0x104c11db7, /* G(x) */
+ },
+};
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
index afcdeee429664..326261e503b42 100644
--- a/arch/x86/lib/crc32-glue.c
+++ b/arch/x86/lib/crc32-glue.c
@@ -18,10 +18,11 @@
static DEFINE_STATIC_KEY_FALSE(have_crc32);
static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
+DECLARE_CRC_PCLMUL_FUNCS(crc32_msb, u32);
u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
{
CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
have_pclmulqdq, IS_ENABLED(CONFIG_CRC32_SLICEBY8));
@@ -69,10 +70,12 @@ u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len)
}
EXPORT_SYMBOL(crc32c_le_arch);
u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
{
+ CRC_PCLMUL(crc, p, len, crc32_msb, crc32_msb_0x04c11db7_consts,
+ have_pclmulqdq, IS_ENABLED(CONFIG_CRC32_SLICEBY8));
return crc32_be_base(crc, p, len);
}
EXPORT_SYMBOL(crc32_be_arch);
static int __init crc32_x86_init(void)
@@ -80,10 +83,11 @@ static int __init crc32_x86_init(void)
if (boot_cpu_has(X86_FEATURE_XMM4_2))
static_branch_enable(&have_crc32);
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
static_branch_enable(&have_pclmulqdq);
INIT_CRC_PCLMUL(crc32_lsb);
+ INIT_CRC_PCLMUL(crc32_msb);
}
return 0;
}
arch_initcall(crc32_x86_init);
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
index cf07d571ae864..d562944211d4d 100644
--- a/arch/x86/lib/crc32-pclmul.S
+++ b/arch/x86/lib/crc32-pclmul.S
@@ -2,5 +2,6 @@
// Copyright 2024 Google LLC
#include "crc-pclmul-template.S"
DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
+DEFINE_CRC_PCLMUL_FUNCS(crc32_msb, /* bits= */ 32, /* lsb= */ 0)
--
2.47.0
Powered by blists - more mailing lists