lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20221012215931.3896-6-elliott@hpe.com>
Date:   Wed, 12 Oct 2022 16:59:17 -0500
From:   Robert Elliott <elliott@....com>
To:     herbert@...dor.apana.org.au, davem@...emloft.net,
        tim.c.chen@...ux.intel.com, ap420073@...il.com, ardb@...nel.org,
        linux-crypto@...r.kernel.org, linux-kernel@...r.kernel.org
Cc:     Robert Elliott <elliott@....com>
Subject: [PATCH v2 05/19] crypto: x86/crc - limit FPU preemption

As done by the ECB and CBC helpers in arch/x86/crypt/ecb_cbc_helpers.h,
limit the number of bytes processed between kernel_fpu_begin() and
kernel_fpu_end() calls.

Those functions call preempt_disable() and preempt_enable(), so
the CPU core is unavailable for scheduling while running, leading to:
    rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: ...

Fixes: 78c37d191dd6 ("crypto: crc32 - add crc32 pclmulqdq implementation and wrappers for table implementation")
Fixes: 6a8ce1ef3940 ("crypto: crc32c - Optimize CRC32C calculation with PCLMULQDQ instruction")
Fixes: 0b95a7f85718 ("crypto: crct10dif - Glue code to cast accelerated CRCT10DIF assembly as a crypto transform")
Suggested-by: Herbert Xu <herbert@...dor.apana.org.au>
Signed-off-by: Robert Elliott <elliott@....com>
---
 arch/x86/crypto/crc32-pclmul_asm.S      |  6 ++--
 arch/x86/crypto/crc32-pclmul_glue.c     | 19 ++++++++----
 arch/x86/crypto/crc32c-intel_glue.c     | 29 ++++++++++++++----
 arch/x86/crypto/crct10dif-pclmul_glue.c | 39 ++++++++++++++++++++-----
 4 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index ca53e96996ac..9abd861636c3 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -72,15 +72,15 @@
 .text
 /**
  *      Calculate crc32
- *      BUF - buffer (16 bytes aligned)
- *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ *      BUF - buffer - must be 16 bytes aligned
+ *      LEN - sizeof buffer - must be multiple of 16 bytes and greater than 63
  *      CRC - initial crc32
  *      return %eax crc32
  *      uint crc32_pclmul_le_16(unsigned char const *buffer,
  *	                     size_t len, uint crc32)
  */
 
-SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
+SYM_FUNC_START(crc32_pclmul_le_16)
 	movdqa  (BUF), %xmm1
 	movdqa  0x10(BUF), %xmm2
 	movdqa  0x20(BUF), %xmm3
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 98cf3b4e4c9f..38539c6edfe5 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -46,6 +46,8 @@
 #define SCALE_F			16L	/* size of xmm register */
 #define SCALE_F_MASK		(SCALE_F - 1)
 
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
 u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
 
 static u32 __attribute__((pure))
@@ -70,12 +72,19 @@ static u32 __attribute__((pure))
 	iquotient = len & (~SCALE_F_MASK);
 	iremainder = len & SCALE_F_MASK;
 
-	kernel_fpu_begin();
-	crc = crc32_pclmul_le_16(p, iquotient, crc);
-	kernel_fpu_end();
+	do {
+		unsigned int chunk = min(iquotient, FPU_BYTES);
+
+		kernel_fpu_begin();
+		crc = crc32_pclmul_le_16(p, chunk, crc);
+		kernel_fpu_end();
+
+		iquotient -= chunk;
+		p += chunk;
+	} while (iquotient >= PCLMUL_MIN_LEN);
 
-	if (iremainder)
-		crc = crc32_le(crc, p + iquotient, iremainder);
+	if (iquotient || iremainder)
+		crc = crc32_le(crc, p, iquotient + iremainder);
 
 	return crc;
 }
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index feccb5254c7e..ece620227057 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -41,6 +41,8 @@
  */
 #define CRC32C_PCL_BREAKEVEN	512
 
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
 				unsigned int crc_init);
 #endif /* CONFIG_X86_64 */
@@ -158,9 +160,16 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
 	 * overcome kernel fpu state save/restore overhead
 	 */
 	if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		*crcp = crc_pcl(data, len, *crcp);
-		kernel_fpu_end();
+		do {
+			unsigned int chunk = min(len, FPU_BYTES);
+
+			kernel_fpu_begin();
+			*crcp = crc_pcl(data, chunk, *crcp);
+			kernel_fpu_end();
+
+			len -= chunk;
+			data += chunk;
+		} while (len);
 	} else
 		*crcp = crc32c_intel_le_hw(*crcp, data, len);
 	return 0;
@@ -170,9 +179,17 @@ static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
 				u8 *out)
 {
 	if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
-		kernel_fpu_end();
+		do {
+			unsigned int chunk = min(len, FPU_BYTES);
+
+			kernel_fpu_begin();
+			*crcp = crc_pcl(data, chunk, *crcp);
+			kernel_fpu_end();
+
+			len -= chunk;
+			data += chunk;
+		} while (len);
+		*(__le32 *)out = ~cpu_to_le32(*crcp);
 	} else
 		*(__le32 *)out =
 			~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index 71291d5af9f4..54a537fc88ee 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -34,6 +34,10 @@
 #include <asm/cpu_device_id.h>
 #include <asm/simd.h>
 
+#define PCLMUL_MIN_LEN 16U /* minimum size of buffer for crc_t10dif_pcl */
+
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
 asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
 
 struct chksum_desc_ctx {
@@ -54,10 +58,19 @@ static int chksum_update(struct shash_desc *desc, const u8 *data,
 {
 	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	if (length >= 16 && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
-		kernel_fpu_end();
+	if (length >= PCLMUL_MIN_LEN && crypto_simd_usable()) {
+		do {
+			unsigned int chunk = min(length, FPU_BYTES);
+
+			kernel_fpu_begin();
+			ctx->crc = crc_t10dif_pcl(ctx->crc, data, chunk);
+			kernel_fpu_end();
+
+			length -= chunk;
+			data += chunk;
+		} while (length >= PCLMUL_MIN_LEN);
+		if (length)
+			ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
 	} else
 		ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
 	return 0;
@@ -73,10 +86,20 @@ static int chksum_final(struct shash_desc *desc, u8 *out)
 
 static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
 {
-	if (len >= 16 && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		*(__u16 *)out = crc_t10dif_pcl(crc, data, len);
-		kernel_fpu_end();
+	if (len >= PCLMUL_MIN_LEN && crypto_simd_usable()) {
+		do {
+			unsigned int chunk = min(len, FPU_BYTES);
+
+			kernel_fpu_begin();
+			crc = crc_t10dif_pcl(crc, data, chunk);
+			kernel_fpu_end();
+
+			len -= chunk;
+			data += chunk;
+		} while (len >= PCLMUL_MIN_LEN);
+		if (len)
+			crc = crc_t10dif_generic(crc, data, len);
+		*(__u16 *)out = crc;
 	} else
 		*(__u16 *)out = crc_t10dif_generic(crc, data, len);
 	return 0;
-- 
2.37.3

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ