lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260121101923.64657-1-zhangchunyan@iscas.ac.cn>
Date: Wed, 21 Jan 2026 18:19:23 +0800
From: Chunyan Zhang <zhangchunyan@...as.ac.cn>
To: Paul Walmsley <paul.walmsley@...ive.com>,
	Palmer Dabbelt <palmer@...belt.com>,
	Albert Ou <aou@...s.berkeley.edu>,
	Alexandre Ghiti <alex@...ti.fr>,
	Herbert Xu <herbert@...dor.apana.org.au>,
	"David S . Miller" <davem@...emloft.net>
Cc: linux-riscv@...ts.infradead.org,
	linux-crypto@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Chunyan Zhang <zhang.lyra@...il.com>
Subject: [PATCH] crypto: aegis128: Add RISC-V vector SIMD implementation

Add a RISC-V vector-accelerated implementation of aegis128 by
wiring it into the generic SIMD hooks.

This implementation supports vlen values of 512, 256, and 128.

Signed-off-by: Chunyan Zhang <zhangchunyan@...as.ac.cn>
---
 crypto/Kconfig              |   4 +-
 crypto/Makefile             |   4 +
 crypto/aegis-rvv.h          |  21 +
 crypto/aegis128-rvv-inner.c | 752 ++++++++++++++++++++++++++++++++++++
 crypto/aegis128-rvv.c       |  63 +++
 5 files changed, 842 insertions(+), 2 deletions(-)
 create mode 100644 crypto/aegis-rvv.h
 create mode 100644 crypto/aegis128-rvv-inner.c
 create mode 100644 crypto/aegis128-rvv.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 2e5b195b1b06..c4901610aac0 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -777,8 +777,8 @@ config CRYPTO_AEGIS128
 	  AEGIS-128 AEAD algorithm
 
 config CRYPTO_AEGIS128_SIMD
-	bool "AEGIS-128 (arm NEON, arm64 NEON)"
-	depends on CRYPTO_AEGIS128 && ((ARM || ARM64) && KERNEL_MODE_NEON)
+	bool "AEGIS-128 (arm NEON, arm64 NEON, RISC-V vector)"
+	depends on CRYPTO_AEGIS128 && (((ARM || ARM64) && KERNEL_MODE_NEON) || RISCV)
 	default y
 	help
 	  AEGIS-128 AEAD algorithm
diff --git a/crypto/Makefile b/crypto/Makefile
index 16a35649dd91..3d94cae9eeba 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -121,6 +121,10 @@ endif
 # Enable <arm_neon.h>
 CFLAGS_aegis128-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
 
+ifeq ($(ARCH),riscv)
+aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-rvv.o aegis128-rvv-inner.o
+endif
+
 obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
 obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
 obj-$(CONFIG_CRYPTO_DES) += des_generic.o
diff --git a/crypto/aegis-rvv.h b/crypto/aegis-rvv.h
new file mode 100644
index 000000000000..2fae8c93b02a
--- /dev/null
+++ b/crypto/aegis-rvv.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2026 Institute of Software, CAS
+ */
+
+#ifndef _AEGIS_RVV_H
+#define _AEGIS_RVV_H
+
+extern const u8 crypto_aes_sbox[];
+
+void crypto_aegis128_init_rvv(void *state, const void *key, const void *iv);
+void crypto_aegis128_update_rvv(void *state, const void *msg);
+void crypto_aegis128_encrypt_chunk_rvv(void *state, void *dst, const void *src,
+					unsigned int size);
+void crypto_aegis128_decrypt_chunk_rvv(void *state, void *dst, const void *src,
+					unsigned int size);
+int crypto_aegis128_final_rvv(void *state, void *tag_xor,
+			       unsigned int assoclen,
+			       unsigned int cryptlen,
+			       unsigned int authsize);
+#endif
diff --git a/crypto/aegis128-rvv-inner.c b/crypto/aegis128-rvv-inner.c
new file mode 100644
index 000000000000..ee2f206eea22
--- /dev/null
+++ b/crypto/aegis128-rvv-inner.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2026 Institute of Software, CAS
+ * Author: Chunyan Zhang <zhangchunyan@...as.ac.cn>
+ *
+ * Based on aegis128-neon-inner.c:
+ *	Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@...aro.org>
+ */
+
+#include <asm/vector.h>
+#include <linux/types.h>
+
+#include "aegis-rvv.h"
+#include "aegis.h"
+
+#define AEGIS128_STATE_BLOCKS 5
+#define RVV_VLEN	riscv_vector_vlen()
+
+typedef u8 aegis128_block_bytes[AEGIS_BLOCK_SIZE];
+struct aegis_state {
+	aegis128_block_bytes blocks[AEGIS128_STATE_BLOCKS];
+};
+
+/* Load 256 bytes at one time into the vector registers starting from r0 */
+#define preload_sbox_1(m, r0)  do {				\
+	unsigned long vl;					\
+	asm volatile (".option	push\n"				\
+		      ".option	arch,+v\n"			\
+		      "vsetvli	%0, x0, e8, "m", ta, ma\n"	\
+		      "vle8.v	"r0", (%1)\n"			\
+		      ".option	pop\n"				\
+		      : "=&r" (vl)				\
+		      :						\
+		      "r" (crypto_aes_sbox)			\
+	:);							\
+} while (0)
+
+/* Load 256 bytes at two times into the vector registers starting from r0 and r1 */
+#define preload_sbox_2(m, r0, r1)  do {				\
+	unsigned long vl;					\
+	asm volatile (".option	push\n"				\
+		      ".option	arch,+v\n"			\
+		      "vsetvli	%0, x0, e8, "m", ta, ma\n"	\
+		      "vle8.v	"r0", (%1)\n"			\
+		      "vle8.v	"r1", (%2)\n"			\
+		      ".option	pop\n"				\
+		      : "=&r" (vl)				\
+		      :						\
+		      "r" (crypto_aes_sbox),			\
+		      "r" (crypto_aes_sbox + 0x80)		\
+	:);							\
+} while (0)
+
+/* v16 - v31: crypto_aes_sbox[0-255] */
+#define preload_sbox_128() preload_sbox_2("m8", "v16", "v24")
+
+/* v16 - v23: crypto_aes_sbox[0-255] */
+#define preload_sbox_256() preload_sbox_1("m8", "v16")
+
+/* v16 - v19: crypto_aes_sbox[0-255] */
+#define preload_sbox_512() preload_sbox_1("m4", "v16")
+
+static __always_inline
+void preload_round_data(void)
+{
+	static const u8 rev32qu16[] = {
+		0x2, 0x3, 0x0, 0x1, 0x6, 0x7, 0x4, 0x5,
+		0xa, 0xb, 0x8, 0x9, 0xe, 0xf, 0xc, 0xd,
+	};
+
+	static const u8 shift_rows[] = {
+		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+	};
+
+	static const u8 ror32by8[] = {
+		0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+		0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+	};
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v13, (%[rev32qu16])\n"
+		      "vle8.v	v14, (%[shift_rows])\n"
+		      "vle8.v	v15, (%[ror32by8])\n"
+		      ".option	pop\n"
+		      : :
+		      [rev32qu16]"r"(rev32qu16),
+		      [shift_rows]"r"(shift_rows),
+		      [ror32by8]"r"(ror32by8)
+	:);
+
+	switch (RVV_VLEN) {
+	case 128:
+		preload_sbox_128();
+		break;
+	case 256:
+		preload_sbox_256();
+		break;
+	case 512:
+		preload_sbox_512();
+		break;
+	default:
+		pr_err("ERROR: %d is not supported vector length!", RVV_VLEN);
+	}
+}
+
+
+#define AEGIS128_ROUND_PART1				\
+	".option	push\n"				\
+	".option	arch,+v\n"			\
+	"vsetivli	zero, 0x10, e8, m1, ta, ma\n"	\
+	/* s = vqtbl1q_u8(s, vld1q_u8(shift_rows)) */	\
+	"vle8.v		v0, (%[s])\n"			\
+	"vrgather.vv	v1, v0, v14\n" /* v14: shift_rows */
+
+#define AEGIS128_ROUND_PART3						\
+	/* s= (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b) */	\
+	"vsetivli	zero, 0x10, e8, m1, ta, ma\n"			\
+	"vsra.vi	v3, v2, 7\n" /* ((int8x16_t)v >> 7) */		\
+	"vand.vx	v3, v3, %[x1b]\n"				\
+	"vsll.vi	v4, v2, 1\n" /* (v << 1) */			\
+	"vxor.vv	v3, v4, v3\n"					\
+	/* s ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v) */		\
+	"vrgather.vv	v4, v2, v13\n" /* v13: rev32qu16 */		\
+	"vxor.vv	v3, v3, v4\n"					\
+	/* s ^= vqtbl1q_u8(v ^ s, vld1q_u8(ror32by8)); */		\
+	"vxor.vv	v4, v3, v2\n" /* v ^ s */			\
+	"vrgather.vv	v5, v4, v15\n" /* v15: ror32by8 */		\
+	"vxor.vv	v3, v3, v5\n"					\
+	"vle8.v		v4, (%[d])\n"					\
+	"vxor.vv	v3, v3, v4\n" /* dst ^= v3 */			\
+	"vse8.v		v3, (%[d])\n"					\
+	".option	pop\n"
+
+/*
+ * v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + step), s - step);
+ * r: vector register which stores sbox array
+ */
+#define gather_sbox(r)				\
+	"vsub.vx	v1, v1, %[step]\n"	\
+	"vrgather.vv	v3, "r", v1\n"		\
+	"vor.vv		v2, v2, v3\n"
+
+static __always_inline
+void aegis128_round_128(u8 *dst, const u8 *src)
+{
+	unsigned long vl;
+
+	/* v16 - v31: crypto_aes_sbox[0-255] */
+	asm volatile (AEGIS128_ROUND_PART1
+		      /* v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), s); */
+		      "vsetvli		%0, x0, e8, m1, ta, ma\n"
+		      "vrgather.vv	v2, v16, v1\n"
+		      /* v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x10), s - 0x10); */
+		      gather_sbox("v17")
+		      gather_sbox("v18")
+		      gather_sbox("v19")
+		      gather_sbox("v20")
+		      gather_sbox("v21")
+		      gather_sbox("v22")
+		      gather_sbox("v23")
+		      gather_sbox("v24")
+		      gather_sbox("v25")
+		      gather_sbox("v26")
+		      gather_sbox("v27")
+		      gather_sbox("v28")
+		      gather_sbox("v29")
+		      gather_sbox("v30")
+		      gather_sbox("v31")
+		      AEGIS128_ROUND_PART3
+		      : "=&r" (vl) :
+		      [s]"r"(src),
+		      [step]"r"(0x10),
+		      [x1b]"r"(0x1b),
+		      [d]"r"(dst)
+		      : "memory"
+	);
+}
+
+static __always_inline
+void aegis128_round_256(u8 *dst, const u8 *src)
+{
+	unsigned long vl;
+
+	/* v16 - v23: crypto_aes_sbox[0-255] */
+	asm volatile (AEGIS128_ROUND_PART1
+		      /* v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), s); */
+		      "vsetvli		%0, x0, e8, m1, ta, ma\n"
+		      "vrgather.vv	v2, v16, v1\n"
+		      /* v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x20), s - 0x20); */
+		      gather_sbox("v17")
+		      gather_sbox("v18")
+		      gather_sbox("v19")
+		      gather_sbox("v20")
+		      gather_sbox("v21")
+		      gather_sbox("v22")
+		      gather_sbox("v23")
+		      AEGIS128_ROUND_PART3
+		      : "=&r" (vl) :
+		      [s]"r"(src),
+		      [step]"r"(0x20),
+		      [x1b]"r"(0x1b),
+		      [d]"r"(dst)
+		      : "memory"
+	);
+}
+
+static __always_inline
+void aegis128_round_512(u8 *dst, const u8 *src)
+{
+	unsigned long vl;
+
+	/* v16 - v19: crypto_aes_sbox[0-255] */
+	asm volatile (AEGIS128_ROUND_PART1
+		      /* v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), s); */
+		      "vsetvli		%0, x0, e8, m1, ta, ma\n"
+		      "vrgather.vv	v2, v16, v1\n"
+		      /*v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), s - 0x40);*/
+		      gather_sbox("v17")
+		      /*v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), s - 0x80);*/
+		      gather_sbox("v18")
+		      /*v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), s - 0xc0);*/
+		      gather_sbox("v19")
+		      AEGIS128_ROUND_PART3
+		      : "=&r" (vl) :
+		      [s]"r"(src),
+		      [step]"r"(0x40),
+		      [x1b]"r"(0x1b),
+		      [d]"r"(dst)
+		      : "memory"
+	);
+}
+
+static __always_inline
+void aegis128_round(u8 *dst, const u8 *src)
+{
+	switch (RVV_VLEN) {
+	case 128:
+		aegis128_round_128(dst, src);
+		break;
+	case 256:
+		aegis128_round_256(dst, src);
+		break;
+	case 512:
+		aegis128_round_512(dst, src);
+		break;
+	default:
+		pr_err("ERROR: %d is not supported vector length!", RVV_VLEN);
+	}
+}
+
+static __always_inline
+void aegis128_update_rvv(struct aegis_state *state, const void *key)
+{
+	u8 k[AEGIS_BLOCK_SIZE];
+
+	/* save key to k[16] */
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v1, (%[key])\n"
+		      "vse8.v	v1, (%[k])\n"
+		      ".option	pop\n"
+		      : :
+		      [key]"r"(key),
+		      [k]"r"(k)
+	:);
+
+	aegis128_round(k, state->blocks[4]);
+	aegis128_round(state->blocks[4], state->blocks[3]);
+	aegis128_round(state->blocks[3], state->blocks[2]);
+	aegis128_round(state->blocks[2], state->blocks[1]);
+	aegis128_round(state->blocks[1], state->blocks[0]);
+
+	/* state->blocks[0] ^= key */
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v1, (%[k])\n"
+		      "vle8.v	v2, (%[block0])\n"
+		      "vxor.vv	v2, v2, v1\n"
+		      "vse8.v	v2, (%[block0])\n"
+		      ".option	pop\n"
+		      : :
+		      [k]"r"(k),
+		      [block0]"r"(state->blocks[0])
+	:);
+}
+
+void crypto_aegis128_init_rvv(void *state, const void *key, const void *iv)
+{
+	struct aegis_state *st = state;
+	u8 kiv[AEGIS_BLOCK_SIZE];
+
+	static const u8 const0[] = {
+		0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d,
+		0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62,
+	};
+	static const u8 const1[] = {
+		0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1,
+		0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd,
+	};
+
+	/*
+	 * kiv = key^iv
+	 * struct aegis128_state st = {{
+		kiv,
+		vld1q_u8(const1),
+		vld1q_u8(const0),
+		key ^ vld1q_u8(const0),
+		key ^ vld1q_u8(const1),
+	   }};
+	 */
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v0, (%[const0])\n"
+		      "vle8.v	v1, (%[const1])\n"
+		      "vse8.v	v0, (%[block2])\n"
+		      "vse8.v	v1, (%[block1])\n"
+		      "vle8.v	v2, (%[iv])\n"
+		      "vle8.v	v3, (%[key])\n"
+		      "vxor.vv	v0, v0, v3\n"
+		      "vxor.vv	v1, v1, v3\n"
+		      "vxor.vv	v2, v2, v3\n"
+		      "vse8.v	v2, (%[block0])\n"
+		      "vse8.v	v2, (%[kiv])\n"
+		      "vse8.v	v0, (%[block3])\n"
+		      "vse8.v	v1, (%[block4])\n"
+		      ".option	pop\n"
+		      : :
+		      [const0]"r"(const0),
+		      [const1]"r"(const1),
+		      [block1]"r"(st->blocks[1]),
+		      [block2]"r"(st->blocks[2]),
+		      [iv]"r"(iv),
+		      [key]"r"(key),
+		      [block0]"r"(st->blocks[0]),
+		      [kiv]"r"(kiv),
+		      [block3]"r"(st->blocks[3]),
+		      [block4]"r"(st->blocks[4])
+	:);
+
+	preload_round_data();
+
+	for (int i = 0; i < 5; i++) {
+		aegis128_update_rvv(st, key);
+		aegis128_update_rvv(st, kiv);
+	}
+}
+
+void crypto_aegis128_update_rvv(void *state, const void *msg)
+{
+	struct aegis_state *st = state;
+
+	preload_round_data();
+
+	aegis128_update_rvv(st, msg);
+}
+
+static const u8 permute[] __aligned(64) = {
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+void crypto_aegis128_encrypt_chunk_rvv(void *state, void *dst, const void *src,
+					unsigned int size)
+{
+	struct aegis_state *st = state;
+	const int short_input = size < AEGIS_BLOCK_SIZE;
+	u8 s[AEGIS_BLOCK_SIZE];
+	u8 msg[AEGIS_BLOCK_SIZE];
+
+	preload_round_data();
+
+	while (size >= AEGIS_BLOCK_SIZE) {
+		/* s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v	v1, (%[block1])\n"
+			      "vle8.v	v2, (%[block2])\n"
+			      "vle8.v	v3, (%[block3])\n"
+			      "vle8.v	v4, (%[block4])\n"
+			      "vxor.vv	v1, v1, v4\n"
+			      "vand.vv	v2, v2, v3\n"
+			      "vxor.vv	v1, v1, v2\n"
+			      "vse8.v	v1, (%[s])\n"
+			      ".option	pop\n"
+			      : :
+			      [block1]"r"(st->blocks[1]),
+			      [block2]"r"(st->blocks[2]),
+			      [block3]"r"(st->blocks[3]),
+			      [block4]"r"(st->blocks[4]),
+			      [s]"r"(s)
+		:);
+
+		aegis128_update_rvv(st, src);
+		/* dst = s ^ src*/
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v1, (%[s])\n"
+			      "vle8.v	v2, (%[src])\n"
+			      "vxor.vv	v1, v1, v2\n"
+			      "vse8.v	v1, (%[dst])\n"
+			      "vse8.v	v1, (%[msg])\n"
+			      ".option	pop\n"
+			      : :
+			      [s]"r"(s),
+			      [src]"r"(src),
+			      [dst]"r"(dst),
+			      [msg]"r"(msg)
+		:);
+
+		size -= AEGIS_BLOCK_SIZE;
+		src += AEGIS_BLOCK_SIZE;
+		dst += AEGIS_BLOCK_SIZE;
+	}
+
+	if (size > 0) {
+		u8 buf[AEGIS_BLOCK_SIZE];
+		const void *in = src;
+		void *out = dst;
+		u8 m[AEGIS_BLOCK_SIZE];
+
+		/* s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v	v1, (%[block1])\n"
+			      "vle8.v	v2, (%[block2])\n"
+			      "vle8.v	v3, (%[block3])\n"
+			      "vle8.v	v4, (%[block4])\n"
+			      "vxor.vv	v1, v1, v4\n" /* st.v[1] ^ st.v[4] */
+			      "vand.vv	v2, v2, v3\n" /* st.v[2] & st.v[3] */
+			      "vxor.vv	v1, v1, v2\n"
+			      "vse8.v	v1, (%[s])\n"
+			      ".option	pop\n"
+			      : :
+			      [block1]"r"(st->blocks[1]),
+			      [block2]"r"(st->blocks[2]),
+			      [block3]"r"(st->blocks[3]),
+			      [block4]"r"(st->blocks[4]),
+			      [s]"r"(s)
+		:);
+
+		if (__builtin_expect(short_input, 0))
+			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
+
+		/*
+		 * m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
+		 *		  vld1q_u8(permute + 32 - size));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vle8.v		v1, (%[in])\n"
+			      "vle8.v		v2, (%[p])\n"
+			      "vrgather.vv	v3, v1, v2\n"
+			      "vse8.v		v3, (%[m])\n"
+			      ".option		pop\n"
+			      : :
+			      [in]"r"(in + size - AEGIS_BLOCK_SIZE),
+			      [p]"r"(permute + 32 - size),
+			      [m]"r"(m)
+		:);
+
+		aegis128_update_rvv(st, m);
+
+		/*
+		 * vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
+		 *			vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vsetivli		zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v		v1, (%[m])\n"
+			      "vle8.v		v2, (%[s])\n"
+			      "vxor.vv		v1, v1, v2\n"
+			      "vle8.v		v2, (%[p])\n"
+			      "vrgather.vv	v3, v1, v2\n"
+			      "vse8.v		v3, (%[out])\n"
+			      ".option		pop\n"
+			      : :
+			      [m]"r"(m),
+			      [s]"r"(s),
+			      [p]"r"(permute + size),
+			      [out]"r"(out + size - AEGIS_BLOCK_SIZE)
+		:);
+
+		if (__builtin_expect(short_input, 0)) {
+			memcpy(dst, out, size);
+		} else {
+			/* vst1q_u8(out - AEGIS_BLOCK_SIZE, m); */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+				      "vle8.v	v1, (%[msg])\n"
+				      "vse8.v	v1, (%[out])\n"
+				      ".option	pop\n"
+				      : :
+				      [msg]"r"(msg),
+				      [out]"r"(out - AEGIS_BLOCK_SIZE)
+			:);
+		}
+	}
+}
+
+void crypto_aegis128_decrypt_chunk_rvv(void *state, void *dst, const void *src,
+					unsigned int size)
+{
+	struct aegis_state *st = state;
+	const int short_input = size < AEGIS_BLOCK_SIZE;
+	u8 s[AEGIS_BLOCK_SIZE];
+	u8 msg[AEGIS_BLOCK_SIZE];
+
+	preload_round_data();
+
+	while (size >= AEGIS_BLOCK_SIZE) {
+		/* s = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v	v1, (%[block1])\n"
+			      "vle8.v	v2, (%[block2])\n"
+			      "vle8.v	v3, (%[block3])\n"
+			      "vle8.v	v4, (%[block4])\n"
+			      "vle8.v	v5, (%[src])\n"
+			      "vxor.vv	v1, v1, v4\n"
+			      "vand.vv	v2, v2, v3\n"
+			      "vxor.vv	v1, v1, v2\n"
+			      "vxor.vv	v1, v1, v5\n"
+			      "vse8.v	v1, (%[msg])\n"
+			      ".option	pop\n"
+			      : :
+			      [block1]"r"(st->blocks[1]),
+			      [block2]"r"(st->blocks[2]),
+			      [block3]"r"(st->blocks[3]),
+			      [block4]"r"(st->blocks[4]),
+			      [src]"r"(src),
+			      [msg]"r"(msg)
+		:);
+
+		aegis128_update_rvv(st, msg);
+		/* dst = s */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vle8.v	v1, (%[msg])\n"
+			      "vse8.v	v1, (%[dst])\n"
+			      ".option	pop\n"
+			      : :
+			      [msg]"r"(msg),
+			      [dst]"r"(dst)
+		:);
+
+		size -= AEGIS_BLOCK_SIZE;
+		src += AEGIS_BLOCK_SIZE;
+		dst += AEGIS_BLOCK_SIZE;
+	}
+
+	if (size > 0) {
+		u8 buf[AEGIS_BLOCK_SIZE];
+		const void *in = src;
+		void *out = dst;
+		u8 m[AEGIS_BLOCK_SIZE];
+
+		/* s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; */
+		asm volatile (".option	push\n"
+			      ".option	arch,+v\n"
+			      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v	v1, (%[block1])\n"
+			      "vle8.v	v2, (%[block2])\n"
+			      "vle8.v	v3, (%[block3])\n"
+			      "vle8.v	v4, (%[block4])\n"
+			      "vxor.vv	v1, v1, v4\n" /* st.v[1] ^ st.v[4] */
+			      "vand.vv	v2, v2, v3\n" /* st.v[2] & st.v[3] */
+			      "vxor.vv	v1, v1, v2\n"
+			      "vse8.v	v1, (%[s])\n"
+			      ".option	pop\n"
+			      : :
+			      [block1]"r"(st->blocks[1]),
+			      [block2]"r"(st->blocks[2]),
+			      [block3]"r"(st->blocks[3]),
+			      [block4]"r"(st->blocks[4]),
+			      [s]"r"(s)
+		:);
+
+		if (__builtin_expect(short_input, 0))
+			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
+
+		/*
+		 * m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
+		 *		      vld1q_u8(permute + 32 - size));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vle8.v		v1, (%[in])\n"
+			      "vle8.v		v2, (%[p])\n"
+			      "vrgather.vv	v3, v1, v2\n"
+			      "vle8.v		v4, (%[s])\n"
+			      "vmsltu.vx	v0, v2, %[x10]\n" /* set if less then 0x10 */
+			      "vmerge.vvm	v3, v4, v3, v0\n"
+			      "vxor.vv		v3, v4, v3\n"
+			      "vse8.v		v3, (%[m])\n"
+			      ".option		pop\n"
+			      : :
+			      [in]"r"(in + size - AEGIS_BLOCK_SIZE),
+			      [p]"r"(permute + 32 - size),
+			      [s]"r"(s),
+			      [x10]"r"(0x10),
+			      [m]"r"(m)
+		:);
+
+		aegis128_update_rvv(st, m);
+
+		/*
+		 * vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
+		 *	    vqtbl1q_u8(m, vld1q_u8(permute + size)));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vsetivli		zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v		v1, (%[m])\n"
+			      "vle8.v		v2, (%[p])\n"
+			      "vrgather.vv	v3, v1, v2\n"
+			      "vse8.v		v3, (%[out])\n"
+			      ".option		pop\n"
+			      : :
+			      [m]"r"(m),
+			      [p]"r"(permute + size),
+			      [out]"r"(out + size - AEGIS_BLOCK_SIZE)
+		:);
+
+		if (__builtin_expect(short_input, 0)) {
+			memcpy(dst, out, size);
+		} else {
+			/* vst1q_u8(out - AEGIS_BLOCK_SIZE, m); */
+			asm volatile (".option	push\n"
+				      ".option	arch,+v\n"
+				      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+				      "vle8.v	v1, (%[msg])\n"
+				      "vse8.v	v1, (%[out])\n"
+				      ".option	pop\n"
+				      : :
+				      [msg]"r"(msg),
+				      [out]"r"(out - AEGIS_BLOCK_SIZE)
+			:);
+		}
+	}
+}
+
+int crypto_aegis128_final_rvv(void *state, void *tag_xor,
+			       unsigned int assoclen,
+			       unsigned int cryptlen,
+			       unsigned int authsize)
+{
+	struct aegis_state *st = state;
+	u64 v[2];
+	int i;
+
+	preload_round_data();
+
+	/*
+	 *v = st.v[3] ^ (uint8x16_t)vcombine_u64(vmov_n_u64(8ULL * assoclen),
+	 *					 vmov_n_u64(8ULL * cryptlen));
+	 */
+	v[0] = 8ULL * assoclen;
+	v[1] = 8ULL * cryptlen;
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v0, (%[v])\n"
+		      "vle8.v	v1, (%[block3])\n"
+		      "vxor.vv	v0, v0, v1\n"
+		      "vse8.v	v0, (%[v])\n"
+		      ".option	pop\n"
+		      : :
+		      [v]"r"(v),
+		      [block3]"r"(st->blocks[3])
+	:);
+
+	for (i = 0; i < 7; i++)
+		aegis128_update_rvv(st, v);
+
+	/* v = st.v[0] ^ st.v[1] ^ st.v[2] ^ st.v[3] ^ st.v[4]; */
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v0, (%[block0])\n"
+		      "vle8.v	v1, (%[block1])\n"
+		      "vle8.v	v2, (%[block2])\n"
+		      "vle8.v	v3, (%[block3])\n"
+		      "vle8.v	v4, (%[block4])\n"
+		      "vxor.vv	v0, v0, v1\n"
+		      "vxor.vv	v2, v2, v3\n"
+		      "vxor.vv	v0, v0, v2\n"
+		      "vxor.vv	v0, v0, v4\n"
+		      "vse8.v	v0, (%[v])\n"
+		      ".option	pop\n"
+		      : :
+		      [block0]"r"(st->blocks[0]),
+		      [block1]"r"(st->blocks[1]),
+		      [block2]"r"(st->blocks[2]),
+		      [block3]"r"(st->blocks[3]),
+		      [block4]"r"(st->blocks[4]),
+		      [v]"r"(v)
+	:);
+
+	if (authsize > 0) {
+		/*
+		 * v = vqtbl1q_u8(~vceqq_u8(v, vld1q_u8(tag_xor)),
+		 *			    vld1q_u8(permute + authsize));
+		 */
+		asm volatile (".option		push\n"
+			      ".option		arch,+v\n"
+			      "vsetivli		zero, 0x10, e8, m1, ta, ma\n"
+			      "vle8.v		v0, (%[v])\n"
+			      "vle8.v		v1, (%[tag_xor])\n"
+			      "vmseq.vv		v0, v0, v1\n" /* vceqq_u8(v, vld1q_u8(tag_xor) */
+			      "vmv.v.i		v1, 0\n" /* set v1 = 0 */
+			      "vmerge.vxm	v1, v1, %[xff], v0\n"
+			      "vxor.vi		v1, v1, -1\n" /* vnot.v v0, v0 */
+			      "vle8.v		v0, (%[pa])\n"
+			      "vrgather.vv	v2, v1, v0\n"
+			      "vredmin.vs	v2, v2, v2\n" /* vminvq_s8((int8x16_t)v) */
+			      "vse8.v		v2, (%[v])\n"
+			      ".option		pop\n"
+			      : :
+			      [v]"r"(v),
+			      [tag_xor]"r"(tag_xor),
+			      [xff]"r"(0xff),
+			      [pa]"r"(permute + authsize)
+		:);
+
+		return *((s8 *)v);
+	}
+
+	asm volatile (".option	push\n"
+		      ".option	arch,+v\n"
+		      "vsetivli	zero, 0x10, e8, m1, ta, ma\n"
+		      "vle8.v	v0, (%[v])\n"
+		      "vse8.v	v0, (%[tag_xor])\n"
+		      ".option	pop\n"
+		      : :
+		      [v]"r"(v),
+		      [tag_xor]"r"(tag_xor)
+	:);
+
+	return 0;
+}
diff --git a/crypto/aegis128-rvv.c b/crypto/aegis128-rvv.c
new file mode 100644
index 000000000000..ce0536121ec3
--- /dev/null
+++ b/crypto/aegis128-rvv.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2026 Institute of Software, CAS
+ * Author: Chunyan Zhang <zhangchunyan@...as.ac.cn>
+ */
+
+#include <asm/vector.h>
+
+#include "aegis.h"
+#include "aegis-rvv.h"
+
+bool crypto_aegis128_have_simd(void)
+{
+	return IS_ENABLED(CONFIG_RISCV_ISA_V);
+}
+
+void crypto_aegis128_init_simd(struct aegis_state *state,
+			       const union aegis_block *key,
+			       const u8 *iv)
+{
+	kernel_vector_begin();
+	crypto_aegis128_init_rvv(state, key, iv);
+	kernel_vector_end();
+}
+
+void crypto_aegis128_update_simd(struct aegis_state *state, const void *msg)
+{
+	kernel_vector_begin();
+	crypto_aegis128_update_rvv(state, msg);
+	kernel_vector_end();
+}
+
+void crypto_aegis128_encrypt_chunk_simd(struct aegis_state *state, u8 *dst,
+					const u8 *src, unsigned int size)
+{
+	kernel_vector_begin();
+	crypto_aegis128_encrypt_chunk_rvv(state, dst, src, size);
+	kernel_vector_end();
+}
+
+void crypto_aegis128_decrypt_chunk_simd(struct aegis_state *state, u8 *dst,
+					const u8 *src, unsigned int size)
+{
+	kernel_vector_begin();
+	crypto_aegis128_decrypt_chunk_rvv(state, dst, src, size);
+	kernel_vector_end();
+}
+
+int crypto_aegis128_final_simd(struct aegis_state *state,
+			       union aegis_block *tag_xor,
+			       unsigned int assoclen,
+			       unsigned int cryptlen,
+			       unsigned int authsize)
+{
+	int ret;
+
+	kernel_vector_begin();
+	ret = crypto_aegis128_final_rvv(state, tag_xor, assoclen, cryptlen,
+					 authsize);
+	kernel_vector_end();
+
+	return ret;
+}
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ