linux-kernel - [POC 11/12] x86-64: implement _rai_bucket

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20181017223332.11964-11-linux@rasmusvillemoes.dk>
Date:   Thu, 18 Oct 2018 00:33:31 +0200
From:   Rasmus Villemoes <linux@...musvillemoes.dk>
To:     linux-kernel@...r.kernel.org
Cc:     x86@...nel.org, "H . Peter Anvin" <hpa@...or.com>,
        Ingo Molnar <mingo@...nel.org>,
        "Kirill A . Shutemov" <kirill.shutemov@...ux.intel.com>,
        Rasmus Villemoes <linux@...musvillemoes.dk>
Subject: [POC 11/12] x86-64: implement _rai_bucket_shift

The only slightly tricky issue is that for implementing the thunk, we
need some temporary registers (with %ecx being one of them), and we
don't know whether the hash input and/or destination register collide
with whichever we choose. One _could_ attempt text parsing in asm in
order to find a safe set of temps, but they would need to be restored
anyway.

So instead, just pick %edx and %ecx, and start by pushing them on the
stack. Then compute the result we need, push that to the stack, restore
%edx and %ecx, and finally pop the result into the destination
register (which may be %rdx or %rcx or any other) and adjust the stack
pointer.

The patched code does need to do a shr, so I don't think there's a way
around the cc clobber.

Signed-off-by: Rasmus Villemoes <linux@...musvillemoes.dk>
---
 arch/x86/include/asm/rai.S | 59 ++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/rai.h | 21 +++++++++++++-
 arch/x86/kernel/rai.c      | 13 +++++++++
 3 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/rai.S b/arch/x86/include/asm/rai.S
index f42cdd8db876..144697e146b6 100644
--- a/arch/x86/include/asm/rai.S
+++ b/arch/x86/include/asm/rai.S
@@ -54,5 +54,64 @@
 	.popsection
 .endm /* rai_load */
 
+	/*
+	 * For convenience, and because it should not cause that much
+	 * worse code gen, we tie the hash to an output register, to
+	 * avoid it being given in the same register where we must
+	 * place the actual output. Since the hash output is unused,
+	 * gcc is free to pick that register for anything immediately
+	 * afterwards.
+	 */
+.macro rai_bucket_shift dst, hash, hashq, base, shift
+	.pushsection .rai_templ, "aw"
+10:	movabs $0x1234567812345678, \dst
+	/*
+	 * Actually, the hash output contains the shifted hash
+	 * value. But I don't think there's a way to inform gcc about
+	 * that, and I don't know how useful it would be anyway. So in
+	 * the thunk below, we don't do anything to have the same
+	 * property, though it would be doable.
+	 */
+	shr $6, \hash
+	lea (\dst, \hashq, 8), \dst
+11:
+	.popsection
+
+	.pushsection .text.rai_thunk, "ax"
+20:	/* dst and hash are registers, we can clobber hash */
+	push %rdx
+	push %rcx
+	mov \hash, %edx
+	mov \shift(%rip), %ecx
+	shr %cl,%edx
+	/* move the shifted value into \hash, so the below works regardless of whether \dst is %rdx or not */
+	mov %edx, \hash
+	mov \base(%rip), \dst
+	lea (\dst, \hashq, 8), \dst
+	/* We have our final value. */
+	push \dst
+	/* Now restore %rdx and %rcx, then finally restore \dst and adjust the stack pointer */
+	mov 0x8(%rsp), %rcx
+	mov 0x10(%rsp), %rdx
+	pop \dst
+	add $0x10, %rsp
+	jmp 32f
+21:
+	.popsection
+	/* The part that goes into .text */
+30:	jmp 20b
+31:	.skip -(((11b - 10b)-(31b - 30b)) > 0)*((11b - 10b)-(31b - 30b)), 0x90
+32:
+
+	.pushsection .rai_data, "a"
+40:
+	rai_entry RAI_BUCKET_SHIFT_8_4_4 30b 32b 10b 11b 20b
+	.quad \base   /* .bucket_shift.base_addr */
+	.quad \shift  /* .bucket_shift.shift_addr */
+41:
+	rai_entry_pad 40b 41b
+	.popsection
+.endm /* rai_bucket_shift */
+
 
 #endif
diff --git a/arch/x86/include/asm/rai.h b/arch/x86/include/asm/rai.h
index b57494c98d0f..c9726d1e40ed 100644
--- a/arch/x86/include/asm/rai.h
+++ b/arch/x86/include/asm/rai.h
@@ -3,8 +3,9 @@
 
 #define RAI_LOAD_4 0
 #define RAI_LOAD_8 1
+#define RAI_BUCKET_SHIFT_8_4_4 2
 
-#define STRUCT_RAI_ENTRY_SIZE 32
+#define STRUCT_RAI_ENTRY_SIZE 40
 
 /* Put the asm macros in a separate file for easier editing. */
 #include <asm/rai.S>
@@ -23,6 +24,10 @@ struct rai_entry {
 		struct {
 			void *addr;
 		} load;
+		struct {
+			void *base_addr;
+			void *shift_addr;
+		} bucket_shift;
 	};
 };
 _Static_assert(sizeof(struct rai_entry) == STRUCT_RAI_ENTRY_SIZE,
@@ -48,6 +53,20 @@ _Static_assert(sizeof(struct rai_entry) == STRUCT_RAI_ENTRY_SIZE,
 		ret__;							\
 	})
 
+#define _rai_bucket_shift(base, shift, hash) ({				\
+		typeof(base) ret__;					\
+		typeof(hash) unused__;					\
+		if (sizeof(*(base)) == 8 && sizeof(shift) == 4		\
+		    && sizeof(hash) == 4)				\
+			asm("rai_bucket_shift %0 %1 %q1 %c3 %c4"	\
+			    : "=r" (ret__), "=r" (unused__)		\
+			    : "1" (hash), "i" (&(base)), "i" (&(shift))	\
+			    : "cc");					\
+		else							\
+			ret__ = _rai_bucket_shift_fallback(base, shift, hash); \
+		ret__;							\
+	})
+
 #endif /* !__ASSEMBLY */
 
 #endif /* _ASM_X86_RAI_H */
diff --git a/arch/x86/kernel/rai.c b/arch/x86/kernel/rai.c
index c4284ce7478f..3aa2e3b2c31b 100644
--- a/arch/x86/kernel/rai.c
+++ b/arch/x86/kernel/rai.c
@@ -32,6 +32,19 @@ rai_patch_one(const struct rai_entry *r)
 		memcpy(templ + r->templ_len - sizeof(*imm), imm, sizeof(*imm));
 		break;
 	}
+	case RAI_BUCKET_SHIFT_8_4_4: {
+		const u32 *shiftp = r->bucket_shift.shift_addr;
+		const u64 *basep = r->bucket_shift.base_addr;
+		/*
+		 * This should be made more robust. For now, assume we
+		 * have a 10-byte movabs followed by a 3-byte shr. And
+		 * while *shiftp is 4 bytes wide, we just need the
+		 * LSB.
+		 */
+		memcpy(templ + 2, basep, sizeof(*basep));
+		memcpy(templ + 12, shiftp, 1);
+		break;
+	}
 	default:
 		WARN_ONCE(1, "unhandled RAI type %d\n", r->type);
 		return;
-- 
2.19.1.6.gbde171bbf5