#define SIMD 4 #define P 4 #define ROUNDS 4 #define SBITS 8 #define SN 3 /* 2, 3, 4 */ #define OR 0 /* 0, 1, 3 */ //#define FEISTEL #define MIX_IN #define MIX_OUT //#define ROUNDKEY //#define SLIDING_S #define TEST_N 1024 #define TEST_R 8 #define TEST_SKIP_OUT 3 //#define TEST_STRESS #define S1SIZE (1 << SBITS) #define SMASK (S1SIZE - 1) #define SNSIZE (SN * S1SIZE) #include #include #include #if defined(MIX_IN) || defined(MIX_OUT) /* Actual implementation would use a few rounds of Salsa or ChaCha */ #include #endif #include /* for analyze() */ #include /* for main() */ static void blockmix(const uint32_t * S, const uint32_t * Bin, uint32_t * Bout, size_t r) { #ifdef FEISTEL uint32_t X[P][2][SIMD]; #else uint32_t X[P][SIMD]; #endif r *= 128; /* so that r is input in same units as scrypt's */ assert(r % sizeof(X) == 0); r /= sizeof(X); assert(r > 0); assert(SBITS * SN <= 32); memcpy(X, &Bin[(r - 1) * (sizeof(X) / sizeof(uint32_t))], sizeof(X)); #ifdef MIX_IN { unsigned char *p = (unsigned char *)X; do { uint32_t n; MD5_CTX ctx; n = sizeof(X) - (p - (unsigned char *)X); if (n > MD5_DIGEST_LENGTH) n = MD5_DIGEST_LENGTH; MD5_Init(&ctx); if (p > (unsigned char *)X) MD5_Update(&ctx, p - MD5_DIGEST_LENGTH, MD5_DIGEST_LENGTH); MD5_Update(&ctx, p, n); MD5_Final(p, &ctx); p += n; } while (p < (unsigned char *)X + sizeof(X)); } #endif do { uint32_t i, j, round; for (i = 0; i < P; i++) { for (j = 0; j < SIMD; j++) { #ifdef TEST_STRESS #ifdef FEISTEL X[i][0][j] ^= *Bin++; X[i][1][j] ^= *Bin++; #else X[i][j] ^= *Bin++; #endif #else #ifdef FEISTEL X[i][0][j] += *Bin++; X[i][1][j] += *Bin++; #else X[i][j] += *Bin++; #endif #endif } } for (round = 0; round < ROUNDS; round++) { for (i = 0; i < P; i++) { #ifdef FEISTEL uint32_t half = X[i][round & 1][0]; #else uint32_t half = X[i][0]; #endif uint32_t k; const uint32_t *sk, *sp[SN]; sk = S; for (k = 0; k < SN; k++) { sp[k] = &sk[(half & SMASK) * SIMD]; sk += S1SIZE * SIMD; half >>= SBITS; } for (j = 0; j < SIMD; j++) { #ifdef FEISTEL #define otherhalf [~round & 1] #else #define otherhalf #endif #ifdef ROUNDKEY /* "Round key" or actually just id */ X[i]otherhalf[j] ^= j | (round << 16); #endif #if 1 && SN == 2 /* FMA */ X[i]otherhalf[j] += sp[0][j] * (sp[1][j] | OR); #elif SN == 3 /* classic Feistel (with FMA) */ X[i]otherhalf[j] ^= sp[0][j] * (sp[1][j] | OR) + sp[2][j]; #elif SN == 4 /* classic Feistel (Blowfish-like) */ X[i]otherhalf[j] ^= ((sp[0][j] + sp[1][j]) ^ sp[2][j]) + sp[3][j]; #elif SN == 2 /* classic Feistel */ X[i]otherhalf[j] ^= sp[0][j] + sp[1][j]; #else #error Unsupported S-box count (SN) #endif #undef otherhalf } } } #ifdef MIX_OUT if (r == 1) { /* for last block only */ unsigned char *p = (unsigned char *)X; do { uint32_t n; MD5_CTX ctx; n = sizeof(X) - (p - (unsigned char *)X); if (n > MD5_DIGEST_LENGTH) n = MD5_DIGEST_LENGTH; MD5_Init(&ctx); if (p > (unsigned char *)X) MD5_Update(&ctx, p - MD5_DIGEST_LENGTH, MD5_DIGEST_LENGTH); MD5_Update(&ctx, p, n); MD5_Final(p, &ctx); p += n; } while (p < (unsigned char *)X + sizeof(X)); } #endif for (i = 0; i < P; i++) { for (j = 0; j < SIMD; j++) { #ifdef FEISTEL *Bout++ = X[i][0][j]; *Bout++ = X[i][1][j]; #else *Bout++ = X[i][j]; #endif } } #ifdef SLIDING_S S += P * SIMD; #endif } while (--r); } static uint32_t Vmap1[0x10000 / 32], Vmap2[0x10000 / 32]; static void analyze(unsigned char *Vc, uint32_t n) { uint32_t *Vi = (uint32_t *)Vc; uint32_t i; uint32_t counts[0x100], min, max, diff1, diff2; memset(counts, 0, sizeof(counts)); for (i = 0; i < n; i++) counts[Vc[i]]++; min = ~0UL; max = 0; for (i = 0; i < 0x100; i++) { if (min > counts[i]) min = counts[i]; if (max < counts[i]) max = counts[i]; } fprintf(stderr, "min = %u max = %u" " (min+max)/2 = %.1f (expected %.1f)\n", min, max, (double)(min + max) / 2.0, (double)n / 0x100); n /= sizeof(uint32_t); memset(Vmap1, 0, sizeof(Vmap1)); memset(Vmap2, 0, sizeof(Vmap2)); diff1 = diff2 = 0; for (i = 0; i < n; i++) { uint32_t x = Vi[i] & 0xffff; uint32_t y = Vi[i] >> 16; uint32_t * v = &Vmap1[x / 32]; if (!(*v & (1 << (x % 32)))) diff1++; *v |= 1 << (x % 32); v = &Vmap2[y / 32]; if (!(*v & (1 << (y % 32)))) diff2++; *v |= 1 << (y % 32); } fprintf(stderr, "diff1 = %u diff2 = %u (expected %.1f)\n", diff1, diff2, 0x10000 * (1.0 - pow((0x10000 - 1.0) / 0x10000, n))); } static uint32_t Sinit[SNSIZE * SIMD + TEST_R * 32], V[TEST_N + 1][TEST_R * 32]; int main(void) { uint32_t i; fprintf(stderr, "S-box size = %u\n", SNSIZE * SIMD * 4); /* Start with extremely poor inputs (but not as bad as all-zero S) */ memset(Sinit, 'x', sizeof(Sinit)); memset(V[0], 0, sizeof(V[0])); memcpy(V, "test", 4); /* This is similar to SMix's first loop */ for (i = 0; i < TEST_N; i++) { const uint32_t * S; if ((uint32_t *)&V[i + 1] - (uint32_t *)&V[1] >= SNSIZE * SIMD) S = (uint32_t *)&V[i + 1] - SNSIZE * SIMD; else S = Sinit; blockmix(S, V[i], V[i + 1], TEST_R); } fwrite(&V[TEST_SKIP_OUT], sizeof(V) - TEST_SKIP_OUT * sizeof(V[0]), 1, stdout); analyze((unsigned char *)V[TEST_SKIP_OUT], sizeof(V) - TEST_SKIP_OUT * sizeof(V[0])); return 0; }