[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140612032248.GA2437@thunk.org>
Date: Wed, 11 Jun 2014 23:22:48 -0400
From: Theodore Ts'o <tytso@....edu>
To: George Spelvin <linux@...izon.com>
Cc: hpa@...ux.intel.com, linux-kernel@...r.kernel.org,
mingo@...nel.org, price@....edu
Subject: Re: drivers/char/random.c: More futzing about
On Wed, Jun 11, 2014 at 08:32:49PM -0400, George Spelvin wrote:
> Comparable, but slightly slower. Clearly, I need to do better.
> And you can see the first-iteration effects clearly. Still,
> noting *remotely* like 7x!
I redid my numbers, and I can no longer reproduce the 7x slowdown. I
do see that if you compile w/o -O2, fast_mix2 is twice as slow. But
it's not 7x slower.
When compiling w/o -O2:
fast_mix fast_mix2
task-clock 221.3 ms 460.7 ms
When compiling with -O2 -Os:
fast_mix fast_mix2
task-clock 115.4 ms 71.5 ms
And here's the numbers I got with a single iteration using rdtsc:
fast_mix: 164 fast_mix2: 237
fast_mix: 168 fast_mix2: 230
fast_mix: 166 fast_mix2: 228
fast_mix: 164 fast_mix2: 230
fast_mix: 166 fast_mix2: 230
fast_mix: 168 fast_mix2: 232
fast_mix: 166 fast_mix2: 228
fast_mix: 164 fast_mix2: 228
fast_mix: 166 fast_mix2: 234
fast_mix: 166 fast_mix2: 230
- Ted
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
typedef unsigned int __u32;
struct fast_pool {
__u32 pool[4];
unsigned long last;
unsigned short count;
unsigned char rotate;
unsigned char last_timer_intr;
};
/**
* rol32 - rotate a 32-bit value left
* @word: value to rotate
* @shift: bits to roll
*/
static inline __u32 rol32(__u32 word, unsigned int shift)
{
return (word << shift) | (word >> (32 - shift));
}
static __u32 const twist_table[8] = {
0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
/*
* This is a fast mixing routine used by the interrupt randomness
* collector. It's hardcoded for an 128 bit pool and assumes that any
* locks that might be needed are taken by the caller.
*/
extern void fast_mix(struct fast_pool *f, __u32 input[4])
{
__u32 w;
unsigned input_rotate = f->rotate;
w = rol32(input[0], input_rotate) ^ f->pool[0] ^ f->pool[3];
f->pool[0] = (w >> 3) ^ twist_table[w & 7];
input_rotate = (input_rotate + 14) & 31;
w = rol32(input[1], input_rotate) ^ f->pool[1] ^ f->pool[0];
f->pool[1] = (w >> 3) ^ twist_table[w & 7];
input_rotate = (input_rotate + 7) & 31;
w = rol32(input[2], input_rotate) ^ f->pool[2] ^ f->pool[1];
f->pool[2] = (w >> 3) ^ twist_table[w & 7];
input_rotate = (input_rotate + 7) & 31;
w = rol32(input[3], input_rotate) ^ f->pool[3] ^ f->pool[2];
f->pool[3] = (w >> 3) ^ twist_table[w & 7];
input_rotate = (input_rotate + 7) & 31;
f->rotate = input_rotate;
f->count++;
}
extern fast_mix2(struct fast_pool *f, __u32 const input[4])
{
__u32 a = f->pool[0] ^ input[0], b = f->pool[1] ^ input[1];
__u32 c = f->pool[2] ^ input[2], d = f->pool[3] ^ input[3];
int i;
for (i = 0; i < 3; i++) {
/*
* Inspired by ChaCha's QuarterRound, but
* modified for much greater parallelism.
* Surprisingly, rotating a and c seems to work
* better than b and d. And it runs faster.
*/
a += b; c += d;
d ^= a; b ^= c;
a = rol32(a, 15); c = rol32(c, 21);
a += b; c += d;
d ^= a; b ^= c;
a = rol32(a, 3); c = rol32(c, 7);
}
f->pool[0] = a; f->pool[1] = b;
f->pool[2] = c; f->pool[3] = d;
f->count++;
}
static __inline__ volatile unsigned long long rdtsc(void)
{
unsigned long long int x;
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
return x;
}
int main(int argc, char **argv)
{
struct fast_pool f;
int i;
__u32 input[4];
unsigned volatile long long start_time, end_time;
memset(&f, 0, sizeof(f));
memset(&input, 0, sizeof(input));
f.pool[0] = 1;
#if !defined(BENCH_FASTMIX) && !defined(BENCH_FASTMIX2)
for (i=0; i < 10; i++) {
usleep(50000);
start_time = rdtsc();
fast_mix(&f, input);
end_time = rdtsc();
printf("fast_mix: %llu\t", end_time - start_time);
usleep(50000);
start_time = rdtsc();
fast_mix2(&f, input);
end_time = rdtsc();
printf("fast_mix2: %llu\n", end_time - start_time);
}
#endif
#ifdef BENCH_FASTMIX
for (i=0; i < 10240000; i++) {
fast_mix(&f, input);
}
#endif
#ifdef BENCH_FASTMIX2
for (i=0; i < 10240000; i++) {
fast_mix2(&f, input);
}
#endif
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists