lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 18 Apr 2008 22:18:09 +0200
From:	Alexander van Heukelum <heukelum@...lshack.com>
To:	Alexander van Heukelum <heukelum@...tmail.fm>
Cc:	dean gaudet <dean@...tic.org>, Ingo Molnar <mingo@...e.hu>,
	Andi Kleen <andi@...stfloor.org>,
	LKML <linux-kernel@...r.kernel.org>
Subject: Alternative implementation of the generic __ffs

On Mon, Apr 07, 2008 at 12:25:50PM +0200, Alexander van Heukelum wrote:
> On Sun, 6 Apr 2008 13:22:58 -0700 (PDT), "dean gaudet" <dean@...tic.org> said:
> > On Sun, 6 Apr 2008, Alexander van Heukelum wrote:
> > > The current generic implementation of ffz is O(lg(n)) already
> > 
> > it's O(lg(n)) time... the operations all depend on each other.
> > 
> > the implementation i pointed to is O(lg(n)) code space... and the time 
> > depends on how parallel the machine is, they're not dependent on each 
> > other.
> 
> Indeed. The worst dependencies are in the sum of all the partial
> results in this implementation. And addition is associative, so
> partial results can be written as ((a+b)+(c+d))+(e+f). Assuming
> perfect parallel execution this would lead to O(ln(ln(n))). Good.

Hello all,

I've implemented ffs (find first set bit) like it is shown
in http://www.hackersdelight.org/ (see revisions, page 21).
I would be interested to see how it compares with the generic
implementation of __ffs in the linux kernel, in particular
on architectures that do not have an obvious fast way of
determining the first set bit of a word.

I have included a simple benchmark program that should give
a reasonable estimate of the performance ratio of the two
implementations. Please test and report :).

Is this implementation suitable to replace the current one?

Greetings,
	Alexander

P.S. Some results for some machines I could test on:

-----------

On a 2.1 GHz Athlon XP:
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original:        396 tics,   756 tics
New:             378 tics,   851 tics
Assembly:        262 tics,   383 tics
Empty loop:      128 tics,   203 tics

real    0m33.862s
user    0m33.026s
sys     0m0.344s

-----------

On a 2.33 GHz Xeon:
$ gcc -m64 -Os ffs.c && time ./a.out 
Original:        277 tics,   324 tics
New:             230 tics,   236 tics
Assembly:         90 tics,    84 tics
Empty loop:       90 tics,    82 tics

real    0m14.490s
user    0m14.270s
sys     0m0.220s
$ gcc -m32 -Os -fomit-frame-pointer ffs.c && time ./a.out
Original:        303 tics,   449 tics
New:             231 tics,   394 tics
Assembly:         90 tics,   144 tics
Empty loop:      102 tics,   116 tics

real    0m18.521s
user    0m18.380s
sys     0m0.140s

-----------

On an alpha EV6.7 (21264A) operating at 667 MHz:
$ gcc -Os ffs.c && time ./a.out
Original:        622 tics,   633 tics
New:             431 tics,   465 tics
Assembly:        235 tics,   210 tics
Empty loop:      199 tics,   218 tics
50.358u 0.259s 1:14.28 68.1%    0+1k 2+0io 2pf+0w

-----------

#include <stdio.h>
#include <sys/times.h>

#define LOOPS32 (1<<(30-5-1))
#define LOOPS64 (1<<(30-6-1))
#define ATTR __attribute__((noinline))

static ATTR int __ffs32_orig(unsigned int word)
{
	int num = 0;

	if ((word & 0xffff) == 0) {
		num += 16;
		word >>= 16;
	}
	if ((word & 0xff) == 0) {
		num += 8;
		word >>= 8;
	}
	if ((word & 0xf) == 0) {
		num += 4;
		word >>= 4;
	}
	if ((word & 0x3) == 0) {
		num += 2;
		word >>= 2;
	}
	if ((word & 0x1) == 0)
		num += 1;

	return num;
}

static ATTR int __ffs64_orig(unsigned long long word)
{
	int num = 0;

	if ((word & 0xffffffff) == 0) {
		num += 32;
		word >>= 32;
	}
	if ((word & 0xffff) == 0) {
		num += 16;
		word >>= 16;
	}
	if ((word & 0xff) == 0) {
		num += 8;
		word >>= 8;
	}
	if ((word & 0xf) == 0) {
		num += 4;
		word >>= 4;
	}
	if ((word & 0x3) == 0) {
		num += 2;
		word >>= 2;
	}
	if ((word & 0x1) == 0)
		num += 1;

	return num;
}

static ATTR int __ffs32_new(unsigned int value)
{
	int x0, x1, x2, x3, x4;

	value &= -value;
	x0 = (value & 0x55555555) ? 0 : 1;
	x1 = (value & 0x33333333) ? 0 : 2;
	x2 = (value & 0x0f0f0f0f) ? 0 : 4;
	x3 = (value & 0x00ff00ff) ? 0 : 8;
	x4 = (value & 0x0000ffff) ? 0 : 16;

	return x0 | x1 | x2 | x3 | x4;
}

static ATTR int __ffs64_new(unsigned long long value)
{
	int x0, x1, x2, x3, x4, x5;

	value &= -value;
	x0 = (value & 0x5555555555555555ull) ? 0 : 1;
	x1 = (value & 0x3333333333333333ull) ? 0 : 2;
	x2 = (value & 0x0f0f0f0f0f0f0f0full) ? 0 : 4;
	x3 = (value & 0x00ff00ff00ff00ffull) ? 0 : 8;
	x4 = (value & 0x0000ffff0000ffffull) ? 0 : 16;
	x5 = (value & 0x00000000ffffffffull) ? 0 : 32;

	return x0 | x1 | x2 | x3 | x4 | x5;
}

#ifdef __x86_64__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
	int res;

	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}

static ATTR int __ffs64_asm(unsigned long long value)
{
	long res;

	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}
#endif

#ifdef __i386__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
	int res;

	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}

static ATTR int __ffs64_asm(unsigned long long value)
{
	unsigned int low, high;
	int res;

	low = value;
	if (low) {
		asm volatile("bsf %[value], %[res]"
			: [res] "=r" (res)
			: [value] "r" (low)
		);
		return res;
	}

	high = value >> 32;
	asm volatile("bsf %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (high)
	);

	return res | 32;
}
#endif

#ifdef __alpha__
#define ASSEMBLYVERSION
static ATTR int __ffs32_asm(unsigned int value)
{
	int res;

	asm volatile("cttz %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}

static ATTR int __ffs64_asm(unsigned long long value)
{
	long res;

	asm volatile("cttz %[value], %[res]"
		: [res] "=r" (res)
		: [value] "r" (value)
	);	

	return res;
}
#endif

static ATTR int __nothing32(unsigned int value)
{
	return value;
}

static ATTR int __nothing64(unsigned long long value)
{
	return (int)value;
}

/* Random numbers: modified version of libc mrand48 */
static unsigned long long myrand_next = 0x330eull;
static const unsigned long long myrand_a = 0x5deece66dull;
static const unsigned long long myrand_b = 0xbull;
unsigned int myrand(void)
{
	myrand_next = myrand_a * myrand_next + myrand_b;
	return (unsigned int)(myrand_next >> 16);
}

unsigned long long myrand64(void)
{
	return ((unsigned long long)myrand() << 32) | myrand();
}

void myrand_seed(unsigned int seed)
{
	int n;
	myrand_next = ((unsigned long long)seed << 16) + 0x330eull;
	for (n=0; n<100; n++) myrand(); /* warm it up a bit */
}

/* tic: wait for next clock tick and save current tick */
clock_t tictime;
void tic(void)
{
	struct tms t;
	times(&t);
	tictime = t.tms_utime;
	do {
		times(&t);
	} while (tictime == t.tms_utime);
	tictime = t.tms_utime;
}

/* toc: report number of ticks since tic() */
long toc(void)
{
	struct tms t;
	times(&t);
	return (long)(t.tms_utime - tictime);
}

__attribute__((noinline))
int bench_orig32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __ffs32_orig(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_orig64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __ffs64_orig(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_new32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __ffs32_new(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_new64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __ffs64_new(i);
			i &= i - 1;	
		}	
	}
	return res;
}

#ifdef ASSEMBLYVERSION
__attribute__((noinline))
int bench_asm32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __ffs32_asm(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_asm64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __ffs64_asm(i);
			i &= i - 1;	
		}	
	}
	return res;
}
#endif

__attribute__((noinline))
int bench_none32(void)
{
	unsigned int i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS32; n++) {
		i = myrand();
		while (i) {
			res ^= __nothing32(i);
			i &= i - 1;	
		}	
	}
	return res;
}

__attribute__((noinline))
int bench_none64(void)
{
	unsigned long long i;
	int n, res = 0;

	myrand_seed(0);
	for (n=0; n<LOOPS64; n++) {
		i = myrand64();
		while (i) {
			res ^= __nothing64(i);
			i &= i - 1;
		}	
	}
	return res;
}

void test(void)
{
	unsigned long long int i;
	int n, res1, res2;

	myrand_seed(0);

	for (n=0; n<1000; n++) {
		i = myrand64();
		while (i) {
			res1 = __ffs64_orig(i);
			res2 = __ffs64_new(i);
			if (res1 != res2) {
				printf("%llu %i %i\n", i,
						res1, res2);
			}
			i &= i - 1;
		}
	}
}

int main(void)
{
	long tics;

	setvbuf(stdout, NULL, _IONBF, 0);
	test();

	printf("%-14s", "Original:");
	tic(); bench_orig32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_orig64(); tics = toc();
	printf("%6lu tics\n", tics);

	printf("%-14s", "New:");
	tic(); bench_new32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_new64(); tics = toc();
	printf("%6lu tics\n", tics);

#ifdef ASSEMBLYVERSION
	printf("%-14s", "Assembly:");
	tic(); bench_asm32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_asm64(); tics = toc();
	printf("%6lu tics\n", tics);
#endif

	printf("%-14s", "Empty loop:");
	tic(); bench_none32(); tics = toc();
	printf("%6lu tics,", tics);
	tic(); bench_none64(); tics = toc();
	printf("%6lu tics\n", tics);

	return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ