linux-kernel - Re: [PATCH v2 1/3] lib: find_*

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <54D2A2B9.1020206@gmail.com>
Date:	Thu, 05 Feb 2015 01:52:41 +0300
From:	Yury <yury.norov@...il.com>
To:	Rasmus Villemoes <linux@...musvillemoes.dk>
CC:	klimov.linux@...il.com, davem@...emloft.net,
	akpm@...ux-foundation.org, hannes@...essinduktion.org,
	dborkman@...hat.com, laijs@...fujitsu.com,
	takahiro.akashi@...aro.org, valentinrothberg@...il.com,
	linux@...izon.com, msalter@...hat.com, chris@...is-wilson.co.uk,
	tgraf@...g.ch, linux-kernel@...r.kernel.org,
	Yury Norov <y.norov@...sung.com>
Subject: Re: [PATCH v2 1/3] lib: find_*_bit reimplementation


On 02.02.2015 13:43, Rasmus Villemoes wrote:
> On Sat, Jan 31 2015, yury.norov@...il.com wrote:
>
>> From: Yury Norov <y.norov@...sung.com>
>>
>> New implementations takes less space in source file (see diffstat)
>> and in object. For me it's 710 vs 453 bytes of text.
>>
> New version generally looks good. Please include a summary of the
> changes between the versions either below the --- line or in a 0/n cover
> letter, especially since you've now expanded the scope of the series.
>
> Comments below.
>
>> Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
>> Performance tests were ran on userspace with code like this:
>>
>> 	/* addr[] is filled from /dev/urandom */
>> 	start = clock();
>> 	while (ret < nbits)
>> 		ret = find_next_bit(addr, nbits, ret + 1);
>>
>> 	end = clock();
>> 	printf("%ld\t", (unsigned long) end - start);
>>
>> On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
>> (for find_next_bit, nbits is 8M, for find_first_bit - 80K)
>>
>> 	find_next_bit:		find_first_bit:
>> 	new	current		new	current
>> 	26932	43151		14777	14925
>> 	26947	43182		14521	15423
>> 	26507	43824		15053	14705
>> 	27329	43759		14473	14777
>> 	26895	43367		14847	15023
>> 	26990	43693		15103	15163
>> 	26775	43299		15067	15232
>> 	27282	42752		14544	15121
>> 	27504	43088		14644	14858
>> 	26761	43856		14699	15193
>> 	26692	43075		14781	14681
>> 	27137	42969		14451	15061
>> 	...			...
>>
>> find_next_bit performance gain is 35-40%;
>> find_first_bit - no measurable difference.
>>
>> Signed-off-by: Yury Norov <yury.norov@...il.com>
>> ---
>>  lib/find_last_bit.c |  31 ++-----
>>  lib/find_next_bit.c | 254 +++++++++++++++-------------------------------------
>>  2 files changed, 79 insertions(+), 206 deletions(-)
>>
>> diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
>> index 91ca09f..e67e970 100644
>> --- a/lib/find_last_bit.c
>> +++ b/lib/find_last_bit.c
>> @@ -4,44 +4,29 @@
>>   * Written by Rusty Russell <rusty@...tcorp.com.au>
>>   * (Inspired by David Howell's find_next_bit implementation)
>>   *
>> + * Rewritten by Yury Norov <yury.norov@...il.com> to decrease
>> + * size and improve performance, 2015.
>> + *
>>   * This program is free software; you can redistribute it and/or
>>   * modify it under the terms of the GNU General Public License
>>   * as published by the Free Software Foundation; either version
>>   * 2 of the License, or (at your option) any later version.
>>   */
>>  
>> -#include <linux/bitops.h>
> Why do you remove that #include? It is rather important that the header
> and implementation don't get out of sync. I know that kernel.h includes
> bitops.h, but please don't rely on such things. Quoting SubmitChecklist:
>
> 1: If you use a facility then #include the file that defines/declares
>    that facility.  Don't depend on other header files pulling in ones
>    that you use.
>
>
>>  #include <linux/export.h>
>> -#include <asm/types.h>
>> -#include <asm/byteorder.h>
> However, getting rid of includes that are no longer needed is certainly
> a good thing.
Yes, linux/bitops.h are to get back.
>> +#include <linux/kernel.h>
>>  
>>  #ifndef find_last_bit
>>  
>>  unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
>>  {
>> -	unsigned long words;
>> -	unsigned long tmp;
>> -
>> -	/* Start at final word. */
>> -	words = size / BITS_PER_LONG;
>> -
>> -	/* Partial final word? */
>> -	if (size & (BITS_PER_LONG-1)) {
>> -		tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
>> -					 - (size & (BITS_PER_LONG-1)))));
>> -		if (tmp)
>> -			goto found;
>> -	}
>> +	unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
>>  
>> -	while (words) {
>> -		tmp = addr[--words];
>> -		if (tmp) {
>> -found:
>> -			return words * BITS_PER_LONG + __fls(tmp);
>> -		}
>> +	while (idx--) {
>> +		if (addr[idx])
>> +			return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
>>  	}
>>  
>> -	/* Not found */
>>  	return size;
>>  }
>>  EXPORT_SYMBOL(find_last_bit);
>> diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
>> index 0cbfc0b..ebfb3dc 100644
>> --- a/lib/find_next_bit.c
>> +++ b/lib/find_next_bit.c
>> @@ -3,18 +3,45 @@
>>   * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
>>   * Written by David Howells (dhowells@...hat.com)
>>   *
>> + * Rewritten by Yury Norov <yury.norov@...il.com> to decrease
>> + * size and improve performance, 2015.
>> + *
>>   * This program is free software; you can redistribute it and/or
>>   * modify it under the terms of the GNU General Public License
>>   * as published by the Free Software Foundation; either version
>>   * 2 of the License, or (at your option) any later version.
>>   */
>>  
>> -#include <linux/bitops.h>
>>  #include <linux/export.h>
>> -#include <asm/types.h>
>> -#include <asm/byteorder.h>
>> +#include <linux/kernel.h>
> Same as above.
>
>> +#define HIGH_BITS_MASK(nr)		(ULONG_MAX << (nr))
>> +
>> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
>> +static unsigned long _find_next_bit(const unsigned long *addr,
>> +		unsigned long nbits, unsigned long start, bool set)
>> +{
>> +	unsigned long tmp = set ? addr[start / BITS_PER_LONG]
>> +			: ~addr[start / BITS_PER_LONG];
>> +
>> +	/* Handle 1st word. */
>> +	if (!IS_ALIGNED(start, BITS_PER_LONG)) {
>> +		tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
>> +		start = round_down(start, BITS_PER_LONG);
>> +	}
>>  
>> -#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
>> +	while (!tmp) {
>> +		start += BITS_PER_LONG;
>> +		if (start >= nbits)
>> +			return nbits;
>> +
>> +		tmp = set ? addr[start / BITS_PER_LONG]
>> +			: ~addr[start / BITS_PER_LONG];
>> +	}
>> +
>> +	return start + __ffs(tmp);
>> +}
>> +#endif
>>  
>>  #ifndef find_next_bit
>>  /*
>> @@ -23,86 +50,22 @@
>>  unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
>>  			    unsigned long offset)
>>  {
>> -	const unsigned long *p = addr + BITOP_WORD(offset);
>> -	unsigned long result = offset & ~(BITS_PER_LONG-1);
>> -	unsigned long tmp;
>> -
>>  	if (offset >= size)
>>  		return size;
> Why can't this ...
>
>
>> -	size -= result;
>> -	offset %= BITS_PER_LONG;
>> -	if (offset) {
>> -		tmp = *(p++);
>> -		tmp &= (~0UL << offset);
>> -		if (size < BITS_PER_LONG)
>> -			goto found_first;
>> -		if (tmp)
>> -			goto found_middle;
>> -		size -= BITS_PER_LONG;
>> -		result += BITS_PER_LONG;
>> -	}
>> -	while (size & ~(BITS_PER_LONG-1)) {
>> -		if ((tmp = *(p++)))
>> -			goto found_middle;
>> -		result += BITS_PER_LONG;
>> -		size -= BITS_PER_LONG;
>> -	}
>> -	if (!size)
>> -		return result;
>> -	tmp = *p;
>>  
>> -found_first:
>> -	tmp &= (~0UL >> (BITS_PER_LONG - size));
>> -	if (tmp == 0UL)		/* Are any bits set? */
>> -		return result + size;	/* Nope. */
>> -found_middle:
>> -	return result + __ffs(tmp);
>> +	return min(_find_next_bit(addr, size, offset, 1), size);
> ... and this be part of _find_next_bit? Can find_next_bit not be simply
> 'return _find_next_bit(addr, size, offset, 1);', and similarly for
> find_next_zero_bit? Btw., passing true and false for the boolean
> parameter may be a little clearer.
I moved size checkers out of '_find_next_bit' to let user call it from his code
if he knows for sure that size/offset pair is valid. This may help save a couple
of clocks. I think, I'll walk over the code to find how many such places we have.
If not too much / not in critical paths, checks may be moved into the function.

Or maybe leave as is and place some comment for future?...
>
>>  }
>>  EXPORT_SYMBOL(find_next_bit);
>>  #endif
>>  
>>  #ifndef find_next_zero_bit
>> -/*
>> - * This implementation of find_{first,next}_zero_bit was stolen from
>> - * Linus' asm-alpha/bitops.h.
>> - */
>>  unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
>>  				 unsigned long offset)
>>  {
>> -	const unsigned long *p = addr + BITOP_WORD(offset);
>> -	unsigned long result = offset & ~(BITS_PER_LONG-1);
>> -	unsigned long tmp;
>> -
>>  	if (offset >= size)
>>  		return size;
> See above.
>
>> -	size -= result;
>> -	offset %= BITS_PER_LONG;
>> -	if (offset) {
>> -		tmp = *(p++);
>> -		tmp |= ~0UL >> (BITS_PER_LONG - offset);
>> -		if (size < BITS_PER_LONG)
>> -			goto found_first;
>> -		if (~tmp)
>> -			goto found_middle;
>> -		size -= BITS_PER_LONG;
>> -		result += BITS_PER_LONG;
>> -	}
>> -	while (size & ~(BITS_PER_LONG-1)) {
>> -		if (~(tmp = *(p++)))
>> -			goto found_middle;
>> -		result += BITS_PER_LONG;
>> -		size -= BITS_PER_LONG;
>> -	}
>> -	if (!size)
>> -		return result;
>> -	tmp = *p;
>>  
>> -found_first:
>> -	tmp |= ~0UL << size;
>> -	if (tmp == ~0UL)	/* Are any bits zero? */
>> -		return result + size;	/* Nope. */
>> -found_middle:
>> -	return result + ffz(tmp);
>> +	return min(_find_next_bit(addr, size, offset, 0), size);
> See above.
>
>>  }
>>  EXPORT_SYMBOL(find_next_zero_bit);
>>  #endif
>> @@ -113,24 +76,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
>>   */
>>  unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
>>  {
>> -	const unsigned long *p = addr;
>> -	unsigned long result = 0;
>> -	unsigned long tmp;
>> +	unsigned long idx;
>>  
>> -	while (size & ~(BITS_PER_LONG-1)) {
>> -		if ((tmp = *(p++)))
>> -			goto found;
>> -		result += BITS_PER_LONG;
>> -		size -= BITS_PER_LONG;
>> +	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
>> +		if (addr[idx])
>> +			return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
>>  	}
>> -	if (!size)
>> -		return result;
>>  
>> -	tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
>> -	if (tmp == 0UL)		/* Are any bits set? */
>> -		return result + size;	/* Nope. */
>> -found:
>> -	return result + __ffs(tmp);
>> +	return size;
>>  }
>>  EXPORT_SYMBOL(find_first_bit);
>>  #endif
>> @@ -141,24 +94,14 @@ EXPORT_SYMBOL(find_first_bit);
>>   */
>>  unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
>>  {
>> -	const unsigned long *p = addr;
>> -	unsigned long result = 0;
>> -	unsigned long tmp;
>> +	unsigned long idx;
>>  
>> -	while (size & ~(BITS_PER_LONG-1)) {
>> -		if (~(tmp = *(p++)))
>> -			goto found;
>> -		result += BITS_PER_LONG;
>> -		size -= BITS_PER_LONG;
>> +	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
>> +		if (addr[idx] != ULONG_MAX)
>> +			return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
>>  	}
>> -	if (!size)
>> -		return result;
>>  
>> -	tmp = (*p) | (~0UL << size);
>> -	if (tmp == ~0UL)	/* Are any bits zero? */
>> -		return result + size;	/* Nope. */
>> -found:
>> -	return result + ffz(tmp);
>> +	return size;
>>  }
>>  EXPORT_SYMBOL(find_first_zero_bit);
>>  #endif
>> @@ -166,18 +109,6 @@ EXPORT_SYMBOL(find_first_zero_bit);
>>  #ifdef __BIG_ENDIAN
>>  
>>  /* include/linux/byteorder does not support "unsigned long" type */
>> -static inline unsigned long ext2_swabp(const unsigned long * x)
>> -{
>> -#if BITS_PER_LONG == 64
>> -	return (unsigned long) __swab64p((u64 *) x);
>> -#elif BITS_PER_LONG == 32
>> -	return (unsigned long) __swab32p((u32 *) x);
>> -#else
>> -#error BITS_PER_LONG not defined
>> -#endif
>> -}
>> -
>> -/* include/linux/byteorder doesn't support "unsigned long" type */
>>  static inline unsigned long ext2_swab(const unsigned long y)
>>  {
>>  #if BITS_PER_LONG == 64
>> @@ -189,48 +120,40 @@ static inline unsigned long ext2_swab(const unsigned long y)
>>  #endif
>>  }
>>  
>> +#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
>> +static unsigned long _find_next_bit_le(const unsigned long *addr,
>> +		unsigned long nbits, unsigned long start, bool set)
>> +{
>> +	unsigned long tmp = set ? addr[start / BITS_PER_LONG]
>> +			: ~addr[start / BITS_PER_LONG];
>> +
>> +	/* Handle 1st word. */
>> +	if (!IS_ALIGNED(start, BITS_PER_LONG)) {
>> +		tmp &= ext2_swab(HIGH_BITS_MASK(start % BITS_PER_LONG));
>> +		start = round_down(start, BITS_PER_LONG);
>> +	}
>> +
>> +	while (!tmp) {
>> +		start += BITS_PER_LONG;
>> +		if (start >= nbits)
>> +			return nbits;
>> +
>> +		tmp = set ? addr[start / BITS_PER_LONG]
>> +			: ~addr[start / BITS_PER_LONG];
>> +	}
>> +
>> +	return start + __ffs(ext2_swab(tmp));
>> +}
>> +#endif
>> +
>>  #ifndef find_next_zero_bit_le
>>  unsigned long find_next_zero_bit_le(const void *addr, unsigned
>>  		long size, unsigned long offset)
>>  {
>> -	const unsigned long *p = addr;
>> -	unsigned long result = offset & ~(BITS_PER_LONG - 1);
>> -	unsigned long tmp;
>> -
>>  	if (offset >= size)
>>  		return size;
> Again, I think this should be moved to the common implementation in
> _find_next_bit_le, and similarly for find_next_bit_le below.
>
>> -	p += BITOP_WORD(offset);
>> -	size -= result;
>> -	offset &= (BITS_PER_LONG - 1UL);
>> -	if (offset) {
>> -		tmp = ext2_swabp(p++);
>> -		tmp |= (~0UL >> (BITS_PER_LONG - offset));
>> -		if (size < BITS_PER_LONG)
>> -			goto found_first;
>> -		if (~tmp)
>> -			goto found_middle;
>> -		size -= BITS_PER_LONG;
>> -		result += BITS_PER_LONG;
>> -	}
>>  
>> -	while (size & ~(BITS_PER_LONG - 1)) {
>> -		if (~(tmp = *(p++)))
>> -			goto found_middle_swap;
>> -		result += BITS_PER_LONG;
>> -		size -= BITS_PER_LONG;
>> -	}
>> -	if (!size)
>> -		return result;
>> -	tmp = ext2_swabp(p);
>> -found_first:
>> -	tmp |= ~0UL << size;
>> -	if (tmp == ~0UL)	/* Are any bits zero? */
>> -		return result + size; /* Nope. Skip ffz */
>> -found_middle:
>> -	return result + ffz(tmp);
>> -
>> -found_middle_swap:
>> -	return result + ffz(ext2_swab(tmp));
>> +	return min(_find_next_bit_le(addr, size, offset, 0), size);
>>  }
>>  EXPORT_SYMBOL(find_next_zero_bit_le);
>>  #endif
>> @@ -239,45 +162,10 @@ EXPORT_SYMBOL(find_next_zero_bit_le);
>>  unsigned long find_next_bit_le(const void *addr, unsigned
>>  		long size, unsigned long offset)
>>  {
>> -	const unsigned long *p = addr;
>> -	unsigned long result = offset & ~(BITS_PER_LONG - 1);
>> -	unsigned long tmp;
>> -
>>  	if (offset >= size)
>>  		return size;
>> -	p += BITOP_WORD(offset);
>> -	size -= result;
>> -	offset &= (BITS_PER_LONG - 1UL);
>> -	if (offset) {
>> -		tmp = ext2_swabp(p++);
>> -		tmp &= (~0UL << offset);
>> -		if (size < BITS_PER_LONG)
>> -			goto found_first;
>> -		if (tmp)
>> -			goto found_middle;
>> -		size -= BITS_PER_LONG;
>> -		result += BITS_PER_LONG;
>> -	}
>> -
>> -	while (size & ~(BITS_PER_LONG - 1)) {
>> -		tmp = *(p++);
>> -		if (tmp)
>> -			goto found_middle_swap;
>> -		result += BITS_PER_LONG;
>> -		size -= BITS_PER_LONG;
>> -	}
>> -	if (!size)
>> -		return result;
>> -	tmp = ext2_swabp(p);
>> -found_first:
>> -	tmp &= (~0UL >> (BITS_PER_LONG - size));
>> -	if (tmp == 0UL)		/* Are any bits set? */
>> -		return result + size; /* Nope. */
>> -found_middle:
>> -	return result + __ffs(tmp);
>>  
>> -found_middle_swap:
>> -	return result + __ffs(ext2_swab(tmp));
>> +	return min(_find_next_bit_le(addr, size, offset, 1), size);
>>  }
>>  EXPORT_SYMBOL(find_next_bit_le);
>>  #endif

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/