linux-kernel - Re: [PATCH] lib/int_sqrt.c: Optimize square root function

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170720112449.6xvc2ghaj3jh6w7l@hirez.programming.kicks-ass.net>
Date:   Thu, 20 Jul 2017 13:24:49 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Linus Torvalds <torvalds@...ux-foundation.org>
Cc:     Anshul Garg <aksgarg1989@...il.com>,
        Davidlohr Bueso <dave@...olabs.net>,
        Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
        anshul.g@...sung.com, Thomas Gleixner <tglx@...utronix.de>,
        joe@...ches.com
Subject: Re: [PATCH] lib/int_sqrt.c: Optimize square root function

On Mon, Feb 02, 2015 at 11:13:44AM -0800, Linus Torvalds wrote:
> On Mon, Feb 2, 2015 at 11:00 AM, Linus Torvalds
> <torvalds@...ux-foundation.org> wrote:
> >
> > (I'm also not entirely sure what uses int_sqrt() that ends up being so
> > performance-critical, so it would be good to document that too, since
> > that probably also matters for the "what's the normal argument range"
> > question..)
> 
> ... it's also not entirely clear that we need a whole new loop. We
> might just instead start off with a better guess for 'm' using some
> calculation that might be doable with a single conditional move
> instruction instead of a loop. Because I suspect that the inevitable
> branch misprediction of a new loop is likely as expensive as a few
> iterations through the core one.
> 
> IOW, instead of
> 
>   m = 1UL << (BITS_PER_LONG - 2);
> 
> perhaps something like
> 
>    m = 1UL << (BITS_PER_LONG/2- 2);
>    if (m < x)
>        m <<= BITS_PER_LONG/2;
> 
> (assuming gcc can change that code into a "cmov") might cut down the
> "lots of empty loops" case in half for small values of 'x'.
> 
> There's probably some other better cheap initial guess value estimator.

So I had a wee look at int_sqrt(), and yes Davidlohr's patch makes it
worse for my machine (Xeon E3v5 aka. fancy Skylake).

As in _MUCH_ worse..

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1  ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       828,233,295      cycles:u                                                      ( +-  0.04% )
     2,480,095,771      instructions:u            #    2.99  insn per cycle           ( +-  0.00% )

       0.220202758 seconds time elapsed                                          ( +-  0.18% )

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000   ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       603,809,551      cycles:u                                                      ( +-  1.30% )
     1,677,726,591      instructions:u            #    2.78  insn per cycle           ( +-  0.00% )

       0.160801044 seconds time elapsed  

That is, we went from ~60 cycles to ~82 cycles per invocation.


Now, the patches proposed in this thread do indeed improve matters but
seem to not have merged up until this day:


~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DANSHUL=1  ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       598,827,196      cycles:u                                                      ( +-  0.18% )
     1,697,726,381      instructions:u            #    2.84  insn per cycle           ( +-  0.00% )

       0.159880738 seconds time elapsed                                          ( +-  0.36% )

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DLINUS=1  ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       601,463,184      cycles:u                                                      ( +-  0.07% )
     1,380,095,976      instructions:u            #    2.29  insn per cycle           ( +-  0.00% )

       0.160788095 seconds time elapsed                                          ( +-  0.55% )

Which basically bring us back to the old performance. So I would
strongly suggest we merge one.


Now, the thing is, Intel CPUs are actually fairly good at fls(), so I
gave Joe's suggestions a spin too..

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1  ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       390,420,004      cycles:u                                                      ( +-  0.07% )
     1,141,802,493      instructions:u            #    2.92  insn per cycle           ( +-  0.00% )

       0.105480000 seconds time elapsed                                          ( +-  0.64% )

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 -DANSHUL=1  ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       328,415,775      cycles:u                                                      ( +-  0.15% )
     1,138,579,704      instructions:u            #    3.47  insn per cycle           ( +-  0.00% )

       0.088703205 seconds time elapsed  

Which gets us a real improvement...

Now even with the software fls() fallback from
include/asm-generic/bitops/fls.h there is a significant improvement:


~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 -DANSHUL=1 -DSOFTFLS=1  ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       375,633,931      cycles:u                                                      ( +-  0.13% )
     1,289,700,013      instructions:u            #    3.43  insn per cycle           ( +-  0.00% )

       0.100596211 seconds time elapsed 


Also, I ran with -DVALIDATE=1 and while the comment says its a 'rough'
estimate of the sqrt() I did not in fact find any value for which we
deviate < INT_MAX.




----------------------[ sqrt.c ]---------------------------

#include <math.h>
#include <stdio.h>

#define BITS_PER_LONG	(sizeof(long) * 8)

#ifndef LOOPS
#define LOOPS 1000000
#endif

#ifdef SOFTFLS

static __always_inline unsigned long fls(unsigned long x)
{
	int r = 32;

	if (!x)
		return 0;

	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

#else

static __always_inline unsigned long fls(unsigned long word)
{
	asm("rep; bsr %1,%0"
		: "=r" (word)
		: "rm" (word));
	return BITS_PER_LONG - 1 - word;
}

#endif


#ifdef NEW

unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
	unsigned long b, m, y = 0;

	if (x <= 1)
		return x;

#ifdef LINUS
	m = 1UL << (BITS_PER_LONG/2 - 2);
	if (m < x)
		m <<= BITS_PER_LONG/2;

#else

#ifdef FLS
	m = 1UL << ((fls(x) + 1) & ~1UL);
#else
	m = 1UL << (BITS_PER_LONG - 2);
#endif

#ifdef ANSHUL
	while (m > x)
		m >>= 2;
#endif
#endif

	while (m != 0) {
		b = y + m;
		y >>= 1;

		if (x >= b) {
			x -= b;
			y += m;
		}
		m >>= 2;
	}

	return y;
}

#else

unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
	unsigned long op, res, one;

	op = x;
	res = 0;

	one = 1UL << (BITS_PER_LONG - 2);
	while (one > op)
		one >>= 2;

	while (one != 0) {
		if (op >= res + one) {
			op = op - (res + one);
			res = res +  2 * one;
		}
		res /= 2;
		one /= 4;
	}
	return res;
}

#endif

void main(void)
{
	unsigned long i;

	for (i=0; i<LOOPS; i++) {
		unsigned long a = int_sqrt(i);
		asm volatile("" : : "r" (a));

#ifdef VALIDATE
		{
			unsigned long b = floor(sqrt(i));
			if (a != b)
				printf("%ld %ld %ld\n", i, a, b);
		}
#endif

	}
}

--------------

The only caveat seems to be that our fls() is only defined for 'int' not
long :/


---
 lib/int_sqrt.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index 1ef4cc344977..87c3aa360441 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -7,12 +7,11 @@
 
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/bitops.h>
 
 /**
- * int_sqrt - rough approximation to sqrt
+ * int_sqrt - approximation to sqrt
  * @x: integer of which to calculate the sqrt
- *
- * A very rough approximation to the sqrt() function.
  */
 unsigned long int_sqrt(unsigned long x)
 {
@@ -21,7 +20,11 @@ unsigned long int_sqrt(unsigned long x)
 	if (x <= 1)
 		return x;
 
-	m = 1UL << (BITS_PER_LONG - 2);
+	m = 1UL << ((fls(x) + 1) & ~1UL);
+
+	while (m > x)
+		m >>= 2;
+
 	while (m != 0) {
 		b = y + m;
 		y >>= 1;