[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170720112449.6xvc2ghaj3jh6w7l@hirez.programming.kicks-ass.net>
Date: Thu, 20 Jul 2017 13:24:49 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: Anshul Garg <aksgarg1989@...il.com>,
Davidlohr Bueso <dave@...olabs.net>,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
anshul.g@...sung.com, Thomas Gleixner <tglx@...utronix.de>,
joe@...ches.com
Subject: Re: [PATCH] lib/int_sqrt.c: Optimize square root function
On Mon, Feb 02, 2015 at 11:13:44AM -0800, Linus Torvalds wrote:
> On Mon, Feb 2, 2015 at 11:00 AM, Linus Torvalds
> <torvalds@...ux-foundation.org> wrote:
> >
> > (I'm also not entirely sure what uses int_sqrt() that ends up being so
> > performance-critical, so it would be good to document that too, since
> > that probably also matters for the "what's the normal argument range"
> > question..)
>
> ... it's also not entirely clear that we need a whole new loop. We
> might just instead start off with a better guess for 'm' using some
> calculation that might be doable with a single conditional move
> instruction instead of a loop. Because I suspect that the inevitable
> branch misprediction of a new loop is likely as expensive as a few
> iterations through the core one.
>
> IOW, instead of
>
> m = 1UL << (BITS_PER_LONG - 2);
>
> perhaps something like
>
> m = 1UL << (BITS_PER_LONG/2- 2);
> if (m < x)
> m <<= BITS_PER_LONG/2;
>
> (assuming gcc can change that code into a "cmov") might cut down the
> "lots of empty loops" case in half for small values of 'x'.
>
> There's probably some other better cheap initial guess value estimator.
So I had a wee look at int_sqrt(), and yes Davidlohr's patch makes it
worse for my machine (Xeon E3v5 aka. fancy Skylake).
As in _MUCH_ worse..
~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt
Performance counter stats for './sqrt' (10 runs):
828,233,295 cycles:u ( +- 0.04% )
2,480,095,771 instructions:u # 2.99 insn per cycle ( +- 0.00% )
0.220202758 seconds time elapsed ( +- 0.18% )
~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt
Performance counter stats for './sqrt' (10 runs):
603,809,551 cycles:u ( +- 1.30% )
1,677,726,591 instructions:u # 2.78 insn per cycle ( +- 0.00% )
0.160801044 seconds time elapsed
That is, we went from ~60 cycles to ~82 cycles per invocation.
Now, the patches proposed in this thread do indeed improve matters but
seem to not have merged up until this day:
~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DANSHUL=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt
Performance counter stats for './sqrt' (10 runs):
598,827,196 cycles:u ( +- 0.18% )
1,697,726,381 instructions:u # 2.84 insn per cycle ( +- 0.00% )
0.159880738 seconds time elapsed ( +- 0.36% )
~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DLINUS=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt
Performance counter stats for './sqrt' (10 runs):
601,463,184 cycles:u ( +- 0.07% )
1,380,095,976 instructions:u # 2.29 insn per cycle ( +- 0.00% )
0.160788095 seconds time elapsed ( +- 0.55% )
Which basically bring us back to the old performance. So I would
strongly suggest we merge one.
Now, the thing is, Intel CPUs are actually fairly good at fls(), so I
gave Joe's suggestions a spin too..
~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt
Performance counter stats for './sqrt' (10 runs):
390,420,004 cycles:u ( +- 0.07% )
1,141,802,493 instructions:u # 2.92 insn per cycle ( +- 0.00% )
0.105480000 seconds time elapsed ( +- 0.64% )
~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 -DANSHUL=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt
Performance counter stats for './sqrt' (10 runs):
328,415,775 cycles:u ( +- 0.15% )
1,138,579,704 instructions:u # 3.47 insn per cycle ( +- 0.00% )
0.088703205 seconds time elapsed
Which gets us a real improvement...
Now even with the software fls() fallback from
include/asm-generic/bitops/fls.h there is a significant improvement:
~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 -DANSHUL=1 -DSOFTFLS=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt
Performance counter stats for './sqrt' (10 runs):
375,633,931 cycles:u ( +- 0.13% )
1,289,700,013 instructions:u # 3.43 insn per cycle ( +- 0.00% )
0.100596211 seconds time elapsed
Also, I ran with -DVALIDATE=1 and while the comment says its a 'rough'
estimate of the sqrt() I did not in fact find any value for which we
deviate < INT_MAX.
----------------------[ sqrt.c ]---------------------------
#include <math.h>
#include <stdio.h>
#define BITS_PER_LONG (sizeof(long) * 8)
#ifndef LOOPS
#define LOOPS 1000000
#endif
#ifdef SOFTFLS
static __always_inline unsigned long fls(unsigned long x)
{
int r = 32;
if (!x)
return 0;
if (!(x & 0xffff0000u)) {
x <<= 16;
r -= 16;
}
if (!(x & 0xff000000u)) {
x <<= 8;
r -= 8;
}
if (!(x & 0xf0000000u)) {
x <<= 4;
r -= 4;
}
if (!(x & 0xc0000000u)) {
x <<= 2;
r -= 2;
}
if (!(x & 0x80000000u)) {
x <<= 1;
r -= 1;
}
return r;
}
#else
static __always_inline unsigned long fls(unsigned long word)
{
asm("rep; bsr %1,%0"
: "=r" (word)
: "rm" (word));
return BITS_PER_LONG - 1 - word;
}
#endif
#ifdef NEW
unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
unsigned long b, m, y = 0;
if (x <= 1)
return x;
#ifdef LINUS
m = 1UL << (BITS_PER_LONG/2 - 2);
if (m < x)
m <<= BITS_PER_LONG/2;
#else
#ifdef FLS
m = 1UL << ((fls(x) + 1) & ~1UL);
#else
m = 1UL << (BITS_PER_LONG - 2);
#endif
#ifdef ANSHUL
while (m > x)
m >>= 2;
#endif
#endif
while (m != 0) {
b = y + m;
y >>= 1;
if (x >= b) {
x -= b;
y += m;
}
m >>= 2;
}
return y;
}
#else
unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
unsigned long op, res, one;
op = x;
res = 0;
one = 1UL << (BITS_PER_LONG - 2);
while (one > op)
one >>= 2;
while (one != 0) {
if (op >= res + one) {
op = op - (res + one);
res = res + 2 * one;
}
res /= 2;
one /= 4;
}
return res;
}
#endif
void main(void)
{
unsigned long i;
for (i=0; i<LOOPS; i++) {
unsigned long a = int_sqrt(i);
asm volatile("" : : "r" (a));
#ifdef VALIDATE
{
unsigned long b = floor(sqrt(i));
if (a != b)
printf("%ld %ld %ld\n", i, a, b);
}
#endif
}
}
--------------
The only caveat seems to be that our fls() is only defined for 'int' not
long :/
---
lib/int_sqrt.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index 1ef4cc344977..87c3aa360441 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -7,12 +7,11 @@
#include <linux/kernel.h>
#include <linux/export.h>
+#include <linux/bitops.h>
/**
- * int_sqrt - rough approximation to sqrt
+ * int_sqrt - approximation to sqrt
* @x: integer of which to calculate the sqrt
- *
- * A very rough approximation to the sqrt() function.
*/
unsigned long int_sqrt(unsigned long x)
{
@@ -21,7 +20,11 @@ unsigned long int_sqrt(unsigned long x)
if (x <= 1)
return x;
- m = 1UL << (BITS_PER_LONG - 2);
+ m = 1UL << ((fls(x) + 1) & ~1UL);
+
+ while (m > x)
+ m >>= 2;
+
while (m != 0) {
b = y + m;
y >>= 1;
Powered by blists - more mailing lists