[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200123130541.GA30620@redhat.com>
Date: Thu, 23 Jan 2020 14:05:41 +0100
From: Oleg Nesterov <oleg@...hat.com>
To: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>
Cc: Andrew Fox <afox@...hat.com>,
Stephen Johnston <sjohnsto@...hat.com>,
linux-kernel@...r.kernel.org,
Stanislaw Gruszka <sgruszka@...hat.com>
Subject: Re: [PATCH] sched/cputime: make scale_stime() more precise
On 01/22, Oleg Nesterov wrote:
>
> But there is another reason why I think it makes more sense. It is also
> faster on x86-64, much faster when the numbers are big. See the naive
> test code below. For example,
>
> $ ./test 553407856289849 18446744066259977121 1660223568869547
> 553407856289849 * 18446744066259977121 / 1660223568869547 =
> (128) 6148914688753325707
> (asm) 6148914688753325707
> (new) 6148914691236512239
> (old) 9067034312525142184
>
> ticks:
> asm: 7183908591
> new: 4891383871
> old: 23585547775
Just for completeness, see the updated code which can be compiled with -m32.
As expected, my version is slower on 32-bit when the numbers are small,
$ ./test 1 3 2
1 * 3 / 2 =
(new) 1
(old) 1
ticks:
new: 3624344961
old: 2514403456
But still faster when rtime is big enough:
$ ./test 1 68719476736 2
1 * 68719476736 / 2 =
(new) 34359738368
(old) 34359738368
ticks:
new: 5044284834
old: 5347969883
$ ./test 553407856289849 18446744066259977121 1660223568869547
553407856289849 * 18446744066259977121 / 1660223568869547 =
(new) 6148914691236512239
(old) 9067034312525142184
ticks:
new: 11496181242
old: 33622910386
Oleg.
------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#define noinline __attribute__((__noinline__))
typedef unsigned long long u64;
typedef unsigned int u32;
#ifdef __x86_64__
typedef unsigned __int128 u128;
u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
{
u64 q;
asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
return q;
}
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
*remainder = dividend % divisor;
return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
*remainder = dividend % divisor;
return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
return dividend / divisor;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
u32 remainder;
return div_u64_rem(dividend, divisor, &remainder);
}
static inline int fls64(u64 x)
{
int bitpos = -1;
/*
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before.
*/
asm("bsrq %1,%q0"
: "+r" (bitpos)
: "rm" (x));
return bitpos + 1;
}
#else // 32-bit
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
union {
u64 v64;
u32 v32[2];
} d = { dividend };
u32 upper;
upper = d.v32[1];
d.v32[1] = 0;
if (upper >= divisor) {
d.v32[1] = upper / divisor;
upper %= divisor;
}
asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
"rm" (divisor), "0" (d.v32[0]), "1" (upper));
return d.v64;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
u32 remainder;
return div_u64_rem(dividend, divisor, &remainder);
}
static inline int fls(unsigned int x)
{
int r;
asm("bsrl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "rm" (-1));
return r + 1;
}
static inline int fls64(u64 x)
{
u32 h = x >> 32;
if (h)
return fls(h) + 32;
return fls(x);
}
u64 noinline div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
u32 high = divisor >> 32;
u64 quot;
if (high == 0) {
u32 rem32;
quot = div_u64_rem(dividend, divisor, &rem32);
*remainder = rem32;
} else {
int n = fls(high);
quot = div_u64(dividend >> n, divisor >> n);
if (quot != 0)
quot--;
*remainder = dividend - quot * divisor;
if (*remainder >= divisor) {
quot++;
*remainder -= divisor;
}
}
return quot;
}
u64 noinline div64_u64(u64 dividend, u64 divisor)
{
u32 high = divisor >> 32;
u64 quot;
if (high == 0) {
quot = div_u64(dividend, divisor);
} else {
int n = fls(high);
quot = div_u64(dividend >> n, divisor >> n);
if (quot != 0)
quot--;
if ((dividend - quot * divisor) >= divisor)
quot++;
}
return quot;
}
#endif
static inline int ilog2(u64 n)
{
return fls64(n) - 1;
}
#define swap(a, b) \
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 scaled;
for (;;) {
/* Make sure "rtime" is the bigger of stime/rtime */
if (stime > rtime)
swap(rtime, stime);
/* Make sure 'total' fits in 32 bits */
if (total >> 32)
goto drop_precision;
/* Does rtime (and thus stime) fit in 32 bits? */
if (!(rtime >> 32))
break;
/* Can we just balance rtime/stime rather than dropping bits? */
if (stime >> 31)
goto drop_precision;
/* We can grow stime and shrink rtime and try to make them both fit */
stime <<= 1;
rtime >>= 1;
continue;
drop_precision:
/* We drop from rtime, it has more bits than stime */
rtime >>= 1;
total >>= 1;
}
/*
* Make sure gcc understands that this is a 32x32->64 multiply,
* followed by a 64/32->64 divide.
*/
scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
return scaled;
}
u64 new_scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 res = 0, div, rem;
if (ilog2(stime) + ilog2(rtime) > 62) {
div = div64_u64_rem(rtime, total, &rem);
res = div * stime;
rtime = rem;
int shift = ilog2(stime) + ilog2(rtime) - 62;
if (shift > 0) {
rtime >>= shift;
total >>= shift;
if (!total)
return res;
}
}
return res + div64_u64(stime * rtime, total);
}
static inline u64 rdtsc(void)
{
unsigned low, high;
asm volatile("rdtsc" : "=a" (low), "=d" (high));
return ((low) | ((u64)(high) << 32));
}
u64 S, R, T;
u64 noinline profile(u64 (*f)(u64,u64,u64))
{
// u64 s = S, r = R, t = T;
u64 tsc1, tsc2;
int i;
tsc1 = rdtsc();
for (i = 0; i < 100*1000*1000; ++i)
// f(s++, r++, t++);
f(S,R,T);
tsc2 = rdtsc();
return tsc2 - tsc1;
}
int main(int argc, char **argv)
{
if (argc != 4) {
printf("usage: %s stime rtime total\n", argv[0]);
return 1;
}
S = strtoull(argv[1], NULL, 0);
R = strtoull(argv[2], NULL, 0);
T = strtoull(argv[3], NULL, 0);
assert(S < T);
assert(T < R);
if (1) {
printf("%llu * %llu / %llu =\n", S,R,T);
#ifdef __x86_64__
printf("\t(128) %lld\n", (u64)( ((u128)S)*((u128)R)/((u128)T) ));
printf("\t(asm) %lld\n", mul_u64_u64_div_u64(S,R,T));
#endif
printf("\t(new) %lld\n", new_scale_stime(S,R,T));
printf("\t(old) %lld\n", scale_stime(S,R,T));
printf("\n");
}
printf("ticks:\n");
#ifdef __x86_64__
printf("\tasm: %lld\n", profile(mul_u64_u64_div_u64));
#endif
printf("\tnew: %lld\n", profile(new_scale_stime));
printf("\told: %lld\n", profile(scale_stime));
return 0;
}
Powered by blists - more mailing lists