[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200122164612.GA19818@redhat.com>
Date: Wed, 22 Jan 2020 17:46:12 +0100
From: Oleg Nesterov <oleg@...hat.com>
To: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>
Cc: Andrew Fox <afox@...hat.com>,
Stephen Johnston <sjohnsto@...hat.com>,
linux-kernel@...r.kernel.org,
Stanislaw Gruszka <sgruszka@...hat.com>
Subject: Re: [PATCH] sched/cputime: make scale_stime() more precise
Hi Peter, et al.
On 07/18, Oleg Nesterov wrote:
>
> People report that utime and stime from /proc/<pid>/stat become very wrong
> when the numbers are big enough. In particular, the monitored application
> can run all the time in user-space but only stime grows.
Let me reanimate this discussion and try again to convince you that
scale_stime() needs changes and this patch makes sense.
To remind, scale_stime(stime, rtime, total) is not precise, to say at
least. For example:
stime = -1ul/33333; total = stime*3; rtime = total*5555555555;
scale_stime() returns 9067034312525142184 while the correct result is
6148914688753325707.
OK, these random numbers are not realistic, usually the relative error
is small enough.
However, even if the relative error is small, the absolute error can be
huge. And this means that if you watch /proc/$pid/status incrementally
to see how stime/utime grow, you can get the completely wrong numbers.
Say, utime (or stime) can be frozen for unpredictably long time, as if
the monitored application "hangs" in kernel mode, while the real split
is 50/50. The test-case from the changelog tries to demonstrate this,
see https://lore.kernel.org/lkml/20190718131834.GA22211@redhat.com/
Yes, yes, there are no 'real' stime and utime numbers. But still, why we
can't improve scale_stime()?
-------------------------------------------------------------------------------
Lets look at simplified (but less accurate) version I mentioned in
https://lore.kernel.org/lkml/20190718145549.GA22902@redhat.com
u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 res = 0, div, rem;
if (ilog2(stime) + ilog2(rtime) > 62) {
div = div64_u64_rem(rtime, total, &rem);
res = div * stime;
rtime = rem;
int shift = ilog2(stime) + ilog2(rtime) - 62;
if (shift > 0) {
rtime >>= shift;
total >>= shift;
if (!total)
return res;
}
}
return res + div64_u64(stime * rtime, total);
}
It is much more accurate than the current version, in fact I think it
should be 100% accurate "in practice".
But there is another reason why I think it makes more sense. It is also
faster on x86-64, much faster when the numbers are big. See the naive
test code below. For example,
$ ./test 553407856289849 18446744066259977121 1660223568869547
553407856289849 * 18446744066259977121 / 1660223568869547 =
(128) 6148914688753325707
(asm) 6148914688753325707
(new) 6148914691236512239
(old) 9067034312525142184
ticks:
asm: 7183908591
new: 4891383871
old: 23585547775
(I am surprised it works faster than asm mul_u64_u64_div_u64() version
on my machine, I don't understand why divq is so slow when rdx != 0).
I am going to send V2, I need to rewrite the changelog and comments.
But if you still think this doesn't worth fixing, please tell me right
now.
What do you think?
Oleg.
-------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#define noinline __attribute__((__noinline__))
typedef unsigned __int128 u128;
typedef unsigned long long u64;
typedef unsigned int u32;
u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
{
u64 q;
asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
return q;
}
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
*remainder = dividend % divisor;
return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
*remainder = dividend % divisor;
return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
return dividend / divisor;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
u32 remainder;
return div_u64_rem(dividend, divisor, &remainder);
}
static inline int fls64(u64 x)
{
int bitpos = -1;
/*
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before.
*/
asm("bsrq %1,%q0"
: "+r" (bitpos)
: "rm" (x));
return bitpos + 1;
}
static inline int ilog2(u64 n)
{
return fls64(n) - 1;
}
#define swap(a, b) \
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 scaled;
for (;;) {
/* Make sure "rtime" is the bigger of stime/rtime */
if (stime > rtime)
swap(rtime, stime);
/* Make sure 'total' fits in 32 bits */
if (total >> 32)
goto drop_precision;
/* Does rtime (and thus stime) fit in 32 bits? */
if (!(rtime >> 32))
break;
/* Can we just balance rtime/stime rather than dropping bits? */
if (stime >> 31)
goto drop_precision;
/* We can grow stime and shrink rtime and try to make them both fit */
stime <<= 1;
rtime >>= 1;
continue;
drop_precision:
/* We drop from rtime, it has more bits than stime */
rtime >>= 1;
total >>= 1;
}
/*
* Make sure gcc understands that this is a 32x32->64 multiply,
* followed by a 64/32->64 divide.
*/
scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
return scaled;
}
u64 new_scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 res = 0, div, rem;
if (ilog2(stime) + ilog2(rtime) > 62) {
div = div64_u64_rem(rtime, total, &rem);
res = div * stime;
rtime = rem;
int shift = ilog2(stime) + ilog2(rtime) - 62;
if (shift > 0) {
rtime >>= shift;
total >>= shift;
if (!total)
return res;
}
}
return res + div64_u64(stime * rtime, total);
}
static inline u64 rdtsc(void)
{
unsigned low, high;
asm volatile("rdtsc" : "=a" (low), "=d" (high));
return ((low) | ((u64)(high) << 32));
}
u64 S, R, T;
u64 noinline profile(u64 (*f)(u64,u64,u64))
{
// u64 s = S, r = R, t = T;
u64 tsc1, tsc2;
int i;
tsc1 = rdtsc();
for (i = 0; i < 100*1000*1000; ++i)
// f(s++, r++, t++);
f(S,R,T);
tsc2 = rdtsc();
return tsc2 - tsc1;
}
int main(int argc, char **argv)
{
if (argc != 4) {
printf("usage: %s stime rtime total\n", argv[0]);
return 1;
}
S = strtoull(argv[1], NULL, 0);
R = strtoull(argv[2], NULL, 0);
T = strtoull(argv[3], NULL, 0);
assert(S < T);
assert(T < R);
if (1) {
printf("%llu * %llu / %llu =\n", S,R,T);
printf("\t(128) %lld\n", (u64)( ((u128)S)*((u128)R)/((u128)T) ));
printf("\t(asm) %lld\n", mul_u64_u64_div_u64(S,R,T));
printf("\t(new) %lld\n", new_scale_stime(S,R,T));
printf("\t(old) %lld\n", scale_stime(S,R,T));
printf("\n");
}
printf("ticks:\n");
printf("\tasm: %lld\n", profile(mul_u64_u64_div_u64));
printf("\tnew: %lld\n", profile(new_scale_stime));
printf("\told: %lld\n", profile(scale_stime));
return 0;
}
Powered by blists - more mailing lists