linux-kernel - Re: [PATCH] sched/cputime: make scale

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190719110349.GG3419@hirez.programming.kicks-ass.net>
Date:   Fri, 19 Jul 2019 13:03:49 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Oleg Nesterov <oleg@...hat.com>
Cc:     Ingo Molnar <mingo@...hat.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Andrew Fox <afox@...hat.com>,
        Stephen Johnston <sjohnsto@...hat.com>,
        linux-kernel@...r.kernel.org,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Stanislaw Gruszka <sgruszka@...hat.com>
Subject: Re: [PATCH] sched/cputime: make scale_stime() more precise

On Thu, Jul 18, 2019 at 03:18:34PM +0200, Oleg Nesterov wrote:
> People report that utime and stime from /proc/<pid>/stat become very wrong
> when the numbers are big enough. In particular, the monitored application
> can run all the time in user-space but only stime grows.
> 
> This is because scale_stime() is very inaccurate. It tries to minimize the
> relative error, but the absolute error can be huge.
> 
> Andrew wrote the test-case:
> 
> 	int main(int argc, char **argv)
> 	{
> 	    struct task_cputime c;
> 	    struct prev_cputime p;
> 	    u64 st, pst, cst;
> 	    u64 ut, put, cut;
> 	    u64 x;
> 	    int i = -1; // one step not printed
> 
> 	    if (argc != 2)
> 	    {
> 		printf("usage: %s <start_in_seconds>\n", argv[0]);
> 		return 1;
> 	    }
> 	    x = strtoull(argv[1], NULL, 0) * SEC;
> 	    printf("start=%lld\n", x);
> 
> 	    p.stime = 0;
> 	    p.utime = 0;
> 
> 	    while (i++ < NSTEPS)
> 	    {
> 		x += STEP;
> 		c.stime = x;
> 		c.utime = x;
> 		c.sum_exec_runtime = x + x;
> 		pst = cputime_to_clock_t(p.stime);
> 		put = cputime_to_clock_t(p.utime);
> 		cputime_adjust(&c, &p, &ut, &st);
> 		cst = cputime_to_clock_t(st);
> 		cut = cputime_to_clock_t(ut);
> 		if (i)
> 		    printf("ut(diff)/st(diff): %20lld (%4lld)  %20lld (%4lld)\n",
> 			cut, cut - put, cst, cst - pst);
> 	    }
> 	}
> 
> For example,
> 
> 	$ ./stime 300000
> 	start=300000000000000
> 	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
> 	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
> 	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
> 	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
> 	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
> 	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)
> 
> shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> 
> The new implementation of scale_stime() does the additional div64_u64_rem()
> in a loop but see the comment, as long it is used by cputime_adjust() this
> can happen only once.

That only shows something after long long staring :/ There's no words on
what the output actually means or what would've been expected.

Also, your example is incomplete; the below is a test for scale_stime();
from this we can see that the division results in too large a number,
but, important for our use-case in cputime_adjust(), it is a step
function (due to loss in precision) and for every plateau we shift
runtime into the wrong bucket.

Your proposed function works; but is atrocious, esp. on 32bit. That
said, before we 'fixed' it, it had similar horrible divisions in, see
commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").

Included below is also an x86_64 implementation in 2 instructions.

I'm still trying see if there's anything saner we can do...

---
#include <stdio.h>
#include <stdlib.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned long long u64;
typedef unsigned int u32;

static noinline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
{
	u64 q;
	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
	return q;
}

static u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder);

static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static __always_inline int fls(unsigned int x)
{
	return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
}

#if 0
static u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	union {
		u64 v64;
		u32 v32[2];
	} d = { dividend };
	u32 upper;

	upper = d.v32[1];
	d.v32[1] = 0;
	if (upper >= divisor) {
		d.v32[1] = upper / divisor;
		upper %= divisor;
	}
	asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
		"rm" (divisor), "0" (d.v32[0]), "1" (upper));
	return d.v64;
}
static u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		u32 rem32;
		quot = div_u64_rem(dividend, divisor, &rem32);
		*remainder = rem32;
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;

		*remainder = dividend - quot * divisor;
		if (*remainder >= divisor) {
			quot++;
			*remainder -= divisor;
		}
	}

	return quot;
}
static u64 div64_u64(u64 dividend, u64 divisor)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		quot = div_u64(dividend, divisor);
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;
		if ((dividend - quot * divisor) >= divisor)
			quot++;
	}

	return quot;
}
#else
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
	return dividend / divisor;
}
#endif

static __always_inline int fls64(u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}

static inline int ilog2(u64 n)
{
	return fls64(n) - 1;
}

#define swap(a, b) \
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

static noinline u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((stime * rtime), total);
	return scaled;
}

static noinline u64 oleg(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;
	/* can stime * rtime overflow ? */
	while (ilog2(stime) + ilog2(rtime) > 62) {
		if (stime > rtime)
			swap(rtime, stime);
		if (rtime >= total) {
			/*
			 * (rtime * stime) / total is equal to
			 *
			 *      (rtime / total) * stime +
			 *      (rtime % total) * stime / total
			 *
			 * if nothing overflows. Can the 1st multiplication
			 * overflow? Yes, but we do not care: this can only
			 * happen if the end result can't fit in u64 anyway.
			 *
			 * So the code below does
			 *
			 *      res += (rtime / total) * stime;
			 *      rtime = rtime % total;
			 */
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;
			continue;
		}
		/* drop precision */
		rtime >>= 1;
		total >>= 1;
		if (!total)
			return res;
	}
	return res + div64_u64(stime * rtime, total);
}

#define SEC	1000000000ULL

int main(int argc, char **argv)
{
	u64 u, s;
	u64 x;
	int i = -1; // one step not printed
	if (argc != 2)
	{
		printf("usage: %s <start_in_seconds>\n", argv[0]);
		return 1;
	}
	x = strtoull(argv[1], NULL, 0) * SEC;
	printf("start=%lld\n", x);

	for (i=0; i<50; i++, x += 2000) {
		printf("%lld = %lld * %lld / %lld\n", mul_u64_u64_div_u64(x, x+x, x+x), x, x+x, x+x);
		printf("%lld = %lld * %lld / %lld\n", scale_stime(x, x+x, x+x), x, x+x, x+x);
		printf("%lld = %lld * %lld / %lld\n", oleg(x, x+x, x+x), x, x+x, x+x);
		printf("---\n");
	}
}