linux-kernel - Re: [PATCH] sched/cputime: make scale

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200124154215.GA14714@redhat.com>
Date:   Fri, 24 Jan 2020 16:42:15 +0100
From:   Oleg Nesterov <oleg@...hat.com>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Thomas Gleixner <tglx@...utronix.de>
Cc:     Andrew Fox <afox@...hat.com>,
        Stephen Johnston <sjohnsto@...hat.com>,
        linux-kernel@...r.kernel.org,
        Stanislaw Gruszka <sgruszka@...hat.com>
Subject: Re: [PATCH] sched/cputime: make scale_stime() more precise

On 01/22, Oleg Nesterov wrote:
>
> To remind, scale_stime(stime, rtime, total) is not precise, to say at
> least. For example:
>
> 	stime = -1ul/33333; total = stime*3; rtime = total*5555555555;
>
> scale_stime() returns 9067034312525142184 while the correct result is
> 6148914688753325707.
>
> OK, these random numbers are not realistic, usually the relative error
> is small enough.
>
> However, even if the relative error is small, the absolute error can be
> huge. And this means that if you watch /proc/$pid/status incrementally
> to see how stime/utime grow, you can get the completely wrong numbers.
>
> Say, utime (or stime) can be frozen for unpredictably long time, as if
> the monitored application "hangs" in kernel mode, while the real split
> is 50/50.

See another test-case below. Arguments:

	start_time start_utime_percent inc_time inc_utime_percent

For example,

	$ ./test 8640000 50 600 50 | head

simulates process which runs 100 days 50/50 in user/kernel mode, then it
starts to check utime/stime every 600 seconds and print the difference.

The output:

               old               	               new
               0:600000000000    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
    600000000000:0               	    300000000000:300000000000
    499469920248:100530079752    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
    600000000000:0               	    300000000000:300000000000
    499490181588:100509818412    	    300000000000:300000000000


it looks as if this process can spend 20 minutes entirely in kernel mode.

Oleg.

-------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned long long u64;
typedef unsigned int u32;
typedef unsigned __int128 u128;

static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
	return dividend / divisor;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static inline int fls64(u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}

static inline int ilog2(u64 n)
{
	return fls64(n) - 1;
}

#define swap(a, b) \
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
	return scaled;
}

u64 new_scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;

	if (ilog2(stime) + ilog2(rtime) > 62) {
		div = div64_u64_rem(rtime, total, &rem);
		res = div * stime;
		rtime = rem;

		int shift = ilog2(stime) + ilog2(rtime) - 62;
		if (shift > 0) {
			rtime >>= shift;
			total >>= shift;
			if (!total)
				return res;
		}
	}

	return res + div64_u64(stime * rtime, total);
}

struct task_cputime {
	u64				stime;
	u64				utime;
	unsigned long long		sum_exec_runtime;
};
struct prev_cputime {
	u64				utime;
	u64				stime;
};

void cputime_adjust(int new, struct task_cputime *curr, struct prev_cputime *prev,
		    u64 *ut, u64 *st)
{
	u64 rtime, stime, utime;

	rtime = curr->sum_exec_runtime;

	if (prev->stime + prev->utime >= rtime)
		goto out;

	stime = curr->stime;
	utime = curr->utime;

	if (stime == 0) {
		utime = rtime;
		goto update;
	}

	if (utime == 0) {
		stime = rtime;
		goto update;
	}

	stime = (new ? new_scale_stime : scale_stime)(stime, rtime, stime + utime);

update:
	if (stime < prev->stime)
		stime = prev->stime;
	utime = rtime - stime;

	if (utime < prev->utime) {
		utime = prev->utime;
		stime = rtime - utime;
	}

	prev->stime = stime;
	prev->utime = utime;
out:
	*ut = prev->utime;
	*st = prev->stime;
}

void prdiff(int new, struct task_cputime *curr, struct prev_cputime *prev)
{
	struct prev_cputime __prev = *prev;
	u64 ut, st, ud, sd;

	cputime_adjust(new, curr, prev, &ut, &st);
	ud = ut - __prev.utime;
	sd = st - __prev.stime;

	printf("%16llu:%-16llu", ud, sd);
}

#define SEC	1000000000ULL

void parse_cputime(struct task_cputime *t, char **argv)
{
	double total = strtod(argv[0], NULL) * SEC;
	double utime = strtod(argv[1], NULL) / 100;

	utime *= total;
	t->utime = utime;
	t->stime = total - utime;
}

int main(int argc, char **argv)
{
	struct prev_cputime old_prev = {};
	struct prev_cputime new_prev = {};
	struct task_cputime curr, diff;
	u64 tmp;

	if (argc != 5) {
		printf("usage: %s start_time utime_percent inc_time utime_percent\n", argv[0]);
		return 0;
	}

	parse_cputime(&curr, argv+1);
	parse_cputime(&diff, argv+3);

	curr.sum_exec_runtime = curr.utime + curr.stime;
	cputime_adjust(0, &curr, &old_prev, &tmp, &tmp);
	cputime_adjust(1, &curr, &new_prev, &tmp, &tmp);

	printf("%18s%15s\t%18s\n", "old", "", "new");
	for (;;) {
		curr.utime += diff.utime;
		curr.stime += diff.stime;
		curr.sum_exec_runtime = curr.utime + curr.stime;

		prdiff(0, &curr, &old_prev);
		printf("\t");
		prdiff(1, &curr, &new_prev);
		printf("\n");
	}

	return 0;
}