lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <adjv6ypwuvlugbx3uqd6q4r5hyj3v3gx4bs6cwcvj6h7llbo6v@jr4p35pkkzyx>
Date: Thu, 4 Jul 2024 19:26:45 +0200
From: Uwe Kleine-König <u.kleine-koenig@...libre.com>
To: Nicolas Pitre <nico@...xnic.net>
Cc: Andrew Morton <akpm@...ux-foundation.org>, 
	Nicolas Pitre <npitre@...libre.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 1/2] mul_u64_u64_div_u64: make it precise always

Hello Nico,

On Tue, Jul 02, 2024 at 11:34:08PM -0400, Nicolas Pitre wrote:
> diff --git a/lib/math/div64.c b/lib/math/div64.c
> index 191761b1b6..dd461b3973 100644
> --- a/lib/math/div64.c
> +++ b/lib/math/div64.c
> @@ -186,55 +186,92 @@ EXPORT_SYMBOL(iter_div_u64_rem);
>  #ifndef mul_u64_u64_div_u64
>  u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
>  {
> -	u64 res = 0, div, rem;
> -	int shift;
> +	if (ilog2(a) + ilog2(b) <= 62)
> +		return div64_u64(a * b, c);
>  
> -	/* can a * b overflow ? */
> -	if (ilog2(a) + ilog2(b) > 62) {
> -		/*
> -		 * Note that the algorithm after the if block below might lose
> -		 * some precision and the result is more exact for b > a. So
> -		 * exchange a and b if a is bigger than b.
> -		 *
> -		 * For example with a = 43980465100800, b = 100000000, c = 1000000000
> -		 * the below calculation doesn't modify b at all because div == 0
> -		 * and then shift becomes 45 + 26 - 62 = 9 and so the result
> -		 * becomes 4398035251080. However with a and b swapped the exact
> -		 * result is calculated (i.e. 4398046510080).
> -		 */
> -		if (a > b)
> -			swap(a, b);
> +#if defined(__SIZEOF_INT128__)
> +
> +	/* native 64x64=128 bits multiplication */
> +	u128 prod = (u128)a * b;
> +	u64 n_lo = prod, n_hi = prod >> 64;
> +
> +#else
> +
> +	/* perform a 64x64=128 bits multiplication manually */
> +	union {
> +		u64 v;
> +		struct {
> +#if defined(CONFIG_CPU_LITTLE_ENDIAN)
> +			u32 l;
> +			u32 h;
> +#elif defined(CONFIG_CPU_BIG_ENDIAN)
> +			u32 h;
> +			u32 l;
> +#else
> +#error "unknown endianness"
> +#endif
> +		};
> +	} A, B, X, Y, Z;
> +
> +	A.v = a;
> +	B.v = b;
> +
> +	X.v = (u64)A.l * B.l;
> +	Y.v = (u64)A.l * B.h + X.h;
> +	Z.v = (u64)A.h * B.h + Y.h;
> +	Y.v = (u64)A.h * B.l + Y.l;
> +	X.h = Y.l;
> +	Z.v += Y.h;
> +
> +	u64 n_lo = X.v, n_hi = Z.v;

I tried to understand your patch. This part could really benefit from
some comments. With pen and paper I worked out your idea:

	The goal is:

		A * B == Z << 64 + X

	With A = A.h << 32 + A.l and similar identities for B, we have:

	A * B = (A.h << 32 + A.l) * (B.h << 32 + B.l)
	      = (A.h * B.h << 64 + (A.h * B.l + A.l * B.h) << 32 + A.l * B.l

	The operations done here are only 32 bit multiplications and
	additions, and with U32_MAX = 0xffffffff we have:
	U32_MAX * U32_MAX + U32_MAX = (U32_MAX + 1) * U32_MAX =
	0xffffffff00000000 which fits into an u64.  Even when adding
	another U32_MAX (which happens with Z.v += Y.h) it still fits
	into u64, and so the operations won't overflow.

> +
> +#endif

So starting here we have

	n_hi << 64 + n_lo == a * b

> +	int shift = __builtin_ctzll(c);
> +
> +	/* try reducing the fraction in case the dividend becomes <= 64 bits */
> +	if ((n_hi >> shift) == 0) {

The idea here is: c = c_ << shift, and so

	a * b / c == (a * b) >> shift / c_

In this if-body we're handling (a * b) >> shift fitting into an u64.

> +		u64 n = (n_lo >> shift) | (n_hi << (64 - shift));
> +
> +		return div64_u64(n, c >> shift);
>  		/*
> -		 * (b * a) / c is equal to
> -		 *
> -		 *      (b / c) * a +
> -		 *      (b % c) * a / c
> -		 *
> -		 * if nothing overflows. Can the 1st multiplication
> -		 * overflow? Yes, but we do not care: this can only
> -		 * happen if the end result can't fit in u64 anyway.
> -		 *
> -		 * So the code below does
> -		 *
> -		 *      res = (b / c) * a;
> -		 *      b = b % c;
> +		 * The remainder value if needed would be:
> +		 *   res = div64_u64_rem(n, c >> shift, &rem);
> +		 *   rem = (rem << shift) + (n_lo - (n << shift));
>  		 */
> -		div = div64_u64_rem(b, c, &rem);
> -		res = div * a;
> -		b = rem;
> -
> -		shift = ilog2(a) + ilog2(b) - 62;
> -		if (shift > 0) {
> -			/* drop precision */
> -			b >>= shift;
> -			c >>= shift;
> -			if (!c)
> -				return res;
> -		}
>  	}
>  
> -	return res + div64_u64(a * b, c);
> +	if (n_hi >= c) {
> +		/* overflow: result is unrepresentable in a u64 */
> +		return -1;
> +	}
> +
> +	/* Do the full 128 by 64 bits division */

Here is the code location where I stop understanding your code :-)

Maybe stating the loop invariant in a comment would be helpful?

> +	shift = __builtin_clzll(c);
> +	c <<= shift;
> +
> +	int p = 64 + shift;
> +	u64 res = 0;
> +	bool carry;
> +
> +	do {
> +		carry = n_hi >> 63;
> +		shift = carry ? 1 : __builtin_clzll(n_hi);
> +		if (p < shift)
> +			break;
> +		p -= shift;
> +		n_hi <<= shift;
> +		n_hi |= n_lo >> (64 - shift);
> +		n_lo <<= shift;
> +		if (carry || (n_hi >= c)) {
> +			n_hi -= c;
> +			res |= 1ULL << p;
> +		}
> +	} while (n_hi);
> +	/* The remainder value if needed would be n_hi << p */

I indeed need a variant of this function that rounds up. So maybe
creating a function

	u64 mul_u64_u64_div_u64_rem(u64 a, u64 b, u64 c, u64 *rem)

with the sophistication of your mul_u64_u64_div_u64 and then:

	u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
	{
		u64 rem, ret;

		ret = mul_u64_u64_div_u64_rem(a, b, c, &rem);
		return ret;
	}

(In the hope that the compiler optimizes out the calculation for the
remainder) and:

	u64 mul_u64_u64_div_u64_roundup(u64 a, u64 b, u64 c)
	{
		u64 rem, ret;

		ret = mul_u64_u64_div_u64_rem(a, b, c, &rem);

		if (rem)
			ret += 1;

		return ret;
	}

would be nice. (This could be done in a follow up patch though.)

> +	return res;
>  }
>  EXPORT_SYMBOL(mul_u64_u64_div_u64);
>  #endif

Best regards
Uwe

Download attachment "signature.asc" of type "application/pgp-signature" (489 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ