linux-kernel - Re: [PATCH] math64: Provide an uprounding variant of mul_u64_u64_div

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <20250402224652.1bb38f6b@pumpkin>
Date: Wed, 2 Apr 2025 22:46:52 +0100
From: David Laight <david.laight.linux@...il.com>
To: Uwe Kleine-König <u.kleine-koenig@...libre.com>
Cc: Nicolas Pitre <nico@...xnic.net>, Andrew Morton
 <akpm@...ux-foundation.org>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] math64: Provide an uprounding variant of
 mul_u64_u64_div_u64()

On Wed, 2 Apr 2025 17:01:49 +0200
Uwe Kleine-König <u.kleine-koenig@...libre.com> wrote:

How about (tab damaged):

Compile tested only, on x86-x64 (once with the local definitions removed).

Looking at the object code, if u128 is supported then checking n_hi
is always going to be better than a pre-check.
Remember multiply is cheap.

	David

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9931e4c7d73f..6115f3fcb975 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -84,21 +84,28 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
  * Will generate an #DE when the result doesn't fit u64, could fix with an
  * __ex_table[] entry when it becomes an issue.
  */
-static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
+static inline u64 mul_u64_add_u64_div_u64(u64 a, u64 mul, u64 add, u64 div)
 {
        u64 q;

-       asm ("mulq %2; divq %3" : "=a" (q)
-                               : "a" (a), "rm" (mul), "rm" (div)
-                               : "rdx");
+       if (statically_true(!add)) {
+               asm ("mulq %2; divq %3" : "=a" (q)
+                                       : "a" (a), "rm" (mul), "rm" (div)
+                                       : "rdx");
+       } else {
+               asm ("mulq %2; addq %4,%rax; addc $0,%rdx; divq %3"
+                       : "=a" (q)
+                       : "a" (a), "rm" (mul), "rm" (div), "rm" (add)
+                       : "rdx");
+       }

        return q;
 }
-#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64

 static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
 {
-       return mul_u64_u64_div_u64(a, mul, div);
+       return mul_u64_add_u64_div_u64(a, mul, 0, div);
 }
 #define mul_u64_u32_div        mul_u64_u32_div

diff --git a/include/linux/math64.h b/include/linux/math64.h
index 6aaccc1626ab..1544dc37e317 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -282,7 +282,10 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 }
 #endif /* mul_u64_u32_div */

-u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+u64 mul_u64_add_u64_div_u64(u64 a, u64 mul, u64 add, u64 div);
+#define mul_u64_u64_div_u64(a, mul, div) mul_u64_add_u64_div_u64(a, mul, 0, div)
+#define mul_u64_u64_div_u64_roundup(a, mul, div) \
+       ({ u64 _tmp = (div); mul_u64_add_u64_div_u64(a, mul, _tmp - 1, _tmp); })

 /**
  * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 5faa29208bdb..efcc8d729c74 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -183,16 +183,13 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 }
 EXPORT_SYMBOL(iter_div_u64_rem);

-#ifndef mul_u64_u64_div_u64
-u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
+#ifndef mul_u64_add_u64_div_u64
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 add, u64 c)
 {
-       if (ilog2(a) + ilog2(b) <= 62)
-               return div64_u64(a * b, c);
-
 #if defined(__SIZEOF_INT128__)

        /* native 64x64=128 bits multiplication */
-       u128 prod = (u128)a * b;
+       u128 prod = (u128)a * b + add;
        u64 n_lo = prod, n_hi = prod >> 64;

 #else
@@ -201,6 +198,11 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
        u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
        u64 x, y, z;

+#if BITS_PER_LONG == 32
+       if (!(a_hi | b_hi))
+               return div64_u64(a_lo * b_lo + add, c);
+#endif
+
        x = (u64)a_lo * b_lo;
        y = (u64)a_lo * b_hi + (u32)(x >> 32);
        z = (u64)a_hi * b_hi + (u32)(y >> 32);
@@ -208,10 +210,13 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
        z += (u32)(y >> 32);
        x = (y << 32) + (u32)x;

-       u64 n_lo = x, n_hi = z;
+       u64 n_lo = x + add, n_hi = z + (n_lo < x);

 #endif

+       if (!n_hi)
+               return div64_u64(n_lo, c);
+
        /* make sure c is not zero, trigger exception otherwise */
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdiv-by-zero"
@@ -265,5 +270,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)

        return res;
 }
-EXPORT_SYMBOL(mul_u64_u64_div_u64);
+EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
 #endif