lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20211127042540.96249-1-goldstein.w.n@gmail.com>
Date:   Fri, 26 Nov 2021 22:25:40 -0600
From:   Noah Goldstein <goldstein.w.n@...il.com>
To:     unlisted-recipients:; (no To-header on input)
Cc:     tglx@...utronix.de, mingo@...hat.com, bp@...en8.de,
        dave.hansen@...ux.intel.com, x86@...nel.org, hpa@...or.com,
        peterz@...radead.org, alexanderduyck@...com,
        goldstein.w.n@...il.com, edumazet@...gle.com,
        linux-kernel@...r.kernel.org
Subject: [PATCH v2] x86/lib: Optimize 8x loop and memory clobbers in csum_partial.c

Modify the 8x loop to that it uses two independent
accumulators. Despite adding more instructions the latency and
throughput of the loop is improved because the `adc` chains can now
take advantage of multiple execution units.

Make the memory clobbers more precise. 'buff' is read only and we know
the exact usage range. There is no reason to write-clobber all memory.

Relative performance changes on Tigerlake:

Time Unit: Ref Cycles
Size Unit: Bytes

size, lat old, lat new,    tput old,    tput new
   0,   4.961,   4.901,       4.887,       4.951
   8,   5.590,   5.620,       4.227,       4.252
  16,   6.182,   6.202,       4.233,       4.278
  24,   7.392,   7.380,       4.256,       4.279
  32,   7.371,   7.390,       4.550,       4.537
  40,   8.621,   8.601,       4.862,       4.836
  48,   9.406,   9.374,       5.206,       5.234
  56,  10.535,  10.522,       5.416,       5.447
  64,  10.000,   7.590,       6.946,       6.989
 100,  14.218,  12.476,       9.429,       9.441
 200,  22.115,  16.937,      13.088,      12.852
 300,  31.826,  24.640,      19.383,      18.230
 400,  39.016,  28.133,      23.223,      21.304
 500,  48.815,  36.186,      30.331,      27.104
 600,  56.732,  40.120,      35.899,      30.363
 700,  66.623,  48.178,      43.044,      36.400
 800,  73.259,  51.171,      48.564,      39.173
 900,  82.821,  56.635,      58.592,      45.162
1000,  90.780,  63.703,      65.658,      48.718

Signed-off-by: Noah Goldstein <goldstein.w.n@...il.com>

tmp
---
 arch/x86/lib/csum-partial_64.c | 38 +++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index ded842cd1020..52540f148ebb 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -48,18 +48,21 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
 	}
 
 	while (unlikely(len >= 64)) {
-		asm("addq 0*8(%[src]),%[res]\n\t"
-		    "adcq 1*8(%[src]),%[res]\n\t"
-		    "adcq 2*8(%[src]),%[res]\n\t"
-		    "adcq 3*8(%[src]),%[res]\n\t"
-		    "adcq 4*8(%[src]),%[res]\n\t"
-		    "adcq 5*8(%[src]),%[res]\n\t"
+		u64 temp_accum;
+
+		asm("movq 0*8(%[src]),%[res_tmp]\n\t"
+		    "addq 1*8(%[src]),%[res_tmp]\n\t"
+		    "adcq 2*8(%[src]),%[res_tmp]\n\t"
+		    "adcq 3*8(%[src]),%[res_tmp]\n\t"
+		    "adcq 4*8(%[src]),%[res_tmp]\n\t"
+		    "adcq $0,%[res_tmp]\n\t"
+		    "addq 5*8(%[src]),%[res]\n\t"
 		    "adcq 6*8(%[src]),%[res]\n\t"
 		    "adcq 7*8(%[src]),%[res]\n\t"
-		    "adcq $0,%[res]"
-		    : [res] "+r" (temp64)
-		    : [src] "r" (buff)
-		    : "memory");
+		    "adcq %[res_tmp], %[res]\n\t"
+		    "adcq $0,%[res]\n\t"
+		    : [res] "+r"(temp64), [res_tmp] "=&r"(temp_accum)
+		    : [src] "r"(buff), "m"(*(const char(*)[64])buff));
 		buff += 64;
 		len -= 64;
 	}
@@ -70,26 +73,23 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
 		    "adcq 2*8(%[src]),%[res]\n\t"
 		    "adcq 3*8(%[src]),%[res]\n\t"
 		    "adcq $0,%[res]"
-			: [res] "+r" (temp64)
-			: [src] "r" (buff)
-			: "memory");
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[32])buff));
 		buff += 32;
 	}
 	if (len & 16) {
 		asm("addq 0*8(%[src]),%[res]\n\t"
 		    "adcq 1*8(%[src]),%[res]\n\t"
 		    "adcq $0,%[res]"
-			: [res] "+r" (temp64)
-			: [src] "r" (buff)
-			: "memory");
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[16])buff));
 		buff += 16;
 	}
 	if (len & 8) {
 		asm("addq 0*8(%[src]),%[res]\n\t"
 		    "adcq $0,%[res]"
-			: [res] "+r" (temp64)
-			: [src] "r" (buff)
-			: "memory");
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[8])buff));
 		buff += 8;
 	}
 	if (len & 7) {
-- 
2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ