[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <29cf408370b749069f3b395781fe434c@AcuMS.aculab.com>
Date: Thu, 2 Dec 2021 14:24:47 +0000
From: David Laight <David.Laight@...LAB.COM>
To: 'Noah Goldstein' <goldstein.w.n@...il.com>
CC: Eric Dumazet <edumazet@...gle.com>,
"tglx@...utronix.de" <tglx@...utronix.de>,
"mingo@...hat.com" <mingo@...hat.com>,
Borislav Petkov <bp@...en8.de>,
"dave.hansen@...ux.intel.com" <dave.hansen@...ux.intel.com>,
X86 ML <x86@...nel.org>, "hpa@...or.com" <hpa@...or.com>,
"peterz@...radead.org" <peterz@...radead.org>,
"alexanderduyck@...com" <alexanderduyck@...com>,
open list <linux-kernel@...r.kernel.org>,
netdev <netdev@...r.kernel.org>
Subject: RE: [PATCH v1] x86/lib: Optimize 8x loop and memory clobbers in
csum_partial.c
I've dug out my test program and measured the performance of
various copied of the inner loop - usually 64 bytes/iteration.
Code is below.
It uses the hardware performance counter to get the number of
clocks the inner loop takes.
This is reasonable stable once the branch predictor has settled down.
So the different in clocks between a 64 byte buffer and a 128 byte
buffer is the number of clocks for 64 bytes.
(Unlike the TSC the pmc count doesn't depend on the cpu frequency.)
What is interesting is that even some of the trivial loops appear
to be doing 16 bytes per clock for short buffers - which is impossible.
Checksum 1k bytes and you get an entirely different answer.
The only loop that really exceeds 8 bytes/clock for long buffers
is the adxc/adoc one.
What is almost certainly happening is that all the memory reads and
the dependant add/adc instructions are all queued up in the 'out of
order' execution unit.
Since 'rdpmc' isn't a serialising instruction they can still be
outstanding when the function returns.
Uncomment the 'rdtsc' and you get much slower values for short buffers.
When testing the full checksum function the queued up memory
reads and adc are probably running in parallel with the logic
that is handling lengths that aren't multiples of 64.
I also found nothing consistently different for misaligned reads.
These were all tested on my i7-7700 cpu.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <linux/perf_event.h>
#include <sys/mman.h>
#include <sys/syscall.h>
static int init_pmc(void)
{
static struct perf_event_attr perf_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.pinned = 1,
};
struct perf_event_mmap_page *pc;
int perf_fd;
perf_fd = syscall(__NR_perf_event_open, &perf_attr, 0, -1, -1, 0);
if (perf_fd < 0) {
fprintf(stderr, "perf_event_open failed: errno %d\n", errno);
exit(1);
}
pc = mmap(NULL, 4096, PROT_READ, MAP_SHARED, perf_fd, 0);
if (pc == MAP_FAILED) {
fprintf(stderr, "perf_event mmap() failed: errno %d\n", errno);
exit(1);
}
return pc->index - 1;
}
static inline unsigned int rdpmc(unsigned int counter)
{
unsigned int low, high;
// asm volatile("rdtsc" : "=a" (low), "=d" (high));
asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
// return low bits, counter might to 32 or 40 bits wide.
return low;
}
#ifndef MODE
#define MODE 0
#endif
__attribute__((noinline))
unsigned long csum64(unsigned long csum, const unsigned char *buff, unsigned long len)
{
#if MODE == 0
// Simple 'adc' loop, abs max 8 bytes/clock.
// Will be very slow on old cpu (Pre Ivy bridge) cpu where 'dec'
// has a false dependency against the carry flag.
// Nearly as good if doing 32 bytes/iteration.
#define DFLT_OFFSET 40
long count = len/32;
asm(" clc\n"
"1: adcq (%[buff]), %[csum]\n"
" adcq 8(%[buff]), %[csum]\n"
" adcq 16(%[buff]), %[csum]\n"
" adcq 24(%[buff]), %[csum]\n"
" lea 32(%[buff]), %[buff]\n"
" dec %[count]\n"
" jnz 1b\n"
" adcq $0, %[csum]\n"
: [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
: [buff] "r" (buff)
: "memory");
#endif
#if MODE == 1
// Alternate 'adc' loop that avoids using the carry flag.
#define DFLT_OFFSET 40
buff += len;
len = -len;
asm(" clc\n"
"1: jrcxz 2f\n"
" adcq 0(%[buff], %[len]), %[csum]\n"
" adcq 8(%[buff], %[len]), %[csum]\n"
" adcq 16(%[buff], %[len]), %[csum]\n"
" adcq 24(%[buff], %[len]), %[csum]\n"
" lea 32(%[len]), %[len]\n"
" jmp 1b\n"
"2: adcq $0, %[csum]\n"
: [csum] "+&r" (csum), [len] "+&c" (len)
: [buff] "r" (buff)
: "memory");
#endif
#if MODE == 2
// Existing kernel loop.
// The adc chain limits it to about 8 bytes/clock.
#define DFLT_OFFSET 40
long count = len/64;
asm("1: addq 0(%[buff]), %[csum]\n"
" adcq 8(%[buff]), %[csum]\n"
" adcq 16(%[buff]), %[csum]\n"
" adcq 24(%[buff]), %[csum]\n"
" adcq 32(%[buff]), %[csum]\n"
" adcq 40(%[buff]), %[csum]\n"
" adcq 48(%[buff]), %[csum]\n"
" adcq 56(%[buff]), %[csum]\n"
" adcq $0, %[csum]\n"
" lea 64(%[buff]), %[buff]\n"
" dec %[count]\n"
" jnz 1b"
: [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
: [buff] "r" (buff)
: "memory");
#endif
#if MODE == 3
// Proposed kernel loop.
// This splits the adc chain in two.
// Faster than (2) for short buffers - not sure why.
#define DFLT_OFFSET 38
long count = len/64;
unsigned long csum1;
asm("1: movq 0(%[buff]), %[csum1]\n"
" adcq 8(%[buff]), %[csum1]\n"
" adcq 16(%[buff]), %[csum1]\n"
" adcq 24(%[buff]), %[csum1]\n"
" adcq 32(%[buff]), %[csum1]\n"
" adcq $0, %[csum1]\n"
" lea 64(%[buff]), %[buff]\n"
" addq 40-64(%[buff]), %[csum]\n"
" adcq 48-64(%[buff]), %[csum]\n"
" adcq 56-64(%[buff]), %[csum]\n"
" adcq %[csum1], %[csum]\n"
" adcq $0, %[csum]\n"
" dec %[count]\n"
" jnz 1b"
: [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count)
: [buff] "r" (buff)
: "memory");
#endif
#if MODE == 4
// Less unrolled version of (3).
// Maybe just as fast!
#define DFLT_OFFSET 43
long count = len;
unsigned long csum1;
asm("1: movq 0(%[buff]), %[csum1]\n"
" adcq 8(%[buff]), %[csum1]\n"
" adcq $0, %[csum1]\n"
" lea 32(%[buff]), %[buff]\n"
" addq 16-32(%[buff]), %[csum]\n"
" adcq 24-32(%[buff]), %[csum]\n"
" adcq %[csum1], %[csum]\n"
" adcq $0, %[csum]\n"
" sub $32, %[count]\n"
" jnz 1b"
: [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count)
: [buff] "r" (buff)
: "memory");
#endif
#if MODE == 5
// adxc/adxo loop.
// This is the only loop that exceeds 8 bytes/clock for 1k buffers.
#define DFLT_OFFSET 40
unsigned long csum_odd;
buff += len;
len = -len;
asm( " xor %[csum_odd], %[csum_odd]\n" // Also clears carry and overflow
"10: jrcxz 20f\n"
" adcx (%[buff], %[len]), %[csum]\n"
" adox 8(%[buff], %[len]), %[csum_odd]\n"
" adcx 16(%[buff], %[len]), %[csum]\n"
" adox 24(%[buff], %[len]), %[csum_odd]\n"
" adcx 32(%[buff], %[len]), %[csum]\n"
" adox 40(%[buff], %[len]), %[csum_odd]\n"
" adcx 48(%[buff], %[len]), %[csum]\n"
" adox 56(%[buff], %[len]), %[csum_odd]\n"
" lea 64(%[len]), %[len]\n"
" jmp 10b\n"
"20: adox %[len], %[csum_odd]\n" // [len] is zero
" adcx %[csum_odd], %[csum]\n"
" adcx %[len], %[csum]"
: [csum] "+&r" (csum), [len] "+&c" (len), [csum_odd] "=&r" (csum_odd)
: [buff] "r" (buff)
: "memory");
#endif
return csum;
}
unsigned char buff[8192] __attribute__((aligned(4096))) = {
0x46,0x56,0x20,0x04,0x10,0x02,0x50,0x07,0x72,0x4d,0xc6,0x3d,0x31,0x85,0x2d,0xbd,
0xe2,0xe0,0x9d,0x3e,0x3b,0x7a,0x70,0x3d,0xd2,0xfb,0x8c,0xbf,0x95,0x10,0xa9,0xbe,
0xeb,0xfd,0x29,0x40,0xd5,0x7a,0x61,0x40,0xde,0xcd,0x14,0xbf,0x81,0x1b,0xf6,0x3f,
0xbc,0xff,0x17,0x3f,0x67,0x1c,0x6e,0xbe,0xf4,0xc2,0x05,0x40,0x0b,0x13,0x78,0x3f,
0xfe,0x47,0xa7,0xbd,0x59,0xc2,0x15,0x3f,0x07,0xd0,0xea,0xbf,0x97,0xf1,0x3c,0x3f,
0xcc,0xfa,0x6b,0x40,0x72,0x6a,0x4f,0xbe,0x0b,0xe3,0x75,0x3e,0x3c,0x9b,0x0e,0xbf,
0xa9,0xeb,0xb7,0x3f,0xeb,0x4a,0xec,0x3e,0x33,0x8c,0x0c,0x3f,0x6a,0xf2,0xf3,0x3e,
0x2b,0x45,0x86,0x3f,0x83,0xce,0x8a,0x3f,0xf6,0x01,0x16,0x40,0x9c,0x17,0x47,0x3e,
0x44,0x83,0x61,0x40,0x74,0xc7,0x5c,0x3f,0xec,0xe7,0x95,0x3f,0xee,0x19,0xb5,0xbf,
0xb5,0xf0,0x03,0xbf,0xd1,0x02,0x1c,0x3e,0xa3,0x55,0x90,0xbe,0x1e,0x0b,0xa1,0xbf,
0xa4,0xa8,0xb4,0x3f,0xc6,0x68,0x91,0x3f,0xd1,0xc5,0xab,0x3f,0xb9,0x14,0x62,0x3f,
0x7c,0xe0,0xb9,0xbf,0xc0,0xa4,0xb5,0x3d,0x6f,0xd9,0xa7,0x3f,0x8f,0xc4,0xb0,0x3d,
0x48,0x2c,0x7a,0x3e,0x83,0xb2,0x3c,0x40,0x36,0xd3,0x18,0x40,0xb7,0xa9,0x57,0x40,
0xda,0xd3,0x95,0x3f,0x74,0x95,0xc0,0xbe,0xbb,0xce,0x71,0x3e,0x95,0xec,0x18,0xbf,
0x94,0x17,0xdd,0x3f,0x98,0xa5,0x02,0x3f,0xbb,0xfb,0xbb,0x3e,0xd0,0x5a,0x9c,0x3f,
0xd4,0x70,0x9b,0xbf,0x3b,0x9f,0x20,0xc0,0x84,0x5b,0x0f,0x40,0x5e,0x48,0x2c,0xbf,
};
#ifndef PASSES
#define PASSES 10
#endif
#ifndef OFFSET
#ifdef DFLT_OFFSET
#define OFFSET DFLT_OFFSET
#else
#define OFFSET 0
#endif
#endif
int main(int argc, char **argv)
{
unsigned long csum;
unsigned int tick;
unsigned int ticks[PASSES];
unsigned int len, off;
unsigned int i;
unsigned int id = init_pmc();
len = sizeof buff;
off = 0;
if (argv[1]) {
len = atoi(argv[1]);
len = (len + 63) & ~63;
if (argv[2])
off = atoi(argv[2]) & 7;
}
for (i = 0; i < PASSES; i++) {
tick = rdpmc(id);
csum = csum64(0, buff + off, len);
ticks[i] = rdpmc(id) - tick;
}
printf(" ticks rate %lx\n", csum);
for (i = 0; i < PASSES; i++)
printf(" %7u %7u\n", ticks[i], 100 * len / (ticks[i] - OFFSET));
return 0;
}
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Powered by blists - more mailing lists