My problem
I have 2 adjacent buffers of bytes of identical size (around 20 MB each). I just want to count the differences between them.
My question
How much time this loop should take to run on a 4.8GHz Intel I7 9700K with 3600MT RAM ?
How do we compute max theoretical speed ?
What I tried
uint64_t compareFunction(const char *const __restrict buffer, const uint64_t commonSize)
{
uint64_t diffFound = 0;
for(uint64_t byte = 0; byte < commonSize; ++byte)
diffFound += static_cast<uint64_t>(buffer[byte] != buffer[byte + commonSize]);
return diffFound;
}
It takes 11ms on my PC (9700K 4.8Ghz RAM 3600 Windows 10 Clang 14.0.6 -O3 MinGW ) and I feel it is too slow and that I am missing something.
40MB should take less than 2ms to be read on the CPU (my RAM bandwidth is between 20 and 30GB/s)
I don't know how to count cycles required to execute one iteration (especially because CPUs are superscalar nowadays). If I assume 1 cycle per operation and if I don't mess up my counting, it should be 10 ops per iteration -> 200 million ops -> at 4.8 Ghz with only one execution unit -> 40ms. Obviously I am wrong on how to compute the number of cycles per loop.
Fun fact: I tried on Linux PopOS GCC 11.2 -O3 and it ran at 4.5ms. Why such a difference?
Here are the dissassemblies vectorised and scalar produced by clang:
compareFunction(char const*, unsigned long): # @compareFunction(char const*, unsigned long)
test rsi, rsi
je .LBB0_1
lea r8, [rdi + rsi]
neg rsi
xor edx, edx
xor eax, eax
.LBB0_4: # =>This Inner Loop Header: Depth=1
movzx r9d, byte ptr [rdi + rdx]
xor ecx, ecx
cmp r9b, byte ptr [r8 + rdx]
setne cl
add rax, rcx
add rdx, 1
mov rcx, rsi
add rcx, rdx
jne .LBB0_4
ret
.LBB0_1:
xor eax, eax
ret
Clang14 O3:
.LCPI0_0:
.quad 1 # 0x1
.quad 1 # 0x1
compareFunction(char const*, unsigned long): # @compareFunction(char const*, unsigned long)
test rsi, rsi
je .LBB0_1
cmp rsi, 4
jae .LBB0_4
xor r9d, r9d
xor eax, eax
jmp .LBB0_11
.LBB0_1:
xor eax, eax
ret
.LBB0_4:
mov r9, rsi
and r9, -4
lea rax, [r9 - 4]
mov r8, rax
shr r8, 2
add r8, 1
test rax, rax
je .LBB0_5
mov rdx, r8
and rdx, -2
lea r10, [rdi + 6]
lea r11, [rdi + rsi]
add r11, 6
pxor xmm0, xmm0
xor eax, eax
pcmpeqd xmm2, xmm2
movdqa xmm3, xmmword ptr [rip + .LCPI0_0] # xmm3 = [1,1]
pxor xmm1, xmm1
.LBB0_7: # =>This Inner Loop Header: Depth=1
movzx ecx, word ptr [r10 + rax - 6]
movd xmm4, ecx
movzx ecx, word ptr [r10 + rax - 4]
movd xmm5, ecx
movzx ecx, word ptr [r11 + rax - 6]
movd xmm6, ecx
pcmpeqb xmm6, xmm4
movzx ecx, word ptr [r11 + rax - 4]
movd xmm7, ecx
pcmpeqb xmm7, xmm5
pxor xmm6, xmm2
punpcklbw xmm6, xmm6 # xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
pshuflw xmm4, xmm6, 212 # xmm4 = xmm6[0,1,1,3,4,5,6,7]
pshufd xmm4, xmm4, 212 # xmm4 = xmm4[0,1,1,3]
pand xmm4, xmm3
paddq xmm4, xmm0
pxor xmm7, xmm2
punpcklbw xmm7, xmm7 # xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
pshuflw xmm0, xmm7, 212 # xmm0 = xmm7[0,1,1,3,4,5,6,7]
pshufd xmm5, xmm0, 212 # xmm5 = xmm0[0,1,1,3]
pand xmm5, xmm3
paddq xmm5, xmm1
movzx ecx, word ptr [r10 + rax - 2]
movd xmm0, ecx
movzx ecx, word ptr [r10 + rax]
movd xmm1, ecx
movzx ecx, word ptr [r11 + rax - 2]
movd xmm6, ecx
pcmpeqb xmm6, xmm0
movzx ecx, word ptr [r11 + rax]
movd xmm7, ecx
pcmpeqb xmm7, xmm1
pxor xmm6, xmm2
punpcklbw xmm6, xmm6 # xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
pshuflw xmm0, xmm6, 212 # xmm0 = xmm6[0,1,1,3,4,5,6,7]
pshufd xmm0, xmm0, 212 # xmm0 = xmm0[0,1,1,3]
pand xmm0, xmm3
paddq xmm0, xmm4
pxor xmm7, xmm2
punpcklbw xmm7, xmm7 # xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
pshuflw xmm1, xmm7, 212 # xmm1 = xmm7[0,1,1,3,4,5,6,7]
pshufd xmm1, xmm1, 212 # xmm1 = xmm1[0,1,1,3]
pand xmm1, xmm3
paddq xmm1, xmm5
add rax, 8
add rdx, -2
jne .LBB0_7
test r8b, 1
je .LBB0_10
.LBB0_9:
movzx ecx, word ptr [rdi + rax]
movd xmm2, ecx
movzx ecx, word ptr [rdi + rax + 2]
movd xmm3, ecx
add rax, rsi
movzx ecx, word ptr [rdi + rax]
movd xmm4, ecx
pcmpeqb xmm4, xmm2
movzx eax, word ptr [rdi + rax + 2]
movd xmm2, eax
pcmpeqb xmm2, xmm3
pcmpeqd xmm3, xmm3
pxor xmm4, xmm3
punpcklbw xmm4, xmm4 # xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
pshuflw xmm4, xmm4, 212 # xmm4 = xmm4[0,1,1,3,4,5,6,7]
pshufd xmm4, xmm4, 212 # xmm4 = xmm4[0,1,1,3]
movdqa xmm5, xmmword ptr [rip + .LCPI0_0] # xmm5 = [1,1]
pand xmm4, xmm5
paddq xmm0, xmm4
pxor xmm2, xmm3
punpcklbw xmm2, xmm2 # xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
pshuflw xmm2, xmm2, 212 # xmm2 = xmm2[0,1,1,3,4,5,6,7]
pshufd xmm2, xmm2, 212 # xmm2 = xmm2[0,1,1,3]
pand xmm2, xmm5
paddq xmm1, xmm2
.LBB0_10:
paddq xmm0, xmm1
pshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
paddq xmm1, xmm0
movq rax, xmm1
cmp r9, rsi
je .LBB0_13
.LBB0_11:
lea r8, [r9 + rsi]
sub rsi, r9
add r8, rdi
add rdi, r9
xor edx, edx
.LBB0_12: # =>This Inner Loop Header: Depth=1
movzx r9d, byte ptr [rdi + rdx]
xor ecx, ecx
cmp r9b, byte ptr [r8 + rdx]
setne cl
add rax, rcx
add rdx, 1
cmp rsi, rdx
jne .LBB0_12
.LBB0_13:
ret
.LBB0_5:
pxor xmm0, xmm0
xor eax, eax
pxor xmm1, xmm1
test r8b, 1
jne .LBB0_9
jmp .LBB0_10
-march=native
to ensure you are using all the SIMD features your current CPU has available. The baseline is only SSE2. – Beastly-march=native
is default depends on how your compiler install was configured when it was built (by you or your package distributor). I think with the pure vanilla distribution, it isn't default. – Beastlyuint32_t diffFound
, as then you get twice as many adds per instruction. – Beastly