I was running some tests to compare C to Java and ran into something interesting. Running my exactly identical benchmark code with optimization level 1 (-O1) in a function called by main, rather than in main itself, resulted in roughly double performance. I'm printing out the size of test_t to verify beyond any doubt that the code is being compiled to x64.
I sent the executables to my friend who's running an i7-7700HQ and got similar results. I'm running an i7-6700.
Here's the slower code:
#include <stdio.h>
#include <time.h>
#include <stdint.h>
int main() {
printf("Size = %I64u\n", sizeof(size_t));
int start = clock();
for(int64_t i = 0; i < 10000000000L; i++) {
}
printf("%ld\n", clock() - start);
return 0;
}
And the faster:
#include <stdio.h>
#include <time.h>
#include <stdint.h>
void test() {
printf("Size = %I64u\n", sizeof(size_t));
int start = clock();
for(int64_t i = 0; i < 10000000000L; i++) {
}
printf("%ld\n", clock() - start);
}
int main() {
test();
return 0;
}
I'll also provide the assembly code for you to dig in to. I don't know assembly. Slower:
.file "dummy.c"
.text
.def __main; .scl 2; .type 32; .endef
.section .rdata,"dr"
.LC0:
.ascii "Size = %I64u\12\0"
.LC1:
.ascii "%ld\12\0"
.text
.globl main
.def main; .scl 2; .type 32; .endef
.seh_proc main
main:
pushq %rbx
.seh_pushreg %rbx
subq $32, %rsp
.seh_stackalloc 32
.seh_endprologue
call __main
movl $8, %edx
leaq .LC0(%rip), %rcx
call printf
call clock
movl %eax, %ebx
movabsq $10000000000, %rax
.L2:
subq $1, %rax
jne .L2
call clock
subl %ebx, %eax
movl %eax, %edx
leaq .LC1(%rip), %rcx
call printf
movl $0, %eax
addq $32, %rsp
popq %rbx
ret
.seh_endproc
.ident "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 8.1.0"
.def printf; .scl 2; .type 32; .endef
.def clock; .scl 2; .type 32; .endef
Faster:
.file "dummy.c"
.text
.section .rdata,"dr"
.LC0:
.ascii "Size = %I64u\12\0"
.LC1:
.ascii "%ld\12\0"
.text
.globl test
.def test; .scl 2; .type 32; .endef
.seh_proc test
test:
pushq %rbx
.seh_pushreg %rbx
subq $32, %rsp
.seh_stackalloc 32
.seh_endprologue
movl $8, %edx
leaq .LC0(%rip), %rcx
call printf
call clock
movl %eax, %ebx
movabsq $10000000000, %rax
.L2:
subq $1, %rax
jne .L2
call clock
subl %ebx, %eax
movl %eax, %edx
leaq .LC1(%rip), %rcx
call printf
nop
addq $32, %rsp
popq %rbx
ret
.seh_endproc
.def __main; .scl 2; .type 32; .endef
.globl main
.def main; .scl 2; .type 32; .endef
.seh_proc main
main:
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
call __main
call test
movl $0, %eax
addq $40, %rsp
ret
.seh_endproc
.ident "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 8.1.0"
.def printf; .scl 2; .type 32; .endef
.def clock; .scl 2; .type 32; .endef
Here's my batch script for compilation:
@echo off
set /p file= File to compile:
del compiled.exe
gcc -Wall -Wextra -std=c17 -O1 -o compiled.exe %file%.c
compiled.exe
PAUSE
And for compilation to assembly:
@echo off
set /p file= File to compile:
del %file%.s
gcc -S -Wall -Wextra -std=c17 -O1 %file%.c
PAUSE
L2
in both cases then that can be checked – Hydrate