In an answer, I've stated that unaligned access has almost the same speed as aligned access a long time (on x86/x86_64). I didn't have any numbers to back up this statement, so I've created a benchmark for it.
Do you see any flaws in this benchmark? Can you improve on it (I mean, to increase GB/sec, so it reflects the truth better)?
#include <sys/time.h>
#include <stdio.h>
template <int N>
__attribute__((noinline))
void loop32(const char *v) {
for (int i=0; i<N; i+=160) {
__asm__ ("mov (%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x04(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x08(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x0c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x10(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x14(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x18(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x1c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x20(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x24(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x28(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x2c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x30(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x34(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x38(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x3c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x40(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x44(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x48(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x4c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x50(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x54(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x58(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x5c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x60(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x64(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x68(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x6c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x70(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x74(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x78(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x7c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x80(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x84(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x88(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x8c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x90(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x94(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x98(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x9c(%0), %%eax" : : "r"(v) :"eax");
v += 160;
}
}
template <int N>
__attribute__((noinline))
void loop64(const char *v) {
for (int i=0; i<N; i+=160) {
__asm__ ("mov (%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x08(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x10(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x18(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x20(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x28(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x30(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x38(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x40(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x48(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x50(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x58(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x60(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x68(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x70(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x78(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x80(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x88(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x90(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x98(%0), %%rax" : : "r"(v) :"rax");
v += 160;
}
}
template <int N>
__attribute__((noinline))
void loop128a(const char *v) {
for (int i=0; i<N; i+=160) {
__asm__ ("movaps (%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x10(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x20(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x30(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x40(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x50(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x60(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x70(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x80(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x90(%0), %%xmm0" : : "r"(v) :"xmm0");
v += 160;
}
}
template <int N>
__attribute__((noinline))
void loop128u(const char *v) {
for (int i=0; i<N; i+=160) {
__asm__ ("movups (%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x10(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x20(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x30(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x40(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x50(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x60(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x70(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x80(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x90(%0), %%xmm0" : : "r"(v) :"xmm0");
v += 160;
}
}
long long int t() {
struct timeval tv;
gettimeofday(&tv, 0);
return (long long int)tv.tv_sec*1000000 + tv.tv_usec;
}
int main() {
const int ITER = 10;
const int N = 1600000000;
char *data = reinterpret_cast<char *>(((reinterpret_cast<unsigned long long>(new char[N+32])+15)&~15));
for (int i=0; i<N+16; i++) data[i] = 0;
{
long long int t0 = t();
for (int i=0; i<ITER*100000; i++) {
loop32<N/100000>(data);
}
long long int t1 = t();
for (int i=0; i<ITER*100000; i++) {
loop32<N/100000>(data+1);
}
long long int t2 = t();
for (int i=0; i<ITER; i++) {
loop32<N>(data);
}
long long int t3 = t();
for (int i=0; i<ITER; i++) {
loop32<N>(data+1);
}
long long int t4 = t();
printf(" 32-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
printf(" 32-bit, mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
}
{
long long int t0 = t();
for (int i=0; i<ITER*100000; i++) {
loop64<N/100000>(data);
}
long long int t1 = t();
for (int i=0; i<ITER*100000; i++) {
loop64<N/100000>(data+1);
}
long long int t2 = t();
for (int i=0; i<ITER; i++) {
loop64<N>(data);
}
long long int t3 = t();
for (int i=0; i<ITER; i++) {
loop64<N>(data+1);
}
long long int t4 = t();
printf(" 64-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
printf(" 64-bit, mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
}
{
long long int t0 = t();
for (int i=0; i<ITER*100000; i++) {
loop128a<N/100000>(data);
}
long long int t1 = t();
for (int i=0; i<ITER*100000; i++) {
loop128u<N/100000>(data+1);
}
long long int t2 = t();
for (int i=0; i<ITER; i++) {
loop128a<N>(data);
}
long long int t3 = t();
for (int i=0; i<ITER; i++) {
loop128u<N>(data+1);
}
long long int t4 = t();
printf("128-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
printf("128-bit, mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
}
}
perf
to just get counts for user-space cycles, with very good precision. Definitely enough to detect any unaligned penalty. – Swatter