The dot product of two arrays
for(int i=0; i<n; i++) {
sum += x[i]*y[i];
}
does not reuse data so it should be a memory bound operation. Therefore, I should be able to measure the memory bandwidth from the dot product.
Using the code at why-vectorizing-the-loop-does-not-have-performance-improvement I get a bandwidth of 9.3 GB/s for my system. However, when I attempt to calculate the bandwidth using the dot product I get over twice the rate for a single thread and over three time the rate using multiple threads (my system has four cores/eight hyper-threads). This makes no sense to me since a memory bound operation should not benefit from multiple threads. Here is the output from the code below:
Xeon E5-1620, GCC 4.9.0, Linux kernel 3.13
dot 1 thread: 1.0 GB, sum 191054.81, time 4.98 s, 21.56 GB/s, 5.39 GFLOPS
dot_avx 1 thread 1.0 GB, sum 191043.33, time 5.16 s, 20.79 GB/s, 5.20 GFLOPS
dot_avx 2 threads: 1.0 GB, sum 191045.34, time 3.44 s, 31.24 GB/s, 7.81 GFLOPS
dot_avx 8 threads: 1.0 GB, sum 191043.34, time 3.26 s, 32.91 GB/s, 8.23 GFLOPS
Can somebody please explain to me why I get over twice the bandwidth for one thread and over three times the bandwidth using more than one thread?
Here is the code I used:
//g++ -O3 -fopenmp -mavx -ffast-math dot.cpp
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <x86intrin.h>
#include <omp.h>
extern "C" inline float horizontal_add(__m256 a) {
__m256 t1 = _mm256_hadd_ps(a,a);
__m256 t2 = _mm256_hadd_ps(t1,t1);
__m128 t3 = _mm256_extractf128_ps(t2,1);
__m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
return _mm_cvtss_f32(t4);
}
extern "C" float dot_avx(float * __restrict x, float * __restrict y, const int n) {
x = (float*)__builtin_assume_aligned (x, 32);
y = (float*)__builtin_assume_aligned (y, 32);
float sum = 0;
#pragma omp parallel reduction(+:sum)
{
__m256 sum1 = _mm256_setzero_ps();
__m256 sum2 = _mm256_setzero_ps();
__m256 sum3 = _mm256_setzero_ps();
__m256 sum4 = _mm256_setzero_ps();
__m256 x8, y8;
#pragma omp for
for(int i=0; i<n; i+=32) {
x8 = _mm256_loadu_ps(&x[i]);
y8 = _mm256_loadu_ps(&y[i]);
sum1 = _mm256_add_ps(_mm256_mul_ps(x8,y8),sum1);
x8 = _mm256_loadu_ps(&x[i+8]);
y8 = _mm256_loadu_ps(&y[i+8]);
sum2 = _mm256_add_ps(_mm256_mul_ps(x8,y8),sum2);
x8 = _mm256_loadu_ps(&x[i+16]);
y8 = _mm256_loadu_ps(&y[i+16]);
sum3 = _mm256_add_ps(_mm256_mul_ps(x8,y8),sum3);
x8 = _mm256_loadu_ps(&x[i+24]);
y8 = _mm256_loadu_ps(&y[i+24]);
sum4 = _mm256_add_ps(_mm256_mul_ps(x8,y8),sum4);
}
sum += horizontal_add(_mm256_add_ps(_mm256_add_ps(sum1,sum2),_mm256_add_ps(sum3,sum4)));
}
return sum;
}
extern "C" float dot(float * __restrict x, float * __restrict y, const int n) {
x = (float*)__builtin_assume_aligned (x, 32);
y = (float*)__builtin_assume_aligned (y, 32);
float sum = 0;
for(int i=0; i<n; i++) {
sum += x[i]*y[i];
}
return sum;
}
int main(){
uint64_t LEN = 1 << 27;
float *x = (float*)_mm_malloc(sizeof(float)*LEN,64);
float *y = (float*)_mm_malloc(sizeof(float)*LEN,64);
for(uint64_t i=0; i<LEN; i++) { x[i] = 1.0*rand()/RAND_MAX - 0.5; y[i] = 1.0*rand()/RAND_MAX - 0.5;}
uint64_t size = 2*sizeof(float)*LEN;
volatile float sum = 0;
double dtime, rate, flops;
int repeat = 100;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) sum += dot(x,y,LEN);
dtime = omp_get_wtime() - dtime;
rate = 1.0*repeat*size/dtime*1E-9;
flops = 2.0*repeat*LEN/dtime*1E-9;
printf("%f GB, sum %f, time %f s, %.2f GB/s, %.2f GFLOPS\n", 1.0*size/1024/1024/1024, sum, dtime, rate,flops);
sum = 0;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) sum += dot_avx(x,y,LEN);
dtime = omp_get_wtime() - dtime;
rate = 1.0*repeat*size/dtime*1E-9;
flops = 2.0*repeat*LEN/dtime*1E-9;
printf("%f GB, sum %f, time %f s, %.2f GB/s, %.2f GFLOPS\n", 1.0*size/1024/1024/1024, sum, dtime, rate,flops);
}
I just downloaded, complied, and ran STREAM as suggested by Jonathan Dursi and here are the results:
One thread
Function Rate (MB/s) Avg time Min time Max time
Copy: 14292.1657 0.0023 0.0022 0.0023
Scale: 14286.0807 0.0023 0.0022 0.0023
Add: 14724.3906 0.0033 0.0033 0.0033
Triad: 15224.3339 0.0032 0.0032 0.0032
Eight threads
Function Rate (MB/s) Avg time Min time Max time
Copy: 24501.2282 0.0014 0.0013 0.0021
Scale: 23121.0556 0.0014 0.0014 0.0015
Add: 25263.7209 0.0024 0.0019 0.0056
Triad: 25817.7215 0.0020 0.0019 0.0027
memset
at stackoverflow.com/questions/18159455/…. So either that code does not measure the bandwidth correctly and my real bandwidth is much higher or I don't understand something. From wikipedia the bandwidth for my memory is 12.8 GB/s which is close to what thememset
code gets. – Glandularmemset
code is that my code only reads memory and thememset
code only writes memory. So maybe the bandwidth to read is higher. Though that does not explain why multiple threads help it could explain why reading one 1GB is faster than writing 1GB. – Glandular