Edit: The huge difference in performance is due to a bug in the test, when set up properly Eigen
is 2 to 3 times faster.
I noticed that sparse matrix multiplication using C++
Eigen library is much slower than using Python
scipy.sparse library. I achieve in scipy.sparse
in ~0.03
seconds what I achieve in Eigen
in ~25
seconds. Maybe I doing something wrong in Eigen?
Here Python code:
from scipy import sparse
from time import time
import random as rn
N_VALUES = 200000
N_ROWS = 400000
N_COLS = 400000
rows_a = rn.sample(range(N_COLS), N_VALUES)
cols_a = rn.sample(range(N_ROWS), N_VALUES)
values_a = [rn.uniform(0,1) for _ in xrange(N_VALUES)]
rows_b = rn.sample(range(N_COLS), N_VALUES)
cols_b = rn.sample(range(N_ROWS), N_VALUES)
values_b = [rn.uniform(0,1) for _ in xrange(N_VALUES)]
big_a = sparse.coo_matrix((values_a, (cols_a, rows_a)), shape=(N_ROWS, N_COLS))
big_b = sparse.coo_matrix((values_b, (cols_b, rows_b)), shape=(N_ROWS, N_COLS))
big_a = big_a.tocsr()
big_b = big_a.tocsr()
start = time()
AB = big_a * big_b;
end = time()
print 'time taken : {}'.format(end - start)
C++ code:
#include <iostream>
#include <cstdlib>
#include <vector>
#include <algorithm>
#include <Eigen/Dense>
#include <Eigen/Sparse>
using namespace Eigen;
std::vector<long> gen_random_sample(long min, long max, long sample_size);
double get_random_double(double min, double max);
std::vector<double> get_vector_of_rn_doubles(int length, double min, double max);
int main()
{
long N_COLS = 400000;
long N_ROWS = 400000;
long N_VALUES = 200000;
SparseMatrix<double> big_A(N_ROWS, N_COLS);
std::vector<long> cols_a = gen_random_sample(0, N_COLS, N_VALUES);
std::vector<long> rows_a = gen_random_sample(0, N_COLS, N_VALUES);
std::vector<double> values_a = get_vector_of_rn_doubles(N_VALUES, 0, 1);
for (int i = 0; i < N_VALUES; i++)
big_A.insert(cols_a[i], cols_a[i]) = values_a[i];
// big_A.makeCompressed(); // slows things down
SparseMatrix<double> big_B(N_ROWS, N_COLS);
std::vector<long> cols_b = gen_random_sample(0, N_COLS, N_VALUES);
std::vector<long> rows_b = gen_random_sample(0, N_COLS, N_VALUES);
std::vector<double> values_b = get_vector_of_rn_doubles(N_VALUES, 0, 1);
for (int i = 0; i < N_VALUES; i++)
big_B.insert(cols_b[i], cols_b[i]) = values_b[i];
// big_B.makeCompressed();
SparseMatrix<double> big_AB(N_ROWS, N_COLS);
clock_t begin = clock();
big_AB = (big_A * big_B); //.pruned();
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
std::cout << "Time taken : " << elapsed_secs << std::endl;
}
std::vector<long> gen_random_sample(long min, long max, long sample_size)
{
std::vector<long> my_vector(sample_size); // THE BUG, is right std::vector<long> my_vector
for (long i = min; i != max; i++)
{
my_vector.push_back(i);
}
std::random_shuffle(my_vector.begin(), my_vector.end());
std::vector<long> new_vec = std::vector<long>(my_vector.begin(), my_vector.begin() + sample_size);
return new_vec;
}
double get_random_double(double min, double max)
{
std::uniform_real_distribution<double> unif(min, max);
std::default_random_engine re;
double a_random_double = unif(re);
}
std::vector<double> get_vector_of_rn_doubles(int length, double min, double max)
{
std::vector<double> my_vector(length);
for (int i=0; i < length; i++)
{
my_vector[i] = get_random_double(min, max);
}
return my_vector;
}
I compiled with: g++ -std=c++11 -I/usr/include/eigen3 time_eigen.cpp -o my_exec -O2 -DNDEBUG
.
Am I missing a way to do sparse multiplication fast using Eigen?
-fopenmp
flag, to enable OpenMP parallelization in Eigen – GreennessMATLAB
, like 10% slower (MATLAB btw is super optimized with Intel MKL). Are you sure thatpython
is not doing some kind of lazy evaluation? You are talking here about 3 orders of magnitude slower. Your code runs in ~13 seconds on my MacBook Pro 2013. – Greennessclang++
ignores-fopenmp
flag, so you may want to compile withg++
to enable multi-processing. But yes it doesn't seem to make a difference for matrix multiplication. – GreennessAB.sum()
instantly, which means that all values inAB
are calculated. MyC++
code runs in ~13 seconds on your machine, right? – PetroC++
runs in13s
, and get the same~0.03s
for python. – Greenness