You may find that reducing the data size gives you a considerable performance improvement if you are performing this calculation over a large number of matrices:
#include <cstdint>
#include <cstdlib>
using T = std::uint_fast8_t;
void mpy(T A[3][3], T B[3][3], T C[3][3])
{
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
C[i][j] = C[i][j] + A[i][k] * B[k][j];
}
}
}
}
The pentium can move and sign-extend an 8-bit value in one instruction.
This means you're getting 4 times as many matrices per cache line.
UPDATE: curiosity piqued, I wrote a test:
#include <random>
#include <utility>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <typeinfo>
template<class T>
struct matrix
{
static constexpr std::size_t rows = 3;
static constexpr std::size_t cols = 3;
static constexpr std::size_t size() { return rows * cols; }
template<class Engine, class U>
matrix(Engine& engine, std::uniform_int_distribution<U>& dist)
: matrix(std::make_index_sequence<size()>(), engine, dist)
{}
template<class U>
matrix(std::initializer_list<U> li)
: matrix(std::make_index_sequence<size()>(), li)
{
}
matrix()
: _data { 0 }
{}
const T* operator[](std::size_t i) const {
return std::addressof(_data[i * cols]);
}
T* operator[](std::size_t i) {
return std::addressof(_data[i * cols]);
}
private:
template<std::size_t...Is, class U, class Engine>
matrix(std::index_sequence<Is...>, Engine& eng, std::uniform_int_distribution<U>& dist)
: _data { (void(Is), dist(eng))... }
{}
template<std::size_t...Is, class U>
matrix(std::index_sequence<Is...>, std::initializer_list<U> li)
: _data { ((Is < li.size()) ? *(li.begin() + Is) : 0)... }
{}
T _data[rows * cols];
};
template<class T>
matrix<T> operator*(const matrix<T>& A, const matrix<T>& B)
{
matrix<T> C;
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
C[i][j] = C[i][j] + A[i][k] * B[k][j];
}
}
}
return C;
}
static constexpr std::size_t test_size = 1000000;
template<class T, class Engine>
void fill(std::vector<matrix<T>>& v, Engine& eng, std::uniform_int_distribution<T>& dist)
{
v.clear();
v.reserve(test_size);
generate_n(std::back_inserter(v), test_size,
[&] { return matrix<T>(eng, dist); });
}
template<class T>
void test(std::random_device& rd)
{
std::mt19937 eng(rd());
std::uniform_int_distribution<T> distr(0, 15);
std::vector<matrix<T>> As, Bs, Cs;
fill(As, eng, distr);
fill(Bs, eng, distr);
fill(Cs, eng, distr);
auto start = std::chrono::high_resolution_clock::now();
auto ia = As.cbegin();
auto ib = Bs.cbegin();
for (auto&m : Cs)
{
m = *ia++ * *ib++;
}
auto stop = std::chrono::high_resolution_clock::now();
auto diff = stop - start;
auto millis = std::chrono::duration_cast<std::chrono::microseconds>(diff).count();
std::cout << "for type " << typeid(T).name() << " time is " << millis << "us" << std::endl;
}
int main() {
//Random number generator
std::random_device rd;
test<std::uint64_t>(rd);
test<std::uint32_t>(rd);
test<std::uint16_t>(rd);
test<std::uint8_t>(rd);
}
Example output (recent macbook pro, 64-bit, compiled with -O3):
for type y
time is 32787 μs
for type j
time is 15323 μs
for type t
time is 14347 μs
for type h
time is 31550 μs
Summary:
on this platform, int32 and int16 proved to be as fast as each other. int64 and int8 were equally slow (the 8-bit result surprised me).
Conclusion:
As ever, express intent to the compiler and let the optimiser do its thing. If the program is running too slowly in production, take measurements and optimise the worst-offenders.
pshufb
). SSE/AVX aren't as powerful as BMI1/BMI2 (esppext
/pdep
) for manipulating elements narrower than a byte, of course. – Melar