I keep beating myself over the head with this. I have an SSE-based algorithm for multiplying matrix A
by matrix B
. I need to also implement the operations for where A, B, or both are transposed. I did a naive implementation of it, the 4x4 matrix code represented below (which is pretty standard SSE operations, I think), but the A*B^T
operation takes about as twice as long as A*B
. The ATLAS implementation returns similar values for A*B
, and nearly identical results for multiplying by a transpose, which suggests to me that there is an efficient way to do this.
MM-Multiplication:
m1 = (mat1.m_>>2)<<2;
n2 = (mat2.n_>>2)<<2;
n = (mat1.n_>>2)<<2;
for (k=0; k<n; k+=4) {
for (i=0; i<m1; i+=4) {
// fetch: get 4x4 matrix from mat1
// row-major storage, so get 4 rows
Float* a0 = mat1.el_[i]+k;
Float* a1 = mat1.el_[i+1]+k;
Float* a2 = mat1.el_[i+2]+k;
Float* a3 = mat1.el_[i+3]+k;
for (j=0; j<n2; j+=4) {
// fetch: get 4x4 matrix from mat2
// row-major storage, so get 4 rows
Float* b0 = mat2.el_[k]+j;
Float* b1 = mat2.el_[k+1]+j;
Float* b2 = mat2.el_[k+2]+j;
Float* b3 = mat2.el_[k+3]+j;
__m128 b0r = _mm_loadu_ps(b0);
__m128 b1r = _mm_loadu_ps(b1);
__m128 b2r = _mm_loadu_ps(b2);
__m128 b3r = _mm_loadu_ps(b3);
{ // first row of result += first row of mat1 * 4x4 of mat2
__m128 cX1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a0+0), b0r), _mm_mul_ps(_mm_load_ps1(a0+1), b1r));
__m128 cX2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a0+2), b2r), _mm_mul_ps(_mm_load_ps1(a0+3), b3r));
Float* c0 = this->el_[i]+j;
_mm_storeu_ps(c0, _mm_add_ps(_mm_add_ps(cX1, cX2), _mm_loadu_ps(c0)));
}
{ // second row of result += second row of mat1 * 4x4 of mat2
__m128 cX1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a1+0), b0r), _mm_mul_ps(_mm_load_ps1(a1+1), b1r));
__m128 cX2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a1+2), b2r), _mm_mul_ps(_mm_load_ps1(a1+3), b3r));
Float* c1 = this->el_[i+1]+j;
_mm_storeu_ps(c1, _mm_add_ps(_mm_add_ps(cX1, cX2), _mm_loadu_ps(c1)));
}
{ // third row of result += third row of mat1 * 4x4 of mat2
__m128 cX1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a2+0), b0r), _mm_mul_ps(_mm_load_ps1(a2+1), b1r));
__m128 cX2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a2+2), b2r), _mm_mul_ps(_mm_load_ps1(a2+3), b3r));
Float* c2 = this->el_[i+2]+j;
_mm_storeu_ps(c2, _mm_add_ps(_mm_add_ps(cX1, cX2), _mm_loadu_ps(c2)));
}
{ // fourth row of result += fourth row of mat1 * 4x4 of mat2
__m128 cX1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a3+0), b0r), _mm_mul_ps(_mm_load_ps1(a3+1), b1r));
__m128 cX2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a3+2), b2r), _mm_mul_ps(_mm_load_ps1(a3+3), b3r));
Float* c3 = this->el_[i+3]+j;
_mm_storeu_ps(c3, _mm_add_ps(_mm_add_ps(cX1, cX2), _mm_loadu_ps(c3)));
}
}
// Code omitted to handle remaining rows and columns
}
For the MT multiplication (matrix multiplied by transpose matrix), I stead stored b0r to b3r with the following commands and changed the loop variables appropriately:
__m128 b0r = _mm_set_ps(b3[0], b2[0], b1[0], b0[0]);
__m128 b1r = _mm_set_ps(b3[1], b2[1], b1[1], b0[1]);
__m128 b2r = _mm_set_ps(b3[2], b2[2], b1[2], b0[2]);
__m128 b3r = _mm_set_ps(b3[3], b2[3], b1[3], b0[3]);
I suspect that the slowdown is partly due to the difference between pulling in a row at a time and having to store 4 values each time to get the column, but I feel like the other way of going about this, pulling in rows of B and then multiplying by the column of As, will just shift the cost over to storing 4 columns of results.
I have also tried pulling in B's rows as rows and then using _MM_TRANSPOSE4_PS(b0r, b1r, b2r, b3r);
to do the transposition (I thought there might be some additional optimizations in that macro), but there's no real improvement.
On the surface, I feel like this should be faster... the dot products involved would be a row by a row, which seems inherently more efficient, but trying to do the dot products straight up just results in having to do the same thing to store the results.
What am I missing here?
Added: Just to clarify, I'm trying to not transpose the matrices. I'd prefer to iterate along them. The problem, as best I can tell, is that the _mm_set_ps command is much slower than _mm_load_ps.
I also tried a variation where I stored the 4 rows of the A matrix and then replaced the 4 curly-bracketed segments containing 1 load, 4 multiplies, and 2 adds with 4 multiply instructions and 3 hadds
, but to little avail. The timing stays the same (and yes, I tried it with a debug statement to verify that the code had changed in my test compile. Said debug statement was removed before profiling, of course):
{ // first row of result += first row of mat1 * 4x4 of mat2
__m128 cX1 = _mm_hadd_ps(_mm_mul_ps(a0r, b0r), _mm_mul_ps(a0r, b1r));
__m128 cX2 = _mm_hadd_ps(_mm_mul_ps(a0r, b2r), _mm_mul_ps(a0r, b3r));
Float* c0 = this->el_[i]+j;
_mm_storeu_ps(c0, _mm_add_ps(_mm_hadd_ps(cX1, cX2), _mm_loadu_ps(c0)));
}
{ // second row of result += second row of mat1 * 4x4 of mat2
__m128 cX1 = _mm_hadd_ps(_mm_mul_ps(a1r, b0r), _mm_mul_ps(a1r, b1r));
__m128 cX2 = _mm_hadd_ps(_mm_mul_ps(a1r, b2r), _mm_mul_ps(a1r, b3r));
Float* c0 = this->el_[i+1]+j;
_mm_storeu_ps(c0, _mm_add_ps(_mm_hadd_ps(cX1, cX2), _mm_loadu_ps(c0)));
}
{ // third row of result += third row of mat1 * 4x4 of mat2
__m128 cX1 = _mm_hadd_ps(_mm_mul_ps(a2r, b0r), _mm_mul_ps(a2r, b1r));
__m128 cX2 = _mm_hadd_ps(_mm_mul_ps(a2r, b2r), _mm_mul_ps(a2r, b3r));
Float* c0 = this->el_[i+2]+j;
_mm_storeu_ps(c0, _mm_add_ps(_mm_hadd_ps(cX1, cX2), _mm_loadu_ps(c0)));
}
{ // fourth row of result += fourth row of mat1 * 4x4 of mat2
__m128 cX1 = _mm_hadd_ps(_mm_mul_ps(a3r, b0r), _mm_mul_ps(a3r, b1r));
__m128 cX2 = _mm_hadd_ps(_mm_mul_ps(a3r, b2r), _mm_mul_ps(a3r, b3r));
Float* c0 = this->el_[i+3]+j;
_mm_storeu_ps(c0, _mm_add_ps(_mm_hadd_ps(cX1, cX2), _mm_loadu_ps(c0)));
}
Update:
Right, and moving the loading of the rows of a0r
to a3r
into the curly braces in an attempt to avoid register thrashing failed as well.