I made some benchmark between the two approaches:
- transpose + popcount
- update row by row
I wrote a naive version and an AVX2 one for both approaches. I used some functions (found on stackoverflow or elsewhere) for the AVX2 "transpose+popcount" approach.
In my test, I make the assumption that the input is a nbRowsx32
matrix in a bits packed format (nbRows itself being a multiple of 32); the matrix is therefore stored as an array of uint32_t
.
The code is the following:
#include <cinttypes>
#include <cstdio>
#include <cstring>
#include <cmath>
#include <cassert>
#include <chrono>
#include <immintrin.h>
#include <asmlib.h>
using namespace std;
using namespace std::chrono;
// see https://mcmap.net/q/14410/-fastest-way-to-unpack-32-bits-to-a-32-byte-simd-vector
static __m256i expand_bits_to_bytes (uint32_t x);
// see https://mischasan.wordpress.com/2011/10/03/the-full-sse2-bit-matrix-transpose-routine/
static void sse_trans(char const *inp, char *out);
static double deviation (double n, double sum2, double sum);
////////////////////////////////////////////////////////////////////////////////
// Naive approach (matrix transposition)
////////////////////////////////////////////////////////////////////////////////
void test_transpose_popcnt_naive (uint64_t nbRows, const uint32_t* bitmap, uint64_t* globalSums)
{
assert (nbRows%32==0);
uint8_t transpo[32][32]; memset (transpo, 0, sizeof(transpo));
for (uint64_t k=0; k<nbRows; k+=32)
{
// We unpack and transpose the input into a 32x32 bytes matrix
for (size_t row=0; row<32; row++)
{
for (size_t col=0; col<32; col++) { transpo[col][row] = (bitmap[k+row] >> col) & 1 ; }
}
for (size_t row=0; row<32; row++)
{
// We popcount the current row
u_int8_t sum=0;
for (size_t col=0; col<32; col++) { sum += transpo[row][col]; }
// We update the corresponding global sum
globalSums[row] += sum;
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Naive approach (row by row)
////////////////////////////////////////////////////////////////////////////////
void test_update_row_by_row_naive (uint64_t nbRows, const uint32_t* bitmap, uint64_t* globalSums)
{
for (uint64_t row=0; row<nbRows; row++)
{
for (size_t col=0; col<32; col++)
{
globalSums[col] += (bitmap[row] >> col) & 1;
}
}
}
////////////////////////////////////////////////////////////////////////////////
// AVX2 (matrix transposition + popcount)
////////////////////////////////////////////////////////////////////////////////
void test_transpose_popcnt_avx2 (uint64_t nbRows, const uint32_t* bitmap, uint64_t* globalSums)
{
assert (nbRows%32==0);
uint32_t transpo[32];
const uint32_t* loop = bitmap;
for (uint64_t k=0; k<nbRows; loop+=32, k+=32)
{
// We transpose the input as a 32x32 bytes matrix
sse_trans ((const char*)loop, (char*)transpo);
// We update the global sums
for (size_t i=0; i<32; i++)
{
globalSums[i] += __builtin_popcount (transpo[i]);
}
}
}
////////////////////////////////////////////////////////////////////////////////
// AVX2 approach (update totals row by row)
////////////////////////////////////////////////////////////////////////////////
// Note: we use template specialization to unroll some portions of a loop
template<int N>
void UpdateLocalSums (__m256i& localSums, const uint32_t* bitmap, uint64_t& k)
{
// We update the local sums with the current row
localSums = _mm256_sub_epi8 (localSums, expand_bits_to_bytes (bitmap[k++]));
// Go recursively
UpdateLocalSums<N-1>(localSums, bitmap, k);
}
template<>
void UpdateLocalSums<0> (__m256i& localSums, const uint32_t* bitmap, uint64_t& k)
{
}
// Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums with AVX2
#define USE_AVX2_FOR_GRAND_TOTALS 1
void test_update_row_by_row_avx2 (uint64_t nbRows, const uint32_t* bitmap, uint64_t* globalSums)
{
union U256i { __m256i v; uint8_t a[32]; uint32_t b[8]; };
// We use 1 register for updating local totals
__m256i localSums = _mm256_setzero_si256();
#ifdef USE_AVX2_FOR_GRAND_TOTALS
// Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums with AVX2
__m256i globalSumsReg[4]; for (size_t r=0; r<4; r++) { globalSumsReg[r] = _mm256_setzero_si256(); }
#endif
uint64_t steps = nbRows / 255;
uint64_t k=0;
const int divisorOf255 = 5;
// We iterate over all rows
for (uint64_t i=0; i<steps; i++)
{
// we update the local totals (255*32=8160 additions)
for (int j=0; j<255/divisorOf255; j++)
{
// unroll some portion of the 255 loop through template specialization
UpdateLocalSums<divisorOf255>(localSums, bitmap, k);
}
#ifdef USE_AVX2_FOR_GRAND_TOTALS
// Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums
// We take the 128 high bits of the local sums
__m256i localSums2 = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(localSums,1));
globalSumsReg[0] = _mm256_add_epi32 (globalSumsReg[0],
_mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums, 0)))
);
globalSumsReg[1] = _mm256_add_epi32 (globalSumsReg[1],
_mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums, 8)))
);
globalSumsReg[2] = _mm256_add_epi32 (globalSumsReg[2],
_mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums2, 0)))
);
globalSumsReg[3] = _mm256_add_epi32 (globalSumsReg[3],
_mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums2, 8)))
);
#else
// we update the global totals
U256i tmp = { localSums };
for (size_t k=0; k<32; k++) { globalSums[k] += tmp.a[k]; }
#endif
// we reset the local totals
localSums = _mm256_setzero_si256();
}
#ifdef USE_AVX2_FOR_GRAND_TOTALS
// We update the global totals into the final uint32_t array
for (size_t r=0; r<4; r++)
{
U256i tmp = { globalSumsReg[r] };
for (size_t k=0; k<8; k++) { globalSums[r*8+k] += tmp.b[k]; }
}
#endif
// we update the remaining local totals
for (uint64_t i=steps*255; i<nbRows; i++)
{
UpdateLocalSums<1>(localSums, bitmap, k);
}
// we update the global totals
U256i tmp = { localSums };
for (size_t k=0; k<32; k++) { globalSums[k] += tmp.a[k]; }
}
////////////////////////////////////////////////////////////////////////////////
void execute (
const char* name,
void (*fct)(uint64_t nbRows, const uint32_t* bitmap, uint64_t* globalSums),
size_t nbRuns,
uint64_t nbRows,
u_int32_t* bitmap
)
{
uint64_t sums[32];
double timeTotal=0;
double cycleTotal=0;
double timeTotal2=0;
double cycleTotal2=0;
uint64_t check=0;
for (size_t n=0; n<nbRuns; n++)
{
memset(sums,0,sizeof(sums));
// We want both time and cpu cycles information
milliseconds t0 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());
uint64_t c0 = ReadTSC();
// We run the test
(*fct) (nbRows, bitmap, sums);
uint64_t c1 = ReadTSC();
milliseconds t1 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());
timeTotal += (t1-t0).count();
cycleTotal += (double)(c1-c0) / nbRows;
timeTotal2 += (t1-t0).count() * (t1-t0).count();
cycleTotal2 += ((double)(c1-c0) / nbRows) * ((double)(c1-c0) / nbRows);
// We compute some dummy checksum
for (size_t k=0; k<32; k++) { check += sums[k]; }
}
printf ("%-21s | %5.0lf (%5.1lf) | %5.2lf (%4.2lf) | %.3lf | 0x%lx\n",
name,
timeTotal / nbRuns,
deviation (nbRuns, timeTotal2, timeTotal),
cycleTotal/nbRuns,
deviation (nbRuns, cycleTotal2, cycleTotal),
check,
nbRows * cycleTotal / timeTotal / 1000000.0
);
}
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
// We set rows number as 2^n where n is the provided argument
// For simplification, we assume that the rows number is a multiple of 32
uint64_t nbRows = 1ULL << (argc>1 ? atoi(argv[1]) : 28);
size_t nbRuns = argc>2 ? atoi(argv[2]) : 10;
// We build an bitmap of size nbRows*32
uint32_t* bitmap = new uint32_t[nbRows];
if (bitmap==nullptr)
{
fprintf(stderr, "unable to allocate the bitmap\n");
exit(1);
}
// We fill the bitmap with random values
srand(time(nullptr));
for (uint64_t i=0; i<nbRows; i++) { bitmap[i] = rand() & 0xFFFFFFFF; }
printf ("\n");
printf ("nbRows=%ld nbRuns=%ld\n", nbRows, nbRuns);
printf ("------------------------------------------------------------------------------------------------------------\n");
printf ("name | time in msec : mean (sd) | cycles/row : mean (sd) | frequency in GHz | checksum\n");
printf ("------------------------------------------------------------------------------------------------------------\n");
// We launch the benchmark
execute ("naive (transpo) ", test_transpose_popcnt_naive, nbRuns, nbRows, bitmap);
execute ("naive (row by row)", test_update_row_by_row_naive, nbRuns, nbRows, bitmap);
execute ("AVX2 (transpo) ", test_transpose_popcnt_avx2, nbRuns, nbRows, bitmap);
execute ("AVX2 (row by row)", test_update_row_by_row_avx2, nbRuns, nbRows, bitmap);
printf ("\n");
// Some clean up
delete[] bitmap;
return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////
__m256i expand_bits_to_bytes(uint32_t x)
{
__m256i xbcast = _mm256_set1_epi32(x);
// Each byte gets the source byte containing the corresponding bit
__m256i shufmask = _mm256_set_epi64x(
0x0303030303030303, 0x0202020202020202,
0x0101010101010101, 0x0000000000000000);
__m256i shuf = _mm256_shuffle_epi8(xbcast, shufmask);
__m256i andmask = _mm256_set1_epi64x(0x8040201008040201); // every 8 bits -> 8 bytes, pattern repeats.
__m256i isolated_inverted = _mm256_and_si256(shuf, andmask);
// Avoid an _mm256_add_epi8 thanks to Peter Cordes's comment
return _mm256_cmpeq_epi8(isolated_inverted, andmask);
}
////////////////////////////////////////////////////////////////////////////////
void sse_trans(char const *inp, char *out)
{
#define INP(x,y) inp[(x)*4 + (y)/8]
#define OUT(x,y) out[(y)*4 + (x)/8]
int rr, cc, i, h;
union { __m256i x; uint8_t b[32]; } tmp;
for (cc = 0; cc < 32; cc += 8)
{
for (i = 0; i < 32; ++i)
tmp.b[i] = INP(i, cc);
for (i = 8; i--; tmp.x = _mm256_slli_epi64(tmp.x, 1))
*(uint32_t*)&OUT(0, cc + i) = _mm256_movemask_epi8(tmp.x);
}
}
////////////////////////////////////////////////////////////////////////////////
double deviation (double n, double sum2, double sum) { return sqrt (sum2/n - (sum/n)*(sum/n)); }
Some remarks:
- I used the Agner Fog's asmlib to have a function that returns CPU cycles
- The compilation command is
g++ -O3 -march=native ../Test.cpp -o ./Test -laelf64
- The gcc version is 7.3.1
- The CPU is Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
- I compute some dummy checksum to compare the results of the different tests
Now the results:
------------------------------------------------------------------------------------------------------------
name | time in msec : mean (sd) | cycles/row : mean (sd) | frequency in GHz | checksum
------------------------------------------------------------------------------------------------------------
naive (transpo) | 4548 ( 36.5) | 43.91 (0.35) | 2.592 | 0x9affeb5a6
naive (row by row) | 3033 ( 11.0) | 29.29 (0.11) | 2.592 | 0x9affeb5a6
AVX2 (transpo) | 767 ( 12.8) | 7.40 (0.12) | 2.592 | 0x9affeb5a6
AVX2 (row by row) | 130 ( 4.0) | 1.25 (0.04) | 2.591 | 0x9affeb5a6
So it seems that the "row by row" in AVX2 is the best so far.
Note that when I saw this result (less than 2 cycles per row), I made no more effort to optimize the AVX2 "transpose+popcount" method, which should be feasable by computing several popcounts in parallel (I may test it later).
_mm256_srli_si256
to shift the bytes 0, 64, 128, and 192 bytes,_mm256_castsi256_si128
to get the last 128 bits as a__mm128
,_mm256_cvtepu8_epi32
to convert the last 8 bytes of the__m128
to 0-extended 32-bit integers. Then proceed to add these to the grand total avx2 registers. Once you've handled all rows, then go ahead and update the uint32_t array values. – Steady__mm256
register. Normally you'd shift, mask, and then add. Instead, mask and add. The overflow would be preserved by the preceding bits for all the but the first of the 32 columns, and just shift everything right before updating the grand totals. The problem is handling that first column. – Steady