Would we slice as many of the vector that would fit in the SIMD vector and just manually process the remainder? Is there a better way to do this?
Pretty much this. A basic example that processes all number in batches of 8, and uses mask load/maskstore to handle the remainder.
void add(int* const r, const int* const a, const int* const b, const unsigned count) {
// how many blocks of 8, and how many left over
const unsigned c8 = count & ~0x7U;
const unsigned cr = count & 0x7U;
// process blocks of 8
for(unsigned i = 0; i < c8; i += 8) {
__m256i _a = _mm256_loadu_si256((__m256i*)(a + i));
__m256i _b = _mm256_loadu_si256((__m256i*)(b + i));
__m256i _c = _mm256_add_epi32(_a, _b);
_mm256_storeu_si256((__m256i*)(c + i), _c);
}
const __m128i temp[5] = {
_mm_setr_epi32(0, 0, 0, 0),
_mm_setr_epi32(-1, 0, 0, 0),
_mm_setr_epi32(-1, -1, 0, 0),
_mm_setr_epi32(-1, -1, -1, 0),
_mm_setr_epi32(-1, -1, -1, -1)
};
// I'm using mask load / mask store for the remainder here.
// (this is not the only approach)
__m256i mask;
if(cr >= 4) {
mask = _mm256_set_m128i(temp[cr&3], temp[4]);
} else {
mask = _mm256_set_m128i(temp[0], temp[cr]);
}
__m256i _a = _mm256_maskload_epi32((a + c8), mask);
__m256i _b = _mm256_maskload_epi32((b + c8), mask);
__m256i _c = _mm256_add_epi32(_a, _b);
_mm256_maskstore_epi32((c + c8), mask, _c);
}
Of course, if you happen to use your own containers (or provide your own allocators), then you can avoid most of this mess by simply ensuring all container allocations occur in multiples of 256bits.
// yes, this class is missing a lot...
class MyIntArray {
public:
MyIntArray(unsigned count, const int* data) {
// bump capacity to next multiple of 8
unsigned cap = count & 7;
if(cap) cap = 8 - cap;
capacity = cap + count;
// allocation is aligned to 256bit
alloc = new int[capacity];
size = count;
memcpy(alloc, data, sizeof(int) * size);
}
MyIntArray(unsigned count) {
// bump capacity to next multiple of 8
unsigned cap = count & 7;
if(cap) cap = 8 - cap;
capacity = cap + count;
// allocation is aligned to 256bit
alloc = new int[capacity];
size = count;
}
unsigned capacity;
unsigned size;
int* alloc;
int* begin() { return alloc; }
int* end() { return alloc + size; }
const int* begin() const { return alloc; }
const int* end() const { return alloc + size; }
};
void add(MyIntArray r, const MyIntArray a, const MyIntArray b) {
// process blocks of 8.
// we may be stamping beyond the end of the array, but not over the
// the end of the capacity allocation....
// (probably also want to check to see if the sizes match!).
for(unsigned i = 0; i < r.size; i += 8) {
__m256i _a = _mm256_loadu_si256((__m256i*)(a.alloc + i));
__m256i _b = _mm256_loadu_si256((__m256i*)(b.alloc + i));
__m256i _c = _mm256_add_epi32(_a, _b);
_mm256_storeu_si256((__m256i*)(c.alloc + i), _c);
}
}