#include <immintrin.h>
__m256i avx2_lzcnt_epi16(__m256i v) {
const __m256i lut_lo = _mm256_set_epi8(
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 16,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 16
);
const __m256i lut_hi = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 16,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 16
);
const __m256i nibble_mask = _mm256_set1_epi8(0x0F);
const __m256i byte_offset = _mm256_set1_epi16(0x0008);
__m256i t;
t = _mm256_and_si256(nibble_mask, v);
v = _mm256_and_si256(_mm256_srli_epi16(v, 4), nibble_mask);
t = _mm256_shuffle_epi8(lut_lo, t);
v = _mm256_shuffle_epi8(lut_hi, v);
v = _mm256_min_epu8(v, t);
t = _mm256_srli_epi16(v, 8);
v = _mm256_or_si256(v, byte_offset);
v = _mm256_min_epu8(v, t);
return v;
}
// 16 - lzcnt_u16(subwords)
__m256i avx2_ms1b_epi16(__m256i v) {
const __m256i lut_lo = _mm256_set_epi8(
12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 0,
12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 0
);
const __m256i lut_hi = _mm256_set_epi8(
16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 13, 0,
16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 13, 0
);
const __m256i nibble_mask = _mm256_set1_epi8(0x0F);
const __m256i adj = _mm256_set1_epi16(0x1F08);
__m256i t;
t = _mm256_and_si256(nibble_mask, v);
v = _mm256_and_si256(_mm256_srli_epi16(v, 4), nibble_mask);
t = _mm256_shuffle_epi8(lut_lo, t);
v = _mm256_shuffle_epi8(lut_hi, v);
v = _mm256_max_epu8(v, t);
t = _mm256_srli_epi16(v, 8);
v = _mm256_sub_epi8(v, adj);
v = _mm256_max_epi8(v, t);
return v;
}
For results packed into uint8 use _mm256_packs_epi16()
.
For packed results in the correct order also use _mm256_permute4x64_epi64()
.
Solution from r/SIMD.
This solution was also described in the comments here.
vpcmpeqb
to simply search for a non-zero byte, then bitscan it. – Jarrellvpcmpgtb
to check if any bits above the low 4 are set, then the low 2, then the low 1. Hmm no, you need to be able to produce a unique result for all 8 positions within a byte. (And then something to combine results from pairs of bytes into words). Even with 16x 16-bit elements per vector, scalar at 1 per clock may still come out ahead. Or maybe even better on Ryzen if you canshl eax, 16
/mov ax, cx
to merge 2x 16-bit results into a 32-bit result to store both at once. No CPUs with AVX2 rename AX separately from RAX so no stalls – Jarrellvpshufb
as a 4-bit LUT after unpacking, then usepmaxub
to merge results from high/low halves of each byte. – Jarrellvpmaxsb
so we can use negative to indicate that there are no zero bit in this nibble. But yeah, similar to AVX2popcnt
, split into nibbles. Look up how that algorithm works. Except for this we probably want 2 separate LUTs, and we still have tovpaddb
for the high bytes of each pair. – Jarrelluint16 -> int32 -> float
and extract the exponent (which needs to be adjusted, of course). Another problem here is handling the 0 case. If you don't use the result afterwards (in a vectorized way), I doubt that this is worth the effort ... – Spottyuint8
,uint16
,uint32
? And what shall be the result for0
(or will0
not happen as input)? – Spotty