I believe this should be the fastest. It uses a lookup table for all 256 possible byte values and avoids unnecessary shifts by utilizing endianness. The add
with a second memory operand should macrofuse nicely on Intel processors and be very fast. It also uses ARM's rbit instruction where available in both MSVC and GCC/Clang.
#include <stdint.h>
#if defined(__clang__)
__attribute__((minsize))
#elif defined(__GNUC__)
__attribute__((optimize("Os")))
#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
unsigned int _arm_rbit(unsigned int _Rm);
#pragma intrinsic(_arm_rbit)
#endif
uint32_t fastBitReversal32(uint32_t in) {
#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
uint32_t out;
__asm__ inline ("rbit %0, %1" : "=r"(out) : "r"(in));
return out;
#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
return _arm_rbit(in);
#elif defined(__x86_64__) || (defined(_M_AMD64) && !defined(_M_PPC) && !defined(_M_IA64) && !defined(_M_ALPHA))
// leverage unaligned access on the x86_64
#ifdef __GNUC__
__attribute__((aligned(2048))) // avoid crossing a page boundary
#elif defined(_MSC_VER)
__declspec(aligned(2048))
#endif
static const uint32_t lookup[257] = {0,0,128,64,192,32,160,96,224,
16,144,80,208,48,176,112,240,8,136,72,200,40,168,104,232,24,
152,88,216,56,184,120,248,4,132,68,196,36,164,100,228,20,148,
84,212,52,180,116,244,12,140,76,204,44,172,108,236,28,156,92,
220,60,188,124,252,2,130,66,194,34,162,98,226,18,146,82,210,
50,178,114,242,10,138,74,202,42,170,106,234,26,154,90,218,58,
186,122,250,6,134,70,198,38,166,102,230,22,150,86,214,54,182,
118,246,14,142,78,206,46,174,110,238,30,158,94,222,62,190,126,
254,1,129,65,193,33,161,97,225,17,145,81,209,49,177,113,241,9,
137,73,201,41,169,105,233,25,153,89,217,57,185,121,249,5,133,
69,197,37,165,101,229,21,149,85,213,53,181,117,245,13,141,77,
205,45,173,109,237,29,157,93,221,61,189,125,253,3,131,67,195,
35,163,99,227,19,147,83,211,51,179,115,243,11,139,75,203,43,
171,107,235,27,155,91,219,59,187,123,251,7,135,71,199,39,167,
103,231,23,151,87,215,55,183,119,247,15,143,79,207,47,175,111,
239,31,159,95,223,63,191,127,255};
#if defined(__GNUC__)
uint32_t out, tmp;
__asm__ inline (
"movzx{l %b[in], %[out]| %[out], %b[in]}\n\t"
"mov{l %p[lu]+1(,%[out],4), %[out]| %[out], DWORD PTR %p[lu][%[out]*4+1]}\n\t"
#ifdef __AVX2__
"rorx{l $8, %[in], %[tmp]| %[tmp], %[in], 8}\n\t"
#else
"mov{l %[in], %[tmp]| %[tmp], %[in]}\n\t"
"shr{l $8, %[tmp]| %[tmp], 8}\n\t"
#endif
"shr{l $24, %[in]| %[in], 24}\n\t"
"add{l %p[lu]+4(,%[in],4), %[out]| %[out], DWORD PTR %p[lu][%[in]*4+4]}\n\t"
"movzx{l %h[tmp], %[in]| %[in], %h[tmp]}\n\t"
"movzx{l %b[tmp], %[tmp]| %[tmp], %b[tmp]}\n\t"
"add{l %p[lu]+3(,%[in],4), %[out]| %[out], DWORD PTR %p[lu][%[in]*4+3]}\n\t"
"add{l %p[lu]+2(,%[tmp],4), %[out]| %[out], DWORD PTR %p[lu][%[tmp]*4+2]}"
: [out] "=r" (out), [tmp] "=Q" (tmp), [in] "+r" (in)
: [lu] "m" (lookup)
);
return out;
#else
return *(uint32_t*)((char*)lookup + 1 + 4*(uint8_t)(in >> 0))
+ *(uint32_t*)((char*)lookup + 2 + 4*(uint8_t)(in >> 8))
+ *(uint32_t*)((char*)lookup + 3 + 4*(uint8_t)(in >> 16))
+ *(uint32_t*)((char*)lookup + 4 + 4*(uint8_t)(in >> 24));
#endif
#else
// fallback to regular slow path
uint32_t x = (in >> 16) | (in << 16);
x = (x << 8) & UINT32_C(0xff00ff00) | ((x & UINT32_C(0xff00ff00))>>8);
x = (x << 4) & UINT32_C(0xf0f0f0f0) | ((x & UINT32_C(0xf0f0f0f0))>>4);
x = (x << 2) & UINT32_C(0x33333333) | ((x & UINT32_C(0x33333333))>>2);
x = (x << 1) & UINT32_C(0xAAAAAAAA) | ((x & UINT32_C(0xAAAAAAAA))>>1);
return x;
#endif
}