Fast, portable 4-instructions for GCC/Clang+MSVC
For the best performance on common compilers and platforms with fallbacks for everything else:
#include <stdint.h>
#include <limits.h>
#ifdef _MSC_VER
unsigned char _BitScanReverse64(unsigned long*Index,unsigned __int64 Mask);
#pragma intrinsic(_BitScanReverse64)
#endif
#if defined(INTPTR_MAX) && INTPTR_MAX > INT32_MAX
uint32_t nextPow2int32(uint64_t in) { // Returns next power of 2
#else
uint32_t nextPow2int32(uint32_t in) { // Returns next power of 2
#endif
#if defined(__GNUC__) && defined(__x86_64__)
if (in > UINT32_MAX) __builtin_unreachable();
uint64_t out, tmp;
__asm__("lea{q -1(,%q[in],2), %q[tmp]| %q[tmp], [%q[in]*2-1]}"
"\n\tbsr{q %q[tmp], %q[tmp]| %q[tmp], %q[tmp]}"
"\n\txor{l %k[out], %k[out]| %k[out], %k[out]}"
"\n\tbtc{q %q[tmp], %q[out]| %q[out], %q[tmp]}"
: [out]"=r"(out),[tmp]"=r"(tmp):[in]"r"(in));
return (uint32_t)out;
#elif defined(__GNUC__) && defined(__aarch64__)
uint64_t x = (uint64_t)in*2 - 1;
#ifdef __builtin_arm_rbitll
x = __builtin_arm_rbitll(x);
#else
__asm__ ("rbit %0, %1" : "=r"(x) : "r"(x));
#endif
uint64_t y = x & -x;
#ifdef __builtin_arm_rbitll
y = __builtin_arm_rbitll(y);
#else
__asm__ ("rbit %0, %1" : "=r"(y) : "r"(y));
#endif
return (uint32_t) y;
#elif defined(__GNUC__)
if (in > UINT32_MAX) __builtin_unreachable();
unsigned long long inT2M1=2*(unsigned long long)in-1, clz;
clz = __builtin_clzll(inT2M1)^(sizeof(long long)*8-1);
inT2M1 = inT2M1 == 0 ? inT2M1 : clz;
return (uint32_t)((unsigned long long)1 << inT2M1);
#elif defined(_MSC_VER)
unsigned __int64 inT2M1=2*(unsigned __int64)in-1;
unsigned long res = (unsigned long)inT2M1;
_BitScanReverse64(&res, inT2M1);
return (uint32_t)((__int64)1 << res);
#else // otherwise, use the slow fallback
in -= 1;
if (in >> 27) in |= in >> 27; // very unlikely, so free
in|=(in>>9)|(in>>18), in|=(in>>3)|(in>>6);
return (in | (in>>1) | (in>>2)) + 1;
#endif
}
This yields the following ultra compact 4-instruction assembly on x86_64 GCC/Clang:
nextPow2int32:
lea rdx, [rdi*2-1]
bsr rdx, rdx
xor eax, eax
btc rax, rdx
ret
For MSVC on x86_64:
nextPow2int32 PROC
lea rcx, QWORD PTR [rcx*2-1]
mov eax, 1
bsr rcx, rcx
shl rax, cl
ret 0
nextPow2int32 ENDP
Test:
#include <stdio.h>
#include <inttypes.h>
int main() {
printf("nextPowerOf2(0) = %" PRIu32 "\n", nextPowerOf2(0)); // 0
printf("nextPowerOf2(1) = %" PRIu32 "\n", nextPowerOf2(1)); // 1
printf("nextPowerOf2(2) = %" PRIu32 "\n", nextPowerOf2(2)); // 2
printf("nextPowerOf2(3) = %" PRIu32 "\n", nextPowerOf2(3)); // 4
printf("nextPowerOf2(4) = %" PRIu32 "\n", nextPowerOf2(4)); // 4
printf("nextPowerOf2(5) = %" PRIu32 "\n", nextPowerOf2(5)); // 8
printf("nextPowerOf2(6) = %" PRIu32 "\n", nextPowerOf2(6)); // 8
printf("nextPowerOf2(7) = %" PRIu32 "\n", nextPowerOf2(7)); // 8
}