I have benchmarked several implementations of a 64 bit "highest bit". The most "branch free" code is not in fact the fastest.
This is my highest-bit.c
source file:
int highest_bit_unrolled(unsigned long long n)
{
if (n & 0xFFFFFFFF00000000) {
if (n & 0xFFFF000000000000) {
if (n & 0xFF00000000000000) {
if (n & 0xF000000000000000) {
if (n & 0xC000000000000000)
return (n & 0x8000000000000000) ? 64 : 63;
else
return (n & 0x2000000000000000) ? 62 : 61;
} else {
if (n & 0x0C00000000000000)
return (n & 0x0800000000000000) ? 60 : 59;
else
return (n & 0x0200000000000000) ? 58 : 57;
}
} else {
if (n & 0x00F0000000000000) {
if (n & 0x00C0000000000000)
return (n & 0x0080000000000000) ? 56 : 55;
else
return (n & 0x0020000000000000) ? 54 : 53;
} else {
if (n & 0x000C000000000000)
return (n & 0x0008000000000000) ? 52 : 51;
else
return (n & 0x0002000000000000) ? 50 : 49;
}
}
} else {
if (n & 0x0000FF0000000000) {
if (n & 0x0000F00000000000) {
if (n & 0x0000C00000000000)
return (n & 0x0000800000000000) ? 48 : 47;
else
return (n & 0x0000200000000000) ? 46 : 45;
} else {
if (n & 0x00000C0000000000)
return (n & 0x0000080000000000) ? 44 : 43;
else
return (n & 0x0000020000000000) ? 42 : 41;
}
} else {
if (n & 0x000000F000000000) {
if (n & 0x000000C000000000)
return (n & 0x0000008000000000) ? 40 : 39;
else
return (n & 0x0000002000000000) ? 38 : 37;
} else {
if (n & 0x0000000C00000000)
return (n & 0x0000000800000000) ? 36 : 35;
else
return (n & 0x0000000200000000) ? 34 : 33;
}
}
}
} else {
if (n & 0x00000000FFFF0000) {
if (n & 0x00000000FF000000) {
if (n & 0x00000000F0000000) {
if (n & 0x00000000C0000000)
return (n & 0x0000000080000000) ? 32 : 31;
else
return (n & 0x0000000020000000) ? 30 : 29;
} else {
if (n & 0x000000000C000000)
return (n & 0x0000000008000000) ? 28 : 27;
else
return (n & 0x0000000002000000) ? 26 : 25;
}
} else {
if (n & 0x0000000000F00000) {
if (n & 0x0000000000C00000)
return (n & 0x0000000000800000) ? 24 : 23;
else
return (n & 0x0000000000200000) ? 22 : 21;
} else {
if (n & 0x00000000000C0000)
return (n & 0x0000000000080000) ? 20 : 19;
else
return (n & 0x0000000000020000) ? 18 : 17;
}
}
} else {
if (n & 0x000000000000FF00) {
if (n & 0x000000000000F000) {
if (n & 0x000000000000C000)
return (n & 0x0000000000008000) ? 16 : 15;
else
return (n & 0x0000000000002000) ? 14 : 13;
} else {
if (n & 0x0000000000000C00)
return (n & 0x0000000000000800) ? 12 : 11;
else
return (n & 0x0000000000000200) ? 10 : 9;
}
} else {
if (n & 0x00000000000000F0) {
if (n & 0x00000000000000C0)
return (n & 0x0000000000000080) ? 8 : 7;
else
return (n & 0x0000000000000020) ? 6 : 5;
} else {
if (n & 0x000000000000000C)
return (n & 0x0000000000000008) ? 4 : 3;
else
return (n & 0x0000000000000002) ? 2 : (n ? 1 : 0);
}
}
}
}
}
int highest_bit_bs(unsigned long long n)
{
const unsigned long long mask[] = {
0x000000007FFFFFFF,
0x000000000000FFFF,
0x00000000000000FF,
0x000000000000000F,
0x0000000000000003,
0x0000000000000001
};
int hi = 64;
int lo = 0;
int i = 0;
if (n == 0)
return 0;
for (i = 0; i < sizeof mask / sizeof mask[0]; i++) {
int mi = lo + (hi - lo) / 2;
if ((n >> mi) != 0)
lo = mi;
else if ((n & (mask[i] << lo)) != 0)
hi = mi;
}
return lo + 1;
}
int highest_bit_shift(unsigned long long n)
{
int i = 0;
for (; n; n >>= 1, i++)
; /* empty */
return i;
}
static int count_ones(unsigned long long d)
{
d = ((d & 0xAAAAAAAAAAAAAAAA) >> 1) + (d & 0x5555555555555555);
d = ((d & 0xCCCCCCCCCCCCCCCC) >> 2) + (d & 0x3333333333333333);
d = ((d & 0xF0F0F0F0F0F0F0F0) >> 4) + (d & 0x0F0F0F0F0F0F0F0F);
d = ((d & 0xFF00FF00FF00FF00) >> 8) + (d & 0x00FF00FF00FF00FF);
d = ((d & 0xFFFF0000FFFF0000) >> 16) + (d & 0x0000FFFF0000FFFF);
d = ((d & 0xFFFFFFFF00000000) >> 32) + (d & 0x00000000FFFFFFFF);
return d;
}
int highest_bit_parallel(unsigned long long n)
{
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
return count_ones(n);
}
int highest_bit_so(unsigned long long x)
{
static const unsigned long long t[6] = {
0xFFFFFFFF00000000ull,
0x00000000FFFF0000ull,
0x000000000000FF00ull,
0x00000000000000F0ull,
0x000000000000000Cull,
0x0000000000000002ull
};
int y = (((x & (x - 1)) == 0) ? 0 : 1);
int j = 32;
int i;
for (i = 0; i < 6; i++) {
int k = (((x & t[i]) == 0) ? 0 : j);
y += k;
x >>= k;
j >>= 1;
}
return y;
}
int highest_bit_so2(unsigned long long value)
{
int pos = 0;
if (value & (value - 1ULL))
{
pos = 1;
}
if (value & 0xFFFFFFFF00000000ULL)
{
pos += 32;
value = value >> 32;
}
if (value & 0x00000000FFFF0000ULL)
{
pos += 16;
value = value >> 16;
}
if (value & 0x000000000000FF00ULL)
{
pos += 8;
value = value >> 8;
}
if (value & 0x00000000000000F0ULL)
{
pos += 4;
value = value >> 4;
}
if (value & 0x000000000000000CULL)
{
pos += 2;
value = value >> 2;
}
if (value & 0x0000000000000002ULL)
{
pos += 1;
value = value >> 1;
}
return pos;
}
This is highest-bit.h
:
int highest_bit_unrolled(unsigned long long n);
int highest_bit_bs(unsigned long long n);
int highest_bit_shift(unsigned long long n);
int highest_bit_parallel(unsigned long long n);
int highest_bit_so(unsigned long long n);
int highest_bit_so2(unsigned long long n);
And the main program (sorry about all the copy and paste):
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "highest-bit.h"
double timedelta(clock_t start, clock_t end)
{
return (end - start)*1.0/CLOCKS_PER_SEC;
}
int main(int argc, char **argv)
{
int i;
volatile unsigned long long v;
clock_t start, end;
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_unrolled(v);
}
end = clock();
printf("highest_bit_unrolled = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_parallel(v);
}
end = clock();
printf("highest_bit_parallel = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_bs(v);
}
end = clock();
printf("highest_bit_bs = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_shift(v);
}
end = clock();
printf("highest_bit_shift = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_so(v);
}
end = clock();
printf("highest_bit_so = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_so2(v);
}
end = clock();
printf("highest_bit_so2 = %6.3fs\n", timedelta(start, end));
return 0;
}
I've tried this various Intel x86 boxes, old and new.
The highest_bit_unrolled
(unrolled binary search) is consistently significantly faster than highest_bit_parallel
(branch-free bit ops). This is faster than than highest_bit_bs
(binary search loop), and that in turn is faster than highest_bit_shift
(naive shift and count loop).
highest_bit_unrolled
is also faster than the one in the accepted SO answer (highest_bit_so
) and also one given in another answer (highest_bit_so2
).
The benchmark cycles through a one-bit mask that covers successive bits. This is to try to defeat branch prediction in the unrolled binary search, which is realistic: in a real world program, the input cases are unlikely to exhibit locality of bit position.
Here are results on an old Intel(R) Core(TM)2 Duo CPU E4500 @ 2.20GHz
:
$ ./highest-bit
highest_bit_unrolled = 6.090s
highest_bit_parallel = 9.260s
highest_bit_bs = 19.910s
highest_bit_shift = 21.130s
highest_bit_so = 8.230s
highest_bit_so2 = 6.960s
On a newer model Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
:
highest_bit_unrolled = 1.555s
highest_bit_parallel = 3.420s
highest_bit_bs = 6.486s
highest_bit_shift = 9.505s
highest_bit_so = 4.127s
highest_bit_so2 = 1.645s
On the newer hardware, highest_bit_so2
comes closer to highest_bit_unrolled
on the newer hardware. The order is not quite the same; now highest_bit_so
really falls behind, and is slower than highest_bit_parallel
.
The fastest one, highest_bit_unrolled
contains the most code with the most branches. Every single return value reached by a different set of conditions with its own dedicated piece of code.
The intuition of "avoid all branches" (due to worries about branch mis-predictions) is not always right. Modern (and even not-so-modern any more) processors contain considerable cunning in order not to be hampered by branching.
P.S. the highest_bit_unrolled
was introduced in the TXR language in December 2011 (with mistakes, since debugged).
Recently, I started wondering whether some nicer, more compact code without branches might not be faster.
I'm somewhat surprised by the results.
Arguably, the code should really be #ifdef
-ing for GNU C and using some compiler primitives, but as far as portability goes, that version stays.
((x << 1) - 1)
. You'd need to special-casex == 0
, and you'll overflow if the top bit is set, but this method might be faster than some of the other rounding techniques presented. – Kymberlykymograph