rcl r8, 1
has entered the chat... /;)
diffClassifyU64:
cmp rdi, rsi
seta al
rcl al
ret
diffClassifyU64
is just three x86 assembly instructions to compare two unsigned 64-bit operands and returns 0 if they are equal, 1 if the first is less than the second, and 2 if greater.
I wrote a neat portable wrapper that will enable this bit trick where appropriate, using fallbacks for non-x86 and non-gcc compilers and permitting constant arguments to be evaluated at compile time and work in c++ constexpr if you are using c++ (this is disabled if you are not in c++17.)
- Use
numClassifyI64
below to classify a single signed 64-bit integer.
// returns {0, 1, 2} if SIGNED 64-bit x is {zero, negative, positive}
uint_least8_t numClassifyI64(int64_t x) {
uint_least8_t gt = 0 < x, nz = x != 0, out = gt + nz;
#if defined(__GNUC__) && defined(__x86_64__)
#ifdef __cpp_lib_is_constant_evaluated
if (!__builtin_is_constant_evaluated())
#endif
if (! __builtin_constant_p(x))
__asm__("sal{q %1| %1}\n\t"
"setnbe{ %0| %0}\n\t"
"rcl{b $1, %0| %0, 1}"
: "=r"(out), "+r"(x) :: "cc");
#endif
return out;
}
- Use
diffClassifyU64
below to compare two UNSIGNED 64-bit integers
// returns {0, 1, 2} if UNSIGNED 64-bit {a == b, a < b, a > b}
uint_least8_t diffClassifyU64(uint64_t a, uint64_t b) {
uint_least8_t gt = a > b, nz = a != b, out = gt + nz;
#if defined(__GNUC__) && defined(__x86_64__)
#ifdef __cpp_lib_is_constant_evaluated
if (!__builtin_is_constant_evaluated())
#endif
if (! __builtin_constant_p(a-b))
__asm__("cmp{q %2, %1| %1, %2}\n\t"
"setnbe{ %0| %0}\n\t"
"rcl{b $1, %0| %0, 1}"
: "=r"(out) : "r"(a), "r"(b) : "cc");
#endif
return out;
}
- Use
diffClassifyI64
below to compare two SIGNED 64-bit integers
// returns {0, 1, 2} if SIGNED 64-bit {a == b, a < b, a > b}
uint_least8_t diffClassifyI64(int64_t a, int64_t b) {
uint_least8_t gt = a > b, nz = a != b, out = gt + nz;
#if defined(__GNUC__) && defined(__x86_64__)
#ifdef __cpp_lib_is_constant_evaluated
if (!__builtin_is_constant_evaluated())
#endif
if (! __builtin_constant_p(a-b))
__asm__("cmp{q %3, %2| %2, %3}\n\t"
"setg{ %0| %0}\n\t"
"setnz{ %1| %1}\n\t"
"add{b %1, %0| %0, %1}"
: "=r"(out), "=&r"(nz) : "r"(a), "r"(b) : "cc");
#endif
return out;
}
Below are tests I ran to ensure the correctness of both the assembly and the C fallback code:
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <stdint.h>
typedef uint_least8_t(*test_f)(uint64_t,uint64_t);
__attribute__((noinline)) uint_least8_t callAsmFn(test_f f, int64_t a,
int64_t b, ...) { return f(a, b); }
#define TESTF(f,...) printf("exp "#f"("#__VA_ARGS__") => %d\n",f(__VA_ARGS__));\
printf("asm "#f"("#__VA_ARGS__") => %d\n",callAsmFn((test_f)f,__VA_ARGS__,0))
#include <stdio.h>
int main(void) {
TESTF(numClassifyI64, INT64_MIN);
TESTF(numClassifyI64, -1);
TESTF(numClassifyI64, 0);
TESTF(numClassifyI64, 1);
TESTF(numClassifyI64, INT64_MAX);
printf("\n");
TESTF(diffClassifyU64, 0, 0);
TESTF(diffClassifyU64, 0, 1);
TESTF(diffClassifyU64, 1, 0);
TESTF(diffClassifyU64, 0, UINT64_MAX);
TESTF(diffClassifyU64, UINT64_MAX, 0);
TESTF(diffClassifyU64, 1, UINT64_MAX);
TESTF(diffClassifyU64, UINT64_MAX, 1);
printf("\n");
TESTF(diffClassifyI64, 0, 0);
TESTF(diffClassifyI64, 0, 1);
TESTF(diffClassifyI64, 1, 0);
TESTF(diffClassifyI64, 0, -1);
TESTF(diffClassifyI64, -1, 0);
TESTF(diffClassifyI64, 0, INT64_MAX);
TESTF(diffClassifyI64, 0, INT64_MIN);
TESTF(diffClassifyI64, 1, INT64_MAX);
TESTF(diffClassifyI64, 1, INT64_MIN);
TESTF(diffClassifyI64, -1, INT64_MAX);
TESTF(diffClassifyI64, -1, INT64_MIN);
return 0;
}
bool operator<(const Point& a, const Point& b){ return a.x < b.x || (a.x == b.x && a.y < b.y); }
? I can't see how the look-up table is going to beat that because calculating the array indices appears to be at least as expensive as just calculating the answer. - With a quick test this sorts 1 million random pairs some 30% faster than the look-up table and Tom's Compare function. – Cabrera