Expanding a little on others' answers, here are c versions of the functions which calculate 'shift-arithmetic-right' on 32 and 64 bit signed integers without branching. The final cast is problematic however.
int32_t sar32(int32_t val, uint8_t sh) {
sh &= 0x1f;
uint32_t uval = (uint32_t)val;
uint32_t result = (uval >> sh) | -((uval & 0x80000000) >> sh);
return (int32_t)result;
}
int32_t sar32b(int32_t val, uint8_t sh) {
sh &= 0x1f;
uint64_t uval = val;
return (int32_t)(uval >> sh);
}
int64_t sar64(int64_t val, uint8_t sh) {
sh &= 0x3f;
uint64_t uval = (uint64_t)val;
uint64_t result = (uval >> sh) | -((uval & 0x8000000000000000UL) >> sh);
return (int64_t)result;
}
These functions sanitise the input of sh to make the shifting safe, but they do this in a way which wraps around if values outside of what is permitted are entered. To avoid the wrapping, something like
sh = (sh >= 0x1f ? 0x1f : sh & 0x1f);
could be used but this introduces branching. One way to avoid this would be to introduce another variable
uint8_t sh2 = ((sh >= 0x1f)*0x1f) | (sh & 0x1f);
and shift with that instead.
It's worth mentioning I think, that while the function below compiles with gcc (which ensures sign extension) and gives no warning even with -Wall -fsanitize=undefined flags, this should not be used where strict compliance with c standards are required because right shifting negative integer values is implementation defined behaviour in c.
int32_t sar(int32_t val, uint8_t sh)
{
return val >> (sh & 0x1f); // DO NOT USE IF val < 0!!!
}
For 32 and 64 bit non-branching functions which use union based type-punning, which is behaviour that is (allegedly) not in any way undefined in 'modern' c, and which don't wrap the shift amount, are below. This approach may not carry over to c++.
int32_t sar32(int32_t val, uint8_t sh) {
uint8_t sh2 = ((sh >= 0x1f)*0x1f) | (sh & 0x1f);
union {
int64_t i;
uint64_t u;
} input = {0};
input.i = val;
input.u >>= sh2;
return (int32_t)input.i;
}
int64_t sar64(int64_t val, uint8_t sh) {
uint8_t sh2 = ((sh >= 0x3f)*0x3f) | (sh & 0x3f);
union {
int64_t i;
uint64_t u;
} input = {0};
input.i = val;
input.u = (input.u >> sh2) | -((input.u & 0x8000000000000000UL) >> sh2);
return input.i;
}
A somewhat laborious approach (which might better convert to other languages such as c++) would be to use memcpy.
int32_t sar32(int32_t val, uint8_t sh) {
uint8_t sh2 = ((sh >= 0x1f)*0x1f) | (sh & 0x1f);
int32_t result;
uint32_t uval32, uval32mask;
memcpy(&uval32, &val, 4);
uval32mask = -(uval32 >> 31);
uval32 = (uval32 >> sh2) | (uval32mask << (31 - sh2));
memcpy(&result, &uval32, 4);
return result;
}
int64_t sar64(int64_t val, uint8_t sh) {
uint8_t sh2 = ((sh >= 0x3f)*0x3f) | (sh & 0x3f);
int64_t result;
uint64_t uval64, uval64mask;
memcpy(&uval64, &val, 8);
uval64mask = -(uval64 >> 63);
uval64 = (uval64 >> sh2) | (uval64mask << (63 - sh2));
memcpy(&result, &uval64, 8);
return result;
}
>>
. Do you know an implementation where this doesn't work? – Slipslop