The code below demonstrates that the desired functionality can be emulated using pre-existing SSE instructions by working from first principles: signed "less than" predicate being true is equivalent to the overflow flag differing from the sign flag after subtraction. Obviously we do not have flags, so the flags need to be emulated as well, leaving the resulting predicate in the most significant bit (MSB) of each quadword. In a second step we can then expand the MSB into a quadword-sized mask.
The resulting fairly lengthy code is a pretty good indication that this is probably not the best way of emulating the desired functionality from a performance perspective, even though it is functionally correct. Compiler Explorer ( shows twelve instructions when compiling with Clang 11 (please refer to the assembly listing below).
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "nmmintrin.h"
#define USE_SSE42_REF (0)
#define NBR_OF_TESTS (1000000000)
__m128i pcmpgtq_ref (__m128i a, __m128i b)
return _mm_cmpgt_epi64 (a, b);
#else // USE_SSE42_REF
__m128i pcmpgtq_ref (__m128i a, __m128i b)
__m128i r;
struct {
int64_t lo;
int64_t hi;
} hilo_a, hilo_b, hilo_r;
memcpy (&hilo_a, &a, sizeof hilo_a);
memcpy (&hilo_b, &b, sizeof hilo_b);
hilo_r.lo = hilo_a.lo > hilo_b.lo ? (-1LL) : 0LL;
hilo_r.hi = hilo_a.hi > hilo_b.hi ? (-1LL) : 0LL;
memcpy (&r, &hilo_r, sizeof r);
return r;
#endif // USE_SSE42_REF
/* "signed less than" == (OF != SF); compute predicate in MSB of each byte */
__m128i ltq_core (__m128i a, __m128i b)
__m128i m = _mm_set1_epi64x (0x7fffffffffffffffULL);
__m128i c = _mm_and_si128 (b, m);
__m128i d = _mm_andnot_si128 (a, m);
__m128i t = _mm_add_epi64 (c, d);
__m128i s = _mm_xor_si128 (a, b);
__m128i x = _mm_xor_si128 (a, t);
__m128i y = _mm_and_si128 (x, s);
__m128i r = _mm_xor_si128 (y, t);
return r;
/* extend sign bits into mask, quadword-wise */
__m128i q_sign_to_mask (__m128i a)
__m128i q = _mm_set1_epi64x (0);
__m128i s = _mm_srli_epi64 (a, 63);
__m128i r = _mm_sub_epi64 (q, s);
return r;
__m128i pcmpltq (__m128i a, __m128i b)
return q_sign_to_mask (ltq_core (a, b));
__m128i pcmpgtq (__m128i a, __m128i b)
return pcmpltq (b, a);
From: geo <[email protected]>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
int main (void)
struct {
uint64_t lo;
uint64_t hi;
} hilo_a, hilo_b, hilo_res, hilo_ref;
for (int i = 0; i < NBR_OF_TESTS; i++) {
uint64_t al = KISS64;
uint64_t ah = KISS64;
uint64_t bl = KISS64;
uint64_t bh = KISS64;
if ((i & 0xff) == 0x00) bl = al; // increase chance of equality
if ((i & 0xff) == 0xff) bh = ah; // increase chance of equality
__m128i a = _mm_set_epi64x (ah, al);
__m128i b = _mm_set_epi64x (bh, bl);
__m128i res = pcmpgtq (a, b);
__m128i ref = pcmpgtq_ref (a, b);
memcpy (&hilo_res, &res, sizeof hilo_res);
memcpy (&hilo_ref, &ref, sizeof hilo_ref);
if ((hilo_res.hi != hilo_ref.hi) || (hilo_res.lo != hilo_ref.lo)) {
memcpy (&hilo_a, &a, sizeof hilo_a);
memcpy (&hilo_b, &b, sizeof hilo_b);
printf ("error: a=%016llx_%016llx b=%016llx_%016llx res=%016llx_%016llx ref=%016llx_%016llx\n",
hilo_a.hi, hilo_a.lo, hilo_b.hi, hilo_b.lo,
hilo_res.hi, hilo_res.lo, hilo_ref.hi, hilo_ref.lo);
When compiling with -msse2
, the output from Clang 11 looks as follows:
.quad 9223372036854775807 # 0x7fffffffffffffff
.quad 9223372036854775807 # 0x7fffffffffffffff
pcmpgtq: # @pcmpgtq
movdqa xmm2, xmmword ptr [rip + .LCPI4_0] # xmm2 = [9223372036854775807,9223372036854775807]
movdqa xmm3, xmm1
pxor xmm3, xmm0
pand xmm0, xmm2
movdqa xmm4, xmm1
pandn xmm4, xmm2
paddq xmm4, xmm0
pand xmm1, xmm3
pandn xmm3, xmm4
por xmm3, xmm1
psrad xmm3, 31
pshufd xmm0, xmm3, 245 # xmm0 = xmm3[1,1,3,3]