Struct Node {
Node *N[SIZE];
int value;
struct Trie {
Node *root;
Node* findNode(Key *key) {
Node *C = &root;
char u;
while (1) {
u = key->next();
if (u < 0) return C;
// if (C->N[0] == C->N[0]); // this line will speed up execution significantly
C = C->N[u];
if (C == 0) return 0;
void addNode(Key *key, int value){...};
In this implementation of Prefix Tree (aka Trie) I found out that 90% of findNode()
execution time is taken by a single operation C=C->N[u];
In my attempt to speed up this code, I randomly added the line that is commented in the snipped above, and code became 30% faster! Why is that?
Here is complete program.
#include "stdio.h"
#include "sys/time.h"
long time1000() {
timeval val;
gettimeofday(&val, 0);
val.tv_sec &= 0xffff;
return val.tv_sec * 1000 + val.tv_usec / 1000;
struct BitScanner {
void *p;
int count, pos;
BitScanner (void *p, int count) {
this->p = p;
this->count = count;
pos = 0;
int next() {
int bpos = pos >> 1;
if (bpos >= count) return -1;
unsigned char b = ((unsigned char*)p)[bpos];
if (pos++ & 1) return (b >>= 4);
return b & 0xf;
struct Node {
Node *N[16];
__int64_t value;
Node() : N(), value(-1) { }
struct Trie16 {
Node root;
bool add(void *key, int count, __int64_t value) {
Node *C = &root;
BitScanner B(key, count);
while (true) {
int u =;
if (u < 0) {
if (C->value == -1) {
C->value = value;
return true; // value added
C->value = value;
return false; // value replaced
Node *Q = C->N[u];
if (Q) {
C = Q;
} else {
C = C->N[u] = new Node;
Node* findNode(void *key, int count) {
Node *C = &root;
BitScanner B(key, count);
while (true) {
char u =;
if (u < 0) return C;
// if (C->N[0] == C->N[1]);
C = C->N[0+u];
if (C == 0) return 0;
int main() {
int T = time1000();
Trie16 trie;
__int64_t STEPS = 100000, STEP = 500000000, key;
key = 0;
for (int i = 0; i < STEPS; i++) {
key += STEP;
bool ok = trie.add(&key, 8, key+222);
printf("insert time:%i\n",time1000() - T); T = time1000();
int err = 0;
key = 0;
for (int i = 0; i < STEPS; i++) {
key += STEP;
Node *N = trie.findNode(&key, 8);
if (N==0 || N->value != key+222) err++;
printf("find time:%i\n",time1000() - T); T = time1000();
printf("errors:%i\n", err);
, they can be very expensive. – Gratingdummy++
, but toif (C->N[0] == C->N[1])
. I guess that this check causes the caches are used and data forC = C->N[u]
are read in the cache immediately. – RefractionSIZE
? – FurnishingsN
when it sees it being accessed multiple times? – Hartedummy++
part is redundant,if (C->N[0] == C->N[1]);
alone is enough to cause the effect. – Myocarditis__builtin_prefetch
on gcc? – Ewing__builtin_prefetch(C->N[u]);
, and it does the same effect. In fact even 5% faster. – Myocarditisif (C->N[0] == C->N[0]);
, which tests a tautology and then does nothing. I'd expect the compiler to optimize that out (assuming no side effects from operator overloads). On the other hand,if (C->N[0] == C->N[1]);
is not a tautology and could affect caching, branch prediction, speculative execution, etc. – Team