I wanted to verify Xiaoning Bian's answer, but unfortunately he didn't post his code. So I implemented a little test suite and ran different little hashing functions on the list of 466K English words to see number of collisions for each:
Hash function | Collisions | Time (words) | Time (file) | Avalanching
===============================================================================
CRC32 | 23 (0.005%) | 96.1 ms | 120.9 ms | 99.8%
MurmurOAAT | 26 (0.006%) | 17.2 ms | 19.4 ms | 100.0%
FNV hash | 32 (0.007%) | 16.4 ms | 14.0 ms | 98.6%
Jenkins OAAT | 36 (0.008%) | 19.9 ms | 18.5 ms | 100.0%
DJB2 hash | 344 (0.074%) | 18.2 ms | 12.7 ms | 92.9%
K&R V2 | 356 (0.076%) | 18.0 ms | 11.9 ms | 92.8%
Coffin | 763 (0.164%) | 16.3 ms | 11.7 ms | 91.4%
x17 hash | 2242 (0.481%) | 18.6 ms | 20.0 ms | 91.5%
-------------------------------------------------------------------------------
large_prime | 25 (0.005%) | 16.5 ms | 18.6 ms | 96.6%
MurmurHash3_x86_32 | 19 (0.004%) | 20.5 ms | 4.4 ms | 100.0%
I included time for both: hashing all words individually and hashing the entire file of all English words once. I also included a more complex MurmurHash3_x86_32
into my test for reference. Additionally, I also calculated "avalanching", which is a measure of how unpredictable the hashes of consecutive words are. Anything below 100% means "quite predictable" (which might be fine for hashing tables, but is bad for other uses).
Conclusions:
- there is almost no point in using the popular DJB2 hash function for strings on Intel x86-64 (or AArch64 for that matter) architecture. Because it has much more collisions than similar functions (FNV, Jenkins OAAT and MurmurOAAT) while having comparable throughput. Bernstein's DJB2 performs especially bad on short strings. Example collisions:
Liz
/MHz
, Bon
/COM
, Rey
/SEX
.
- if you have to use "multiply by an odd number" method (such as DJB2 = x33, K&R V2 = x31 or SDBM = x65599), choose a large 32-bit prime number instead of a small 8-bit number. It makes a lot of difference for reducing the number of collisions. For example,
hash = 17000069 * hash + s[i]
produces only 25 collisions compared to DJB2's 344 collisions.
Test code (compiled with gcc -O2
):
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define MODE 1 // 1 = hash each word individually; 2 = hash the entire file
#define MAXLEN 50 // maximum length of a word
#define MAXWORDS 500000 // maximum number of words in a file
#define SEED 0x12345678
#if MODE == 1
char words[MAXWORDS][MAXLEN];
#else
char file[MAXWORDS * MAXLEN];
#endif
uint32_t DJB2_hash(const uint8_t *str)
{
uint32_t hash = 5381;
uint8_t c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
uint32_t FNV(const void* key, uint32_t h)
{
// See: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
h ^= 2166136261UL;
const uint8_t* data = (const uint8_t*)key;
for(int i = 0; data[i] != '\0'; i++)
{
h ^= data[i];
h *= 16777619;
}
return h;
}
uint32_t MurmurOAAT_32(const char* str, uint32_t h)
{
// One-byte-at-a-time hash based on Murmur's mix
// Source: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
for (; *str; ++str) {
h ^= *str;
h *= 0x5bd1e995;
h ^= h >> 15;
}
return h;
}
uint32_t KR_v2_hash(const char *s)
{
// Source: https://mcmap.net/q/23366/-hash-function-for-string
// a.k.a. Java String hashCode()
uint32_t hashval = 0;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31*hashval;
return hashval;
}
uint32_t Jenkins_one_at_a_time_hash(const char *str)
{
uint32_t hash, i;
for (hash = i = 0; str[i] != '\0'; ++i)
{
hash += str[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
uint32_t crc32b(const uint8_t *str) {
// Source: https://stackoverflow.com/a/21001712
unsigned int byte, crc, mask;
int i = 0, j;
crc = 0xFFFFFFFF;
while (str[i] != 0) {
byte = str[i];
crc = crc ^ byte;
for (j = 7; j >= 0; j--) {
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
}
i = i + 1;
}
return ~crc;
}
inline uint32_t _rotl32(uint32_t x, int32_t bits)
{
return x<<bits | x>>(32-bits); // C idiom: will be optimized to a single operation
}
uint32_t Coffin_hash(char const *input) {
// Source: https://mcmap.net/q/23366/-hash-function-for-string
uint32_t result = 0x55555555;
while (*input) {
result ^= *input++;
result = _rotl32(result, 5);
}
return result;
}
uint32_t x17(const void * key, uint32_t h)
{
// See: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
const uint8_t * data = (const uint8_t*)key;
for (int i = 0; data[i] != '\0'; ++i)
{
h = 17 * h + (data[i] - ' ');
}
return h ^ (h >> 16);
}
uint32_t large_prime(const uint8_t *s, uint32_t hash)
{
// Better alternative to x33: multiply by a large co-prime instead of a small one
while (*s != '\0')
hash = 17000069*hash + *s++;
return hash;
}
void readfile(FILE* fp, char* buffer) {
fseek(fp, 0, SEEK_END);
size_t size = ftell(fp);
rewind(fp);
fread(buffer, size, 1, fp);
buffer[size] = '\0';
}
struct timeval stop, start;
void timing_start() {
gettimeofday(&start, NULL);
}
void timing_end() {
gettimeofday(&stop, NULL);
printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
}
uint32_t apply_hash(int hash, const char* line)
{
uint32_t result;
#if MODE == 2
timing_start();
#endif
switch (hash) {
case 1: result = crc32b((const uint8_t*)line); break;
case 2: result = large_prime((const uint8_t *)line, SEED); break;
case 3: result = MurmurOAAT_32(line, SEED); break;
case 4: result = FNV(line, SEED); break;
case 5: result = Jenkins_one_at_a_time_hash(line); break;
case 6: result = DJB2_hash((const uint8_t*)line); break;
case 7: result = KR_v2_hash(line); break;
case 8: result = Coffin_hash(line); break;
case 9: result = x17(line, SEED); break;
default: break;
}
#if MODE == 2
timing_end();
#endif
return result;
}
int main(int argc, char* argv[])
{
// Read arguments
const int hash_choice = atoi(argv[1]);
char const* const fname = argv[2];
printf("Algorithm: %d\n", hash_choice);
printf("File name: %s\n", fname);
// Open file
FILE* f = fopen(fname, "r");
#if MODE == 1
char line[MAXLEN];
size_t word_count = 0;
uint32_t hashes[MAXWORDS];
// Read file line by line
while (fgets(line, sizeof(line), f)) {
line[strcspn(line, "\n")] = '\0'; // strip newline
strcpy(words[word_count], line);
word_count++;
}
fclose(f);
// Calculate hashes
timing_start();
for (size_t i = 0; i < word_count; i++) {
hashes[i] = apply_hash(hash_choice, words[i]);
}
timing_end();
// Output hashes
for (size_t i = 0; i < word_count; i++) {
printf("%08x\n", hashes[i]);
}
#else
// Read entire file
readfile(f, file);
fclose(f);
printf("Len: %lu\n", strlen(file));
// Calculate hash
uint32_t hash = apply_hash(hash_choice, file);
printf("%08x\n", hash);
#endif
return 0;
}
P.S. A more comprehensive review of speed and quality of modern hash functions can be found in SMHasher repository of Reini Urban (rurban). Notice the "Quality problems" column in the table.