Skip to content

Instantly share code, notes, and snippets.

@jrepp
Created March 1, 2022 16:55
Show Gist options
  • Save jrepp/d2e780c2ebfcddbf7e23062d081ad9b2 to your computer and use it in GitHub Desktop.
Save jrepp/d2e780c2ebfcddbf7e23062d081ad9b2 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <assert.h>
typedef uint32_t (*str_hash_func)(const char *a);
struct str_hash_func_desc {
const char *name;
str_hash_func h;
};
struct str_hash {
str_hash_func h;
uint32_t *bins;
uint32_t nbins;
uint32_t items;
uint32_t collisions;
};
void str_hash_init(struct str_hash *h, uint32_t nbins) {
memset(h, 0, sizeof(struct str_hash));
h->bins = calloc(nbins, sizeof(h->bins[0]));
h->nbins = nbins;
}
void str_hash_deinit(struct str_hash *h) {
free(h->bins);
}
void hash_one_str(struct str_hash *ht, const char *a) {
uint32_t hvalue = ht->h(a);
uint32_t bin = hvalue % ht->nbins;
if (ht->bins[bin]) {
ht->collisions++;
}
ht->bins[bin]++;
ht->items++;
}
uint32_t zhash_str(const char *a)
{
assert(a != NULL);
int32_t hash = 0;
while (*a != 0) {
hash = (hash << 7) + (hash >> 23);
hash += *a;
a++;
}
return (uint32_t) hash;
}
uint32_t fnv1_opt_str(const char *a)
{
uint32_t hash = 0;
while (*a) {
hash += (hash<<1) + (hash<<4) + (hash<<7) + (hash<<8) + (hash<<24);
hash ^= *a;
a++;
}
return hash;
}
uint32_t fnv1_mul_prime_str(const char *a)
{
uint32_t hash = 0;
while (*a) {
hash *= 0x01000193;
hash ^= *a;
a++;
}
return hash;
}
uint32_t djb2_str(const char *a)
{
uint32_t hash = 5381;
while (*a) {
hash = (hash << 5) + hash + *a;
++a;
}
return hash;
}
void process_dictionary(struct str_hash *ht) {
const char *filename = "/usr/share/dict/american-english";
FILE *f = fopen(filename, "r");
assert(f != NULL);
size_t len;
char *line = (char *)malloc(256);
ssize_t read;
while ((read = getline(&line, &len, f)) != -1) {
// printf("[%zd] %s", read, line);
hash_one_str(ht, line);
}
fclose(f);
if (line) {
free(line);
}
}
int main() {
struct str_hash ht;
struct str_hash_func_desc funcs[] = {
{ "zhash", zhash_str },
{ "fnv1_opt", fnv1_opt_str },
{ "fnv1_mul", fnv1_mul_prime_str },
{ "djb2", djb2_str },
};
uint32_t bin_sizes[] = {
// powers of two (zhash)
8,
256,
4096,
8192,
1024 * 1024,
// primes
7,
251,
4099,
8209,
1299827,
};
printf("---------------- string hashing ------------------------\n");
printf("hash, bins, items, collisions\n");
for (uint32_t i = 0; i < sizeof(funcs)/sizeof(funcs[0]); ++i) {
for (uint32_t j = 0; j < sizeof(bin_sizes)/sizeof(bin_sizes[0]); ++j) {
str_hash_init(&ht, bin_sizes[j]);
ht.h = funcs[i].h;
process_dictionary(&ht);
str_hash_deinit(&ht);
printf("%s, %u, %u, %u\n",
funcs[i].name,
bin_sizes[j],
ht.items,
ht.collisions);
}
}
return 0;
}
@jrepp
Copy link
Author

jrepp commented Mar 1, 2022

hash, bins, items, collisions
zhash, 8, 102401, 102393
zhash, 256, 102401, 102145
zhash, 4096, 102401, 98472
zhash, 8192, 102401, 95119
zhash, 1048576, 102401, 33422
zhash, 7, 102401, 102394
zhash, 251, 102401, 102150
zhash, 4099, 102401, 98302
zhash, 8209, 102401, 94192
zhash, 1299827, 102401, 5655
fnv1_opt, 8, 102401, 102393
fnv1_opt, 256, 102401, 102145
fnv1_opt, 4096, 102401, 98305
fnv1_opt, 8192, 102401, 94209
fnv1_opt, 1048576, 102401, 4724
fnv1_opt, 7, 102401, 102394
fnv1_opt, 251, 102401, 102150
fnv1_opt, 4099, 102401, 98302
fnv1_opt, 8209, 102401, 94192
fnv1_opt, 1299827, 102401, 3912
fnv1_mul, 8, 102401, 102393
fnv1_mul, 256, 102401, 102145
fnv1_mul, 4096, 102401, 98305
fnv1_mul, 8192, 102401, 94209
fnv1_mul, 1048576, 102401, 4724
fnv1_mul, 7, 102401, 102394
fnv1_mul, 251, 102401, 102150
fnv1_mul, 4099, 102401, 98302
fnv1_mul, 8209, 102401, 94192
fnv1_mul, 1299827, 102401, 3912
djb2, 8, 102401, 102393
djb2, 256, 102401, 102145
djb2, 4096, 102401, 98305
djb2, 8192, 102401, 94209
djb2, 1048576, 102401, 4828
djb2, 7, 102401, 102394
djb2, 251, 102401, 102150
djb2, 4099, 102401, 98302
djb2, 8209, 102401, 94193
djb2, 1299827, 102401, 3958

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment