Created
March 1, 2022 16:55
-
-
Save jrepp/d2e780c2ebfcddbf7e23062d081ad9b2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <unistd.h> | |
#include <assert.h> | |
typedef uint32_t (*str_hash_func)(const char *a); | |
struct str_hash_func_desc { | |
const char *name; | |
str_hash_func h; | |
}; | |
struct str_hash { | |
str_hash_func h; | |
uint32_t *bins; | |
uint32_t nbins; | |
uint32_t items; | |
uint32_t collisions; | |
}; | |
void str_hash_init(struct str_hash *h, uint32_t nbins) { | |
memset(h, 0, sizeof(struct str_hash)); | |
h->bins = calloc(nbins, sizeof(h->bins[0])); | |
h->nbins = nbins; | |
} | |
void str_hash_deinit(struct str_hash *h) { | |
free(h->bins); | |
} | |
void hash_one_str(struct str_hash *ht, const char *a) { | |
uint32_t hvalue = ht->h(a); | |
uint32_t bin = hvalue % ht->nbins; | |
if (ht->bins[bin]) { | |
ht->collisions++; | |
} | |
ht->bins[bin]++; | |
ht->items++; | |
} | |
uint32_t zhash_str(const char *a) | |
{ | |
assert(a != NULL); | |
int32_t hash = 0; | |
while (*a != 0) { | |
hash = (hash << 7) + (hash >> 23); | |
hash += *a; | |
a++; | |
} | |
return (uint32_t) hash; | |
} | |
uint32_t fnv1_opt_str(const char *a) | |
{ | |
uint32_t hash = 0; | |
while (*a) { | |
hash += (hash<<1) + (hash<<4) + (hash<<7) + (hash<<8) + (hash<<24); | |
hash ^= *a; | |
a++; | |
} | |
return hash; | |
} | |
uint32_t fnv1_mul_prime_str(const char *a) | |
{ | |
uint32_t hash = 0; | |
while (*a) { | |
hash *= 0x01000193; | |
hash ^= *a; | |
a++; | |
} | |
return hash; | |
} | |
uint32_t djb2_str(const char *a) | |
{ | |
uint32_t hash = 5381; | |
while (*a) { | |
hash = (hash << 5) + hash + *a; | |
++a; | |
} | |
return hash; | |
} | |
void process_dictionary(struct str_hash *ht) { | |
const char *filename = "/usr/share/dict/american-english"; | |
FILE *f = fopen(filename, "r"); | |
assert(f != NULL); | |
size_t len; | |
char *line = (char *)malloc(256); | |
ssize_t read; | |
while ((read = getline(&line, &len, f)) != -1) { | |
// printf("[%zd] %s", read, line); | |
hash_one_str(ht, line); | |
} | |
fclose(f); | |
if (line) { | |
free(line); | |
} | |
} | |
int main() { | |
struct str_hash ht; | |
struct str_hash_func_desc funcs[] = { | |
{ "zhash", zhash_str }, | |
{ "fnv1_opt", fnv1_opt_str }, | |
{ "fnv1_mul", fnv1_mul_prime_str }, | |
{ "djb2", djb2_str }, | |
}; | |
uint32_t bin_sizes[] = { | |
// powers of two (zhash) | |
8, | |
256, | |
4096, | |
8192, | |
1024 * 1024, | |
// primes | |
7, | |
251, | |
4099, | |
8209, | |
1299827, | |
}; | |
printf("---------------- string hashing ------------------------\n"); | |
printf("hash, bins, items, collisions\n"); | |
for (uint32_t i = 0; i < sizeof(funcs)/sizeof(funcs[0]); ++i) { | |
for (uint32_t j = 0; j < sizeof(bin_sizes)/sizeof(bin_sizes[0]); ++j) { | |
str_hash_init(&ht, bin_sizes[j]); | |
ht.h = funcs[i].h; | |
process_dictionary(&ht); | |
str_hash_deinit(&ht); | |
printf("%s, %u, %u, %u\n", | |
funcs[i].name, | |
bin_sizes[j], | |
ht.items, | |
ht.collisions); | |
} | |
} | |
return 0; | |
} |
Author
jrepp
commented
Mar 1, 2022
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment