Created
August 8, 2023 15:28
-
-
Save kprotty/67e591fbaa2d3f23764032d19315a0ad to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdint> | |
#include <cstddef> | |
#include <cstring> | |
#include <cassert> | |
#include <x86intrin.h> | |
typedef struct { | |
__m256i state[2]; | |
__m256i counter; | |
__m256i output; | |
} shishua_rng; | |
static inline void shishua_rng_init(shishua_rng* rng) { | |
uint64_t s0[4] = {0xb16739c6ab008a82ULL, 0xb9b1d61724fd10caULL, 0x700ce2d32bbd85f4ULL, 0x2517e0dbea93e94cULL}; | |
uint64_t s1[4] = {0x8fb19220c769a0f4ULL, 0xf8853f8d9e74c5a2ULL, 0x56f8e1aee7550718ULL, 0x71ccfaab44595151ULL}; | |
uint64_t count[4] = {28, 20, 12, 4}; | |
rng->state[0] = _mm256_loadu_si256((const __m256i*)&s0); | |
rng->state[1] = _mm256_loadu_si256((const __m256i*)&s1); | |
rng->counter = _mm256_loadu_si256((const __m256i*)&count); | |
rng->output = rng->state[1]; | |
} | |
static inline void shishua_rng_update(shishua_rng *rng) { | |
rng->state[1] = _mm256_add_epi64(rng->state[1], rng->counter); | |
rng->counter = _mm256_add_epi64(rng->counter, _mm256_set_epi64x(1, 3, 5, 7)); | |
__m256i u0 = _mm256_srli_epi64(rng->state[0], 1); | |
__m256i u1 = _mm256_srli_epi64(rng->state[0], 3); | |
__m256i t0 = _mm256_permutevar8x32_epi32(rng->state[0], _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5)); | |
__m256i t1 = _mm256_permutevar8x32_epi32(rng->state[1], _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3)); | |
rng->state[0] = _mm256_add_epi64(t0, u0); | |
rng->state[1] = _mm256_add_epi64(t1, u1); | |
rng->output = _mm256_xor_si256(t1, u0); | |
} | |
static inline uint32_t avalanche(uint64_t x) { | |
return (x * 11400714819323198485ULL) >> 32; | |
} | |
static inline void nh_round(__m256i *acc, __m256i m, __m256i r) { | |
__m256i mixed = _mm256_add_epi32(m, r); | |
__m256i hi = _mm256_srli_epi64(mixed, 32); | |
__m256i merged = _mm256_mul_epu32(mixed, hi); | |
*acc = _mm256_add_epi64(*acc, merged); | |
} | |
uint32_t nh_hash(const char *ptr, size_t len) { | |
shishua_rng rng; | |
shishua_rng_init(&rng); | |
if (__builtin_expect(len <= 16, 1)) { | |
if (__builtin_expect(len <= 3, 1)) { | |
if (len == 0) return 0; | |
uint32_t word; | |
char bytes[4] = {ptr[0], ptr[len / 2], ptr[len - 1], (char)len}; | |
memcpy(&word, &bytes, 4); | |
return avalanche(word); | |
} | |
uint32_t m[4]; | |
memcpy(&m[0], ptr, 4); | |
memcpy(&m[1], ptr + ((len / 8) * 2), 4); | |
memcpy(&m[2], ptr + len - 4, 4); | |
memcpy(&m[3], ptr + len - 4 - ((len / 8) * 2), 4); | |
uint32_t r[4]; | |
memcpy(&r, &rng.output, sizeof(r)); | |
uint64_t a = ((uint64_t)(m[0] + r[0])) * (m[1] + r[1]); | |
uint64_t b = ((uint64_t)(m[2] + r[2])) * (m[3] + r[3]); | |
return avalanche(a + b); | |
} | |
__m256i m, acc = _mm256_setzero_si256(); | |
while (len > 32) { | |
m = _mm256_loadu_si256((const __m256i*)ptr); | |
nh_round(&acc, m, rng.output); | |
shishua_rng_update(&rng); | |
ptr += 32; | |
len -= 32; | |
} | |
__m128i a = _mm_loadu_si128((const __m128i*)ptr); | |
__m128i b = _mm_loadu_si128((const __m128i*)(ptr + len - 16)); | |
nh_round(&acc, _mm256_set_m128i(a, b), rng.output); | |
// __builtin_reduce_add | |
a = _mm256_extracti128_si256(acc, 0); | |
b = _mm256_extracti128_si256(acc, 1); | |
a = _mm_add_epi64(a, b); | |
uint64_t result = 0; | |
result += _mm_extract_epi64(a, 0); | |
result += _mm_extract_epi64(a, 1); | |
return avalanche(result); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment