Created
May 26, 2022 14:40
-
-
Save magurosan/23a557cd1810cdf4e91ac44371d9b019 to your computer and use it in GitHub Desktop.
x8 xoshiro256 (AVX-512)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <stdint.h> | |
union PARALLEL_XOSHIRO_AVX512_STATE { | |
__m512i state512[4]; | |
uint64_t state64[32]; | |
uint32_t state32[64]; | |
}; | |
typedef union PARALLEL_XOSHIRO_AVX512_STATE xoshiro256_x8_avx512_state_t; | |
inline void next_common_uint64x8_avx512(xoshiro256_x8_avx512_state_t* state) { | |
__m512i s0, s1, s2, s3; | |
__m512i t; | |
__m512i u0, u1; | |
s0 = state->state512[0]; | |
s1 = state->state512[1]; | |
s2 = state->state512[2]; | |
s3 = state->state512[3]; | |
t = _mm512_slli_epi64(s1, 17); | |
s3 = _mm512_xor_epi64(s1, s3); | |
//0x96 : A xor B xor C | |
state->state512[1] = _mm512_ternarylogic_epi64(s1, s2, s0, 0x96); | |
state->state512[2] = _mm512_ternarylogic_epi64(t, s2, s0, 0x96); | |
state->state512[0] = _mm512_xor_epi64(s0, s3); | |
state->state512[3] = _mm512_rol_epi64(s3, 45); | |
} | |
//8x uint64 xoshiro256++ | |
__m512i next_uint64x8(xoshiro256_x8_avx512_state_t* state) { | |
__m512i v = _mm512_add_epi64(state->state512[0], state->state512[3]); | |
v = _mm512_rol_epi64(v, 23); | |
v = _mm512_add_epi64(v, state->state512[0]); | |
next_common_uint64x8_avx512(state); | |
return v; | |
} | |
//8x uint64 xoshiro256** | |
__m512i next_uint64x8_ss(xoshiro256_x8_avx512_state_t* state) { | |
__m512i v = state->state512[0]; | |
v = _mm512_add_epi64(v, _mm512_slli_epi64(v, 2)); // x5 | |
v = _mm512_rol_epi64(v, 7); | |
v = _mm512_add_epi64(v, _mm512_slli_epi64(v, 3)); // x9 | |
next_common_uint64x8_avx512(state); | |
return v; | |
} | |
//8x float64 xoshiro256+ | |
__m512d next_float64x8(xoshiro256_x8_avx512_state_t* state) | |
{ | |
__m512i v = _mm512_add_epi64(state->state512[0], state->state512[3]); | |
__m512d vd = _mm512_mul_pd(_mm512_cvtepi64_pd(v), | |
_mm512_set1_pd(1.0/18446744073709551616.0)); | |
/* NOTE : もし下位52ビットでOKであればこっちの方が速いかも | |
v = _mm512_ternarylogic_epi64(v, | |
_mm512_set1_epi64(0x000FFFFFFFFFFFFFULL), | |
_mm512_set1_epi64(0x3FF0000000000000ULL), | |
0xEA // (A & B) | C | |
); | |
__m512d vd = _mm512_sub_pd(_mm512_castsi512_pd(v), _mm512_set1_pd(1.0)); | |
*/ | |
next_common_uint64x8_avx512(state); | |
return vd; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment