Last active
October 17, 2022 11:02
-
-
Save gfoidl/c8e883c9432d994a4a4d7c30b501aef1 to your computer and use it in GitHub Desktop.
AVX2 random
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Cf. https://github.com/BenjaminAbt/SustainableCode/tree/main/csharp/random-string | |
// And see also https://github.com/BenjaminAbt/SustainableCode/blob/main/csharp/random-string-vector | |
//#define BENCH | |
using System.Diagnostics; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.InteropServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
using BenchmarkDotNet.Attributes; | |
Benchmark bench = new() | |
{ | |
CharLength = 82 | |
}; | |
#if DEBUG | |
Console.WriteLine(bench.StringCreate()); | |
Console.WriteLine(); | |
for (int i = 0; i < 10; ++i) | |
{ | |
Console.WriteLine(bench.Vectorized()); | |
} | |
#else | |
#if BENCH | |
BenchmarkDotNet.Running.BenchmarkRunner.Run<Benchmark>(); | |
#else | |
for (int i = 0; i < 100; ++i) | |
{ | |
if (i % 10 == 0) Thread.Sleep(100); | |
_ = bench.Vectorized(); | |
} | |
#endif | |
#endif | |
public class Benchmark | |
{ | |
[Params(10, 100, 1000)] | |
public int CharLength { get; set; } = 100; | |
[Benchmark(Baseline = true)] | |
public string StringCreate() => StringCreateSample.CreateRandomString(CharLength); | |
[Benchmark] | |
public string Vectorized() => VectorSample.CreateRandomString(CharLength); | |
} | |
public static class SampleConstants | |
{ | |
public const string UpperChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; | |
public const string LowerChars = "abcdefghijklmnopqrstuvwxyz"; | |
public const string Digits = "0123456789"; | |
public const string AlphNum = UpperChars + LowerChars + Digits; | |
} | |
public static class StringCreateSample | |
{ | |
private static readonly Random s_random = new(0); | |
private static void CreateRandomString(Span<char> buffer) | |
{ | |
const string Chars = SampleConstants.AlphNum; | |
int charsLength = Chars.Length; | |
for (int i = 0; i < buffer.Length; ++i) | |
{ | |
int cl = charsLength; | |
buffer[i] = Chars[s_random.Next(cl)]; | |
} | |
} | |
public static string CreateRandomString(int length) | |
{ | |
return string.Create<object?>(length, null, static (buffer, _) => CreateRandomString(buffer)); | |
} | |
} | |
public static class VectorSample | |
{ | |
private static readonly Random s_random = new(0); | |
public static string CreateRandomString(int length) | |
{ | |
return string.Create<object?>(length, null, static (buffer, _) => CreateRandomString(buffer)); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe void CreateRandomString(Span<char> buffer) | |
{ | |
if (Avx2.IsSupported && buffer.Length >= 2 * Vector256<ushort>.Count) | |
{ | |
#if DEBUG | |
// To aid detect writing beyond the allowed range -- see below. | |
buffer[^1] = '='; | |
#endif | |
// For JIT-TC to kick in no stackalloc must occur. | |
// Thus it's hoisted to here. And that's why the scalar path is moved | |
// into its own method. | |
byte* seedChars = stackalloc byte[64]; | |
CreateRandomStringVectorized(buffer, seedChars); | |
} | |
else | |
{ | |
CreateRandomStringScalar(buffer); | |
} | |
} | |
private static void CreateRandomStringScalar(Span<char> buffer) | |
{ | |
const string Chars = SampleConstants.AlphNum; | |
int charsLength = Chars.Length; | |
for (int i = 0; i < buffer.Length; ++i) | |
{ | |
buffer[i] = Chars[s_random.Next(charsLength)]; | |
} | |
} | |
[SkipLocalsInit] | |
private static unsafe void CreateRandomStringVectorized(Span<char> buffer, byte* seedChars) | |
{ | |
const string Chars = SampleConstants.AlphNum; | |
Debug.Assert(Chars.Length == 62); | |
Vector256<int> seed = Vector256.Create( | |
Random.Shared.NextInt64(), | |
Random.Shared.NextInt64(), | |
Random.Shared.NextInt64(), | |
Random.Shared.NextInt64() | |
).AsInt32(); | |
// seedChars could also be given as ROS<byte>, depending on use case. | |
// Especially with C# 11's UTF-8 literals, e.g. "ABCD..."u8 | |
ref ushort chars = ref Unsafe.As<char, ushort>(ref Unsafe.AsRef(Chars.GetPinnableReference())); | |
PackToBytes(ref chars, seedChars); | |
Vector256<byte> seedVec0 = Vector256.Load(seedChars); | |
Vector256<byte> seedVec1 = Vector256.Load(seedChars + Vector256<byte>.Count); | |
Vector256<float> upperForVec = Vector256.Create((float)(Vector256<byte>.Count - 1)); | |
Vector256<float> one = Vector256.Create(1f); | |
Vector256<int> mantissaMask = Vector256.Create(0x7FFFFF); | |
ref ushort dest = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(buffer)); | |
ref ushort twoVectorsAwayFromEnd = ref Unsafe.Add(ref dest, (uint)(buffer.Length - 2 * Vector256<ushort>.Count)); | |
do | |
{ | |
Core(ref dest, seedVec0, seedVec1, ref seed, mantissaMask, one, upperForVec); | |
dest = ref Unsafe.Add(ref dest, 2 * Vector256<ushort>.Count); | |
} | |
while (Unsafe.IsAddressLessThan(ref dest, ref twoVectorsAwayFromEnd)); | |
Core(ref twoVectorsAwayFromEnd, seedVec0, seedVec1, ref seed, mantissaMask, one, upperForVec); | |
//--------------------------------------------------------------------- | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
static void PackToBytes(ref ushort chars, byte* seed) | |
{ | |
ref short charsAsInt16 = ref Unsafe.As<ushort, short>(ref chars); | |
#if DEBUG | |
// Clear the seed (len = 32, i.e. Vector256<byte> size) | |
Vector256<byte>.Zero.Store(seed); | |
// To aid detect writing beyond the allowed range -- see below. | |
seed[62] = (byte)'='; | |
seed[63] = (byte)'='; | |
#endif | |
// We read 32 chars, pack them to 32 bytes | |
// Then 30 chars remain | |
// | |
// Use hw-intrinsics as they don't perform additional AND like Vector256.Narrow does | |
Vector256<byte> narrowed256 = Avx2.PackUnsignedSaturate( | |
Vector256.LoadUnsafe(ref charsAsInt16), | |
Vector256.LoadUnsafe(ref charsAsInt16, (uint)Vector256<ushort>.Count)); | |
narrowed256 = Avx2.Permute4x64(narrowed256.AsInt64(), 0b_11_01_10_00).AsByte(); | |
narrowed256.Store(seed); | |
nuint offset = 2 * (uint)Vector256<ushort>.Count; | |
// We read 16 chars, pack them to 16 bytes | |
// Then 14 chars remain | |
Vector128<byte> narrowed128 = Sse2.PackUnsignedSaturate( | |
Vector128.LoadUnsafe(ref charsAsInt16, offset), | |
Vector128.LoadUnsafe(ref charsAsInt16, offset + (uint)Vector128<ushort>.Count)); | |
narrowed128.Store(seed + Vector256<byte>.Count); | |
// For the remaining 14 chars we read 16 chars from the end, as the operation is idempotent. | |
offset = 62 - 2 * (uint)Vector128<ushort>.Count; | |
narrowed128 = Sse2.PackUnsignedSaturate( | |
Vector128.LoadUnsafe(ref charsAsInt16, offset), | |
Vector128.LoadUnsafe(ref charsAsInt16, offset + (uint)Vector128<ushort>.Count)); | |
narrowed128.Store(seed + offset); | |
#if DEBUG | |
Debug.Assert(seed[62] == (byte)'='); | |
Debug.Assert(seed[63] == (byte)'='); | |
#endif | |
// The 62 Chars are narrowed to 62 bytes, so add another two random bytes (chars) | |
// so the whole range of 64 bytes can be used. In regards to entropy it would be | |
// better to leave them off, as 2/62 are more likely this way. But for speed it's | |
// better. | |
seed[62] = (byte)Unsafe.Add(ref chars, Random.Shared.Next(62)); | |
seed[63] = (byte)Unsafe.Add(ref chars, Random.Shared.Next(62)); | |
} | |
//--------------------------------------------------------------------- | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
static void Core( | |
ref ushort dest, | |
Vector256<byte> seedVec0, | |
Vector256<byte> seedVec1, | |
ref Vector256<int> seed, | |
Vector256<int> mantissaMask, | |
Vector256<float> one, | |
Vector256<float> upperForVector) | |
{ | |
Vector256<byte> shuffleMask = NextRandomByteVector(ref seed, mantissaMask, one, upperForVector); | |
// seedVec0 seedVec1 | |
// ABCDEFGHIJKLMNOP QRSTUVWXYZabcdef ghijklmnopqrstuv wxyz0123456789<> | |
// vec0 vec1 | |
// ANBGJCKHNKIMKLDE ZUVXWRVSUbQWZVZR gthmpiqntqosqrjk 50132x1y07w2515x | |
Vector256<byte> vec0 = Avx2.Shuffle(seedVec0, shuffleMask); | |
Vector256<byte> vec1 = Avx2.Shuffle(seedVec1, shuffleMask); | |
Vector256<int> permuteMask = NextRandomVector(ref seed, mantissaMask, one, upperForVector); | |
// vec0 vec1 | |
// before: ANBG JCKH NKIM KLDE ZUVX WRVS UbQW ZVZR gthm piqn tqos qrjk 5013 2x1y 07w2 515x | |
// after: WRVS ZVZR JCKH KLDE KLDE ZVZR KLDE NKIM 2x1y 515x piqn qrjk qrjk 515x qrjk tqos | |
vec0 = Avx2.PermuteVar8x32(vec0.AsInt32(), permuteMask).AsByte(); | |
vec1 = Avx2.PermuteVar8x32(vec1.AsInt32(), permuteMask).AsByte(); | |
// after blend: 2RVyZ15RJiqnqLDkKrDE5VZxqLjkNKIM | |
Vector256<byte> blendMask = Vector256.Equals(shuffleMask & Vector256.Create((byte)1), Vector256<byte>.Zero); | |
Vector256<byte> res = Avx2.BlendVariable(vec0, vec1, blendMask); | |
(Vector256<ushort> lower, Vector256<ushort> upper) = Vector256.Widen(res); | |
lower.StoreUnsafe(ref dest); | |
upper.StoreUnsafe(ref dest, (uint)Vector256<ushort>.Count); | |
} | |
//--------------------------------------------------------------------- | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
static Vector256<byte> NextRandomByteVector( | |
ref Vector256<int> seed, | |
Vector256<int> mantissaMask, | |
Vector256<float> one, | |
Vector256<float> upper) | |
{ | |
Vector256<int> rnd0 = NextRandomVector(ref seed, mantissaMask, one, upper); | |
Vector256<int> rnd1 = NextRandomVector(ref seed, mantissaMask, one, upper); | |
Vector256<int> rnd2 = NextRandomVector(ref seed, mantissaMask, one, upper); | |
Vector256<int> rnd3 = NextRandomVector(ref seed, mantissaMask, one, upper); | |
rnd1 = Vector256.ShiftLeft(rnd1, 8); | |
rnd2 = Vector256.ShiftLeft(rnd2, 16); | |
rnd3 = Vector256.ShiftLeft(rnd3, 24); | |
Vector256<int> rnd = (rnd0 | rnd1) | (rnd2 | rnd3); | |
return rnd.AsByte(); | |
} | |
//--------------------------------------------------------------------- | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
static Vector256<int> NextRandomVector( | |
ref Vector256<int> seed, | |
Vector256<int> mantissaMask, | |
Vector256<float> one, | |
Vector256<float> upper) | |
{ | |
// Xorshift (cool how easy :-)) | |
seed ^= Vector256.ShiftLeft(seed, 13); | |
seed ^= Vector256.ShiftRightArithmetic(seed, 17); | |
seed ^= Vector256.ShiftLeft(seed, 5); | |
// Convert random ints to floats out of [1, 2), cf. https://stackoverflow.com/a/70565649/347870 | |
Vector256<int> mantissa = seed & mantissaMask; | |
Vector256<float> val = mantissa.AsSingle() | one; | |
val = Fma.MultiplySubtract(val, upper, upper); // Scale from [1, 2) to [0, upper) | |
Vector256<int> rnd = Vector256.ConvertToInt32(val); // Convert back to int out of [0, upper) by truncation | |
return rnd; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Runtime.InteropServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
Vector256<float> one = Vector256.Create(1f); | |
Vector256<float> upper = Vector256.Create((float)(64 + 1)); | |
Dictionary<int, long> histogram = new(); | |
// Random numbers ;-) | |
//Vector256<int> seed = Vector256.Create( | |
// Random.Shared.Next(), | |
// Random.Shared.Next(), | |
// Random.Shared.Next(), | |
// Random.Shared.Next(), | |
// Random.Shared.Next(), | |
// Random.Shared.Next(), | |
// Random.Shared.Next(), | |
// Random.Shared.Next() | |
//); | |
Vector256<int> seed = Vector256.Create( | |
Random.Shared.NextInt64(), | |
Random.Shared.NextInt64(), | |
Random.Shared.NextInt64(), | |
Random.Shared.NextInt64() | |
).AsInt32(); | |
Vector256<int> mantissaMask = Vector256.Create(0x7FFFFF); | |
for (int i = 0; i < 1_000_000; ++i) | |
{ | |
Vector256<int> rnd = NextRandomVector(); | |
Validate(rnd); | |
AddToHistogram(rnd); | |
Console.WriteLine(rnd); | |
Console.ReadKey(); | |
} | |
PrintHistogram(); | |
Vector256<int> NextRandomVector() | |
{ | |
// Xorshift (cool how easy :-)) | |
seed ^= Vector256.ShiftLeft(seed, 13); | |
seed ^= Vector256.ShiftRightArithmetic(seed, 17); | |
seed ^= Vector256.ShiftLeft(seed, 5); | |
// Convert random ints to floats out of [1, 2), cf. https://stackoverflow.com/a/70565649/347870 | |
Vector256<int> mantissa = seed & mantissaMask; | |
Vector256<float> val = mantissa.AsSingle() | one; | |
val = Fma.MultiplySubtract(val, upper, upper); // Scale from [1, 2) to [0, upper) | |
Vector256<int> rnd = Vector256.ConvertToInt32(val); // Convert back to int out of [0, upper) by truncation | |
return rnd; | |
} | |
static void Validate(Vector256<int> rnd) | |
{ | |
for (int i = 0; i < Vector256<int>.Count; ++i) | |
{ | |
int val = rnd[i]; | |
if (val < 0 || val > 64) | |
{ | |
throw new Exception("Out of range"); | |
} | |
} | |
} | |
void AddToHistogram(Vector256<int> rnd) | |
{ | |
for (int i = 0; i < Vector256<int>.Count; ++i) | |
{ | |
int val = rnd[i]; | |
ref long count = ref CollectionsMarshal.GetValueRefOrAddDefault(histogram, val, out _); | |
count++; | |
} | |
} | |
void PrintHistogram() | |
{ | |
long avg = 0; | |
foreach (long value in histogram.Values) | |
{ | |
avg += value; | |
} | |
avg /= histogram.Count; | |
foreach (KeyValuePair<int, long> kvp in histogram.OrderBy(h => h.Key)) | |
{ | |
Console.WriteLine($"{kvp.Key,3}\t{kvp.Value,5}\t{kvp.Value - avg,5}"); | |
} | |
} |
Benchmark for random string:
| Method | CharLength | Mean | Error | StdDev | Ratio | RatioSD |
|------------- |----------- |------------:|----------:|----------:|------:|--------:|
| StringCreate | 10 | 151.2 ns | 3.10 ns | 5.60 ns | 1.00 | 0.00 |
| Vectorized | 10 | 143.1 ns | 2.94 ns | 5.67 ns | 0.95 | 0.05 |
| | | | | | | |
| StringCreate | 100 | 1,253.0 ns | 24.07 ns | 51.30 ns | 1.00 | 0.00 |
| Vectorized | 100 | 149.8 ns | 3.07 ns | 8.31 ns | 0.12 | 0.01 |
| | | | | | | |
| StringCreate | 1000 | 12,472.4 ns | 245.81 ns | 461.69 ns | 1.00 | 0.00 |
| Vectorized | 1000 | 710.7 ns | 16.74 ns | 49.35 ns | 0.06 | 0.00 |
And the codegen for x64 gives nice machine code
DOTNET_JitDisasm: "VectorSample:CreateRandomStringVectorized(Span`1,long)"
; Assembly listing for method VectorSample:CreateRandomStringVectorized(Span`1,long)
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; Tier-1 compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 28 single block inlinees; 0 inlinees without PGO data
G_M000_IG01: ;; offset=0000H
57 push rdi
56 push rsi
55 push rbp
53 push rbx
4881EC98000000 sub rsp, 152
C5F877 vzeroupper
C5F829B42480000000 vmovaps qword ptr [rsp+80H], xmm6
C5F8297C2470 vmovaps qword ptr [rsp+70H], xmm7
C57829442460 vmovaps qword ptr [rsp+60H], xmm8
C578294C2450 vmovaps qword ptr [rsp+50H], xmm9
C57829542440 vmovaps qword ptr [rsp+40H], xmm10
488BF9 mov rdi, rcx
488BF2 mov rsi, rdx
G_M000_IG02: ;; offset=0035H
48B96003C04480020000 mov rcx, 0x28044C00360
488B19 mov rbx, gword ptr [rcx]
488BCB mov rcx, rbx
FF1565191200 call [ThreadSafeRandom:NextInt64():long:this]
C4E1F96EC0 vmovd xmm0, rax
C5F911442430 vmovupd xmmword ptr [rsp+30H], xmm0
488BCB mov rcx, rbx
FF1551191200 call [ThreadSafeRandom:NextInt64():long:this]
C5F910442430 vmovupd xmm0, xmmword ptr [rsp+30H]
C4E3F922C001 vpinsrq xmm0, xmm0, rax, 1
C5F911442430 vmovupd xmmword ptr [rsp+30H], xmm0
488BCB mov rcx, rbx
FF1536191200 call [ThreadSafeRandom:NextInt64():long:this]
C4E1F96EC0 vmovd xmm0, rax
C5F911442420 vmovupd xmmword ptr [rsp+20H], xmm0
488BCB mov rcx, rbx
FF1522191200 call [ThreadSafeRandom:NextInt64():long:this]
C5F910442420 vmovupd xmm0, xmmword ptr [rsp+20H]
C4E3F922C001 vpinsrq xmm0, xmm0, rax, 1
C5F9104C2430 vmovupd xmm1, xmmword ptr [rsp+30H]
C4E37518F001 vinsertf128 ymm6, ymm1, xmm0, 1
48B9F820C04480020000 mov rcx, 0x28044C020F8
488B29 mov rbp, gword ptr [rcx]
4883C50C add rbp, 12
C5FE6F4500 vmovdqu ymm0, ymmword ptr[rbp]
C5FD674520 vpackuswb ymm0, ymm0, ymmword ptr[rbp+20H]
C4E3FD00C0D8 vpermq ymm0, ymm0, -40
C5FE7F06 vmovdqu ymmword ptr[rsi], ymm0
C5FA6F4540 vmovdqu xmm0, xmmword ptr [rbp+40H]
C5F9674550 vpackuswb xmm0, xmm0, xmmword ptr [rbp+50H]
C5FA7F4620 vmovdqu xmmword ptr [rsi+20H], xmm0
C5FA6F455C vmovdqu xmm0, xmmword ptr [rbp+5CH]
C5F967456C vpackuswb xmm0, xmm0, xmmword ptr [rbp+6CH]
C5FA7F462E vmovdqu xmmword ptr [rsi+2EH], xmm0
488BCB mov rcx, rbx
BA3E000000 mov edx, 62
C4E37D19F701 vextractf128 xmm7, ymm6, 1
FF15A3181200 call [ThreadSafeRandom:Next(int):int:this]
4863C8 movsxd rcx, eax
0FB64C4D00 movzx rcx, byte ptr [rbp+2*rcx]
884E3E mov byte ptr [rsi+3EH], cl
488BCB mov rcx, rbx
BA3E000000 mov edx, 62
FF158A181200 call [ThreadSafeRandom:Next(int):int:this]
4898 cdqe
0FB6444500 movzx rax, byte ptr [rbp+2*rax]
88463F mov byte ptr [rsi+3FH], al
C5FE6F06 vmovdqu ymm0, ymmword ptr[rsi]
C5FE6F4E20 vmovdqu ymm1, ymmword ptr[rsi+20H]
C5FD1015EF030000 vmovupd ymm2, ymmword ptr[reloc @RWD00]
C5FD101D07040000 vmovupd ymm3, ymmword ptr[reloc @RWD32]
C5FD10251F040000 vmovupd ymm4, ymmword ptr[reloc @RWD64]
488B07 mov rax, bword ptr [rdi]
8B5708 mov edx, dword ptr [rdi+08H]
G_M000_IG03: ;; offset=0147H
83C2E0 add edx, -32
488D1450 lea rdx, bword ptr [rax+2*rdx]
C5FD102D2A040000 vmovupd ymm5, ymmword ptr[reloc @RWD96]
C4E34D18F701 vinsertf128 ymm6, ymm6, xmm7, 1
align [0 bytes for IG04]
G_M000_IG04: ;; offset=015CH
C5C572F60D vpslld ymm7, ymm6, 13
C5CDEFF7 vpxor ymm6, ymm6, ymm7
C5C572E611 vpsrad ymm7, ymm6, 17
C5CDEFF7 vpxor ymm6, ymm6, ymm7
C5C572F605 vpslld ymm7, ymm6, 5
C5CDEFF7 vpxor ymm6, ymm6, ymm7
C5CDDBFC vpand ymm7, ymm6, ymm4
C5C456FB vorps ymm7, ymm7, ymm3
C4E26DAAFA vfmsub213ps ymm7, ymm2, ymm2
C5FE5BFF vcvttps2dq ymm7, ymm7
C5BD72F60D vpslld ymm8, ymm6, 13
C4C14DEFF0 vpxor ymm6, ymm6, ymm8
C5BD72E611 vpsrad ymm8, ymm6, 17
C4C14DEFF0 vpxor ymm6, ymm6, ymm8
C5BD72F605 vpslld ymm8, ymm6, 5
C4C14DEFF0 vpxor ymm6, ymm6, ymm8
C54DDBC4 vpand ymm8, ymm6, ymm4
C53C56C3 vorps ymm8, ymm8, ymm3
C4626DAAC2 vfmsub213ps ymm8, ymm2, ymm2
C4417E5BC0 vcvttps2dq ymm8, ymm8
C5B572F60D vpslld ymm9, ymm6, 13
C4C14DEFF1 vpxor ymm6, ymm6, ymm9
C5B572E611 vpsrad ymm9, ymm6, 17
C4C14DEFF1 vpxor ymm6, ymm6, ymm9
C5B572F605 vpslld ymm9, ymm6, 5
C4C14DEFF1 vpxor ymm6, ymm6, ymm9
C54DDBCC vpand ymm9, ymm6, ymm4
C53456CB vorps ymm9, ymm9, ymm3
C4626DAACA vfmsub213ps ymm9, ymm2, ymm2
C4417E5BC9 vcvttps2dq ymm9, ymm9
C5AD72F60D vpslld ymm10, ymm6, 13
C4C14DEFF2 vpxor ymm6, ymm6, ymm10
C5AD72E611 vpsrad ymm10, ymm6, 17
C4C14DEFF2 vpxor ymm6, ymm6, ymm10
C5AD72F605 vpslld ymm10, ymm6, 5
C4C14DEFF2 vpxor ymm6, ymm6, ymm10
C54DDBD4 vpand ymm10, ymm6, ymm4
C52C56D3 vorps ymm10, ymm10, ymm3
C4626DAAD2 vfmsub213ps ymm10, ymm2, ymm2
C4417E5BD2 vcvttps2dq ymm10, ymm10
C4C13D72F008 vpslld ymm8, ymm8, 8
C4C13572F110 vpslld ymm9, ymm9, 16
C4C12D72F218 vpslld ymm10, ymm10, 24
C4C145EBF8 vpor ymm7, ymm7, ymm8
C44135EBC2 vpor ymm8, ymm9, ymm10
C4C145EBF8 vpor ymm7, ymm7, ymm8
C4627500C7 vpshufb ymm8, ymm1, ymm7
C5B572F60D vpslld ymm9, ymm6, 13
C4C14DEFF1 vpxor ymm6, ymm6, ymm9
C5B572E611 vpsrad ymm9, ymm6, 17
C4C14DEFF1 vpxor ymm6, ymm6, ymm9
C5B572F605 vpslld ymm9, ymm6, 5
C4C14DEFF1 vpxor ymm6, ymm6, ymm9
C54DDBCC vpand ymm9, ymm6, ymm4
C53456CB vorps ymm9, ymm9, ymm3
C4626DAACA vfmsub213ps ymm9, ymm2, ymm2
C4417E5BC9 vcvttps2dq ymm9, ymm9
C4423536C0 vpermd ymm8, ymm9, ymm8
C4627D00D7 vpshufb ymm10, ymm0, ymm7
C4423536CA vpermd ymm9, ymm9, ymm10
C5C5DBFD vpand ymm7, ymm7, ymm5
C4412C57D2 vxorps ymm10, ymm10, ymm10
C4C14574FA vpcmpeqb ymm7, ymm7, ymm10
C4C3354CF870 vpblendvb ymm7, ymm9, ymm8, ymm7
C57C28C7 vmovaps ymm8, ymm7
C4427D30C0 vpmovzxbw ymm8, ymm8
C4E37D19FF01 vextractf128 xmm7, ymm7, 1
C4E27D30FF vpmovzxbw ymm7, ymm7
G_M000_IG05: ;; offset=02A5H
C57E7F00 vmovdqu ymmword ptr[rax], ymm8
C5FE7F7820 vmovdqu ymmword ptr[rax+20H], ymm7
4883C040 add rax, 64
483BC2 cmp rax, rdx
0F82A1FEFFFF jb G_M000_IG04
G_M000_IG06: ;; offset=02BBH
C5ED72F60D vpslld ymm2, ymm6, 13
C5CDEFF2 vpxor ymm6, ymm6, ymm2
C5ED72E611 vpsrad ymm2, ymm6, 17
C5CDEFF2 vpxor ymm6, ymm6, ymm2
C5ED72F605 vpslld ymm2, ymm6, 5
C5CDEFF2 vpxor ymm6, ymm6, ymm2
C5CDDB1582020000 vpand ymm2, ymm6, ymmword ptr[reloc @RWD64] ; https://github.com/dotnet/runtime/issues/76781
C5EC56155A020000 vorps ymm2, ymm2, ymmword ptr[reloc @RWD32]
C5FD101D32020000 vmovupd ymm3, ymmword ptr[reloc @RWD00]
C4E265AA1529020000 vfmsub213ps ymm2, ymm3, ymmword ptr[reloc @RWD00]
C5FE5BD2 vcvttps2dq ymm2, ymm2
C5E572F60D vpslld ymm3, ymm6, 13
C5CDEFF3 vpxor ymm6, ymm6, ymm3
C5E572E611 vpsrad ymm3, ymm6, 17
C5CDEFF3 vpxor ymm6, ymm6, ymm3
C5E572F605 vpslld ymm3, ymm6, 5
C5CDEFF3 vpxor ymm6, ymm6, ymm3
C5CDDB1D42020000 vpand ymm3, ymm6, ymmword ptr[reloc @RWD64]
C5E4561D1A020000 vorps ymm3, ymm3, ymmword ptr[reloc @RWD32]
C5FD1025F2010000 vmovupd ymm4, ymmword ptr[reloc @RWD00]
C4E25DAA1DE9010000 vfmsub213ps ymm3, ymm4, ymmword ptr[reloc @RWD00]
C5FE5BDB vcvttps2dq ymm3, ymm3
C5DD72F60D vpslld ymm4, ymm6, 13
C5CDEFF4 vpxor ymm6, ymm6, ymm4
C5DD72E611 vpsrad ymm4, ymm6, 17
C5CDEFF4 vpxor ymm6, ymm6, ymm4
C5DD72F605 vpslld ymm4, ymm6, 5
C5CDEFF4 vpxor ymm6, ymm6, ymm4
C5CDDB2502020000 vpand ymm4, ymm6, ymmword ptr[reloc @RWD64]
C5DC5625DA010000 vorps ymm4, ymm4, ymmword ptr[reloc @RWD32]
C5FD103DB2010000 vmovupd ymm7, ymmword ptr[reloc @RWD00]
C4E245AA25A9010000 vfmsub213ps ymm4, ymm7, ymmword ptr[reloc @RWD00]
C5FE5BE4 vcvttps2dq ymm4, ymm4
C5C572F60D vpslld ymm7, ymm6, 13
C5CDEFF7 vpxor ymm6, ymm6, ymm7
C5C572E611 vpsrad ymm7, ymm6, 17
C5CDEFF7 vpxor ymm6, ymm6, ymm7
C5C572F605 vpslld ymm7, ymm6, 5
C5CDEFF7 vpxor ymm6, ymm6, ymm7
C5CDDB3DC2010000 vpand ymm7, ymm6, ymmword ptr[reloc @RWD64]
C5C4563D9A010000 vorps ymm7, ymm7, ymmword ptr[reloc @RWD32]
C57D100572010000 vmovupd ymm8, ymmword ptr[reloc @RWD00]
C4E23DAA3D69010000 vfmsub213ps ymm7, ymm8, ymmword ptr[reloc @RWD00]
C5FE5BFF vcvttps2dq ymm7, ymm7
C5E572F308 vpslld ymm3, ymm3, 8
C5DD72F410 vpslld ymm4, ymm4, 16
C5C572F718 vpslld ymm7, ymm7, 24
C5EDEBD3 vpor ymm2, ymm2, ymm3
C5DDEBDF vpor ymm3, ymm4, ymm7
C5EDEBD3 vpor ymm2, ymm2, ymm3
C4E27500CA vpshufb ymm1, ymm1, ymm2
C5E572F60D vpslld ymm3, ymm6, 13
C5CDEFF3 vpxor ymm6, ymm6, ymm3
C5E572E611 vpsrad ymm3, ymm6, 17
C5CDEFF3 vpxor ymm6, ymm6, ymm3
C5E572F605 vpslld ymm3, ymm6, 5
C5CDEFF3 vpxor ymm6, ymm6, ymm3
C5CDDB1D62010000 vpand ymm3, ymm6, ymmword ptr[reloc @RWD64]
C5E4561D3A010000 vorps ymm3, ymm3, ymmword ptr[reloc @RWD32]
C5FD102512010000 vmovupd ymm4, ymmword ptr[reloc @RWD00]
C4E25DAA1D09010000 vfmsub213ps ymm3, ymm4, ymmword ptr[reloc @RWD00]
C5FE5BDB vcvttps2dq ymm3, ymm3
C4E26536C9 vpermd ymm1, ymm3, ymm1
C4E27D00C2 vpshufb ymm0, ymm0, ymm2
C4E26536C0 vpermd ymm0, ymm3, ymm0
C5EDDBD5 vpand ymm2, ymm2, ymm5
G_M000_IG07: ;; offset=042EH
C5E457DB vxorps ymm3, ymm3, ymm3
C5ED74D3 vpcmpeqb ymm2, ymm2, ymm3
C4E37D4CC120 vpblendvb ymm0, ymm0, ymm1, ymm2
C5FC28C8 vmovaps ymm1, ymm0
C4E27D30C9 vpmovzxbw ymm1, ymm1
C4E37D19C001 vextractf128 xmm0, ymm0, 1
C4E27D30C0 vpmovzxbw ymm0, ymm0
C5FE7F0A vmovdqu ymmword ptr[rdx], ymm1
C5FE7F4220 vmovdqu ymmword ptr[rdx+20H], ymm0
G_M000_IG08: ;; offset=0459H
C5F828B42480000000 vmovaps xmm6, qword ptr [rsp+80H]
C5F8287C2470 vmovaps xmm7, qword ptr [rsp+70H]
C57828442460 vmovaps xmm8, qword ptr [rsp+60H]
C578284C2450 vmovaps xmm9, qword ptr [rsp+50H]
C57828542440 vmovaps xmm10, qword ptr [rsp+40H]
C5F877 vzeroupper
4881C498000000 add rsp, 152
5B pop rbx
5D pop rbp
5E pop rsi
5F pop rdi
C3 ret
RWD00 dq 41F8000041F80000h, 41F8000041F80000h, 41F8000041F80000h, 41F8000041F80000h
RWD32 dq 3F8000003F800000h, 3F8000003F800000h, 3F8000003F800000h, 3F8000003F800000h
RWD64 dq 007FFFFF007FFFFFh, 007FFFFF007FFFFFh, 007FFFFF007FFFFFh, 007FFFFF007FFFFFh
RWD96 dq 0101010101010101h, 0101010101010101h, 0101010101010101h, 0101010101010101h
; Total bytes of code 1161
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To get a random byte-vector, e.g.