-
-
Save jedwardsol/b1061c366eaf4f750854fbb0113ce02d to your computer and use it in GitHub Desktop.
Simd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xmm 128-bit 4 floats | |
ymm 256-bit 8 floats | |
zmm 512-bit 16 floats | |
float data[4096]; { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p} 256 times | |
sum of all values | |
00007FF77F291000 lea rax,[data (07FF77F294640h)] | |
00007FF77F291007 mov ecx,100h | |
00007FF77F29100C vxorps xmm0,xmm0,xmm0 ymm0 = 0 | |
00007FF77F291010 vxorps xmm1,xmm1,xmm1 ymm0 = 0 | |
00007FF77F291014 nop dword ptr [rax] | |
00007FF77F291018 nop dword ptr [rax+rax] | |
00007FF77F291020 vaddps ymm0,ymm0,ymmword ptr [rax] ymm0 {A,B,C,D,E,F,G,H} | |
00007FF77F291024 vaddps ymm1,ymm1,ymmword ptr [rax+20h] ymm1 {I,J,K,L,M,N,O,P} | |
00007FF77F291029 add rax,40h | |
00007FF77F29102D sub rcx,1 | |
00007FF77F291031 jne sum+20h (07FF77F291020h) | |
00007FF77F291033 vaddps ymm0,ymm1,ymm0 ymm0 {A+I,B+J,C+K,D+L,E+M,F+N,G+O,H+P} | |
00007FF77F291037 vhaddps ymm1,ymm0,ymm0 ymm1 {A+I+B+J, C+K+D+L, A+I+B+J, C+K+D+L, | |
E+M+F+N, G+O+H+P, E+M+F+N, G+O+H+P} | |
00007FF77F29103B vhaddps ymm2,ymm1,ymm1 ymm2 {A+I+B+J+C+K+D+L, A+I+B+J+C+K+D+L, A+I+B+J+C+K+D+L, A+I+B+J+C+K+D+L | |
E+M+F+N+G+O+H+P, E+M+F+N+G+O+H+P, E+M+F+N+G+O+H+P, E+M+F+N+G+O+H+P} | |
00007FF77F291046 vextractf128 xmm0,ymm2,1 xmm0 { E+M+F+N+G+O+H+P, E+M+F+N+G+O+H+P, E+M+F+N+G+O+H+P, E+M+F+N+G+O+H+P } | |
00007FF77F29104C vaddps xmm1,xmm0,xmm2 xmm1 { A+I+B+J+C+K+D+L+E+M+F+N+G+O+H+P , 4 times) | |
00007FF77F29106D vmovaps xmm0,xmm1 xmm0 = xmm1 | |
00007FF77F291071 vzeroupper zap ymm*.upper | |
00007FF77F291074 ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment