#ghc asm backend
_s1Mj_info:
0x00000000000018c0 4983C410 addq $0x10, %r12
0x00000000000018c4 4D3BA590000000 cmpq %ds:0x90(%r13), %r12
0x00000000000018cb 0F873D010000 jal $0x1a0e
0x00000000000018d1 48B8FFFF00000000FFFF movq $0xffff00000000ffff, %rax
0x00000000000018db 488B4B07 movq %ds:0x7(%rbx), %rcx
0x00000000000018df 4821C1 andq %rax, %rcx
0x00000000000018e2 B80000FFFF movl $0xffff0000, %eax
0x00000000000018e7 488B5307 movq %ds:0x7(%rbx), %rdx
0x00000000000018eb 48C1EA10 shrq $0x10, %rdx
0x00000000000018ef 4821C2 andq %rax, %rdx
0x00000000000018f2 4809CA orq %rcx, %rdx
0x00000000000018f5 B80000FFFF movl $0xffff0000, %eax
0x00000000000018fa 488B5B07 movq %ds:0x7(%rbx), %rbx
0x00000000000018fe 4821C3 andq %rax, %rbx
0x0000000000001901 48C1E310 shlq $0x10, %rbx
0x0000000000001905 4809D3 orq %rdx, %rbx
0x0000000000001908 48B8FF0000FFFF0000FF movq $0xff0000ffff0000ff, %rax
0x0000000000001912 4889D9 movq %rbx, %rcx
0x0000000000001915 4821C1 andq %rax, %rcx
0x0000000000001918 48B800FF000000FF0000 movq $0xff000000ff00, %rax
0x0000000000001922 4889DA movq %rbx, %rdx
0x0000000000001925 48C1EA08 shrq $0x8, %rdx
0x0000000000001929 4821C2 andq %rax, %rdx
0x000000000000192c 4809CA orq %rcx, %rdx
0x000000000000192f 48B800FF000000FF0000 movq $0xff000000ff00, %rax
0x0000000000001939 4821C3 andq %rax, %rbx
0x000000000000193c 48C1E308 shlq $0x8, %rbx
0x0000000000001940 4809D3 orq %rdx, %rbx
0x0000000000001943 48B80FF00FF00FF00FF0 movq $0xf00ff00ff00ff00f, %rax
0x000000000000194d 4889D9 movq %rbx, %rcx
0x0000000000001950 4821C1 andq %rax, %rcx
0x0000000000001953 48B8F000F000F000F000 movq $0xf000f000f000f0, %rax
0x000000000000195d 4889DA movq %rbx, %rdx
0x0000000000001960 48C1EA04 shrq $0x4, %rdx
0x0000000000001964 4821C2 andq %rax, %rdx
0x0000000000001967 4809CA orq %rcx, %rdx
0x000000000000196a 48B8F000F000F000F000 movq $0xf000f000f000f0, %rax
0x0000000000001974 4821C3 andq %rax, %rbx
0x0000000000001977 48C1E304 shlq $0x4, %rbx
0x000000000000197b 4809D3 orq %rdx, %rbx
0x000000000000197e 48B8C3C3C3C3C3C3C3C3 movq $0xc3c3c3c3c3c3c3c3, %rax
0x0000000000001988 4889D9 movq %rbx, %rcx
0x000000000000198b 4821C1 andq %rax, %rcx
0x000000000000198e 48B80C0C0C0C0C0C0C0C movq $0xc0c0c0c0c0c0c0c, %rax
0x0000000000001998 4889DA movq %rbx, %rdx
0x000000000000199b 48C1EA02 shrq $0x2, %rdx
0x000000000000199f 4821C2 andq %rax, %rdx
0x00000000000019a2 4809CA orq %rcx, %rdx
0x00000000000019a5 48B80C0C0C0C0C0C0C0C movq $0xc0c0c0c0c0c0c0c, %rax
0x00000000000019af 4821C3 andq %rax, %rbx
0x00000000000019b2 48C1E302 shlq $0x2, %rbx
0x00000000000019b6 4809D3 orq %rdx, %rbx
0x00000000000019b9 488D0500000000 leaq %ds:0x19c0, %rax
0x00000000000019c0 49894424F8 movq %rax, %ds:0xfffffffffffffff8(%r12) ; XREF=0x19b9
0x00000000000019c5 48B89999999999999999 movq $0x9999999999999999, %rax
0x00000000000019cf 4889D9 movq %rbx, %rcx
0x00000000000019d2 4821C1 andq %rax, %rcx
0x00000000000019d5 48B82222222222222222 movq $0x2222222222222222, %rax
0x00000000000019df 4889DA movq %rbx, %rdx
0x00000000000019e2 48D1EA shrq $0x1, %rdx
0x00000000000019e5 4821C2 andq %rax, %rdx
0x00000000000019e8 4809CA orq %rcx, %rdx
0x00000000000019eb 48B82222222222222222 movq $0x2222222222222222, %rax
0x00000000000019f5 4821C3 andq %rax, %rbx
0x00000000000019f8 48D1E3 shlq $0x1, %rbx
0x00000000000019fb 4809D3 orq %rdx, %rbx
0x00000000000019fe 49891C24 movq %rbx, %ds:%r12
_s1yV_info:
0x00000000000018c0 4C89E0 movq %r12, %rax
0x00000000000018c3 4C8D6010 leaq %ds:0x10(%rax), %r12
0x00000000000018c7 4D3BA590000000 cmpq %ds:0x90(%r13), %r12
0x00000000000018ce 7611 jbel $0x18e1
0x00000000000018d0 49C785C000000010000000 movq $0x10, %ds:0xc0(%r13)
0x00000000000018db 498B45F0 movq %ds:0xfffffffffffffff0(%r13), %rax
0x00000000000018df FFE0 jmpq *%rax
0x00000000000018e1 488B7B07 movq %ds:0x7(%rbx), %rdi ; XREF=0x18ce
0x00000000000018e5 48BAFFFF00000000FFFF movq $0xffff00000000ffff, %rdx
0x00000000000018ef 4821FA andq %rdi, %rdx
0x00000000000018f2 4889F9 movq %rdi, %rcx
0x00000000000018f5 48C1E910 shrq $0x10, %rcx
0x00000000000018f9 81E10000FFFF andl $0xffff0000, %ecx
0x00000000000018ff 48C1E710 shlq $0x10, %rdi
0x0000000000001903 48BE00000000FFFF0000 movq $0xffff00000000, %rsi
0x000000000000190d 4821FE andq %rdi, %rsi
0x0000000000001910 4809D1 orq %rdx, %rcx
0x0000000000001913 488B3D00000000 movq %ds:0x191a, %rdi
0x000000000000191a 48BBFF0000FFFF0000FF movq $0xff0000ffff0000ff, %rbx ; XREF=0x1913
0x0000000000001924 48BA0000FF000000FF00 movq $0xff000000ff0000, %rdx
0x000000000000192e 49BA0FF00FF00FF00FF0 movq $0xf00ff00ff00ff00f, %r10
0x0000000000001938 49BB000F000F000F000F movq $0xf000f000f000f00, %r11
0x0000000000001942 49BE00FF000000FF0000 movq $0xff000000ff00, %r14
0x000000000000194c 48897808 movq %rdi, %ds:0x8(%rax)
0x0000000000001950 4809CE orq %rcx, %rsi
0x0000000000001953 4889F7 movq %rsi, %rdi
0x0000000000001956 48C1E708 shlq $0x8, %rdi
0x000000000000195a 4821D7 andq %rdx, %rdi
0x000000000000195d 49B9C3C3C3C3C3C3C3C3 movq $0xc3c3c3c3c3c3c3c3, %r9
0x0000000000001967 4821DE andq %rbx, %rsi
0x000000000000196a 48C1E908 shrq $0x8, %rcx
0x000000000000196e 49B89999999999999999 movq $0x9999999999999999, %r8
0x0000000000001978 4C21F1 andq %r14, %rcx
0x000000000000197b 48B80C0C0C0C0C0C0C0C movq $0xc0c0c0c0c0c0c0c, %rax
0x0000000000001985 4809F1 orq %rsi, %rcx
0x0000000000001988 4809CF orq %rcx, %rdi
0x000000000000198b 4889FE movq %rdi, %rsi
0x000000000000198e 48C1E604 shlq $0x4, %rsi
0x0000000000001992 4C21DE andq %r11, %rsi
0x0000000000001995 48BB3030303030303030 movq $0x3030303030303030, %rbx
0x000000000000199f 4C21D7 andq %r10, %rdi
0x00000000000019a2 48C1E904 shrq $0x4, %rcx
0x00000000000019a6 48BAF000F000F000F000 movq $0xf000f000f000f0, %rdx
0x00000000000019b0 4821CA andq %rcx, %rdx
0x00000000000019b3 4809FA orq %rdi, %rdx
0x00000000000019b6 4809D6 orq %rdx, %rsi
0x00000000000019b9 488D0CB500000000 leaq %ds:0x0(,%rsi,4), %rcx
0x00000000000019c1 4821D9 andq %rbx, %rcx
0x00000000000019c4 4C21CE andq %r9, %rsi
0x00000000000019c7 48C1EA02 shrq $0x2, %rdx
0x00000000000019cb 48BF2222222222222222 movq $0x2222222222222222, %rdi
0x00000000000019d5 4821C2 andq %rax, %rdx
0x00000000000019d8 48BB4444444444444444 movq $0x4444444444444444, %rbx
0x00000000000019e2 4809F2 orq %rsi, %rdx
0x00000000000019e5 4809D1 orq %rdx, %rcx
0x00000000000019e8 488D0409 leaq %ds:(%rcx,%rcx), %rax
0x00000000000019ec 4821D8 andq %rbx, %rax
0x00000000000019ef 4C21C1 andq %r8, %rcx
0x00000000000019f2 48D1EA shrq $0x1, %rdx
0x00000000000019f5 4821FA andq %rdi, %rdx
0x00000000000019f8 4809CA orq %rcx, %rdx
0x00000000000019fb 4809C2 orq %rax, %rdx
0x00000000000019fe 49891424 movq %rdx, %ds:%r12
0x0000000000001a02 488B4508 movq %ss:0x8(%rbp), %rax
0x0000000000001a06 488D6D08 leaq %ss:0x8(%rbp), %rbp
0x0000000000001a0a 498D5C24F9 leaq %ds:0xfffffffffffffff9(%r12), %rbx
0x0000000000001a0f FFE0 jmpq *%rax
0x0000000000001a11 0F1F8000000000 nopl %ds:0x0(%rax)
outerShuffle64A :: Word -> Word
outerShuffle64A !x =
--- the 16 shift should be conditional
case ((x .&. 0x00000000FFFF0000) << 16 )
.|. ((x>>16) .&. 0x00000000FFFF0000) .|. (x .&. 0xFFFF00000000FFFF) of
x-> case ((x .&. 0x0000FF000000FF00 ) << 8 )
.|. (x >> 8) .&. 0x0000FF000000FF00 .|. (x .&. 0xFF0000FFFF0000FF) of
x -> case (( x .&. 0x00F000F000F000F0 ) << 4 )
.|. (x >> 4) .&. 0x00F000F000F000F0 .|. (x .&. 0xF00FF00FF00FF00F ) of
x->case ((x .&. 0x0C0C0C0C0C0C0C0C )<< 2 )
.|. (x >> 2) .&. 0x0C0C0C0C0C0C0C0C .|.( x .&. 0xC3C3C3C3C3C3C3C3) of
x-> case ( (x .&. 0x2222222222222222) << 1 )
.|. (x>> 1) .&. 0x2222222222222222 .|. (x .&. 0x9999999999999999) of
res -> res
{-# INLINE outerShuffle64A #-}
What i find pretty neat is that the LLVM code uses more of the available registers. Still need to do some benchmarks on it though :)
outershuffle64a repeated 1,000 times takes an average of 8.7 micro seconds, for the llvm code. pretty good!