Last active
December 17, 2015 00:07
-
-
Save pdumais/0470e5e24cbe800067bb to your computer and use it in GitHub Desktop.
sha256 implementation using AVX instructions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.globl sha256s | |
.equ LOCAL_VAR_SIZE, 8 | |
.equ VAR_TRAIL, -8 | |
.text | |
// WARNING: This is using System-V Calling convention. I don't know if it will work in windows | |
// extern "C" void sha256s(unsigned char* output, unsigned char* input, unsigned long size); | |
// rsi: input | |
// rdx: size | |
// rdi: output | |
// scratch registers: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 and all xmm registers | |
sha256s: | |
push %rbp | |
mov %rsp,%rbp | |
sub $LOCAL_VAR_SIZE,%rsp | |
push %r13 | |
push %r12 | |
push %r11 | |
push %rdi /*rdi=destination buffer in param*/ | |
// Set the trailing bit value | |
mov $0x80,%r13 | |
mov %r13,VAR_TRAIL(%rbp) | |
// restore SH array. Should be in stack anyway | |
vmovapd CACHEDSH,%ymm0 | |
vmovapd %ymm0,SH | |
mov %rsi,%r9 | |
// Calculate number of 512bit chunks, | |
// including the trailing bit and original size | |
mov %rdx,%r8 | |
mov %rdx,%r13 /* r13 = remaining size*/ | |
mov %rdx,%r10 /* r13 = original size*/ | |
add $(1+8+63),%r8 | |
shr $6,%r8 /* r8 = number of chunks */ | |
// load it here once right now. Will be used later. | |
vmovapd permmaskshl,%ymm9 | |
/********************************************** | |
* Chunk loop. This is where we go over | |
* each 512byte chunks of the input and | |
* generate the hash. If the buffer is not | |
* a multiple of 512byte, we will pad it. | |
* But, according to specs, we need to append | |
* a trailing '1' bit at the end of the data | |
* and put the original size, in bit, at the | |
* end of the last chunk | |
**********************************************/ | |
vmovapd mask,%ymm14 /* each ymm register contains 8 int32 */ | |
chunkloop: | |
cmp $64,%r13 | |
jb smallBuffer | |
// convert to BE | |
vmovupd (%r9),%ymm0 | |
vmovupd 32(%r9),%ymm2 | |
vpshufb %ymm14,%ymm0,%ymm0 | |
vpshufb %ymm14,%ymm2,%ymm2 | |
vmovapd %ymm0,SW | |
vmovapd %ymm2,SW+32 | |
sub $64,%r13 | |
jmp bufferFilled | |
// If remainder of buffer is smaller than | |
// 64, then copy bytes per bytes but then reload to | |
// reorder int32 in big-endian | |
smallBuffer: /* copy bytes-per-bytes remainder of buffer */ | |
xor %rcx,%rcx | |
1: cmp %rcx,%r13 | |
je 2f | |
mov (%r9,%rcx),%al | |
mov %al,SW(%rcx) | |
inc %rcx | |
jmp 1b | |
2: xor %rax,%rax | |
1: mov %al,SW(%rcx) /* then zero out remainder of buffer*/ | |
inc %rcx | |
cmp $64,%rcx | |
jne 1b | |
mov VAR_TRAIL(%rbp),%rax | |
mov %al,SW(%r13) | |
xor %r13,%r13 | |
mov %r13,VAR_TRAIL(%rbp) | |
vmovapd SW,%ymm0 | |
vmovapd SW+32,%ymm2 | |
vpshufb %ymm14,%ymm0,%ymm0 | |
vpshufb %ymm14,%ymm2,%ymm2 | |
vmovapd %ymm0,SW | |
vmovapd %ymm2,SW+32 | |
/********************************************** | |
* We have a full 512bit chunk with endianess | |
* corrected. Now proceed with the algorithm | |
**********************************************/ | |
bufferFilled: | |
cmp $1,%r8 /* last chunk? */ | |
ja 1f | |
//add size at the end of the last frame | |
shl $3,%r10 | |
mov %r10,%rax | |
mov %eax,SW+(64-4) | |
shr $32,%rax | |
mov %eax,SW+(64-8) | |
1: mov $16,%r11 | |
mov $SW,%rsi | |
1: // eax = ror(w[i-15],7)^ror(w[i-15],18)^(w[i-15] >> 3); | |
mov -(15*4)(%rsi,%r11,4),%eax | |
mov %eax,%ebx | |
mov %eax,%edx | |
ror $7,%eax | |
ror $18,%ebx | |
shr $3,%edx | |
xor %ebx,%eax | |
xor %edx,%eax | |
// ebx = ror(w[i-2],17)^ror(w[i-2],19)^(w[i-2] >> 10); | |
mov -(2*4)(%rsi,%r11,4),%ebx | |
mov %ebx,%ecx | |
mov %ebx,%edx | |
ror $17,%ebx | |
ror $19,%ecx | |
shr $10,%edx | |
xor %ecx,%ebx | |
xor %edx,%ebx | |
// w[i] = w[i-16] + s0 + w[i-7] + s1; | |
add -(16*4)(%rsi,%r11,4),%eax | |
add -(7*4)(%rsi,%r11,4),%eax | |
add %ebx,%eax | |
mov %eax,(%rsi,%r11,4) | |
inc %r11 | |
cmp $64,%r11 | |
jne 1b /* inner loop. 1st step of algo */ | |
// copy h to h2 | |
vmovapd SH,%ymm0 | |
vmovapd %ymm0,SH2 | |
xor %r11,%r11 | |
1: // eax = ror(e,6)^ror(e,11)^ror(e,25); | |
mov SH2+(4*4),%eax | |
mov %eax,%ebx | |
mov %eax,%ecx | |
ror $6,%eax | |
ror $11,%ebx | |
ror $25,%ecx | |
xor %ebx,%eax | |
xor %ecx,%eax | |
// edx = (e&f)^((~e)&g); | |
mov (4*4)+SH2,%ebx | |
mov (5*4)+SH2,%ecx | |
mov (6*4)+SH2,%edx | |
and %ebx,%ecx | |
not %ebx | |
and %ebx,%edx | |
xor %ecx,%edx | |
// t1 = eax = s1+ch+h+K[i]+w[i]; | |
add %edx,%eax | |
add (7*4)+SH2,%eax | |
mov $SK,%rdi | |
add (%rdi,%r11,4),%eax | |
mov $SW,%rdi | |
add (%rdi,%r11,4),%eax | |
// t2 = ebx = ror(a,2)^ror(a,13)^ror(a,22) + (a&b)^(a&c)^(b&c) | |
mov (0*4)+SH2,%ecx | |
mov (1*4)+SH2,%edx | |
mov (2*4)+SH2,%ebx | |
mov %ebx,%esi | |
and %edx,%ebx /* ebx = b&c */ | |
and %ecx,%edx /* edx = a&b */ | |
and %ecx,%esi /* r12 = a&c */ | |
xor %edx,%ebx | |
xor %ebx,%esi | |
mov (0*4)+SH2,%ebx | |
mov %ebx,%ecx | |
mov %ebx,%edx | |
ror $2,%ebx | |
ror $13,%ecx | |
ror $22,%edx | |
xor %ecx,%ebx | |
xor %edx,%ebx | |
add %esi,%ebx | |
// shift al int32 to the left so a=b, b=c etc... | |
vpermps SH2,%ymm9,%ymm1 /* Seems to be the only way to do a shl on 256bits */ | |
vmovapd %ymm1,SH2 | |
add %eax,%ebx | |
mov %ebx,SH2 | |
add %eax,(4*4)+SH2 | |
inc %r11 | |
cmp $64,%r11 | |
jne 1b /* inner loop. 2nd step of algo */ | |
// add back a,b,c,d,e,f,g,h to H[]. | |
vmovapd SH2,%ymm2 | |
vpaddd SH,%ymm2,%ymm2 | |
vmovapd %ymm2,SH | |
add $64,%r9 | |
dec %r8 | |
jnz chunkloop /* chunk loop */ | |
/********************************************** | |
* convert to hex | |
**********************************************/ | |
pop %rdi | |
vmovapd SH,%ymm3 | |
vpshufb %ymm14,%ymm3,%ymm3 | |
vpbroadcastb hexdigits,%ymm9 | |
vpbroadcastb gtmask,%ymm10 | |
vpbroadcastb digits,%ymm11 | |
vpbroadcastb nibblemask,%ymm14 | |
vpslld $4,%ymm14,%ymm15 | |
vpand %ymm15,%ymm3,%ymm1 | |
vpand %ymm14,%ymm3,%ymm0 | |
vpsrld $4,%ymm1,%ymm1 | |
// at this point: ymm0: low nibbles of each bytes | |
// ymm1: high nibbles of each bytes (shifted right by 4) | |
// interleave | |
vpunpcklbw %ymm0,%ymm1,%ymm2 | |
vpunpckhbw %ymm0,%ymm1,%ymm3 | |
vperm2i128 $0b000010,%ymm2,%ymm3,%ymm0 | |
vperm2i128 $0b010011,%ymm2,%ymm3,%ymm1 | |
// now ymm0, ymm1 contains consecutive bytes | |
// just need to convert to ascii | |
vpaddb %ymm11,%ymm0,%ymm0 | |
vpaddb %ymm11,%ymm1,%ymm1 | |
vpcmpgtb %ymm10,%ymm0,%ymm2 | |
vpcmpgtb %ymm10,%ymm1,%ymm3 | |
vpand %ymm9,%ymm2,%ymm2 | |
vpand %ymm9,%ymm3,%ymm3 | |
vpaddb %ymm2,%ymm0,%ymm0 | |
vpaddb %ymm3,%ymm1,%ymm1 | |
// and finaly store it | |
vmovupd %ymm0,(%rdi) | |
vmovupd %ymm1,32(%rdi) | |
// Cleanup and return | |
pop %r11 | |
pop %r12 | |
pop %r13 | |
add $LOCAL_VAR_SIZE,%rsp | |
leave | |
ret | |
.data | |
.align 32 | |
//This should reside on stack otherwise the function is not threadsafe | |
//but doing so, we would need to use unaligned move or make sure the stack space | |
//is aligned on a 256bit boundary | |
SH: .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19 | |
mask: .byte 0x03,0x02,0x01,0x00,0x07,0x06,0x05,0x04,0x0B,0x0A,0x09,0x08,0x0F,0x0E,0x0D,0x0C | |
.byte 0x13,0x12,0x11,0x10,0x17,0x16,0x15,0x14,0x1B,0x1A,0x19,0x18,0x1F,0x1E,0x1D,0x1C | |
permmaskshl:.int 0,0,1,2,3,4,5,6 | |
SK: .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
.int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
.int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
.int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
.int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
.int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
.int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
.int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
.align 32 | |
CACHEDSH: .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19 | |
.align 32 | |
SH2: .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19 | |
.align 32 | |
SW: | |
.rept 64 | |
.int 0 | |
.endr | |
nibblemask: .byte 0x0F | |
digits: .byte 0x30 | |
gtmask: .byte 0x39 | |
hexdigits: .byte 0x27 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment