Skip to content

Instantly share code, notes, and snippets.

@pdumais
Last active December 17, 2015 00:07
Show Gist options
  • Save pdumais/0470e5e24cbe800067bb to your computer and use it in GitHub Desktop.
Save pdumais/0470e5e24cbe800067bb to your computer and use it in GitHub Desktop.
sha256 implementation using AVX instructions
.globl sha256s
.equ LOCAL_VAR_SIZE, 8
.equ VAR_TRAIL, -8
.text
// WARNING: This is using System-V Calling convention. I don't know if it will work in windows
// extern "C" void sha256s(unsigned char* output, unsigned char* input, unsigned long size);
// rsi: input
// rdx: size
// rdi: output
// scratch registers: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 and all xmm registers
sha256s:
push %rbp
mov %rsp,%rbp
sub $LOCAL_VAR_SIZE,%rsp
push %r13
push %r12
push %r11
push %rdi /*rdi=destination buffer in param*/
// Set the trailing bit value
mov $0x80,%r13
mov %r13,VAR_TRAIL(%rbp)
// restore SH array. Should be in stack anyway
vmovapd CACHEDSH,%ymm0
vmovapd %ymm0,SH
mov %rsi,%r9
// Calculate number of 512bit chunks,
// including the trailing bit and original size
mov %rdx,%r8
mov %rdx,%r13 /* r13 = remaining size*/
mov %rdx,%r10 /* r13 = original size*/
add $(1+8+63),%r8
shr $6,%r8 /* r8 = number of chunks */
// load it here once right now. Will be used later.
vmovapd permmaskshl,%ymm9
/**********************************************
* Chunk loop. This is where we go over
* each 512byte chunks of the input and
* generate the hash. If the buffer is not
* a multiple of 512byte, we will pad it.
* But, according to specs, we need to append
* a trailing '1' bit at the end of the data
* and put the original size, in bit, at the
* end of the last chunk
**********************************************/
vmovapd mask,%ymm14 /* each ymm register contains 8 int32 */
chunkloop:
cmp $64,%r13
jb smallBuffer
// convert to BE
vmovupd (%r9),%ymm0
vmovupd 32(%r9),%ymm2
vpshufb %ymm14,%ymm0,%ymm0
vpshufb %ymm14,%ymm2,%ymm2
vmovapd %ymm0,SW
vmovapd %ymm2,SW+32
sub $64,%r13
jmp bufferFilled
// If remainder of buffer is smaller than
// 64, then copy bytes per bytes but then reload to
// reorder int32 in big-endian
smallBuffer: /* copy bytes-per-bytes remainder of buffer */
xor %rcx,%rcx
1: cmp %rcx,%r13
je 2f
mov (%r9,%rcx),%al
mov %al,SW(%rcx)
inc %rcx
jmp 1b
2: xor %rax,%rax
1: mov %al,SW(%rcx) /* then zero out remainder of buffer*/
inc %rcx
cmp $64,%rcx
jne 1b
mov VAR_TRAIL(%rbp),%rax
mov %al,SW(%r13)
xor %r13,%r13
mov %r13,VAR_TRAIL(%rbp)
vmovapd SW,%ymm0
vmovapd SW+32,%ymm2
vpshufb %ymm14,%ymm0,%ymm0
vpshufb %ymm14,%ymm2,%ymm2
vmovapd %ymm0,SW
vmovapd %ymm2,SW+32
/**********************************************
* We have a full 512bit chunk with endianess
* corrected. Now proceed with the algorithm
**********************************************/
bufferFilled:
cmp $1,%r8 /* last chunk? */
ja 1f
//add size at the end of the last frame
shl $3,%r10
mov %r10,%rax
mov %eax,SW+(64-4)
shr $32,%rax
mov %eax,SW+(64-8)
1: mov $16,%r11
mov $SW,%rsi
1: // eax = ror(w[i-15],7)^ror(w[i-15],18)^(w[i-15] >> 3);
mov -(15*4)(%rsi,%r11,4),%eax
mov %eax,%ebx
mov %eax,%edx
ror $7,%eax
ror $18,%ebx
shr $3,%edx
xor %ebx,%eax
xor %edx,%eax
// ebx = ror(w[i-2],17)^ror(w[i-2],19)^(w[i-2] >> 10);
mov -(2*4)(%rsi,%r11,4),%ebx
mov %ebx,%ecx
mov %ebx,%edx
ror $17,%ebx
ror $19,%ecx
shr $10,%edx
xor %ecx,%ebx
xor %edx,%ebx
// w[i] = w[i-16] + s0 + w[i-7] + s1;
add -(16*4)(%rsi,%r11,4),%eax
add -(7*4)(%rsi,%r11,4),%eax
add %ebx,%eax
mov %eax,(%rsi,%r11,4)
inc %r11
cmp $64,%r11
jne 1b /* inner loop. 1st step of algo */
// copy h to h2
vmovapd SH,%ymm0
vmovapd %ymm0,SH2
xor %r11,%r11
1: // eax = ror(e,6)^ror(e,11)^ror(e,25);
mov SH2+(4*4),%eax
mov %eax,%ebx
mov %eax,%ecx
ror $6,%eax
ror $11,%ebx
ror $25,%ecx
xor %ebx,%eax
xor %ecx,%eax
// edx = (e&f)^((~e)&g);
mov (4*4)+SH2,%ebx
mov (5*4)+SH2,%ecx
mov (6*4)+SH2,%edx
and %ebx,%ecx
not %ebx
and %ebx,%edx
xor %ecx,%edx
// t1 = eax = s1+ch+h+K[i]+w[i];
add %edx,%eax
add (7*4)+SH2,%eax
mov $SK,%rdi
add (%rdi,%r11,4),%eax
mov $SW,%rdi
add (%rdi,%r11,4),%eax
// t2 = ebx = ror(a,2)^ror(a,13)^ror(a,22) + (a&b)^(a&c)^(b&c)
mov (0*4)+SH2,%ecx
mov (1*4)+SH2,%edx
mov (2*4)+SH2,%ebx
mov %ebx,%esi
and %edx,%ebx /* ebx = b&c */
and %ecx,%edx /* edx = a&b */
and %ecx,%esi /* r12 = a&c */
xor %edx,%ebx
xor %ebx,%esi
mov (0*4)+SH2,%ebx
mov %ebx,%ecx
mov %ebx,%edx
ror $2,%ebx
ror $13,%ecx
ror $22,%edx
xor %ecx,%ebx
xor %edx,%ebx
add %esi,%ebx
// shift al int32 to the left so a=b, b=c etc...
vpermps SH2,%ymm9,%ymm1 /* Seems to be the only way to do a shl on 256bits */
vmovapd %ymm1,SH2
add %eax,%ebx
mov %ebx,SH2
add %eax,(4*4)+SH2
inc %r11
cmp $64,%r11
jne 1b /* inner loop. 2nd step of algo */
// add back a,b,c,d,e,f,g,h to H[].
vmovapd SH2,%ymm2
vpaddd SH,%ymm2,%ymm2
vmovapd %ymm2,SH
add $64,%r9
dec %r8
jnz chunkloop /* chunk loop */
/**********************************************
* convert to hex
**********************************************/
pop %rdi
vmovapd SH,%ymm3
vpshufb %ymm14,%ymm3,%ymm3
vpbroadcastb hexdigits,%ymm9
vpbroadcastb gtmask,%ymm10
vpbroadcastb digits,%ymm11
vpbroadcastb nibblemask,%ymm14
vpslld $4,%ymm14,%ymm15
vpand %ymm15,%ymm3,%ymm1
vpand %ymm14,%ymm3,%ymm0
vpsrld $4,%ymm1,%ymm1
// at this point: ymm0: low nibbles of each bytes
// ymm1: high nibbles of each bytes (shifted right by 4)
// interleave
vpunpcklbw %ymm0,%ymm1,%ymm2
vpunpckhbw %ymm0,%ymm1,%ymm3
vperm2i128 $0b000010,%ymm2,%ymm3,%ymm0
vperm2i128 $0b010011,%ymm2,%ymm3,%ymm1
// now ymm0, ymm1 contains consecutive bytes
// just need to convert to ascii
vpaddb %ymm11,%ymm0,%ymm0
vpaddb %ymm11,%ymm1,%ymm1
vpcmpgtb %ymm10,%ymm0,%ymm2
vpcmpgtb %ymm10,%ymm1,%ymm3
vpand %ymm9,%ymm2,%ymm2
vpand %ymm9,%ymm3,%ymm3
vpaddb %ymm2,%ymm0,%ymm0
vpaddb %ymm3,%ymm1,%ymm1
// and finaly store it
vmovupd %ymm0,(%rdi)
vmovupd %ymm1,32(%rdi)
// Cleanup and return
pop %r11
pop %r12
pop %r13
add $LOCAL_VAR_SIZE,%rsp
leave
ret
.data
.align 32
//This should reside on stack otherwise the function is not threadsafe
//but doing so, we would need to use unaligned move or make sure the stack space
//is aligned on a 256bit boundary
SH: .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
mask: .byte 0x03,0x02,0x01,0x00,0x07,0x06,0x05,0x04,0x0B,0x0A,0x09,0x08,0x0F,0x0E,0x0D,0x0C
.byte 0x13,0x12,0x11,0x10,0x17,0x16,0x15,0x14,0x1B,0x1A,0x19,0x18,0x1F,0x1E,0x1D,0x1C
permmaskshl:.int 0,0,1,2,3,4,5,6
SK: .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.align 32
CACHEDSH: .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
.align 32
SH2: .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
.align 32
SW:
.rept 64
.int 0
.endr
nibblemask: .byte 0x0F
digits: .byte 0x30
gtmask: .byte 0x39
hexdigits: .byte 0x27
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment