pdumais · December 17, 2015 00:07
diff --git a/sha256.s b/sha256.s
 .globl sha256s

 .equ LOCAL_VAR_SIZE, 8
 .equ VAR_TRAIL, -8

 .text

 // WARNING: This is using System-V Calling convention. I don't know if it will work in windows
 // extern "C" void sha256s(unsigned char* output, unsigned char* input, unsigned long size);
 //   rsi: input
 //   rdx: size
 //   rdi: output
 // scratch registers: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 and all xmm registers
 sha256s:
    push    %rbp
    mov     %rsp,%rbp
    sub     $LOCAL_VAR_SIZE,%rsp
    push    %r13
    push    %r12
    push    %r11
    push    %rdi                    /*rdi=destination buffer in param*/

    // Set the trailing bit value
    mov     $0x80,%r13          
    mov     %r13,VAR_TRAIL(%rbp)

    // restore SH array. Should be in stack anyway
    vmovapd CACHEDSH,%ymm0
    vmovapd %ymm0,SH
    mov     %rsi,%r9

    // Calculate number of 512bit chunks, 
    // including the trailing bit and original size
    mov     %rdx,%r8
    mov     %rdx,%r13               /* r13 = remaining size*/
    mov     %rdx,%r10               /* r13 = original size*/
    add     $(1+8+63),%r8
    shr     $6,%r8                  /* r8 = number of chunks */

    // load it here once right now. Will be used later.
    vmovapd     permmaskshl,%ymm9

    /**********************************************
    * Chunk loop. This is where we go over
    * each 512byte chunks of the input and 
    * generate the hash. If the buffer is not
    * a multiple of 512byte, we will pad it.
    * But, according to specs, we need to append
    * a trailing '1' bit at the end of the data
    * and put the original size, in bit, at the
    * end of the last chunk
    **********************************************/
    vmovapd mask,%ymm14             /* each ymm register contains 8 int32 */
 chunkloop:  
    cmp     $64,%r13
    jb      smallBuffer

    // convert to BE
    vmovupd (%r9),%ymm0       
    vmovupd 32(%r9),%ymm2     
    vpshufb %ymm14,%ymm0,%ymm0  
    vpshufb %ymm14,%ymm2,%ymm2
    vmovapd %ymm0,SW
    vmovapd %ymm2,SW+32

    sub     $64,%r13
    jmp     bufferFilled

    // If remainder of buffer is smaller than 
    // 64, then copy bytes per bytes but then reload to 
    // reorder int32 in big-endian
 smallBuffer:                        /* copy bytes-per-bytes remainder of buffer */
    xor     %rcx,%rcx
 1:  cmp     %rcx,%r13
    je      2f
    mov     (%r9,%rcx),%al
    mov     %al,SW(%rcx)
    inc     %rcx
    jmp     1b
 2:  xor     %rax,%rax
 1:  mov     %al,SW(%rcx)            /* then zero out remainder of buffer*/
    inc     %rcx
    cmp     $64,%rcx
    jne     1b
    mov     VAR_TRAIL(%rbp),%rax
    mov     %al,SW(%r13)
    xor     %r13,%r13
    mov     %r13,VAR_TRAIL(%rbp)
    
    vmovapd     SW,%ymm0
    vmovapd     SW+32,%ymm2
    vpshufb     %ymm14,%ymm0,%ymm0  
    vpshufb     %ymm14,%ymm2,%ymm2
    vmovapd     %ymm0,SW
    vmovapd     %ymm2,SW+32

    /**********************************************
    * We have a full 512bit chunk with endianess
    * corrected. Now proceed with the algorithm
    **********************************************/
 bufferFilled:
    cmp     $1,%r8                  /* last chunk? */
    ja      1f

    //add size at the end of the last frame
    shl     $3,%r10
    mov     %r10,%rax
    mov     %eax,SW+(64-4)
    shr     $32,%rax
    mov     %eax,SW+(64-8)
 1:  mov     $16,%r11
    mov     $SW,%rsi

 1:  // eax = ror(w[i-15],7)^ror(w[i-15],18)^(w[i-15] >> 3);
    mov     -(15*4)(%rsi,%r11,4),%eax
    mov     %eax,%ebx
    mov     %eax,%edx
    ror     $7,%eax
    ror     $18,%ebx
    shr     $3,%edx
    xor     %ebx,%eax    
    xor     %edx,%eax    

    // ebx = ror(w[i-2],17)^ror(w[i-2],19)^(w[i-2] >> 10);
    mov     -(2*4)(%rsi,%r11,4),%ebx
    mov     %ebx,%ecx
    mov     %ebx,%edx
    ror     $17,%ebx
    ror     $19,%ecx
    shr     $10,%edx
    xor     %ecx,%ebx    
    xor     %edx,%ebx    

    // w[i] = w[i-16] + s0 + w[i-7] + s1;
    add     -(16*4)(%rsi,%r11,4),%eax
    add     -(7*4)(%rsi,%r11,4),%eax
    add     %ebx,%eax
    mov     %eax,(%rsi,%r11,4) 

    inc     %r11
    cmp     $64,%r11
    jne     1b                      /* inner loop. 1st step of algo */

    // copy h to h2
    vmovapd SH,%ymm0
    vmovapd %ymm0,SH2
    xor     %r11,%r11

 1:  // eax = ror(e,6)^ror(e,11)^ror(e,25);
    mov     SH2+(4*4),%eax
    mov     %eax,%ebx
    mov     %eax,%ecx
    ror     $6,%eax
    ror     $11,%ebx
    ror     $25,%ecx
    xor     %ebx,%eax 
    xor     %ecx,%eax 

    // edx = (e&f)^((~e)&g);
    mov     (4*4)+SH2,%ebx
    mov     (5*4)+SH2,%ecx
    mov     (6*4)+SH2,%edx
    and     %ebx,%ecx
    not     %ebx
    and     %ebx,%edx
    xor     %ecx,%edx

    // t1 = eax = s1+ch+h+K[i]+w[i];
    add     %edx,%eax
    add     (7*4)+SH2,%eax
    mov     $SK,%rdi
    add     (%rdi,%r11,4),%eax
    mov     $SW,%rdi
    add     (%rdi,%r11,4),%eax
    
    // t2 = ebx = ror(a,2)^ror(a,13)^ror(a,22) + (a&b)^(a&c)^(b&c)
    mov     (0*4)+SH2,%ecx
    mov     (1*4)+SH2,%edx
    mov     (2*4)+SH2,%ebx
    mov     %ebx,%esi
    and     %edx,%ebx               /* ebx = b&c */
    and     %ecx,%edx               /* edx = a&b */
    and     %ecx,%esi               /* r12 = a&c */
    xor     %edx,%ebx
    xor     %ebx,%esi
   
    mov     (0*4)+SH2,%ebx 
    mov     %ebx,%ecx
    mov     %ebx,%edx
    ror     $2,%ebx
    ror     $13,%ecx
    ror     $22,%edx
    xor     %ecx,%ebx
    xor     %edx,%ebx
    add     %esi,%ebx

    // shift al int32 to the left so a=b, b=c etc...
    vpermps SH2,%ymm9,%ymm1         /* Seems to be the only way to do a shl on 256bits */
    vmovapd %ymm1,SH2
    add     %eax,%ebx
    mov     %ebx,SH2
    add     %eax,(4*4)+SH2
    
    inc     %r11
    cmp     $64,%r11
    jne     1b                      /* inner loop. 2nd step of algo */

    // add back a,b,c,d,e,f,g,h to H[]. 
    vmovapd SH2,%ymm2
    vpaddd  SH,%ymm2,%ymm2
    vmovapd %ymm2,SH
    
    add     $64,%r9
    dec     %r8
    jnz     chunkloop               /* chunk loop */

    /**********************************************
    * convert to hex
    **********************************************/
    pop         %rdi

    vmovapd     SH,%ymm3
    vpshufb     %ymm14,%ymm3,%ymm3

    vpbroadcastb hexdigits,%ymm9 
    vpbroadcastb gtmask,%ymm10
    vpbroadcastb digits,%ymm11   
    vpbroadcastb nibblemask,%ymm14
    vpslld      $4,%ymm14,%ymm15

    vpand       %ymm15,%ymm3,%ymm1
    vpand       %ymm14,%ymm3,%ymm0
    vpsrld      $4,%ymm1,%ymm1
    // at this point: ymm0: low nibbles of each bytes
    //                ymm1: high nibbles of each bytes (shifted right by 4)

    // interleave
    vpunpcklbw  %ymm0,%ymm1,%ymm2
    vpunpckhbw  %ymm0,%ymm1,%ymm3
    vperm2i128  $0b000010,%ymm2,%ymm3,%ymm0
    vperm2i128  $0b010011,%ymm2,%ymm3,%ymm1
    // now ymm0, ymm1 contains consecutive bytes
    // just need to convert to ascii

    vpaddb      %ymm11,%ymm0,%ymm0
    vpaddb      %ymm11,%ymm1,%ymm1
    vpcmpgtb    %ymm10,%ymm0,%ymm2
    vpcmpgtb    %ymm10,%ymm1,%ymm3
    vpand       %ymm9,%ymm2,%ymm2
    vpand       %ymm9,%ymm3,%ymm3
    vpaddb      %ymm2,%ymm0,%ymm0
    vpaddb      %ymm3,%ymm1,%ymm1

    // and finaly store it
    vmovupd     %ymm0,(%rdi)
    vmovupd     %ymm1,32(%rdi)

    // Cleanup and return
    pop     %r11
    pop     %r12
    pop     %r13
    add     $LOCAL_VAR_SIZE,%rsp
    leave
    ret

 .data
 .align 32
 //This should reside on stack otherwise the function is not threadsafe
 //but doing so, we would need to use unaligned move or make sure the stack space
 //is aligned on a 256bit boundary
 SH:         .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
 mask:       .byte 0x03,0x02,0x01,0x00,0x07,0x06,0x05,0x04,0x0B,0x0A,0x09,0x08,0x0F,0x0E,0x0D,0x0C
            .byte 0x13,0x12,0x11,0x10,0x17,0x16,0x15,0x14,0x1B,0x1A,0x19,0x18,0x1F,0x1E,0x1D,0x1C
 permmaskshl:.int 0,0,1,2,3,4,5,6
 SK:         .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
            .int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
            .int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
            .int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
            .int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
            .int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070
            .int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
            .int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .align 32
 CACHEDSH:   .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
 .align 32
 SH2:        .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
 .align 32
 SW:
 .rept 64
 .int 0
 .endr
 nibblemask: .byte 0x0F
 digits:     .byte 0x30
 gtmask:     .byte 0x39
 hexdigits:  .byte 0x27
	.globl sha256s

	.equ LOCAL_VAR_SIZE, 8
	.equ VAR_TRAIL, -8

	.text

	// WARNING: This is using System-V Calling convention. I don't know if it will work in windows
	// extern "C" void sha256s(unsigned char* output, unsigned char* input, unsigned long size);
	// rsi: input
	// rdx: size
	// rdi: output
	// scratch registers: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 and all xmm registers
	sha256s:
	push %rbp
	mov %rsp,%rbp
	sub $LOCAL_VAR_SIZE,%rsp
	push %r13
	push %r12
	push %r11
	push %rdi /rdi=destination buffer in param/

	// Set the trailing bit value
	mov $0x80,%r13
	mov %r13,VAR_TRAIL(%rbp)

	// restore SH array. Should be in stack anyway
	vmovapd CACHEDSH,%ymm0
	vmovapd %ymm0,SH
	mov %rsi,%r9

	// Calculate number of 512bit chunks,
	// including the trailing bit and original size
	mov %rdx,%r8
	mov %rdx,%r13 /* r13 = remaining size*/
	mov %rdx,%r10 /* r13 = original size*/
	add $(1+8+63),%r8
	shr $6,%r8 /* r8 = number of chunks */

	// load it here once right now. Will be used later.
	vmovapd permmaskshl,%ymm9

	/**********************************************
	* Chunk loop. This is where we go over
	* each 512byte chunks of the input and
	* generate the hash. If the buffer is not
	* a multiple of 512byte, we will pad it.
	* But, according to specs, we need to append
	* a trailing '1' bit at the end of the data
	* and put the original size, in bit, at the
	* end of the last chunk
	**********************************************/
	vmovapd mask,%ymm14 /* each ymm register contains 8 int32 */
	chunkloop:
	cmp $64,%r13
	jb smallBuffer

	// convert to BE
	vmovupd (%r9),%ymm0
	vmovupd 32(%r9),%ymm2
	vpshufb %ymm14,%ymm0,%ymm0
	vpshufb %ymm14,%ymm2,%ymm2
	vmovapd %ymm0,SW
	vmovapd %ymm2,SW+32

	sub $64,%r13
	jmp bufferFilled

	// If remainder of buffer is smaller than
	// 64, then copy bytes per bytes but then reload to
	// reorder int32 in big-endian
	smallBuffer: /* copy bytes-per-bytes remainder of buffer */
	xor %rcx,%rcx
	1: cmp %rcx,%r13
	je 2f
	mov (%r9,%rcx),%al
	mov %al,SW(%rcx)
	inc %rcx
	jmp 1b
	2: xor %rax,%rax
	1: mov %al,SW(%rcx) /* then zero out remainder of buffer*/
	inc %rcx
	cmp $64,%rcx
	jne 1b
	mov VAR_TRAIL(%rbp),%rax
	mov %al,SW(%r13)
	xor %r13,%r13
	mov %r13,VAR_TRAIL(%rbp)

	vmovapd SW,%ymm0
	vmovapd SW+32,%ymm2
	vpshufb %ymm14,%ymm0,%ymm0
	vpshufb %ymm14,%ymm2,%ymm2
	vmovapd %ymm0,SW
	vmovapd %ymm2,SW+32

	/**********************************************
	* We have a full 512bit chunk with endianess
	* corrected. Now proceed with the algorithm
	**********************************************/
	bufferFilled:
	cmp $1,%r8 /* last chunk? */
	ja 1f

	//add size at the end of the last frame
	shl $3,%r10
	mov %r10,%rax
	mov %eax,SW+(64-4)
	shr $32,%rax
	mov %eax,SW+(64-8)
	1: mov $16,%r11
	mov $SW,%rsi

	1: // eax = ror(w[i-15],7)^ror(w[i-15],18)^(w[i-15] >> 3);
	mov -(15*4)(%rsi,%r11,4),%eax
	mov %eax,%ebx
	mov %eax,%edx
	ror $7,%eax
	ror $18,%ebx
	shr $3,%edx
	xor %ebx,%eax
	xor %edx,%eax

	// ebx = ror(w[i-2],17)^ror(w[i-2],19)^(w[i-2] >> 10);
	mov -(2*4)(%rsi,%r11,4),%ebx
	mov %ebx,%ecx
	mov %ebx,%edx
	ror $17,%ebx
	ror $19,%ecx
	shr $10,%edx
	xor %ecx,%ebx
	xor %edx,%ebx

	// w[i] = w[i-16] + s0 + w[i-7] + s1;
	add -(16*4)(%rsi,%r11,4),%eax
	add -(7*4)(%rsi,%r11,4),%eax
	add %ebx,%eax
	mov %eax,(%rsi,%r11,4)

	inc %r11
	cmp $64,%r11
	jne 1b /* inner loop. 1st step of algo */

	// copy h to h2
	vmovapd SH,%ymm0
	vmovapd %ymm0,SH2
	xor %r11,%r11

	1: // eax = ror(e,6)^ror(e,11)^ror(e,25);
	mov SH2+(4*4),%eax
	mov %eax,%ebx
	mov %eax,%ecx
	ror $6,%eax
	ror $11,%ebx
	ror $25,%ecx
	xor %ebx,%eax
	xor %ecx,%eax

	// edx = (e&f)^((~e)&g);
	mov (4*4)+SH2,%ebx
	mov (5*4)+SH2,%ecx
	mov (6*4)+SH2,%edx
	and %ebx,%ecx
	not %ebx
	and %ebx,%edx
	xor %ecx,%edx

	// t1 = eax = s1+ch+h+K[i]+w[i];
	add %edx,%eax
	add (7*4)+SH2,%eax
	mov $SK,%rdi
	add (%rdi,%r11,4),%eax
	mov $SW,%rdi
	add (%rdi,%r11,4),%eax

	// t2 = ebx = ror(a,2)^ror(a,13)^ror(a,22) + (a&b)^(a&c)^(b&c)
	mov (0*4)+SH2,%ecx
	mov (1*4)+SH2,%edx
	mov (2*4)+SH2,%ebx
	mov %ebx,%esi
	and %edx,%ebx /* ebx = b&c */
	and %ecx,%edx /* edx = a&b */
	and %ecx,%esi /* r12 = a&c */
	xor %edx,%ebx
	xor %ebx,%esi

	mov (0*4)+SH2,%ebx
	mov %ebx,%ecx
	mov %ebx,%edx
	ror $2,%ebx
	ror $13,%ecx
	ror $22,%edx
	xor %ecx,%ebx
	xor %edx,%ebx
	add %esi,%ebx

	// shift al int32 to the left so a=b, b=c etc...
	vpermps SH2,%ymm9,%ymm1 /* Seems to be the only way to do a shl on 256bits */
	vmovapd %ymm1,SH2
	add %eax,%ebx
	mov %ebx,SH2
	add %eax,(4*4)+SH2

	inc %r11
	cmp $64,%r11
	jne 1b /* inner loop. 2nd step of algo */

	// add back a,b,c,d,e,f,g,h to H[].
	vmovapd SH2,%ymm2
	vpaddd SH,%ymm2,%ymm2
	vmovapd %ymm2,SH

	add $64,%r9
	dec %r8
	jnz chunkloop /* chunk loop */

	/**********************************************
	* convert to hex
	**********************************************/
	pop %rdi

	vmovapd SH,%ymm3
	vpshufb %ymm14,%ymm3,%ymm3

	vpbroadcastb hexdigits,%ymm9
	vpbroadcastb gtmask,%ymm10
	vpbroadcastb digits,%ymm11
	vpbroadcastb nibblemask,%ymm14
	vpslld $4,%ymm14,%ymm15

	vpand %ymm15,%ymm3,%ymm1
	vpand %ymm14,%ymm3,%ymm0
	vpsrld $4,%ymm1,%ymm1
	// at this point: ymm0: low nibbles of each bytes
	// ymm1: high nibbles of each bytes (shifted right by 4)

	// interleave
	vpunpcklbw %ymm0,%ymm1,%ymm2
	vpunpckhbw %ymm0,%ymm1,%ymm3
	vperm2i128 $0b000010,%ymm2,%ymm3,%ymm0
	vperm2i128 $0b010011,%ymm2,%ymm3,%ymm1
	// now ymm0, ymm1 contains consecutive bytes
	// just need to convert to ascii

	vpaddb %ymm11,%ymm0,%ymm0
	vpaddb %ymm11,%ymm1,%ymm1
	vpcmpgtb %ymm10,%ymm0,%ymm2
	vpcmpgtb %ymm10,%ymm1,%ymm3
	vpand %ymm9,%ymm2,%ymm2
	vpand %ymm9,%ymm3,%ymm3
	vpaddb %ymm2,%ymm0,%ymm0
	vpaddb %ymm3,%ymm1,%ymm1

	// and finaly store it
	vmovupd %ymm0,(%rdi)
	vmovupd %ymm1,32(%rdi)

	// Cleanup and return
	pop %r11
	pop %r12
	pop %r13
	add $LOCAL_VAR_SIZE,%rsp
	leave
	ret

	.data
	.align 32
	//This should reside on stack otherwise the function is not threadsafe
	//but doing so, we would need to use unaligned move or make sure the stack space
	//is aligned on a 256bit boundary
	SH: .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
	mask: .byte 0x03,0x02,0x01,0x00,0x07,0x06,0x05,0x04,0x0B,0x0A,0x09,0x08,0x0F,0x0E,0x0D,0x0C
	.byte 0x13,0x12,0x11,0x10,0x17,0x16,0x15,0x14,0x1B,0x1A,0x19,0x18,0x1F,0x1E,0x1D,0x1C
	permmaskshl:.int 0,0,1,2,3,4,5,6
	SK: .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
	.int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
	.int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
	.int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
	.int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
	.int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070
	.int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
	.int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
	.align 32
	CACHEDSH: .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
	.align 32
	SH2: .int 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
	.align 32
	SW:
	.rept 64
	.int 0
	.endr
	nibblemask: .byte 0x0F
	digits: .byte 0x30
	gtmask: .byte 0x39
	hexdigits: .byte 0x27