moon-chilled · June 21, 2023 00:04 · moon-chilled · Jun 17, 2023 · moon-chilled · Jun 18, 2023
diff --git a/macaroni-jam.asm b/macaroni-jam.asm
 ;; hash based on fp multiply
 ;; first we need to make our integers into normal floats (since denormals cost on skx)
 ;; we can vcvtuqq2pd to get a normal float incorporating at least the top 53 bits of the input (todo could maybe get 54 bits with signed vcvtqq2pd; worth looking into?); it remains to ensure we take account of the low 11 bits
 ;; we handle three vectors at a time, as follows
 ;; a: oxxx oxxx oxxx oxxx oxxx oxxx oxxx oxxx
 ;; b: oxxx oxxx oxxx oxxx oxxx oxxx oxxx oxxx
 ;; c: oxxx oxxx oxxx oxxx oxxx oxxx oxxx oxxx
 ;;
 ;; d: ooo- ooo- ooo- ooo- ooo- ooo- ooo- ooo-
 ;; a, b, and c are the inputs; the xes are the 16-bit quantities that vcvtuqq2pd incorporates; the os are the 16-bit quantities which it does not; we want to pack the os into d
 ;; (the - is fixed, and contains a reasonable exponent)
 ;; the low o in each lane of d comes from the c; the remainder are blended from a and b

 ;; actually ... instead of using vcvtuqq2pd, shift abc right 12 places, and then add a constant.  That constant suffices both to install the exponent of a normal float _and_ to do the initial mixing.  It can be an int add or a float add; in the case of a float add, it should have the smallest normal exponent.  Then the result will have a normal exponent too.  (Might lose the bottom bit to rounding, but multiple of the bottom bits get duplicated into d anyway, so no matter.)
 ;; (multiplying denormals is slow, but adding them is fast)
 ;; We can have overflow into the exponent field (could happen with both int and float addition)--does that matter?  I think it would be helpful to be able to control the exponent exactly.  Xor would not have this problem, but the umac paper sez (https://web.cs.ucdavis.edu/~rogaway/papers/umac-full.pdf):
 ;; > Several variants of NH fail to preserve collision probability ε = 2^−w.  In particular, replacing the inner addition or the outer addition with bitwise-XOR increases ε substantially.
 ;; what's next?  We want to do a 53x53->106 multiply, but actually get an unevaluated sum, i.e., 53x53

 ;; we fix the exponent of m_j and k_j both
 ;; we also ensure that, after the implicit bit, at least the first two explicit bits are both zero
 ;; since 1.25*1.25<2, that ensures that the exponent of (m_j*k_j)_hi is fixed too

 default rel

 ; struct { uint48_t lo, hi; } macaroni_jam(void *data, size_t length, uint48_t seed);
 ; hash uses 384 bytes of key internally.  Take just one 48-bit seed, to be xored with every 48-bit unit of its fixed key (hence, 0 seed is ok, and means use the default key)
 macaroni_jam:
 push		rbp
 mov		rbp, rsp
 sub		rsp, 256
 and		rsp, ~63

 vmovdqa64	zmm31, [c_shuffle]

 mov		rcx, 48
 bzhi		rdx, rdx, rcx		; ensure the high 16 bits not set
 vpbroadcastq	zmm4, rdx

 vpxorq		zmm0, zmm4, [c_params0]
 vpxorq		zmm1, zmm4, [c_params1]
 vpxorq		zmm2, zmm4, [c_params2]
 vpxorq		zmm3, zmm4, [c_params3]
 vmovdqa64	[rsp +   0], zmm0
 vmovdqa64	[rsp +  64], zmm1
 vmovdqa64	[rsp + 128], zmm2
 vmovdqa64	[rsp + 192], zmm3


 mov		eax, 0b1110_1110_1110_1110
 kmovd		k7, eax
 mov		eax, 0b0110_0110_0110_0110
 kmovd		k6, eax


 ; zmm30 zmm29 zmm28 zmm27 key for dcba in the low 48 bits of the first half of the block, integer exponent in the exponent bits (so do an int add and get normalised
 ; zmm26 zmm25 zmm24 zmm23 ditto second half of the block?
 ;vpxorq		zmm30, zmm0, [...]
 ; ...


 .loop:
 ;; snarf zmm[012] into zmm[3456] (zmm[01] now junk)
 vpblendmw	zmm6{k7}, zmm2, zmm31
 vpsrlq		zmm5, zmm2, 16
 vpermi2w	zmm6{k6}, zmm0, zmm1
 vpsrlq		zmm4, zmm1, 16
 vpsrlq		zmm3, zmm0, 16

 ;; compute k_j + m_j
 vpaddq		zmm3, zmm3, [rsp]{1to8}
 vpaddq		zmm4, zmm4, [rsp + 8]{1to8}
 vpaddq		zmm5, zmm5, [rsp + 16]{1to8}
 vpaddq		zmm6, zmm6, [rsp + 24]{1to8}
 ;vpaddq		zmm3, zmm3, zmm27
 ;vpaddq		zmm4, zmm4, zmm28
 ;vpaddq		zmm5, zmm5, zmm29
 ;vpaddq		zmm6, zmm6, zmm30

 vmulpd		zmm0, zmm3, zmm4, {rd-sae}
 vmulpd		zmm1, zmm5, zmm6, {rd-sae}
 vfmsub213pd	zmm3, zmm4, zmm0		; these should always be exact, so leave off the exception control so
 vfmsub213pd	zmm5, zmm6, zmm1		; we can check the inexact flag after; if it gets set, there is a bug

 ;; now we have (hi, lo) as (zmm0, zmm3) and another as (zmm1, zmm5)
 ;; the highs are guaranteed to have the same exponent, and we accumulate them modularly anyway, so we can just sum and chop the mantissas later
 ;; however, the lows may have different exponents (we'd like to align them before adding), and we do need to account for carry-out from them into high (possibly?  How much does this affect the quality of the hash?)
 vpaddq		zmm0, zmm0, zmm1
 vcvtpd2qq	zmm3, zmm3
 vcvtpd2qq	zmm5, zmm5
 vpaddq		zmm3, zmm3, zmm5



 leave
 ret


 section .rodata
 align 64
 ;; high 16 bits of a double float with a reasonable exponent
 %define EXP 0b0100001100110000
 ;; first column is irrelevant, as it gets masked out entirely (replaced with the low 2bytes from c)
 ;; next two columns are shuffle indices to select the low 2bytes from a and b
 ;; last column is our desired exponent
 c_shuffle:
 dw 0,  0,  4, EXP
 dw 0,  8, 12, EXP
 dw 0, 16, 20, EXP
 dw 0, 24, 28, EXP
 dw 0, 32, 36, EXP
 dw 0, 40, 44, EXP
 dw 0, 48, 52, EXP
 dw 0, 56, 60, EXP

 ;; random data in low 48, then exp; todo
 c_params0:
 dq 0, 0, 0, 0, 0, 0, 0, 0
 c_params1:
 dq 0, 0, 0, 0, 0, 0, 0, 0
 c_params2:
 dq 0, 0, 0, 0, 0, 0, 0, 0
 c_params3:
 dq 0, 0, 0, 0, 0, 0, 0, 0
	;; hash based on fp multiply
	;; first we need to make our integers into normal floats (since denormals cost on skx)
	;; we can vcvtuqq2pd to get a normal float incorporating at least the top 53 bits of the input (todo could maybe get 54 bits with signed vcvtqq2pd; worth looking into?); it remains to ensure we take account of the low 11 bits
	;; we handle three vectors at a time, as follows
	;; a: oxxx oxxx oxxx oxxx oxxx oxxx oxxx oxxx
	;; b: oxxx oxxx oxxx oxxx oxxx oxxx oxxx oxxx
	;; c: oxxx oxxx oxxx oxxx oxxx oxxx oxxx oxxx
	;;
	;; d: ooo- ooo- ooo- ooo- ooo- ooo- ooo- ooo-
	;; a, b, and c are the inputs; the xes are the 16-bit quantities that vcvtuqq2pd incorporates; the os are the 16-bit quantities which it does not; we want to pack the os into d
	;; (the - is fixed, and contains a reasonable exponent)
	;; the low o in each lane of d comes from the c; the remainder are blended from a and b

	;; actually ... instead of using vcvtuqq2pd, shift abc right 12 places, and then add a constant. That constant suffices both to install the exponent of a normal float _and_ to do the initial mixing. It can be an int add or a float add; in the case of a float add, it should have the smallest normal exponent. Then the result will have a normal exponent too. (Might lose the bottom bit to rounding, but multiple of the bottom bits get duplicated into d anyway, so no matter.)
	;; (multiplying denormals is slow, but adding them is fast)
	;; We can have overflow into the exponent field (could happen with both int and float addition)--does that matter? I think it would be helpful to be able to control the exponent exactly. Xor would not have this problem, but the umac paper sez (https://web.cs.ucdavis.edu/~rogaway/papers/umac-full.pdf):
	;; > Several variants of NH fail to preserve collision probability ε = 2^−w. In particular, replacing the inner addition or the outer addition with bitwise-XOR increases ε substantially.
	;; what's next? We want to do a 53x53->106 multiply, but actually get an unevaluated sum, i.e., 53x53

	;; we fix the exponent of m_j and k_j both
	;; we also ensure that, after the implicit bit, at least the first two explicit bits are both zero
	;; since 1.251.25<2, that ensures that the exponent of (m_jk_j)_hi is fixed too

	default rel

	; struct { uint48_t lo, hi; } macaroni_jam(void *data, size_t length, uint48_t seed);
	; hash uses 384 bytes of key internally. Take just one 48-bit seed, to be xored with every 48-bit unit of its fixed key (hence, 0 seed is ok, and means use the default key)
	macaroni_jam:
	push rbp
	mov rbp, rsp
	sub rsp, 256
	and rsp, ~63

	vmovdqa64 zmm31, [c_shuffle]

	mov rcx, 48
	bzhi rdx, rdx, rcx ; ensure the high 16 bits not set
	vpbroadcastq zmm4, rdx

	vpxorq zmm0, zmm4, [c_params0]
	vpxorq zmm1, zmm4, [c_params1]
	vpxorq zmm2, zmm4, [c_params2]
	vpxorq zmm3, zmm4, [c_params3]
	vmovdqa64 [rsp + 0], zmm0
	vmovdqa64 [rsp + 64], zmm1
	vmovdqa64 [rsp + 128], zmm2
	vmovdqa64 [rsp + 192], zmm3


	mov eax, 0b1110_1110_1110_1110
	kmovd k7, eax
	mov eax, 0b0110_0110_0110_0110
	kmovd k6, eax


	; zmm30 zmm29 zmm28 zmm27 key for dcba in the low 48 bits of the first half of the block, integer exponent in the exponent bits (so do an int add and get normalised
	; zmm26 zmm25 zmm24 zmm23 ditto second half of the block?
	;vpxorq zmm30, zmm0, [...]
	; ...


	.loop:
	;; snarf zmm[012] into zmm[3456] (zmm[01] now junk)
	vpblendmw zmm6{k7}, zmm2, zmm31
	vpsrlq zmm5, zmm2, 16
	vpermi2w zmm6{k6}, zmm0, zmm1
	vpsrlq zmm4, zmm1, 16
	vpsrlq zmm3, zmm0, 16

	;; compute k_j + m_j
	vpaddq zmm3, zmm3, [rsp]{1to8}
	vpaddq zmm4, zmm4, [rsp + 8]{1to8}
	vpaddq zmm5, zmm5, [rsp + 16]{1to8}
	vpaddq zmm6, zmm6, [rsp + 24]{1to8}
	;vpaddq zmm3, zmm3, zmm27
	;vpaddq zmm4, zmm4, zmm28
	;vpaddq zmm5, zmm5, zmm29
	;vpaddq zmm6, zmm6, zmm30

	vmulpd zmm0, zmm3, zmm4, {rd-sae}
	vmulpd zmm1, zmm5, zmm6, {rd-sae}
	vfmsub213pd zmm3, zmm4, zmm0 ; these should always be exact, so leave off the exception control so
	vfmsub213pd zmm5, zmm6, zmm1 ; we can check the inexact flag after; if it gets set, there is a bug

	;; now we have (hi, lo) as (zmm0, zmm3) and another as (zmm1, zmm5)
	;; the highs are guaranteed to have the same exponent, and we accumulate them modularly anyway, so we can just sum and chop the mantissas later
	;; however, the lows may have different exponents (we'd like to align them before adding), and we do need to account for carry-out from them into high (possibly? How much does this affect the quality of the hash?)
	vpaddq zmm0, zmm0, zmm1
	vcvtpd2qq zmm3, zmm3
	vcvtpd2qq zmm5, zmm5
	vpaddq zmm3, zmm3, zmm5



	leave
	ret


	section .rodata
	align 64
	;; high 16 bits of a double float with a reasonable exponent
	%define EXP 0b0100001100110000
	;; first column is irrelevant, as it gets masked out entirely (replaced with the low 2bytes from c)
	;; next two columns are shuffle indices to select the low 2bytes from a and b
	;; last column is our desired exponent
	c_shuffle:
	dw 0, 0, 4, EXP
	dw 0, 8, 12, EXP
	dw 0, 16, 20, EXP
	dw 0, 24, 28, EXP
	dw 0, 32, 36, EXP
	dw 0, 40, 44, EXP
	dw 0, 48, 52, EXP
	dw 0, 56, 60, EXP

	;; random data in low 48, then exp; todo
	c_params0:
	dq 0, 0, 0, 0, 0, 0, 0, 0
	c_params1:
	dq 0, 0, 0, 0, 0, 0, 0, 0
	c_params2:
	dq 0, 0, 0, 0, 0, 0, 0, 0
	c_params3:
	dq 0, 0, 0, 0, 0, 0, 0, 0