Last active
June 21, 2023 00:04
-
-
Save moon-chilled/60bd2ba687dc197d93a9d2251229d715 to your computer and use it in GitHub Desktop.
funny prototype 48-bit umac with floating point I got bored with before finishing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; hash based on fp multiply | |
;; first we need to make our integers into normal floats (since denormals cost on skx) | |
;; we can vcvtuqq2pd to get a normal float incorporating at least the top 53 bits of the input (todo could maybe get 54 bits with signed vcvtqq2pd; worth looking into?); it remains to ensure we take account of the low 11 bits | |
;; we handle three vectors at a time, as follows | |
;; a: oxxx oxxx oxxx oxxx oxxx oxxx oxxx oxxx | |
;; b: oxxx oxxx oxxx oxxx oxxx oxxx oxxx oxxx | |
;; c: oxxx oxxx oxxx oxxx oxxx oxxx oxxx oxxx | |
;; | |
;; d: ooo- ooo- ooo- ooo- ooo- ooo- ooo- ooo- | |
;; a, b, and c are the inputs; the xes are the 16-bit quantities that vcvtuqq2pd incorporates; the os are the 16-bit quantities which it does not; we want to pack the os into d | |
;; (the - is fixed, and contains a reasonable exponent) | |
;; the low o in each lane of d comes from the c; the remainder are blended from a and b | |
;; actually ... instead of using vcvtuqq2pd, shift abc right 12 places, and then add a constant. That constant suffices both to install the exponent of a normal float _and_ to do the initial mixing. It can be an int add or a float add; in the case of a float add, it should have the smallest normal exponent. Then the result will have a normal exponent too. (Might lose the bottom bit to rounding, but multiple of the bottom bits get duplicated into d anyway, so no matter.) | |
;; (multiplying denormals is slow, but adding them is fast) | |
;; We can have overflow into the exponent field (could happen with both int and float addition)--does that matter? I think it would be helpful to be able to control the exponent exactly. Xor would not have this problem, but the umac paper sez (https://web.cs.ucdavis.edu/~rogaway/papers/umac-full.pdf): | |
;; > Several variants of NH fail to preserve collision probability ε = 2^−w. In particular, replacing the inner addition or the outer addition with bitwise-XOR increases ε substantially. | |
;; what's next? We want to do a 53x53->106 multiply, but actually get an unevaluated sum, i.e., 53x53 | |
;; we fix the exponent of m_j and k_j both | |
;; we also ensure that, after the implicit bit, at least the first two explicit bits are both zero | |
;; since 1.25*1.25<2, that ensures that the exponent of (m_j*k_j)_hi is fixed too | |
default rel | |
; struct { uint48_t lo, hi; } macaroni_jam(void *data, size_t length, uint48_t seed); | |
; hash uses 384 bytes of key internally. Take just one 48-bit seed, to be xored with every 48-bit unit of its fixed key (hence, 0 seed is ok, and means use the default key) | |
macaroni_jam: | |
push rbp | |
mov rbp, rsp | |
sub rsp, 256 | |
and rsp, ~63 | |
vmovdqa64 zmm31, [c_shuffle] | |
mov rcx, 48 | |
bzhi rdx, rdx, rcx ; ensure the high 16 bits not set | |
vpbroadcastq zmm4, rdx | |
vpxorq zmm0, zmm4, [c_params0] | |
vpxorq zmm1, zmm4, [c_params1] | |
vpxorq zmm2, zmm4, [c_params2] | |
vpxorq zmm3, zmm4, [c_params3] | |
vmovdqa64 [rsp + 0], zmm0 | |
vmovdqa64 [rsp + 64], zmm1 | |
vmovdqa64 [rsp + 128], zmm2 | |
vmovdqa64 [rsp + 192], zmm3 | |
mov eax, 0b1110_1110_1110_1110 | |
kmovd k7, eax | |
mov eax, 0b0110_0110_0110_0110 | |
kmovd k6, eax | |
; zmm30 zmm29 zmm28 zmm27 key for dcba in the low 48 bits of the first half of the block, integer exponent in the exponent bits (so do an int add and get normalised | |
; zmm26 zmm25 zmm24 zmm23 ditto second half of the block? | |
;vpxorq zmm30, zmm0, [...] | |
; ... | |
.loop: | |
;; snarf zmm[012] into zmm[3456] (zmm[01] now junk) | |
vpblendmw zmm6{k7}, zmm2, zmm31 | |
vpsrlq zmm5, zmm2, 16 | |
vpermi2w zmm6{k6}, zmm0, zmm1 | |
vpsrlq zmm4, zmm1, 16 | |
vpsrlq zmm3, zmm0, 16 | |
;; compute k_j + m_j | |
vpaddq zmm3, zmm3, [rsp]{1to8} | |
vpaddq zmm4, zmm4, [rsp + 8]{1to8} | |
vpaddq zmm5, zmm5, [rsp + 16]{1to8} | |
vpaddq zmm6, zmm6, [rsp + 24]{1to8} | |
;vpaddq zmm3, zmm3, zmm27 | |
;vpaddq zmm4, zmm4, zmm28 | |
;vpaddq zmm5, zmm5, zmm29 | |
;vpaddq zmm6, zmm6, zmm30 | |
vmulpd zmm0, zmm3, zmm4, {rd-sae} | |
vmulpd zmm1, zmm5, zmm6, {rd-sae} | |
vfmsub213pd zmm3, zmm4, zmm0 ; these should always be exact, so leave off the exception control so | |
vfmsub213pd zmm5, zmm6, zmm1 ; we can check the inexact flag after; if it gets set, there is a bug | |
;; now we have (hi, lo) as (zmm0, zmm3) and another as (zmm1, zmm5) | |
;; the highs are guaranteed to have the same exponent, and we accumulate them modularly anyway, so we can just sum and chop the mantissas later | |
;; however, the lows may have different exponents (we'd like to align them before adding), and we do need to account for carry-out from them into high (possibly? How much does this affect the quality of the hash?) | |
vpaddq zmm0, zmm0, zmm1 | |
vcvtpd2qq zmm3, zmm3 | |
vcvtpd2qq zmm5, zmm5 | |
vpaddq zmm3, zmm3, zmm5 | |
leave | |
ret | |
section .rodata | |
align 64 | |
;; high 16 bits of a double float with a reasonable exponent | |
%define EXP 0b0100001100110000 | |
;; first column is irrelevant, as it gets masked out entirely (replaced with the low 2bytes from c) | |
;; next two columns are shuffle indices to select the low 2bytes from a and b | |
;; last column is our desired exponent | |
c_shuffle: | |
dw 0, 0, 4, EXP | |
dw 0, 8, 12, EXP | |
dw 0, 16, 20, EXP | |
dw 0, 24, 28, EXP | |
dw 0, 32, 36, EXP | |
dw 0, 40, 44, EXP | |
dw 0, 48, 52, EXP | |
dw 0, 56, 60, EXP | |
;; random data in low 48, then exp; todo | |
c_params0: | |
dq 0, 0, 0, 0, 0, 0, 0, 0 | |
c_params1: | |
dq 0, 0, 0, 0, 0, 0, 0, 0 | |
c_params2: | |
dq 0, 0, 0, 0, 0, 0, 0, 0 | |
c_params3: | |
dq 0, 0, 0, 0, 0, 0, 0, 0 |
(Can do a second umac—so 96 bits total—for <2x throughput reduction, so this could actually be a ~reasonable performance+quality hash. Not that you should ever use it for anything, because it's horrifying and bad, but still.)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
20 bytes/cycle on skx/icx ('allegedly')