rygorous · August 9, 2019 23:08
diff --git a/b.bat b/b.bat
 @echo off
 setlocal
 cd %~dp0
 call vcvars amd64
 ..\..\bin\win32\nasm -f win64 -g -o histo_asm.obj histo_asm.nas || exit /b 1
 cl /Zi /O2 /nologo histotest.cpp histo_asm.obj || exit /b 1
diff --git a/histo_asm.nas b/histo_asm.nas
    ; NOTE: all just a quick sketch
    ; also I'm using xmm/ymm/zmm0-5 and then 16-31
    ; because Win64 ABI expects me to preserve ymm6-ymm15
    ; and I couldn't be bothered to set up a stack frame and save them :)
    ;
    ; Meant to be assembled with NASM

 %macro IACA_START 0
    mov ebx,111
    db 0x64,0x67,0x90
 %endmacro

 %macro IACA_END 0
    mov ebx,222
    db 0x64,0x67,0x90
 %endmacro

    section         .data

 vone                dd 1
 vtwo                dd 2
 v31                 dd 31
 vneg1               dd -1
 vmaskb              dd 255
 vbase               dd  0*256,1*256,2*256,3*256,4*256,5*256,6*256,7*256
                    dd  8*256,9*256,10*256,11*256,12*256,13*256,14*256,15*256

    section         .text

 ; ---- Win64 ABI: arguments in rcx,rdx,r8

    global      histo_asm_scalar4_core
 histo_asm_scalar4_core:
    mov             r9, r8      ; original count
    shr             r8, 2       ; trip count
    jz              .tail

    push            rbx

    ; Scalar (4x)
    ; This is super-simple: do a single 64b (4B) scalar load
    ; and keep extracting bytes, shifting and incrementing the
    ; respectively histogram bin value.
    ;
    ; Keep 4 separate histograms: each 256-element histogram
    ; is 1k, so this is 4kB total; memory disambiguation for
    ; store->load forwarding only checks the bottom 12 bits
    ; of addresses on older Intel uArchs, so this is the largest
    ; number of separate histograms we can keep that will not
    ; result in false dependencies on these architectures.
    ; In either case, 4 histograms gets essentially all of the
    ; benefit of having more histograms.
    ;
    ; I tried using "movzx eax, bh" and only shifting once
    ; for every pair of bytes, but that was slightly more
    ; expensive on my SKX workstation. Similar with using
    ; a "movzx eax, bl" / "movzx eax, bh" / "bswap ebx" /
    ; "movzx eax, bh" / "movzx eax, bl" combo.
    ;
    ; Haven't investigated this further.

 .inner:
    mov             ebx, [rdx]
    add             rdx, 4

    movzx           eax, bl
    shr             ebx, 8
    add             dword [rcx + rax*4 + 0*1024], 1
    movzx           eax, bl
    shr             ebx, 8
    add             dword [rcx + rax*4 + 1*1024], 1

    movzx           eax, bl
    add             dword [rcx + rax*4 + 2*1024], 1
    shr             ebx, 8
    add             dword [rcx + rbx*4 + 3*1024], 1

    dec             r8
    jnz             .inner

    pop             rbx

 .tail:
    and             r9, 3       ; masked count
    jz              .done

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .done:
    ret

 ; ----

    global      histo_asm_scalar8_core
 histo_asm_scalar8_core:
    mov             r9, r8      ; original count
    shr             r8, 3       ; trip count
    jz              .tail

    push            rbx

    ; Scalar (8x)
    ; Using a 64b (8B) load this time, but still 4 histogram
    ; slices.

 .inner:
    mov             rbx, [rdx]
    add             rdx, 8

    movzx           eax, bl
    shr             rbx, 8
    add             dword [rcx + rax*4 + 0*1024], 1
    movzx           eax, bl
    shr             rbx, 8
    add             dword [rcx + rax*4 + 1*1024], 1

    movzx           eax, bl
    shr             rbx, 8
    add             dword [rcx + rax*4 + 2*1024], 1
    movzx           eax, bl
    shr             rbx, 8
    add             dword [rcx + rax*4 + 3*1024], 1

    movzx           eax, bl
    shr             ebx, 8
    add             dword [rcx + rax*4 + 0*1024], 1
    movzx           eax, bl
    shr             ebx, 8
    add             dword [rcx + rax*4 + 1*1024], 1

    movzx           eax, bl
    add             dword [rcx + rax*4 + 2*1024], 1
    shr             ebx, 8
    add             dword [rcx + rbx*4 + 3*1024], 1

    dec             r8
    jnz             .inner

    pop             rbx

 .tail:
    and             r9, 7       ; masked count
    jz              .done

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .done:
    ret

 ; ----

    global      histo_asm_scalar8_var_core
 histo_asm_scalar8_var_core:
    push            rsi
    mov             rsi, rdx

    sub             r8, 16      ; for readahead
    jb              .tail

    push            rbx

    ; Scalar (8x)
    ; Using two 32b (4B) loads, but offset by an iteration
    ; still 4 histogram slices

    mov             ebx, [rsi]
    mov             edx, [rsi + 4]

 .inner:
    add             rsi, 8

    movzx           eax, bl
    shr             ebx, 8
    add             dword [rcx + rax*4 + 0*1024], 1
    movzx           eax, bl
    shr             ebx, 8
    add             dword [rcx + rax*4 + 1*1024], 1

    movzx           eax, bl
    add             dword [rcx + rax*4 + 2*1024], 1
    shr             ebx, 8
    add             dword [rcx + rbx*4 + 3*1024], 1

    mov             ebx, [rsi]

    movzx           eax, dl
    shr             edx, 8
    add             dword [rcx + rax*4 + 0*1024], 1
    movzx           eax, dl
    shr             edx, 8
    add             dword [rcx + rax*4 + 1*1024], 1

    movzx           eax, dl
    add             dword [rcx + rax*4 + 2*1024], 1
    shr             edx, 8
    add             dword [rcx + rdx*4 + 3*1024], 1

    mov             edx, [rsi + 4]
    sub             r8, 8
    ja              .inner

    pop             rbx

 .tail:
    add             r8, 16      ; restore count
    jz              .done

 .taillp:
    movzx           eax, byte [rsi]
    inc             rsi
    inc             dword [rcx + rax*4]
    dec             r8
    jnz             .taillp

 .done:
    pop             rsi
    ret

 ; ----

 ; Increments value in histogram bucket by 1
 %macro INC1 2 ; slice, bucket
    inc             dword [rcx + (%1 % NUM_SLICES)*BYTES_PER_SLICE + %2*4]
 %endmacro

 ; Increments value in histogram bucket by 2
 %macro INC2 2 ; slice, bucket
    add             dword [rcx + (%1 % NUM_SLICES)*BYTES_PER_SLICE + %2*4], 2
 %endmacro

 ; Updates sum for 8 bytes
 %macro SUM8  5 ; reg64 reg32 reg8L reg8H INC
    movzx           eax, %3
    %5              0, rax
    movzx           eax, %4
    shr             %1, 16
    %5              1, rax

    movzx           eax, %3
    %5              2, rax
    movzx           eax, %4
    shr             %1, 16
    %5              3, rax

    movzx           eax, %3
    %5              4, rax
    movzx           eax, %4
    shr             %2, 16
    %5              5, rax

    movzx           eax, %3
    %5              6, rax
    movzx           eax, %4
    %5              7, rax
 %endmacro

 %define NUM_SLICES 4
 %define BYTES_PER_SLICE 1024

    global      histo_asm_scalar8_var2_core
 histo_asm_scalar8_var2_core:
    push            rsi
    push            r10
    push            rbx
    lea             rsi, [rdx + r8 - 24]

    neg             r8
    add             r8, 24      ; for readahead
    jge             .tail

    ; Scalar (8x)
    ; Using a single 64b (8B) load, but reading ahead by one iteration
    ; unrolled 2x to avoid moves
    ; still 4 histogram slices

    mov             rbx, [rsi + r8]
    align           16

 .inner:
    mov             rdx, [rsi + r8 + 8]
    SUM8            rbx, ebx, bl, bh, INC1

    mov             rbx, [rsi + r8 + 16]
    SUM8            rdx, edx, dl, dh, INC1

    add             r8, 16
    jl              .inner

 .tail:
    add             rsi, 24
    sub             r8, 24      ; restore count

 .taillp:
    movzx           eax, byte [rsi + r8]
    inc             dword [rcx + rax*4]
    inc             r8
    jnz             .taillp

 .done:
    pop             rbx
    pop             r10
    pop             rsi
    ret

 %undef NUM_SLICES
 %undef BYTES_PER_SLICE

 ; ----

 %define NUM_SLICES 8
 %define BYTES_PER_SLICE 1024

    global      histo_asm_scalar8_var3_core
 histo_asm_scalar8_var3_core:
    push            rsi
    push            r10
    push            rbx
    lea             rsi, [rdx + r8 - 24]

    neg             r8
    add             r8, 24      ; for readahead
    jge             .tail

    ; Scalar (8x)
    ; Using a single 64b (8B) load, but reading ahead by one iteration
    ; unrolled 2x to avoid moves
    ; this one uses 8 histogram slices

    mov             rbx, [rsi + r8]
    align           16

 .inner:
    mov             rdx, [rsi + r8 + 8]
    SUM8            rbx, ebx, bl, bh, INC1

    mov             rbx, [rsi + r8 + 16]
    SUM8            rdx, edx, dl, dh, INC1

    add             r8, 16
    jl              .inner

 .tail:
    add             rsi, 24
    sub             r8, 24      ; restore count

 .taillp:
    movzx           eax, byte [rsi + r8]
    inc             dword [rcx + rax*4]
    inc             r8
    jnz             .taillp

 .done:
    pop             rbx
    pop             r10
    pop             rsi
    ret

 %undef NUM_SLICES
 %undef BYTES_PER_SLICE

 ; ----

 %define NUM_SLICES 8
 %define BYTES_PER_SLICE (260*4)

    global      histo_asm_scalar8_var4_core
 histo_asm_scalar8_var4_core:
    push            rsi
    push            r10
    push            rbx
    lea             rsi, [rdx + r8 - 24]

    neg             r8
    add             r8, 24      ; for readahead
    jge             .tail

    ; Scalar (8x)
    ; Using a single 64b (8B) load, but reading ahead by one iteration
    ; unrolled 2x to avoid moves
    ; this one uses 8 histogram slices, and histograms have 260 slots
    ; not 256 to avoid 4k aliasing on runs of the same symbol
    ; (aliasing on descending runs seems less likely to occur in practice)

    mov             rbx, [rsi + r8]
    align           16

 .inner:
    mov             rdx, [rsi + r8 + 8]
    SUM8            rbx, ebx, bl, bh, INC1

    mov             rbx, [rsi + r8 + 16]
    SUM8            rdx, edx, dl, dh, INC1

    add             r8, 16
    jl              .inner

 .tail:
    add             rsi, 24
    sub             r8, 24      ; restore count

 .taillp:
    movzx           eax, byte [rsi + r8]
    inc             dword [rcx + rax*4]
    inc             r8
    jnz             .taillp

 .done:
    pop             rbx
    pop             r10
    pop             rsi
    ret

 %undef NUM_SLICES
 %undef BYTES_PER_SLICE

 ; ----

 %define NUM_SLICES 4
 %define BYTES_PER_SLICE (256*4)

    global      histo_asm_scalar8_var5_core
 histo_asm_scalar8_var5_core:
    push            rsi
    push            r10
    push            rbx
    lea             rsi, [rdx + r8 - 24]

    neg             r8
    add             r8, 24      ; for readahead
    jge             .tail

    ; Scalar (8x)
    ; Using a single 64b (8B) load, but reading ahead by one iteration
    ; unrolled 2x to avoid moves
    ; back to 4 slices, but with a path to detect and handle long runs,
    ; since they are frequent for the original use case of this loop

    mov             rbx, [rsi + r8]
    align           16

 .inner:
    mov             rdx, [rsi + r8 + 8]
    cmp             rbx, rdx    ; if next 64b match current 64b, can save work! (Also, check for runs.)
    je              .add_double
    SUM8            rbx, ebx, bl, bh, INC1

    mov             rbx, [rsi + r8 + 16]
    SUM8            rdx, edx, dl, dh, INC1

    add             r8, 16
    jl              .inner

 .tail:
    add             rsi, 24
    sub             r8, 24      ; restore count

 .taillp:
    movzx           eax, byte [rsi + r8]
    inc             dword [rcx + rax*4]
    inc             r8
    jnz             .taillp

 .done:
    pop             rbx
    pop             r10
    pop             rsi
    ret

    align           16

 .add_double:
    ; if we get here, we have a pair where the first and second 8 bytes
    ; in a group are identical; this is most likely a run of identical bytes,
    ; but even if not, we can still save a good deal of work

    ; check for sequence of all the same byte
    ; x ^ (x << 8) XORs every byte (but byte 0) with its predecessor
    ; if that is <256, all 16 bytes are the same value
    shl             rdx, 8
    xor             rdx, rbx
    cmp             rdx, 256
    jnb             .not_all_same

    ; add 16 to single bucket (just always use first slice)
    add             dword [rcx + rdx*4], 16

    mov             rbx, [rsi + r8 + 16]
    add             r8, 16
    jl              .inner
    jmp             .tail

 .not_all_same:
    ; add 2 to each bucket since we confirmed we have the same 8 bytes twice
    SUM8            rbx, ebx, bl, bh, INC2

    mov             rbx, [rsi + r8 + 16]
    add             r8, 16
    jl              .inner
    jmp             .tail

 %undef NUM_SLICES
 %undef BYTES_PER_SLICE

 ; ----

    global      histo_asm_sse4_core
 histo_asm_sse4_core:
    mov         r9, r8      ; original count
    shr         r8, 4       ; trip count
    jz          .tail

    ; SSE4
    ; Same idea as before, still 4 histogram slices,
    ; but use a 16-byte SSE load and PEXTRB.
    ;
    ; This should not make much of a difference to the
    ; above (PEXTRB is 2 uops on the Intel uArchs I looked at,
    ; same as the movzx/shift pairs above), and indeed, in my
    ; tests anyway, it doesn't.

 .inner:
    movdqu      xmm0, [rdx]
    add         rdx, 16

    %assign i 0
 %rep 16
    pextrb      eax, xmm0, i
    add         dword [rcx + rax*4 + (i&3)*256*4], 1
    %assign i i+1
 %endrep

    dec         r8
    jnz         .inner

 .tail:
    and         r9, 15      ; masked count
    jz          .done

 .taillp:
    movzx       eax, byte [rdx]
    inc         rdx
    inc         dword [rcx + rax*4]
    dec         r9
    jnz         .taillp

 .done:
    ret

 ; ----

    global      histo_asm_avx256_8x_core1
 histo_asm_avx256_8x_core1:
    vzeroupper

    mov         r9, r8      ; original count
    shr         r8, 5       ; trip count
    jz          .tail

    vpbroadcastd ymm3, [rel vone]
    vmovdqu     ymm4, [rel vbase]
    vpbroadcastd ymm5, [rel vmaskb]

    ; "AVX-256", variant 1.
    ;
    ; This is AVX-512, but "only" using 256-bit wide vectors
    ; we're not using "heavy" instructions, so 256-bit stays
    ; in the highest frequency level license on SKX.
    ; (The reason to use AVX-512 is because we need both
    ; gathers and scatters.)
    ;
    ; This avoids collisions between the 8 lanes by using
    ; 8 separate histograms. This would be bad on older
    ; uArchs with 4k store->load aliasing, but those
    ; certainly don't support AVX-512, so it's not a
    ; concern.

 .inner:
    ; We treat these 32 bytes as 8 DWords
    vmovdqu         ymm0, [rdx]
    add             rdx, 32

    ; Extract bottom byte of every lane
    vpandd          ymm1, ymm0, ymm5
    ; Add base so the lanes all go into different histograms
    ; (so they never conflict)
    vpaddd          ymm1, ymm1, ymm4
    ; Dependency breaker
    vpxord          ymm2, ymm2, ymm2
    ; Gather the histo bin values
    kxnorb          k1, k1, k1
    vpgatherdd      ymm2{k1}, [rcx + ymm1*4]
    ; Increment
    vpaddd          ymm2, ymm2, ymm3
    ; Scatter updated values
    kxnorb          k1, k1, k1
    vpscatterdd     [rcx + ymm1*4]{k1}, ymm2

    ; Bits [15:8] of every lane
    vpsrld          ymm1, ymm0, 8
    vpandd          ymm1, ymm1, ymm5
    vpaddd          ymm1, ymm1, ymm4
    vpxord          ymm2, ymm2, ymm2
    kxnorb          k1, k1, k1
    vpgatherdd      ymm2{k1}, [rcx + ymm1*4]
    vpaddd          ymm2, ymm2, ymm3
    kxnorb          k1, k1, k1
    vpscatterdd [rcx + ymm1*4]{k1}, ymm2

    ; Bits [23:16] of every lane
    vpsrld          ymm1, ymm0, 16
    vpandd          ymm1, ymm1, ymm5
    vpaddd          ymm1, ymm1, ymm4
    vpxord          ymm2, ymm2, ymm2
    kxnorb          k1, k1, k1
    vpgatherdd      ymm2{k1}, [rcx + ymm1*4]
    vpaddd          ymm2, ymm2, ymm3
    kxnorb          k1, k1, k1
    vpscatterdd     [rcx + ymm1*4]{k1}, ymm2

    ; Bits [31:24] of every lane
    vpsrld          ymm1, ymm0, 24
    vpaddd          ymm1, ymm1, ymm4
    vpxord          ymm2, ymm2, ymm2
    kxnorb          k1, k1, k1
    vpgatherdd      ymm2{k1}, [rcx + ymm1*4]
    vpaddd          ymm2, ymm2, ymm3
    kxnorb          k1, k1, k1
    vpscatterdd     [rcx + ymm1*4]{k1}, ymm2

    dec             r8
    jnz             .inner

 .tail:
    and             r9, 31      ; masked count
    jz              .done

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .done:
    vzeroupper
    ret

 ; ----

    global      histo_asm_avx256_8x_core2
 histo_asm_avx256_8x_core2:
    vzeroupper

    mov             r9, r8      ; original count
    shr             r8, 3       ; trip count
    jz              .tail

    vpbroadcastd    ymm3, [rel vone]
    vmovdqu         ymm4, [rel vbase]

    ; "AVX-256", variant 2
    ;
    ; Only do a single gather/scatter per iteration, and
    ; use vpmovzxbd to upconvert the 8 bytes.
    ;
    ; This is definitely worse than variant 1.

 .inner:
    vpmovzxbd       ymm0, [rdx]
    add             rdx, 8

    kxnorb          k1, k1, k1
    vpaddd          ymm0, ymm0, ymm4
    vpxord          ymm2, ymm2, ymm2
    vpgatherdd      ymm2{k1}, [rcx + ymm0*4]
    vpaddd          ymm2, ymm2, ymm3
    kxnorb          k1, k1, k1
    vpscatterdd     [rcx + ymm0*4]{k1}, ymm2

    dec             r8
    jnz             .inner

 .tail:
    and             r9, 7       ; masked count
    jz              .done

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .done:
    vzeroupper
    ret

 ; ----

    global      histo_asm_avx256_8x_core3
 histo_asm_avx256_8x_core3:
    vzeroupper

    mov             r9, r8      ; original count
    shr             r8, 5       ; trip count
    jz              .tail

    vpbroadcastd    ymm3, [rel vone]
    vpbroadcastd    ymm17, [rel vtwo]
    vmovdqu         ymm4, [rel vbase]
    vpbroadcastd    ymm5, [rel vmaskb]

    ; "AVX-256", variant 3
    ;
    ; This is similar to variant 1, but manually
    ; checks whether adjacent batches of 8 collide;
    ; if so, the first "hit" in the batch updates the
    ; histogram count by 2, and the second is disabled.
    ; The idea here is to avoid store->load forwarding
    ; cases by hand.
    ;
    ; This is sometimes marginally better than
    ; variant 1.

 .inner:
    vmovdqu         ymm0, [rdx]
    add             rdx, 32

    vpandd          ymm1, ymm0, ymm5
    
    vpsrld          ymm16, ymm0, 8
    vpandd          ymm16, ymm16, ymm5

    vpcmpd          k2, ymm1, ymm16, 0      ; second iter matches first iter?
    vpblendmd       ymm18{k2}, ymm3, ymm17  ; second_matches ? 2 : 1

    vpaddd          ymm1, ymm1, ymm4
    vpxord          ymm2, ymm2, ymm2
    kxnorb          k1, k1, k1
    vpgatherdd      ymm2{k1}, [rcx + ymm1*4]
    vpaddd          ymm2, ymm2, ymm18
    kxnorb          k1, k1, k1
    vpscatterdd     [rcx + ymm1*4]{k1}, ymm2

    vpaddd          ymm16, ymm16, ymm4
    vpxord          ymm18, ymm18, ymm18
    knotb           k1, k2
    vpgatherdd      ymm18{k1}, [rcx + ymm16*4]
    vpaddd          ymm18, ymm18, ymm3
    knotb           k1, k2
    vpscatterdd     [rcx + ymm16*4]{k1}, ymm18

    vpsrld          ymm1, ymm0, 16
    vpandd          ymm1, ymm1, ymm5
    
    vpsrld          ymm16, ymm0, 24

    vpcmpd          k2, ymm1, ymm16, 0      ; second iter matches first iter?
    vpblendmd       ymm18{k2}, ymm3, ymm17  ; second_matches ? 2 : 1

    vpaddd          ymm1, ymm1, ymm4
    vpxord          ymm2, ymm2, ymm2
    kxnorb          k1, k1, k1
    vpgatherdd      ymm2{k1}, [rcx + ymm1*4]
    vpaddd          ymm2, ymm2, ymm18
    kxnorb          k1, k1, k1
    vpscatterdd     [rcx + ymm1*4]{k1}, ymm2

    vpaddd          ymm16, ymm16, ymm4
    vpxord          ymm2, ymm2, ymm2
    knotb           k1, k2
    vpgatherdd      ymm2{k1}, [rcx + ymm16*4]
    vpaddd          ymm2, ymm2, ymm3
    knotb           k1, k2
    vpscatterdd     [rcx + ymm16*4]{k1}, ymm2

    dec             r8
    jnz             .inner

 .tail:
    and             r9, 31      ; masked count
    jz              .done

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .done:
    vzeroupper
    ret

 ; ----

    global      histo_asm_avx512_core_conflict
 histo_asm_avx512_core_conflict:
    vzeroupper

    mov             r9, r8      ; original count
    shr             r8, 4       ; trip count
    jz              .tail
    
    vpbroadcastd    zmm4, [rel vone]
    vpbroadcastd    zmm5, [rel v31]
    vpbroadcastd    zmm17, [rel vneg1]

    ; AVX-512, conflict detect
    ;
    ; This is the algorithm from the Intel Optimization Manual (example 15-18)
    ; with typos fixed.
    ;
    ; This one only writes to a single histogram slice, which given how much
    ; extra work that implies is probably not a good idea.
 .inner:
    vpmovzxbd       zmm0, [rdx]
    add             rdx, 16
    vpconflictd     zmm1, zmm0  ; zmm1=conflicts
    vmovaps         zmm3, zmm4  ; vOne
    vpxord          zmm2, zmm2, zmm2
    kxnorw          k1, k1, k1
    vpgatherdd      zmm2{k1}, [rcx + zmm0*4]
    vptestmd        k1, zmm1, zmm1
    kortestw        k1, k1
    jz              .update

    vplzcntd        zmm1, zmm1
    vpsubd          zmm1, zmm5, zmm1

 .conflicts:
    vpermd          zmm16{k1}{z}, zmm1, zmm3
    vpermd          zmm1{k1}, zmm1, zmm1
    vpaddd          zmm3{k1}, zmm3, zmm16
    vpcmpd          k1, zmm1, zmm17, 4 
    kortestw        k1, k1
    jnz             .conflicts

 .update:
    vpaddd          zmm2, zmm2, zmm3
    kxnorw          k1, k1, k1
    vpscatterdd     [rcx + zmm0*4]{k1}, zmm2

    dec             r8
    jnz             .inner

 .tail:
    and             r9, 15      ; masked count
    jz              .done

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .done:
    vzeroupper
    ret

 ; ----

    global      histo_asm_avx512_core_8x
 histo_asm_avx512_core_8x:
    vzeroupper

    mov             r9, r8      ; original count
    shr             r8, 4       ; trip count
    jz              .tail

    mov             eax, 0xff
    kmovw           k3, eax     ; "low half of lanes" mask
    
    vbroadcasti32x8 zmm4, [rel vbase]
    vpbroadcastd    zmm5, [rel vone]
    vpaddd          zmm16, zmm5, zmm5 ; two

    ; AVX-512, 8 histogram slices
    ;
    ; This means we can do the conflict detection manually like in var3 above.
    ; That seems preferable overall, and is indeed significantly faster in
    ; my tests.
 .inner:
    vpmovzxbd       zmm0, [rdx]
    add             rdx, 16
    vpaddd          zmm1, zmm0, zmm4 ; add base lanes
    valignd         zmm2, zmm0, zmm0, 8 ; input bytes rotated by 8 lanes
    vpcmpd          k2{k3}, zmm0, zmm2, 0 ; check whether high half matches low half

    ; grab source bin values
    vpxord          zmm2, zmm2, zmm2
    kxnorw          k1, k1, k1
    vpgatherdd      zmm2{k1}, [rcx + zmm1*4]

    ; depending on whether we have a conflict between matching high and low lanes,
    ; add either 1 or 2
    vpblendmd       zmm0{k2}, zmm5, zmm16
    vpaddd          zmm2, zmm2, zmm0

    ; determine output scatter mask
    kshiftlw        k1, k2, 8
    knotw           k1, k1
    vpscatterdd     [rcx + zmm1*4]{k1}, zmm2

    dec             r8
    jnz             .inner

 .tail:
    and             r9, 15      ; masked count
    jz              .done

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .done:
    vzeroupper
    ret

 ; ----

    global      histo_asm_avx512_core_16x
 histo_asm_avx512_core_16x:
    vzeroupper

    mov             r9, r8      ; original count
    shr             r8, 4       ; trip count
    jz              .tail

    vmovdqu32       zmm4, [rel vbase]
    vpbroadcastd    zmm5, [rel vone]

    ; AVX-512, 16 histogram slices
    ;
    ; No conflict detection necessary.
 .inner:
    vpmovzxbd       zmm0, [rdx]
    add             rdx, 16
    vpaddd          zmm1, zmm0, zmm4 ; add base lanes

    ; grab source bin values
    vpxord          zmm2, zmm2, zmm2
    kxnorw          k1, k1, k1
    vpgatherdd      zmm2{k1}, [rcx + zmm1*4]

    ; increment
    vpaddd          zmm2, zmm2, zmm5

    kxnorw          k1, k1, k1
    vpscatterdd     [rcx + zmm1*4]{k1}, zmm2

    dec             r8
    jnz             .inner

 .tail:
    and             r9, 15      ; masked count
    jz              .sumit

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .sumit:
    ; summing loop
    mov             r9, (256-16)*4
 .sum:
    vmovdqa32       zmm0, [rcx + r9 + 0*1024]
    %assign i 1
 %rep 15
    vpaddd          zmm0, zmm0, [rcx + r9 + i*1024]
    %assign i i+1
 %endrep
    vmovdqa32       [rcx + r9 + 0*1024], zmm0
    sub             r9, 16*4
    jns             .sum

 .done:
    vzeroupper
    ret

 ; ----

    global      histo_asm_avx512_core_16x_var2
 histo_asm_avx512_core_16x_var2:
    vzeroupper

    mov             r9, r8      ; original count
    shr             r8, 5       ; trip count
    jz              .tail

    vmovdqu32       zmm4, [rel vbase]
    vpbroadcastd    zmm5, [rel vone]
    vpaddd          zmm16, zmm5, zmm5 ; two

    ; AVX-512, 16 histogram slices
    ;
    ; Grab two runs of 16, detect conflicts between them.
 .inner:
    vpmovzxbd       zmm0, [rdx]
    vpmovzxbd       zmm1, [rdx + 16]
    add             rdx, 32
    vpcmpd          k2, zmm0, zmm1, 0 ; check whether upper 16 match lower 16
    vpaddd          zmm0, zmm0, zmm4 ; add base lanes
    vpaddd          zmm1, zmm1, zmm4

    ; grab source bin values
    vpxord          zmm2, zmm2, zmm2
    kxnorw          k1, k1, k1
    vpgatherdd      zmm2{k1}, [rcx + zmm0*4]

    ; depending on whether we have a conflict between matching high and low lanes,
    ; add either 1 or 2
    vpblendmd       zmm3{k2}, zmm5, zmm16
    vpaddd          zmm2, zmm2, zmm3

    ; store updated
    kxnorw          k1, k1, k1
    vpscatterdd     [rcx + zmm0*4]{k1}, zmm2

    ; grab source bin values for second half
    vpxord          zmm2, zmm2, zmm2
    knotw           k1, k2
    vpgatherdd      zmm2{k1}, [rcx + zmm1*4]
    
    ; increment
    vpaddd          zmm2, zmm2, zmm5

    ; store updated
    knotw           k1, k2
    vpscatterdd     [rcx + zmm1*4]{k1}, zmm2

    dec             r8
    jnz             .inner

 .tail:
    and             r9, 31      ; masked count
    jz              .sumit

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .sumit:
    ; summing loop
    mov             r9, (256-16)*4
 .sum:
    vmovdqa32       zmm0, [rcx + r9 + 0*1024]
    %assign i 1
 %rep 15
    vpaddd          zmm0, zmm0, [rcx + r9 + i*1024]
    %assign i i+1
 %endrep
    vmovdqa32       [rcx + r9 + 0*1024], zmm0
    sub             r9, 16*4
    jns             .sum

 .done:
    vzeroupper
    ret

 ; ----

    global      histo_asm_avx512_core_16x_var3
 histo_asm_avx512_core_16x_var3:
    vzeroupper

    mov             r9, r8      ; original count
    shr             r8, 5       ; trip count
    jz              .tail

    vmovdqu32       zmm4, [rel vbase]
    vpbroadcastd    zmm5, [rel vone]
    vpaddd          zmm16, zmm5, zmm5 ; two

    vpmovzxbd       zmm17, [rdx]
    vpmovzxbd       zmm18, [rdx + 16]

    ; AVX-512, 16 histogram slices
    ;
    ; Grab two runs of 16, detect conflicts between them.
 .inner:
    vpcmpd          k2, zmm17, zmm18, 0 ; check whether upper 16 match lower 16
    vpaddd          zmm0, zmm17, zmm4 ; add base lanes
    vpaddd          zmm1, zmm18, zmm4

    ; loads for next round
    vpmovzxbd       zmm17, [rdx + 32]
    vpmovzxbd       zmm18, [rdx + 48]
    add             rdx, 32

    ; grab source bin values
    vpxord          zmm2, zmm2, zmm2
    kxnorw          k1, k1, k1
    vpgatherdd      zmm2{k1}, [rcx + zmm0*4]

    ; depending on whether we have a conflict between matching high and low lanes,
    ; add either 1 or 2
    vpblendmd       zmm3{k2}, zmm5, zmm16
    vpaddd          zmm2, zmm2, zmm3

    ; store updated
    kxnorw          k1, k1, k1
    vpscatterdd     [rcx + zmm0*4]{k1}, zmm2

    ; grab source bin values for second half
    vpxord          zmm2, zmm2, zmm2
    knotw           k1, k2
    vpgatherdd      zmm2{k1}, [rcx + zmm1*4]
    
    ; increment
    vpaddd          zmm2, zmm2, zmm5

    ; store updated
    knotw           k1, k2
    vpscatterdd     [rcx + zmm1*4]{k1}, zmm2

    dec             r8
    jnz             .inner

 .tail:
    and             r9, 31      ; masked count
    jz              .sumit

 .taillp:
    movzx           eax, byte [rdx]
    inc             rdx
    inc             dword [rcx + rax*4]
    dec             r9
    jnz             .taillp

 .sumit:
    ; summing loop
    mov             r9, (256-16)*4
 .sum:
    vmovdqa32       zmm0, [rcx + r9 + 0*1024]
    %assign i 1
 %rep 15
    vpaddd          zmm0, zmm0, [rcx + r9 + i*1024]
    %assign i i+1
 %endrep
    vmovdqa32       [rcx + r9 + 0*1024], zmm0
    sub             r9, 16*4
    jns             .sum

 .done:
    vzeroupper
    ret
diff --git a/histotest.cpp b/histotest.cpp
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
 #include <emmintrin.h>

 typedef uint8_t U8;
 typedef uint32_t U32;
 typedef uint64_t U64;

 //#define NOVALIDATE

 static U8 *read_file(const char *filename, size_t *out_size)
 {
    U8 *ret = NULL;
    FILE *f = fopen(filename, "rb");
    if (f)
    {
        fseek(f, 0, SEEK_END);
        long size = ftell(f);
        fseek(f, 0, SEEK_SET);

        U8 *mem = new U8[size];
        if (mem)
        {
            if (fread(mem, size, 1, f) == 1)
            {
                if (out_size)
                    *out_size = size;
                ret = mem;
            }
            else
                delete[] mem;
        }

        fclose(f);
    }
    
    return ret;
 }

 static inline U64 cycle_timer()
 {
 #ifdef _MSC_VER
    return __rdtsc();
 #else
    U32 lo, hi;
    __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi) );
    return lo | ((U64)hi << 32);
 #endif
 }

 static inline uint32_t read32(const void *p)
 {
    uint32_t x;
    memcpy(&x, p, sizeof(x));
    return x;
 }

 static void histo_ref(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    memset(counts,0,256*sizeof(U32));
    for (size_t i = 0; i < rawLen; i++)
        counts[rawArray[i]]++;
 }

 static void histo_cpp_1x(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    memset(counts,0,256*sizeof(U32));
    
    const U8 * rawPtr = rawArray;
    const U8 * rawEnd = rawArray+rawLen;
    const U8 * rawEndMul4 = rawArray+(rawLen&~3);

    while(rawPtr < rawEndMul4)
    {
        U32 x = read32(rawPtr);
        counts[x & 0xff]++; x >>= 8;
        counts[x & 0xff]++; x >>= 8;
        counts[x & 0xff]++; x >>= 8;
        counts[x] ++; // last one doesn't need to mask
        rawPtr += 4;
    }
    
    // finish the last few bytes (just throw them into array 0, doesn't matter)
    while(rawPtr < rawEnd)
        counts[ *rawPtr++ ] ++;
 }

 static void histo_cpp_2x(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[2][256];
    memset(countsArray,0,sizeof(countsArray));
    
    const U8 * rawPtr = rawArray;
    const U8 * rawEnd = rawArray+rawLen;
    const U8 * rawEndMul4 = rawArray+(rawLen&~3);

    while(rawPtr < rawEndMul4)
    {
        U32 x = read32(rawPtr);
        countsArray[0][x & 0xff]++; x >>= 8;
        countsArray[1][x & 0xff]++; x >>= 8;
        countsArray[0][x & 0xff]++; x >>= 8;
        countsArray[1][x] ++; // last one doesn't need to mask
        rawPtr += 4;
    }
    
    // finish the last few bytes (just throw them into array 0, doesn't matter)
    while(rawPtr < rawEnd)
        countsArray[0][ *rawPtr++ ] ++;
    
    // sum the countsarrays together
    int k=0;
    for(;k<256;k+=4)
    {
        __m128i sum = _mm_load_si128((const __m128i *) &countsArray[0][k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &countsArray[1][k]);
        _mm_storeu_si128((__m128i *)&counts[k], sum);
    }
 }

 static void finish_histo_4x(U32 * counts_out, const U32 * counts4x)
 {
    for (size_t k=0; k<256; k+=4)
    {
        __m128i sum = _mm_load_si128((const __m128i *) &counts4x[0*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts4x[1*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts4x[2*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts4x[3*256 + k]);
        _mm_storeu_si128((__m128i *)&counts_out[k], sum);
    }
 }

 static void finish_histo_8x(U32 * counts_out, const U32 * counts8x)
 {
    for (size_t k=0; k<256; k+=4)
    {
        __m128i sum = _mm_load_si128((const __m128i *) &counts8x[0*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[1*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[2*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[3*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[4*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[5*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[6*256 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[7*256 + k]);
        _mm_storeu_si128((__m128i *)&counts_out[k], sum);
    }
 }

 static void finish_histo_8x_260(U32 * counts_out, const U32 * counts8x)
 {
    for (size_t k=0; k<256; k+=4)
    {
        __m128i sum = _mm_load_si128((const __m128i *) &counts8x[0*260 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[1*260 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[2*260 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[3*260 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[4*260 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[5*260 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[6*260 + k]);
        sum = _mm_add_epi32(sum, *(const __m128i *) &counts8x[7*260 + k]);
        _mm_storeu_si128((__m128i *)&counts_out[k], sum);
    }
 }

 static void histo_cpp_4x(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[4*256];
    memset(countsArray,0,sizeof(countsArray));
    
    const U8 * rawPtr = rawArray;
    const U8 * rawEnd = rawArray+rawLen;
    const U8 * rawEndMul4 = rawArray+(rawLen&~3);

    while(rawPtr < rawEndMul4)
    {
        U32 x = read32(rawPtr);
        countsArray[0*256 + (x & 0xff)]++; x >>= 8;
        countsArray[1*256 + (x & 0xff)]++; x >>= 8;
        countsArray[2*256 + (x & 0xff)]++; x >>= 8;
        countsArray[3*256 + x] ++; // last one doesn't need to mask
        rawPtr += 4;
    }
    
    // finish the last few bytes (just throw them into slice 0, doesn't matter)
    while(rawPtr < rawEnd)
        countsArray[ *rawPtr++ ] ++;
    
    finish_histo_4x(counts, countsArray);
 }

 extern "C" void histo_asm_scalar4_core(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_scalar8_core(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_scalar8_var_core(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_scalar8_var2_core(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_scalar8_var3_core(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_scalar8_var4_core(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_scalar8_var5_core(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_sse4_core(U32 * histo, const U8 * bytes, size_t nbytes);

 static void histo_asm_scalar4(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[4*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_scalar4_core(countsArray, rawArray, rawLen);
    finish_histo_4x(counts, countsArray);
 }

 static void histo_asm_scalar8(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[4*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_scalar8_core(countsArray, rawArray, rawLen);
    finish_histo_4x(counts, countsArray);
 }

 static void histo_asm_scalar8_var(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[4*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_scalar8_var_core(countsArray, rawArray, rawLen);
    finish_histo_4x(counts, countsArray);
 }

 static void histo_asm_scalar8_var2(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[4*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_scalar8_var2_core(countsArray, rawArray, rawLen);
    finish_histo_4x(counts, countsArray);
 }

 static void histo_asm_scalar8_var3(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[8*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_scalar8_var3_core(countsArray, rawArray, rawLen);
    finish_histo_8x(counts, countsArray);
 }

 static void histo_asm_scalar8_var4(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[8*260];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_scalar8_var4_core(countsArray, rawArray, rawLen);
    finish_histo_8x_260(counts, countsArray);
 }

 static void histo_asm_scalar8_var5(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[4*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_scalar8_var5_core(countsArray, rawArray, rawLen);
    finish_histo_4x(counts, countsArray);
 }

 static void histo_asm_sse4(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[4*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_sse4_core(countsArray, rawArray, rawLen);
    finish_histo_4x(counts, countsArray);
 }

 extern "C" void histo_asm_avx256_8x_core1(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_avx256_8x_core2(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_avx256_8x_core3(U32 * histo, const U8 * bytes, size_t nbytes);

 static void histo_asm_avx256_8x_1(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[8*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_avx256_8x_core1(countsArray, rawArray, rawLen);
    finish_histo_8x(counts, countsArray);
 }

 static void histo_asm_avx256_8x_2(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[8*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_avx256_8x_core2(countsArray, rawArray, rawLen);
    finish_histo_8x(counts, countsArray);
 }

 static void histo_asm_avx256_8x_3(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[8*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_avx256_8x_core3(countsArray, rawArray, rawLen);
    finish_histo_8x(counts, countsArray);
 }

 extern "C" void histo_asm_avx512_core_conflict(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_avx512_core_8x(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_avx512_core_16x(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_avx512_core_16x_var2(U32 * histo, const U8 * bytes, size_t nbytes);
 extern "C" void histo_asm_avx512_core_16x_var3(U32 * histo, const U8 * bytes, size_t nbytes);

 static void histo_asm_avx512_conflict(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    memset(counts,0,256*sizeof(U32));

    histo_asm_avx512_core_conflict(counts, rawArray, rawLen);
 }

 static void histo_asm_avx512_8x(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(16)) countsArray[8*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_avx512_core_8x(countsArray, rawArray, rawLen);
    finish_histo_8x(counts, countsArray);
 }

 static void histo_asm_avx512_16x(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(64)) countsArray[16*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_avx512_core_16x(countsArray, rawArray, rawLen);
    memcpy(counts, countsArray, 256*sizeof(U32));
 }

 static void histo_asm_avx512_16x_var2(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(64)) countsArray[16*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_avx512_core_16x_var2(countsArray, rawArray, rawLen);
    memcpy(counts, countsArray, 256*sizeof(U32));
 }

 static void histo_asm_avx512_16x_var3(U32 * counts, const U8 * rawArray, size_t rawLen)
 {
    U32 __declspec(align(64)) countsArray[16*256];
    memset(countsArray,0,sizeof(countsArray));

    histo_asm_avx512_core_16x_var3(countsArray, rawArray, rawLen);
    memcpy(counts, countsArray, 256*sizeof(U32));
 }

 typedef void histo_func(U32 * histo, const U8 * bytes, size_t nbytes);

 U32 g_histo[256]; // global sink

 static void testit(const char *name, histo_func *fn, const U8 * bytes, size_t nbytes)
 {
 #ifndef NOVALIDATE
    // check correctness
    U32 ref_histo[256];
    U32 our_histo[256];
    histo_ref(ref_histo, bytes, nbytes);
    fn(our_histo, bytes, nbytes);
    if (memcmp(ref_histo, our_histo, sizeof(our_histo)) != 0)
    {
        printf("%30s: incorrect result!\n", name);
        return;
    }
 #endif

    // check timing
    U64 min_dur = ~(U64)0;

    for (int run = 0; run < 7500; run++)
    //for (int run = 0; run < 15000; run++)
    //for (int run = 0; run < 50000; run++)
    {
        U64 start = cycle_timer();
        fn(g_histo, bytes, nbytes);
        U64 duration = cycle_timer() - start;

        if (duration < min_dur)
            min_dur = duration;
    }

    printf("%30s: %7lld (%.2f/byte)\n", name, min_dur, 1.0*min_dur/nbytes);
 }

 static void test_file(const char *label, const U8 *bytes, size_t size)
 {
    printf("%s: %d bytes\n", label, (int)size);

 #define TESTIT(name) testit(#name,name,bytes,size)
 #if 1
    TESTIT(histo_ref);
    TESTIT(histo_cpp_1x);
    TESTIT(histo_cpp_2x);
    TESTIT(histo_cpp_4x);
    TESTIT(histo_asm_scalar4);
    TESTIT(histo_asm_scalar8);
    TESTIT(histo_asm_scalar8_var);
    TESTIT(histo_asm_scalar8_var2);
    TESTIT(histo_asm_scalar8_var3);
    TESTIT(histo_asm_scalar8_var4);
    TESTIT(histo_asm_scalar8_var5);
    TESTIT(histo_asm_sse4);
    TESTIT(histo_asm_avx256_8x_1);
    TESTIT(histo_asm_avx256_8x_2);
    TESTIT(histo_asm_avx256_8x_3);
    TESTIT(histo_asm_avx512_conflict);
    TESTIT(histo_asm_avx512_8x);
    TESTIT(histo_asm_avx512_16x);
    TESTIT(histo_asm_avx512_16x_var2);
    TESTIT(histo_asm_avx512_16x_var3);
 #endif

    //TESTIT(histo_ref);
    //TESTIT(histo_cpp_4x);
    //TESTIT(histo_asm_scalar8_var2);
    //TESTIT(histo_asm_scalar8_var3);
    //TESTIT(histo_asm_scalar8_var4);

 #undef TESTIT
 }

 int main(int argc, char **argv)
 {
    if (argc < 2)
    {
        fprintf(stderr, "Usage: histotest <filename>\n");
        return 1;
    }

    const char *filename = argv[1];
    size_t size;
    U8 *bytes = read_file(filename, &size);
    if (!bytes)
    {
        fprintf(stderr, "Error loading '%s'\n", filename);
        return 1;
    }

    test_file(filename, bytes, size);

    static U8 zeroes[128*1024];
    test_file("zeroes", zeroes, 128*1024);

    delete[] bytes;
    return 0;
 }
	@echo off
	setlocal
	cd %~dp0
	call vcvars amd64
	..\..\bin\win32\nasm -f win64 -g -o histo_asm.obj histo_asm.nas \|\| exit /b 1
	cl /Zi /O2 /nologo histotest.cpp histo_asm.obj \|\| exit /b 1
	; NOTE: all just a quick sketch
	; also I'm using xmm/ymm/zmm0-5 and then 16-31
	; because Win64 ABI expects me to preserve ymm6-ymm15
	; and I couldn't be bothered to set up a stack frame and save them :)
	;
	; Meant to be assembled with NASM

	%macro IACA_START 0
	mov ebx,111
	db 0x64,0x67,0x90
	%endmacro

	%macro IACA_END 0
	mov ebx,222
	db 0x64,0x67,0x90
	%endmacro

	section .data

	vone dd 1
	vtwo dd 2
	v31 dd 31
	vneg1 dd -1
	vmaskb dd 255
	vbase dd 0256,1256,2256,3256,4256,5256,6256,7256
	dd 8256,9256,10256,11256,12256,13256,14256,15256

	section .text

	; ---- Win64 ABI: arguments in rcx,rdx,r8

	global histo_asm_scalar4_core
	histo_asm_scalar4_core:
	mov r9, r8 ; original count
	shr r8, 2 ; trip count
	jz .tail

	push rbx

	; Scalar (4x)
	; This is super-simple: do a single 64b (4B) scalar load
	; and keep extracting bytes, shifting and incrementing the
	; respectively histogram bin value.
	;
	; Keep 4 separate histograms: each 256-element histogram
	; is 1k, so this is 4kB total; memory disambiguation for
	; store->load forwarding only checks the bottom 12 bits
	; of addresses on older Intel uArchs, so this is the largest
	; number of separate histograms we can keep that will not
	; result in false dependencies on these architectures.
	; In either case, 4 histograms gets essentially all of the
	; benefit of having more histograms.
	;
	; I tried using "movzx eax, bh" and only shifting once
	; for every pair of bytes, but that was slightly more
	; expensive on my SKX workstation. Similar with using
	; a "movzx eax, bl" / "movzx eax, bh" / "bswap ebx" /
	; "movzx eax, bh" / "movzx eax, bl" combo.
	;
	; Haven't investigated this further.

	.inner:
	mov ebx, [rdx]
	add rdx, 4

	movzx eax, bl
	shr ebx, 8
	add dword [rcx + rax4 + 01024], 1
	movzx eax, bl
	shr ebx, 8
	add dword [rcx + rax4 + 11024], 1

	movzx eax, bl
	add dword [rcx + rax4 + 21024], 1
	shr ebx, 8
	add dword [rcx + rbx4 + 31024], 1

	dec r8
	jnz .inner

	pop rbx

	.tail:
	and r9, 3 ; masked count
	jz .done

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.done:
	ret

	; ----

	global histo_asm_scalar8_core
	histo_asm_scalar8_core:
	mov r9, r8 ; original count
	shr r8, 3 ; trip count
	jz .tail

	push rbx

	; Scalar (8x)
	; Using a 64b (8B) load this time, but still 4 histogram
	; slices.

	.inner:
	mov rbx, [rdx]
	add rdx, 8

	movzx eax, bl
	shr rbx, 8
	add dword [rcx + rax4 + 01024], 1
	movzx eax, bl
	shr rbx, 8
	add dword [rcx + rax4 + 11024], 1

	movzx eax, bl
	shr rbx, 8
	add dword [rcx + rax4 + 21024], 1
	movzx eax, bl
	shr rbx, 8
	add dword [rcx + rax4 + 31024], 1

	movzx eax, bl
	shr ebx, 8
	add dword [rcx + rax4 + 01024], 1
	movzx eax, bl
	shr ebx, 8
	add dword [rcx + rax4 + 11024], 1

	movzx eax, bl
	add dword [rcx + rax4 + 21024], 1
	shr ebx, 8
	add dword [rcx + rbx4 + 31024], 1

	dec r8
	jnz .inner

	pop rbx

	.tail:
	and r9, 7 ; masked count
	jz .done

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.done:
	ret

	; ----

	global histo_asm_scalar8_var_core
	histo_asm_scalar8_var_core:
	push rsi
	mov rsi, rdx

	sub r8, 16 ; for readahead
	jb .tail

	push rbx

	; Scalar (8x)
	; Using two 32b (4B) loads, but offset by an iteration
	; still 4 histogram slices

	mov ebx, [rsi]
	mov edx, [rsi + 4]

	.inner:
	add rsi, 8

	movzx eax, bl
	shr ebx, 8
	add dword [rcx + rax4 + 01024], 1
	movzx eax, bl
	shr ebx, 8
	add dword [rcx + rax4 + 11024], 1

	movzx eax, bl
	add dword [rcx + rax4 + 21024], 1
	shr ebx, 8
	add dword [rcx + rbx4 + 31024], 1

	mov ebx, [rsi]

	movzx eax, dl
	shr edx, 8
	add dword [rcx + rax4 + 01024], 1
	movzx eax, dl
	shr edx, 8
	add dword [rcx + rax4 + 11024], 1

	movzx eax, dl
	add dword [rcx + rax4 + 21024], 1
	shr edx, 8
	add dword [rcx + rdx4 + 31024], 1

	mov edx, [rsi + 4]
	sub r8, 8
	ja .inner

	pop rbx

	.tail:
	add r8, 16 ; restore count
	jz .done

	.taillp:
	movzx eax, byte [rsi]
	inc rsi
	inc dword [rcx + rax*4]
	dec r8
	jnz .taillp

	.done:
	pop rsi
	ret

	; ----

	; Increments value in histogram bucket by 1
	%macro INC1 2 ; slice, bucket
	inc dword [rcx + (%1 % NUM_SLICES)BYTES_PER_SLICE + %24]
	%endmacro

	; Increments value in histogram bucket by 2
	%macro INC2 2 ; slice, bucket
	add dword [rcx + (%1 % NUM_SLICES)BYTES_PER_SLICE + %24], 2
	%endmacro

	; Updates sum for 8 bytes
	%macro SUM8 5 ; reg64 reg32 reg8L reg8H INC
	movzx eax, %3
	%5 0, rax
	movzx eax, %4
	shr %1, 16
	%5 1, rax

	movzx eax, %3
	%5 2, rax
	movzx eax, %4
	shr %1, 16
	%5 3, rax

	movzx eax, %3
	%5 4, rax
	movzx eax, %4
	shr %2, 16
	%5 5, rax

	movzx eax, %3
	%5 6, rax
	movzx eax, %4
	%5 7, rax
	%endmacro

	%define NUM_SLICES 4
	%define BYTES_PER_SLICE 1024

	global histo_asm_scalar8_var2_core
	histo_asm_scalar8_var2_core:
	push rsi
	push r10
	push rbx
	lea rsi, [rdx + r8 - 24]

	neg r8
	add r8, 24 ; for readahead
	jge .tail

	; Scalar (8x)
	; Using a single 64b (8B) load, but reading ahead by one iteration
	; unrolled 2x to avoid moves
	; still 4 histogram slices

	mov rbx, [rsi + r8]
	align 16

	.inner:
	mov rdx, [rsi + r8 + 8]
	SUM8 rbx, ebx, bl, bh, INC1

	mov rbx, [rsi + r8 + 16]
	SUM8 rdx, edx, dl, dh, INC1

	add r8, 16
	jl .inner

	.tail:
	add rsi, 24
	sub r8, 24 ; restore count

	.taillp:
	movzx eax, byte [rsi + r8]
	inc dword [rcx + rax*4]
	inc r8
	jnz .taillp

	.done:
	pop rbx
	pop r10
	pop rsi
	ret

	%undef NUM_SLICES
	%undef BYTES_PER_SLICE

	; ----

	%define NUM_SLICES 8
	%define BYTES_PER_SLICE 1024

	global histo_asm_scalar8_var3_core
	histo_asm_scalar8_var3_core:
	push rsi
	push r10
	push rbx
	lea rsi, [rdx + r8 - 24]

	neg r8
	add r8, 24 ; for readahead
	jge .tail

	; Scalar (8x)
	; Using a single 64b (8B) load, but reading ahead by one iteration
	; unrolled 2x to avoid moves
	; this one uses 8 histogram slices

	mov rbx, [rsi + r8]
	align 16

	.inner:
	mov rdx, [rsi + r8 + 8]
	SUM8 rbx, ebx, bl, bh, INC1

	mov rbx, [rsi + r8 + 16]
	SUM8 rdx, edx, dl, dh, INC1

	add r8, 16
	jl .inner

	.tail:
	add rsi, 24
	sub r8, 24 ; restore count

	.taillp:
	movzx eax, byte [rsi + r8]
	inc dword [rcx + rax*4]
	inc r8
	jnz .taillp

	.done:
	pop rbx
	pop r10
	pop rsi
	ret

	%undef NUM_SLICES
	%undef BYTES_PER_SLICE

	; ----

	%define NUM_SLICES 8
	%define BYTES_PER_SLICE (260*4)

	global histo_asm_scalar8_var4_core
	histo_asm_scalar8_var4_core:
	push rsi
	push r10
	push rbx
	lea rsi, [rdx + r8 - 24]

	neg r8
	add r8, 24 ; for readahead
	jge .tail

	; Scalar (8x)
	; Using a single 64b (8B) load, but reading ahead by one iteration
	; unrolled 2x to avoid moves
	; this one uses 8 histogram slices, and histograms have 260 slots
	; not 256 to avoid 4k aliasing on runs of the same symbol
	; (aliasing on descending runs seems less likely to occur in practice)

	mov rbx, [rsi + r8]
	align 16

	.inner:
	mov rdx, [rsi + r8 + 8]
	SUM8 rbx, ebx, bl, bh, INC1

	mov rbx, [rsi + r8 + 16]
	SUM8 rdx, edx, dl, dh, INC1

	add r8, 16
	jl .inner

	.tail:
	add rsi, 24
	sub r8, 24 ; restore count

	.taillp:
	movzx eax, byte [rsi + r8]
	inc dword [rcx + rax*4]
	inc r8
	jnz .taillp

	.done:
	pop rbx
	pop r10
	pop rsi
	ret

	%undef NUM_SLICES
	%undef BYTES_PER_SLICE

	; ----

	%define NUM_SLICES 4
	%define BYTES_PER_SLICE (256*4)

	global histo_asm_scalar8_var5_core
	histo_asm_scalar8_var5_core:
	push rsi
	push r10
	push rbx
	lea rsi, [rdx + r8 - 24]

	neg r8
	add r8, 24 ; for readahead
	jge .tail

	; Scalar (8x)
	; Using a single 64b (8B) load, but reading ahead by one iteration
	; unrolled 2x to avoid moves
	; back to 4 slices, but with a path to detect and handle long runs,
	; since they are frequent for the original use case of this loop

	mov rbx, [rsi + r8]
	align 16

	.inner:
	mov rdx, [rsi + r8 + 8]
	cmp rbx, rdx ; if next 64b match current 64b, can save work! (Also, check for runs.)
	je .add_double
	SUM8 rbx, ebx, bl, bh, INC1

	mov rbx, [rsi + r8 + 16]
	SUM8 rdx, edx, dl, dh, INC1

	add r8, 16
	jl .inner

	.tail:
	add rsi, 24
	sub r8, 24 ; restore count

	.taillp:
	movzx eax, byte [rsi + r8]
	inc dword [rcx + rax*4]
	inc r8
	jnz .taillp

	.done:
	pop rbx
	pop r10
	pop rsi
	ret

	align 16

	.add_double:
	; if we get here, we have a pair where the first and second 8 bytes
	; in a group are identical; this is most likely a run of identical bytes,
	; but even if not, we can still save a good deal of work

	; check for sequence of all the same byte
	; x ^ (x << 8) XORs every byte (but byte 0) with its predecessor
	; if that is <256, all 16 bytes are the same value
	shl rdx, 8
	xor rdx, rbx
	cmp rdx, 256
	jnb .not_all_same

	; add 16 to single bucket (just always use first slice)
	add dword [rcx + rdx*4], 16

	mov rbx, [rsi + r8 + 16]
	add r8, 16
	jl .inner
	jmp .tail

	.not_all_same:
	; add 2 to each bucket since we confirmed we have the same 8 bytes twice
	SUM8 rbx, ebx, bl, bh, INC2

	mov rbx, [rsi + r8 + 16]
	add r8, 16
	jl .inner
	jmp .tail

	%undef NUM_SLICES
	%undef BYTES_PER_SLICE

	; ----

	global histo_asm_sse4_core
	histo_asm_sse4_core:
	mov r9, r8 ; original count
	shr r8, 4 ; trip count
	jz .tail

	; SSE4
	; Same idea as before, still 4 histogram slices,
	; but use a 16-byte SSE load and PEXTRB.
	;
	; This should not make much of a difference to the
	; above (PEXTRB is 2 uops on the Intel uArchs I looked at,
	; same as the movzx/shift pairs above), and indeed, in my
	; tests anyway, it doesn't.

	.inner:
	movdqu xmm0, [rdx]
	add rdx, 16

	%assign i 0
	%rep 16
	pextrb eax, xmm0, i
	add dword [rcx + rax4 + (i&3)256*4], 1
	%assign i i+1
	%endrep

	dec r8
	jnz .inner

	.tail:
	and r9, 15 ; masked count
	jz .done

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.done:
	ret

	; ----

	global histo_asm_avx256_8x_core1
	histo_asm_avx256_8x_core1:
	vzeroupper

	mov r9, r8 ; original count
	shr r8, 5 ; trip count
	jz .tail

	vpbroadcastd ymm3, [rel vone]
	vmovdqu ymm4, [rel vbase]
	vpbroadcastd ymm5, [rel vmaskb]

	; "AVX-256", variant 1.
	;
	; This is AVX-512, but "only" using 256-bit wide vectors
	; we're not using "heavy" instructions, so 256-bit stays
	; in the highest frequency level license on SKX.
	; (The reason to use AVX-512 is because we need both
	; gathers and scatters.)
	;
	; This avoids collisions between the 8 lanes by using
	; 8 separate histograms. This would be bad on older
	; uArchs with 4k store->load aliasing, but those
	; certainly don't support AVX-512, so it's not a
	; concern.

	.inner:
	; We treat these 32 bytes as 8 DWords
	vmovdqu ymm0, [rdx]
	add rdx, 32

	; Extract bottom byte of every lane
	vpandd ymm1, ymm0, ymm5
	; Add base so the lanes all go into different histograms
	; (so they never conflict)
	vpaddd ymm1, ymm1, ymm4
	; Dependency breaker
	vpxord ymm2, ymm2, ymm2
	; Gather the histo bin values
	kxnorb k1, k1, k1
	vpgatherdd ymm2{k1}, [rcx + ymm1*4]
	; Increment
	vpaddd ymm2, ymm2, ymm3
	; Scatter updated values
	kxnorb k1, k1, k1
	vpscatterdd [rcx + ymm1*4]{k1}, ymm2

	; Bits [15:8] of every lane
	vpsrld ymm1, ymm0, 8
	vpandd ymm1, ymm1, ymm5
	vpaddd ymm1, ymm1, ymm4
	vpxord ymm2, ymm2, ymm2
	kxnorb k1, k1, k1
	vpgatherdd ymm2{k1}, [rcx + ymm1*4]
	vpaddd ymm2, ymm2, ymm3
	kxnorb k1, k1, k1
	vpscatterdd [rcx + ymm1*4]{k1}, ymm2

	; Bits [23:16] of every lane
	vpsrld ymm1, ymm0, 16
	vpandd ymm1, ymm1, ymm5
	vpaddd ymm1, ymm1, ymm4
	vpxord ymm2, ymm2, ymm2
	kxnorb k1, k1, k1
	vpgatherdd ymm2{k1}, [rcx + ymm1*4]
	vpaddd ymm2, ymm2, ymm3
	kxnorb k1, k1, k1
	vpscatterdd [rcx + ymm1*4]{k1}, ymm2

	; Bits [31:24] of every lane
	vpsrld ymm1, ymm0, 24
	vpaddd ymm1, ymm1, ymm4
	vpxord ymm2, ymm2, ymm2
	kxnorb k1, k1, k1
	vpgatherdd ymm2{k1}, [rcx + ymm1*4]
	vpaddd ymm2, ymm2, ymm3
	kxnorb k1, k1, k1
	vpscatterdd [rcx + ymm1*4]{k1}, ymm2

	dec r8
	jnz .inner

	.tail:
	and r9, 31 ; masked count
	jz .done

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.done:
	vzeroupper
	ret

	; ----

	global histo_asm_avx256_8x_core2
	histo_asm_avx256_8x_core2:
	vzeroupper

	mov r9, r8 ; original count
	shr r8, 3 ; trip count
	jz .tail

	vpbroadcastd ymm3, [rel vone]
	vmovdqu ymm4, [rel vbase]

	; "AVX-256", variant 2
	;
	; Only do a single gather/scatter per iteration, and
	; use vpmovzxbd to upconvert the 8 bytes.
	;
	; This is definitely worse than variant 1.

	.inner:
	vpmovzxbd ymm0, [rdx]
	add rdx, 8

	kxnorb k1, k1, k1
	vpaddd ymm0, ymm0, ymm4
	vpxord ymm2, ymm2, ymm2
	vpgatherdd ymm2{k1}, [rcx + ymm0*4]
	vpaddd ymm2, ymm2, ymm3
	kxnorb k1, k1, k1
	vpscatterdd [rcx + ymm0*4]{k1}, ymm2

	dec r8
	jnz .inner

	.tail:
	and r9, 7 ; masked count
	jz .done

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.done:
	vzeroupper
	ret

	; ----

	global histo_asm_avx256_8x_core3
	histo_asm_avx256_8x_core3:
	vzeroupper

	mov r9, r8 ; original count
	shr r8, 5 ; trip count
	jz .tail

	vpbroadcastd ymm3, [rel vone]
	vpbroadcastd ymm17, [rel vtwo]
	vmovdqu ymm4, [rel vbase]
	vpbroadcastd ymm5, [rel vmaskb]

	; "AVX-256", variant 3
	;
	; This is similar to variant 1, but manually
	; checks whether adjacent batches of 8 collide;
	; if so, the first "hit" in the batch updates the
	; histogram count by 2, and the second is disabled.
	; The idea here is to avoid store->load forwarding
	; cases by hand.
	;
	; This is sometimes marginally better than
	; variant 1.

	.inner:
	vmovdqu ymm0, [rdx]
	add rdx, 32

	vpandd ymm1, ymm0, ymm5

	vpsrld ymm16, ymm0, 8
	vpandd ymm16, ymm16, ymm5

	vpcmpd k2, ymm1, ymm16, 0 ; second iter matches first iter?
	vpblendmd ymm18{k2}, ymm3, ymm17 ; second_matches ? 2 : 1

	vpaddd ymm1, ymm1, ymm4
	vpxord ymm2, ymm2, ymm2
	kxnorb k1, k1, k1
	vpgatherdd ymm2{k1}, [rcx + ymm1*4]
	vpaddd ymm2, ymm2, ymm18
	kxnorb k1, k1, k1
	vpscatterdd [rcx + ymm1*4]{k1}, ymm2

	vpaddd ymm16, ymm16, ymm4
	vpxord ymm18, ymm18, ymm18
	knotb k1, k2
	vpgatherdd ymm18{k1}, [rcx + ymm16*4]
	vpaddd ymm18, ymm18, ymm3
	knotb k1, k2
	vpscatterdd [rcx + ymm16*4]{k1}, ymm18

	vpsrld ymm1, ymm0, 16
	vpandd ymm1, ymm1, ymm5

	vpsrld ymm16, ymm0, 24

	vpcmpd k2, ymm1, ymm16, 0 ; second iter matches first iter?
	vpblendmd ymm18{k2}, ymm3, ymm17 ; second_matches ? 2 : 1

	vpaddd ymm1, ymm1, ymm4
	vpxord ymm2, ymm2, ymm2
	kxnorb k1, k1, k1
	vpgatherdd ymm2{k1}, [rcx + ymm1*4]
	vpaddd ymm2, ymm2, ymm18
	kxnorb k1, k1, k1
	vpscatterdd [rcx + ymm1*4]{k1}, ymm2

	vpaddd ymm16, ymm16, ymm4
	vpxord ymm2, ymm2, ymm2
	knotb k1, k2
	vpgatherdd ymm2{k1}, [rcx + ymm16*4]
	vpaddd ymm2, ymm2, ymm3
	knotb k1, k2
	vpscatterdd [rcx + ymm16*4]{k1}, ymm2

	dec r8
	jnz .inner

	.tail:
	and r9, 31 ; masked count
	jz .done

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.done:
	vzeroupper
	ret

	; ----

	global histo_asm_avx512_core_conflict
	histo_asm_avx512_core_conflict:
	vzeroupper

	mov r9, r8 ; original count
	shr r8, 4 ; trip count
	jz .tail

	vpbroadcastd zmm4, [rel vone]
	vpbroadcastd zmm5, [rel v31]
	vpbroadcastd zmm17, [rel vneg1]

	; AVX-512, conflict detect
	;
	; This is the algorithm from the Intel Optimization Manual (example 15-18)
	; with typos fixed.
	;
	; This one only writes to a single histogram slice, which given how much
	; extra work that implies is probably not a good idea.
	.inner:
	vpmovzxbd zmm0, [rdx]
	add rdx, 16
	vpconflictd zmm1, zmm0 ; zmm1=conflicts
	vmovaps zmm3, zmm4 ; vOne
	vpxord zmm2, zmm2, zmm2
	kxnorw k1, k1, k1
	vpgatherdd zmm2{k1}, [rcx + zmm0*4]
	vptestmd k1, zmm1, zmm1
	kortestw k1, k1
	jz .update

	vplzcntd zmm1, zmm1
	vpsubd zmm1, zmm5, zmm1

	.conflicts:
	vpermd zmm16{k1}{z}, zmm1, zmm3
	vpermd zmm1{k1}, zmm1, zmm1
	vpaddd zmm3{k1}, zmm3, zmm16
	vpcmpd k1, zmm1, zmm17, 4
	kortestw k1, k1
	jnz .conflicts

	.update:
	vpaddd zmm2, zmm2, zmm3
	kxnorw k1, k1, k1
	vpscatterdd [rcx + zmm0*4]{k1}, zmm2

	dec r8
	jnz .inner

	.tail:
	and r9, 15 ; masked count
	jz .done

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.done:
	vzeroupper
	ret

	; ----

	global histo_asm_avx512_core_8x
	histo_asm_avx512_core_8x:
	vzeroupper

	mov r9, r8 ; original count
	shr r8, 4 ; trip count
	jz .tail

	mov eax, 0xff
	kmovw k3, eax ; "low half of lanes" mask

	vbroadcasti32x8 zmm4, [rel vbase]
	vpbroadcastd zmm5, [rel vone]
	vpaddd zmm16, zmm5, zmm5 ; two

	; AVX-512, 8 histogram slices
	;
	; This means we can do the conflict detection manually like in var3 above.
	; That seems preferable overall, and is indeed significantly faster in
	; my tests.
	.inner:
	vpmovzxbd zmm0, [rdx]
	add rdx, 16
	vpaddd zmm1, zmm0, zmm4 ; add base lanes
	valignd zmm2, zmm0, zmm0, 8 ; input bytes rotated by 8 lanes
	vpcmpd k2{k3}, zmm0, zmm2, 0 ; check whether high half matches low half

	; grab source bin values
	vpxord zmm2, zmm2, zmm2
	kxnorw k1, k1, k1
	vpgatherdd zmm2{k1}, [rcx + zmm1*4]

	; depending on whether we have a conflict between matching high and low lanes,
	; add either 1 or 2
	vpblendmd zmm0{k2}, zmm5, zmm16
	vpaddd zmm2, zmm2, zmm0

	; determine output scatter mask
	kshiftlw k1, k2, 8
	knotw k1, k1
	vpscatterdd [rcx + zmm1*4]{k1}, zmm2

	dec r8
	jnz .inner

	.tail:
	and r9, 15 ; masked count
	jz .done

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.done:
	vzeroupper
	ret

	; ----

	global histo_asm_avx512_core_16x
	histo_asm_avx512_core_16x:
	vzeroupper

	mov r9, r8 ; original count
	shr r8, 4 ; trip count
	jz .tail

	vmovdqu32 zmm4, [rel vbase]
	vpbroadcastd zmm5, [rel vone]

	; AVX-512, 16 histogram slices
	;
	; No conflict detection necessary.
	.inner:
	vpmovzxbd zmm0, [rdx]
	add rdx, 16
	vpaddd zmm1, zmm0, zmm4 ; add base lanes

	; grab source bin values
	vpxord zmm2, zmm2, zmm2
	kxnorw k1, k1, k1
	vpgatherdd zmm2{k1}, [rcx + zmm1*4]

	; increment
	vpaddd zmm2, zmm2, zmm5

	kxnorw k1, k1, k1
	vpscatterdd [rcx + zmm1*4]{k1}, zmm2

	dec r8
	jnz .inner

	.tail:
	and r9, 15 ; masked count
	jz .sumit

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.sumit:
	; summing loop
	mov r9, (256-16)*4
	.sum:
	vmovdqa32 zmm0, [rcx + r9 + 0*1024]
	%assign i 1
	%rep 15
	vpaddd zmm0, zmm0, [rcx + r9 + i*1024]
	%assign i i+1
	%endrep
	vmovdqa32 [rcx + r9 + 0*1024], zmm0
	sub r9, 16*4
	jns .sum

	.done:
	vzeroupper
	ret

	; ----

	global histo_asm_avx512_core_16x_var2
	histo_asm_avx512_core_16x_var2:
	vzeroupper

	mov r9, r8 ; original count
	shr r8, 5 ; trip count
	jz .tail

	vmovdqu32 zmm4, [rel vbase]
	vpbroadcastd zmm5, [rel vone]
	vpaddd zmm16, zmm5, zmm5 ; two

	; AVX-512, 16 histogram slices
	;
	; Grab two runs of 16, detect conflicts between them.
	.inner:
	vpmovzxbd zmm0, [rdx]
	vpmovzxbd zmm1, [rdx + 16]
	add rdx, 32
	vpcmpd k2, zmm0, zmm1, 0 ; check whether upper 16 match lower 16
	vpaddd zmm0, zmm0, zmm4 ; add base lanes
	vpaddd zmm1, zmm1, zmm4

	; grab source bin values
	vpxord zmm2, zmm2, zmm2
	kxnorw k1, k1, k1
	vpgatherdd zmm2{k1}, [rcx + zmm0*4]

	; depending on whether we have a conflict between matching high and low lanes,
	; add either 1 or 2
	vpblendmd zmm3{k2}, zmm5, zmm16
	vpaddd zmm2, zmm2, zmm3

	; store updated
	kxnorw k1, k1, k1
	vpscatterdd [rcx + zmm0*4]{k1}, zmm2

	; grab source bin values for second half
	vpxord zmm2, zmm2, zmm2
	knotw k1, k2
	vpgatherdd zmm2{k1}, [rcx + zmm1*4]

	; increment
	vpaddd zmm2, zmm2, zmm5

	; store updated
	knotw k1, k2
	vpscatterdd [rcx + zmm1*4]{k1}, zmm2

	dec r8
	jnz .inner

	.tail:
	and r9, 31 ; masked count
	jz .sumit

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.sumit:
	; summing loop
	mov r9, (256-16)*4
	.sum:
	vmovdqa32 zmm0, [rcx + r9 + 0*1024]
	%assign i 1
	%rep 15
	vpaddd zmm0, zmm0, [rcx + r9 + i*1024]
	%assign i i+1
	%endrep
	vmovdqa32 [rcx + r9 + 0*1024], zmm0
	sub r9, 16*4
	jns .sum

	.done:
	vzeroupper
	ret

	; ----

	global histo_asm_avx512_core_16x_var3
	histo_asm_avx512_core_16x_var3:
	vzeroupper

	mov r9, r8 ; original count
	shr r8, 5 ; trip count
	jz .tail

	vmovdqu32 zmm4, [rel vbase]
	vpbroadcastd zmm5, [rel vone]
	vpaddd zmm16, zmm5, zmm5 ; two

	vpmovzxbd zmm17, [rdx]
	vpmovzxbd zmm18, [rdx + 16]

	; AVX-512, 16 histogram slices
	;
	; Grab two runs of 16, detect conflicts between them.
	.inner:
	vpcmpd k2, zmm17, zmm18, 0 ; check whether upper 16 match lower 16
	vpaddd zmm0, zmm17, zmm4 ; add base lanes
	vpaddd zmm1, zmm18, zmm4

	; loads for next round
	vpmovzxbd zmm17, [rdx + 32]
	vpmovzxbd zmm18, [rdx + 48]
	add rdx, 32

	; grab source bin values
	vpxord zmm2, zmm2, zmm2
	kxnorw k1, k1, k1
	vpgatherdd zmm2{k1}, [rcx + zmm0*4]

	; depending on whether we have a conflict between matching high and low lanes,
	; add either 1 or 2
	vpblendmd zmm3{k2}, zmm5, zmm16
	vpaddd zmm2, zmm2, zmm3

	; store updated
	kxnorw k1, k1, k1
	vpscatterdd [rcx + zmm0*4]{k1}, zmm2

	; grab source bin values for second half
	vpxord zmm2, zmm2, zmm2
	knotw k1, k2
	vpgatherdd zmm2{k1}, [rcx + zmm1*4]

	; increment
	vpaddd zmm2, zmm2, zmm5

	; store updated
	knotw k1, k2
	vpscatterdd [rcx + zmm1*4]{k1}, zmm2

	dec r8
	jnz .inner

	.tail:
	and r9, 31 ; masked count
	jz .sumit

	.taillp:
	movzx eax, byte [rdx]
	inc rdx
	inc dword [rcx + rax*4]
	dec r9
	jnz .taillp

	.sumit:
	; summing loop
	mov r9, (256-16)*4
	.sum:
	vmovdqa32 zmm0, [rcx + r9 + 0*1024]
	%assign i 1
	%rep 15
	vpaddd zmm0, zmm0, [rcx + r9 + i*1024]
	%assign i i+1
	%endrep
	vmovdqa32 [rcx + r9 + 0*1024], zmm0
	sub r9, 16*4
	jns .sum

	.done:
	vzeroupper
	ret
	#include <stdio.h>
	#include <stdint.h>
	#include <string.h>
	#ifdef _MSC_VER
	#include <intrin.h>
	#endif
	#include <emmintrin.h>

	typedef uint8_t U8;
	typedef uint32_t U32;
	typedef uint64_t U64;

	//#define NOVALIDATE

	static U8 read_file(const char filename, size_t *out_size)
	{
	U8 *ret = NULL;
	FILE *f = fopen(filename, "rb");
	if (f)
	{
	fseek(f, 0, SEEK_END);
	long size = ftell(f);
	fseek(f, 0, SEEK_SET);

	U8 *mem = new U8[size];
	if (mem)
	{
	if (fread(mem, size, 1, f) == 1)
	{
	if (out_size)
	*out_size = size;
	ret = mem;
	}
	else
	delete[] mem;
	}

	fclose(f);
	}

	return ret;
	}

	static inline U64 cycle_timer()
	{
	#ifdef _MSC_VER
	return __rdtsc();
	#else
	U32 lo, hi;
	__asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi) );
	return lo \| ((U64)hi << 32);
	#endif
	}

	static inline uint32_t read32(const void *p)
	{
	uint32_t x;
	memcpy(&x, p, sizeof(x));
	return x;
	}

	static void histo_ref(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	memset(counts,0,256*sizeof(U32));
	for (size_t i = 0; i < rawLen; i++)
	counts[rawArray[i]]++;
	}

	static void histo_cpp_1x(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	memset(counts,0,256*sizeof(U32));

	const U8 * rawPtr = rawArray;
	const U8 * rawEnd = rawArray+rawLen;
	const U8 * rawEndMul4 = rawArray+(rawLen&~3);

	while(rawPtr < rawEndMul4)
	{
	U32 x = read32(rawPtr);
	counts[x & 0xff]++; x >>= 8;
	counts[x & 0xff]++; x >>= 8;
	counts[x & 0xff]++; x >>= 8;
	counts[x] ++; // last one doesn't need to mask
	rawPtr += 4;
	}

	// finish the last few bytes (just throw them into array 0, doesn't matter)
	while(rawPtr < rawEnd)
	counts[ *rawPtr++ ] ++;
	}

	static void histo_cpp_2x(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[2][256];
	memset(countsArray,0,sizeof(countsArray));

	const U8 * rawPtr = rawArray;
	const U8 * rawEnd = rawArray+rawLen;
	const U8 * rawEndMul4 = rawArray+(rawLen&~3);

	while(rawPtr < rawEndMul4)
	{
	U32 x = read32(rawPtr);
	countsArray[0][x & 0xff]++; x >>= 8;
	countsArray[1][x & 0xff]++; x >>= 8;
	countsArray[0][x & 0xff]++; x >>= 8;
	countsArray[1][x] ++; // last one doesn't need to mask
	rawPtr += 4;
	}

	// finish the last few bytes (just throw them into array 0, doesn't matter)
	while(rawPtr < rawEnd)
	countsArray[0][ *rawPtr++ ] ++;

	// sum the countsarrays together
	int k=0;
	for(;k<256;k+=4)
	{
	__m128i sum = _mm_load_si128((const __m128i *) &countsArray[0][k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &countsArray[1][k]);
	_mm_storeu_si128((__m128i *)&counts[k], sum);
	}
	}

	static void finish_histo_4x(U32 * counts_out, const U32 * counts4x)
	{
	for (size_t k=0; k<256; k+=4)
	{
	__m128i sum = _mm_load_si128((const __m128i ) &counts4x[0256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts4x[1*256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts4x[2*256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts4x[3*256 + k]);
	_mm_storeu_si128((__m128i *)&counts_out[k], sum);
	}
	}

	static void finish_histo_8x(U32 * counts_out, const U32 * counts8x)
	{
	for (size_t k=0; k<256; k+=4)
	{
	__m128i sum = _mm_load_si128((const __m128i ) &counts8x[0256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[1*256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[2*256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[3*256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[4*256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[5*256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[6*256 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[7*256 + k]);
	_mm_storeu_si128((__m128i *)&counts_out[k], sum);
	}
	}

	static void finish_histo_8x_260(U32 * counts_out, const U32 * counts8x)
	{
	for (size_t k=0; k<256; k+=4)
	{
	__m128i sum = _mm_load_si128((const __m128i ) &counts8x[0260 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[1*260 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[2*260 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[3*260 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[4*260 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[5*260 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[6*260 + k]);
	sum = _mm_add_epi32(sum, (const __m128i ) &counts8x[7*260 + k]);
	_mm_storeu_si128((__m128i *)&counts_out[k], sum);
	}
	}

	static void histo_cpp_4x(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[4*256];
	memset(countsArray,0,sizeof(countsArray));

	const U8 * rawPtr = rawArray;
	const U8 * rawEnd = rawArray+rawLen;
	const U8 * rawEndMul4 = rawArray+(rawLen&~3);

	while(rawPtr < rawEndMul4)
	{
	U32 x = read32(rawPtr);
	countsArray[0*256 + (x & 0xff)]++; x >>= 8;
	countsArray[1*256 + (x & 0xff)]++; x >>= 8;
	countsArray[2*256 + (x & 0xff)]++; x >>= 8;
	countsArray[3*256 + x] ++; // last one doesn't need to mask
	rawPtr += 4;
	}

	// finish the last few bytes (just throw them into slice 0, doesn't matter)
	while(rawPtr < rawEnd)
	countsArray[ *rawPtr++ ] ++;

	finish_histo_4x(counts, countsArray);
	}

	extern "C" void histo_asm_scalar4_core(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_scalar8_core(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_scalar8_var_core(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_scalar8_var2_core(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_scalar8_var3_core(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_scalar8_var4_core(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_scalar8_var5_core(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_sse4_core(U32 * histo, const U8 * bytes, size_t nbytes);

	static void histo_asm_scalar4(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[4*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_scalar4_core(countsArray, rawArray, rawLen);
	finish_histo_4x(counts, countsArray);
	}

	static void histo_asm_scalar8(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[4*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_scalar8_core(countsArray, rawArray, rawLen);
	finish_histo_4x(counts, countsArray);
	}

	static void histo_asm_scalar8_var(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[4*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_scalar8_var_core(countsArray, rawArray, rawLen);
	finish_histo_4x(counts, countsArray);
	}

	static void histo_asm_scalar8_var2(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[4*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_scalar8_var2_core(countsArray, rawArray, rawLen);
	finish_histo_4x(counts, countsArray);
	}

	static void histo_asm_scalar8_var3(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[8*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_scalar8_var3_core(countsArray, rawArray, rawLen);
	finish_histo_8x(counts, countsArray);
	}

	static void histo_asm_scalar8_var4(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[8*260];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_scalar8_var4_core(countsArray, rawArray, rawLen);
	finish_histo_8x_260(counts, countsArray);
	}

	static void histo_asm_scalar8_var5(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[4*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_scalar8_var5_core(countsArray, rawArray, rawLen);
	finish_histo_4x(counts, countsArray);
	}

	static void histo_asm_sse4(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[4*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_sse4_core(countsArray, rawArray, rawLen);
	finish_histo_4x(counts, countsArray);
	}

	extern "C" void histo_asm_avx256_8x_core1(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_avx256_8x_core2(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_avx256_8x_core3(U32 * histo, const U8 * bytes, size_t nbytes);

	static void histo_asm_avx256_8x_1(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[8*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_avx256_8x_core1(countsArray, rawArray, rawLen);
	finish_histo_8x(counts, countsArray);
	}

	static void histo_asm_avx256_8x_2(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[8*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_avx256_8x_core2(countsArray, rawArray, rawLen);
	finish_histo_8x(counts, countsArray);
	}

	static void histo_asm_avx256_8x_3(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[8*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_avx256_8x_core3(countsArray, rawArray, rawLen);
	finish_histo_8x(counts, countsArray);
	}

	extern "C" void histo_asm_avx512_core_conflict(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_avx512_core_8x(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_avx512_core_16x(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_avx512_core_16x_var2(U32 * histo, const U8 * bytes, size_t nbytes);
	extern "C" void histo_asm_avx512_core_16x_var3(U32 * histo, const U8 * bytes, size_t nbytes);

	static void histo_asm_avx512_conflict(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	memset(counts,0,256*sizeof(U32));

	histo_asm_avx512_core_conflict(counts, rawArray, rawLen);
	}

	static void histo_asm_avx512_8x(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(16)) countsArray[8*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_avx512_core_8x(countsArray, rawArray, rawLen);
	finish_histo_8x(counts, countsArray);
	}

	static void histo_asm_avx512_16x(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(64)) countsArray[16*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_avx512_core_16x(countsArray, rawArray, rawLen);
	memcpy(counts, countsArray, 256*sizeof(U32));
	}

	static void histo_asm_avx512_16x_var2(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(64)) countsArray[16*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_avx512_core_16x_var2(countsArray, rawArray, rawLen);
	memcpy(counts, countsArray, 256*sizeof(U32));
	}

	static void histo_asm_avx512_16x_var3(U32 * counts, const U8 * rawArray, size_t rawLen)
	{
	U32 __declspec(align(64)) countsArray[16*256];
	memset(countsArray,0,sizeof(countsArray));

	histo_asm_avx512_core_16x_var3(countsArray, rawArray, rawLen);
	memcpy(counts, countsArray, 256*sizeof(U32));
	}

	typedef void histo_func(U32 * histo, const U8 * bytes, size_t nbytes);

	U32 g_histo[256]; // global sink

	static void testit(const char name, histo_func fn, const U8 * bytes, size_t nbytes)
	{
	#ifndef NOVALIDATE
	// check correctness
	U32 ref_histo[256];
	U32 our_histo[256];
	histo_ref(ref_histo, bytes, nbytes);
	fn(our_histo, bytes, nbytes);
	if (memcmp(ref_histo, our_histo, sizeof(our_histo)) != 0)
	{
	printf("%30s: incorrect result!\n", name);
	return;
	}
	#endif

	// check timing
	U64 min_dur = ~(U64)0;

	for (int run = 0; run < 7500; run++)
	//for (int run = 0; run < 15000; run++)
	//for (int run = 0; run < 50000; run++)
	{
	U64 start = cycle_timer();
	fn(g_histo, bytes, nbytes);
	U64 duration = cycle_timer() - start;

	if (duration < min_dur)
	min_dur = duration;
	}

	printf("%30s: %7lld (%.2f/byte)\n", name, min_dur, 1.0*min_dur/nbytes);
	}

	static void test_file(const char label, const U8 bytes, size_t size)
	{
	printf("%s: %d bytes\n", label, (int)size);

	#define TESTIT(name) testit(#name,name,bytes,size)
	#if 1
	TESTIT(histo_ref);
	TESTIT(histo_cpp_1x);
	TESTIT(histo_cpp_2x);
	TESTIT(histo_cpp_4x);
	TESTIT(histo_asm_scalar4);
	TESTIT(histo_asm_scalar8);
	TESTIT(histo_asm_scalar8_var);
	TESTIT(histo_asm_scalar8_var2);
	TESTIT(histo_asm_scalar8_var3);
	TESTIT(histo_asm_scalar8_var4);
	TESTIT(histo_asm_scalar8_var5);
	TESTIT(histo_asm_sse4);
	TESTIT(histo_asm_avx256_8x_1);
	TESTIT(histo_asm_avx256_8x_2);
	TESTIT(histo_asm_avx256_8x_3);
	TESTIT(histo_asm_avx512_conflict);
	TESTIT(histo_asm_avx512_8x);
	TESTIT(histo_asm_avx512_16x);
	TESTIT(histo_asm_avx512_16x_var2);
	TESTIT(histo_asm_avx512_16x_var3);
	#endif

	//TESTIT(histo_ref);
	//TESTIT(histo_cpp_4x);
	//TESTIT(histo_asm_scalar8_var2);
	//TESTIT(histo_asm_scalar8_var3);
	//TESTIT(histo_asm_scalar8_var4);

	#undef TESTIT
	}

	int main(int argc, char **argv)
	{
	if (argc < 2)
	{
	fprintf(stderr, "Usage: histotest <filename>\n");
	return 1;
	}

	const char *filename = argv[1];
	size_t size;
	U8 *bytes = read_file(filename, &size);
	if (!bytes)
	{
	fprintf(stderr, "Error loading '%s'\n", filename);
	return 1;
	}

	test_file(filename, bytes, size);

	static U8 zeroes[128*1024];
	test_file("zeroes", zeroes, 128*1024);

	delete[] bytes;
	return 0;
	}