Last active
August 29, 2015 14:05
-
-
Save ChrisLundquist/7f98cd7d120abb6edaa3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ gcc -Wall -Wextra -pedantic -std=c11 -O3 -S -march=corei7 main.c | |
# Gives you the popcnt assembly, GCC knows we have the instruction | |
$ gcc -Wall -Wextra -pedantic -std=c11 -O3 -S main.c | |
# Gives you the mask assembly |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <string.h> | |
#define ELEMENT_COUNT 1000000000 | |
void generate_data(short* data, int seed) { | |
srand(seed); | |
for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) { | |
data[i] = (short) rand(); | |
} | |
} | |
void generate_table(char* table) { | |
for(unsigned i = 0; i < 256; ++i) | |
table[i] = __builtin_popcount(i); | |
} | |
int main(int argc, char* argv[]) { | |
short* data = malloc( ELEMENT_COUNT * sizeof(short)); | |
uint64_t sum = 0; | |
int seed = 0; /* so we can reproduce the output */ | |
srand(seed); | |
char* table = malloc( 256 * sizeof(char)); | |
generate_table(table); | |
generate_data(data, seed); | |
for(uint64_t i = 0; i < ELEMENT_COUNT * (sizeof(short) / sizeof(char)); ++i) { | |
sum += table[((char*) data)[i]]; /* walk byte at a time */ | |
} | |
printf("Set bit count was: %llu\n", sum); | |
free(data); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section __TEXT,__text,regular,pure_instructions | |
.globl _generate_data | |
.align 4, 0x90 | |
_generate_data: ## @generate_data | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Ltmp3: | |
.cfi_def_cfa_offset 16 | |
Ltmp4: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Ltmp5: | |
.cfi_def_cfa_register %rbp | |
pushq %r14 | |
pushq %rbx | |
Ltmp6: | |
.cfi_offset %rbx, -32 | |
Ltmp7: | |
.cfi_offset %r14, -24 | |
movq %rdi, %r14 | |
movl %esi, %edi | |
callq _srand | |
xorl %ebx, %ebx | |
.align 4, 0x90 | |
LBB0_1: ## =>This Inner Loop Header: Depth=1 | |
callq _rand | |
movw %ax, (%r14,%rbx,2) | |
incq %rbx | |
cmpq $1000000000, %rbx ## imm = 0x3B9ACA00 | |
jne LBB0_1 | |
## BB#2: | |
popq %rbx | |
popq %r14 | |
popq %rbp | |
ret | |
.cfi_endproc | |
.globl _generate_table | |
.align 4, 0x90 | |
_generate_table: ## @generate_table | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Ltmp10: | |
.cfi_def_cfa_offset 16 | |
Ltmp11: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Ltmp12: | |
.cfi_def_cfa_register %rbp | |
xorl %eax, %eax | |
.align 4, 0x90 | |
LBB1_1: ## =>This Inner Loop Header: Depth=1 | |
movl %eax, %ecx | |
shrl %ecx | |
andl $1431655765, %ecx ## imm = 0x55555555 | |
movl %eax, %edx | |
subl %ecx, %edx | |
movl %edx, %ecx | |
andl $858993459, %ecx ## imm = 0x33333333 | |
shrl $2, %edx | |
andl $858993459, %edx ## imm = 0x33333333 | |
addl %ecx, %edx | |
movl %edx, %ecx | |
shrl $4, %ecx | |
addl %edx, %ecx | |
andl $252645135, %ecx ## imm = 0xF0F0F0F | |
imull $16843009, %ecx, %ecx ## imm = 0x1010101 | |
shrl $24, %ecx | |
movb %cl, (%rdi,%rax) | |
incq %rax | |
cmpq $255, %rax | |
jne LBB1_1 | |
## BB#2: | |
popq %rbp | |
ret | |
.cfi_endproc | |
.globl _main | |
.align 4, 0x90 | |
_main: ## @main | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Ltmp16: | |
.cfi_def_cfa_offset 16 | |
Ltmp17: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Ltmp18: | |
.cfi_def_cfa_register %rbp | |
pushq %r15 | |
pushq %r14 | |
pushq %r13 | |
pushq %r12 | |
pushq %rbx | |
pushq %rax | |
Ltmp19: | |
.cfi_offset %rbx, -56 | |
Ltmp20: | |
.cfi_offset %r12, -48 | |
Ltmp21: | |
.cfi_offset %r13, -40 | |
Ltmp22: | |
.cfi_offset %r14, -32 | |
Ltmp23: | |
.cfi_offset %r15, -24 | |
movl $2000000000, %edi ## imm = 0x77359400 | |
callq _malloc | |
movq %rax, %r14 | |
xorl %ebx, %ebx | |
xorl %edi, %edi | |
callq _srand | |
movl $255, %edi | |
callq _malloc | |
movq %rax, %r15 | |
.align 4, 0x90 | |
LBB2_1: ## =>This Inner Loop Header: Depth=1 | |
movl %ebx, %eax | |
shrl %eax | |
andl $1431655765, %eax ## imm = 0x55555555 | |
movl %ebx, %ecx | |
subl %eax, %ecx | |
movl %ecx, %eax | |
andl $858993459, %eax ## imm = 0x33333333 | |
shrl $2, %ecx | |
andl $858993459, %ecx ## imm = 0x33333333 | |
addl %eax, %ecx | |
movl %ecx, %eax | |
shrl $4, %eax | |
addl %ecx, %eax | |
andl $252645135, %eax ## imm = 0xF0F0F0F | |
imull $16843009, %eax, %eax ## imm = 0x1010101 | |
shrl $24, %eax | |
movb %al, (%r15,%rbx) | |
incq %rbx | |
cmpq $255, %rbx | |
jne LBB2_1 | |
## BB#2: ## %generate_table.exit | |
xorl %r13d, %r13d | |
xorl %edi, %edi | |
callq _srand | |
movl $1, %r12d | |
xorl %ebx, %ebx | |
.align 4, 0x90 | |
LBB2_3: ## =>This Inner Loop Header: Depth=1 | |
callq _rand | |
movw %ax, (%r14,%rbx,2) | |
incq %rbx | |
cmpq $1000000000, %rbx ## imm = 0x3B9ACA00 | |
jne LBB2_3 | |
## BB#4: | |
xorl %esi, %esi | |
.align 4, 0x90 | |
LBB2_5: ## %vector.body | |
## =>This Inner Loop Header: Depth=1 | |
movq %rsi, %rax | |
movq %r13, %rcx | |
movsbq -1(%r14,%r12), %rdx | |
movsbq (%r14,%r12), %rsi | |
movsbq (%r15,%rdx), %r13 | |
movsbq (%r15,%rsi), %rsi | |
addq %rcx, %r13 | |
addq %rax, %rsi | |
addq $2, %r12 | |
cmpq $2000000001, %r12 ## imm = 0x77359401 | |
jne LBB2_5 | |
## BB#6: ## %middle.block | |
addq %r13, %rsi | |
leaq L_.str(%rip), %rdi | |
xorl %eax, %eax | |
callq _printf | |
movq %r14, %rdi | |
callq _free | |
xorl %eax, %eax | |
addq $8, %rsp | |
popq %rbx | |
popq %r12 | |
popq %r13 | |
popq %r14 | |
popq %r15 | |
popq %rbp | |
ret | |
.cfi_endproc | |
.section __TEXT,__cstring,cstring_literals | |
L_.str: ## @.str | |
.asciz "Set bit count was: %llu\n" | |
.subsections_via_symbols |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <string.h> | |
#define ELEMENT_COUNT 10000000 | |
int main(int argc, char* argv[]) { | |
short* data = malloc( ELEMENT_COUNT * sizeof(short)); | |
uint64_t sum = 0; | |
int seed = 0; /* so we can reproduce the output */ | |
srand(seed); | |
for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) { | |
data[i] = (short) rand(); | |
} | |
for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) { | |
sum += __builtin_popcount(data[i]); | |
} | |
printf("Set bit count was: %llu\n", sum); | |
free(data); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section __TEXT,__text,regular,pure_instructions | |
.globl _generate_data | |
.align 4, 0x90 | |
_generate_data: ## @generate_data | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Ltmp2: | |
.cfi_def_cfa_offset 16 | |
Ltmp3: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Ltmp4: | |
.cfi_def_cfa_register %rbp | |
popq %rbp | |
ret | |
.cfi_endproc | |
.globl _main | |
.align 4, 0x90 | |
_main: ## @main | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Ltmp8: | |
.cfi_def_cfa_offset 16 | |
Ltmp9: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Ltmp10: | |
.cfi_def_cfa_register %rbp | |
pushq %r15 | |
pushq %r14 | |
pushq %rbx | |
pushq %rax | |
Ltmp11: | |
.cfi_offset %rbx, -40 | |
Ltmp12: | |
.cfi_offset %r14, -32 | |
Ltmp13: | |
.cfi_offset %r15, -24 | |
movl $20000000, %edi ## imm = 0x1312D00 | |
callq _malloc | |
movq %rax, %r14 | |
xorl %r15d, %r15d | |
xorl %edi, %edi | |
callq _srand | |
xorl %ebx, %ebx | |
.align 4, 0x90 | |
LBB1_1: ## =>This Inner Loop Header: Depth=1 | |
callq _rand | |
movw %ax, (%r14,%rbx,2) | |
incq %rbx | |
cmpq $10000000, %rbx ## imm = 0x989680 | |
jne LBB1_1 | |
## BB#2: | |
xorl %esi, %esi | |
.align 4, 0x90 | |
LBB1_3: ## %.preheader | |
## =>This Inner Loop Header: Depth=1 | |
movq %rsi, %rax | |
movswl (%r14,%r15,2), %ecx | |
movl %ecx, %edx | |
shrl %edx | |
andl $1431655765, %edx ## imm = 0x55555555 | |
subl %edx, %ecx | |
movl %ecx, %edx | |
andl $858993459, %edx ## imm = 0x33333333 | |
shrl $2, %ecx | |
andl $858993459, %ecx ## imm = 0x33333333 | |
addl %edx, %ecx | |
movl %ecx, %edx | |
shrl $4, %edx | |
addl %ecx, %edx | |
andl $252645135, %edx ## imm = 0xF0F0F0F | |
imull $16843009, %edx, %esi ## imm = 0x1010101 | |
shrl $24, %esi | |
addq %rax, %rsi | |
incq %r15 | |
cmpq $10000000, %r15 ## imm = 0x989680 | |
jne LBB1_3 | |
## BB#4: | |
leaq L_.str(%rip), %rdi | |
xorl %eax, %eax | |
callq _printf | |
movq %r14, %rdi | |
callq _free | |
xorl %eax, %eax | |
addq $8, %rsp | |
popq %rbx | |
popq %r14 | |
popq %r15 | |
popq %rbp | |
ret | |
.cfi_endproc | |
.section __TEXT,__cstring,cstring_literals | |
L_.str: ## @.str | |
.asciz "Set bit count was: %llu\n" | |
.subsections_via_symbols |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.section __TEXT,__text,regular,pure_instructions | |
.globl _generate_data | |
.align 4, 0x90 | |
_generate_data: ## @generate_data | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Ltmp2: | |
.cfi_def_cfa_offset 16 | |
Ltmp3: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Ltmp4: | |
.cfi_def_cfa_register %rbp | |
popq %rbp | |
ret | |
.cfi_endproc | |
.globl _main | |
.align 4, 0x90 | |
_main: ## @main | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Ltmp8: | |
.cfi_def_cfa_offset 16 | |
Ltmp9: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Ltmp10: | |
.cfi_def_cfa_register %rbp | |
pushq %r15 | |
pushq %r14 | |
pushq %rbx | |
pushq %rax | |
Ltmp11: | |
.cfi_offset %rbx, -40 | |
Ltmp12: | |
.cfi_offset %r14, -32 | |
Ltmp13: | |
.cfi_offset %r15, -24 | |
movl $20000000, %edi ## imm = 0x1312D00 | |
callq _malloc | |
movq %rax, %r14 | |
xorl %r15d, %r15d | |
xorl %edi, %edi | |
callq _srand | |
xorl %ebx, %ebx | |
.align 4, 0x90 | |
LBB1_1: ## =>This Inner Loop Header: Depth=1 | |
callq _rand | |
movw %ax, (%r14,%rbx,2) | |
incq %rbx | |
cmpq $10000000, %rbx ## imm = 0x989680 | |
jne LBB1_1 | |
## BB#2: | |
xorl %esi, %esi | |
.align 4, 0x90 | |
LBB1_3: ## %.preheader | |
## =>This Inner Loop Header: Depth=1 | |
movq %rsi, %rax | |
movswl (%r14,%r15,2), %ecx | |
popcntl %ecx, %esi | |
addq %rax, %rsi | |
incq %r15 | |
cmpq $10000000, %r15 ## imm = 0x989680 | |
jne LBB1_3 | |
## BB#4: | |
leaq L_.str(%rip), %rdi | |
xorl %eax, %eax | |
callq _printf | |
movq %r14, %rdi | |
callq _free | |
xorl %eax, %eax | |
addq $8, %rsp | |
popq %rbx | |
popq %r14 | |
popq %r15 | |
popq %rbp | |
ret | |
.cfi_endproc | |
.section __TEXT,__cstring,cstring_literals | |
L_.str: ## @.str | |
.asciz "Set bit count was: %llu\n" | |
.subsections_via_symbols |
Added lookup.c which uses a lookup table. It ran significantly faster, and I was curious as to why. Turns out it "cheated" and the auto vectorizer was doing movq
. I did the same thing with popcnt
casting our short*
to an uint64_t*
and popcnt
won again.
At some point though, the latency of the popcount instruction for large register sizes might exceed the latency of the lookup table (since it will probably be in L1 or L2). The autovectorizer complicates our analysis though because you can do this task in parallel many different ways. Partitioning the data set into jobs, using OpenCL or something to launch a bunch of threads, etc.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Some casual benchmarking (increasing the array size by 100x for both tests) shows it is about 10% faster. I was expecting more.