ChrisLundquist · August 29, 2015 14:05 · ChrisLundquist · Aug 7, 2014 · ChrisLundquist · Aug 8, 2014
diff --git a/Build b/Build
 $ gcc -Wall -Wextra -pedantic -std=c11 -O3 -S -march=corei7 main.c
 # Gives you the popcnt assembly, GCC knows we have the instruction

 $ gcc -Wall -Wextra -pedantic -std=c11 -O3 -S  main.c 
 # Gives you the mask assembly
diff --git a/lookup.c b/lookup.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>

 #define ELEMENT_COUNT 1000000000

 void generate_data(short* data, int seed) {
    srand(seed);

    for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) {
        data[i] = (short) rand();
    }
 }

 void generate_table(char* table) {
    for(unsigned i = 0; i < 256; ++i)
        table[i] = __builtin_popcount(i);
 }

 int main(int argc, char* argv[]) {
    short* data = malloc( ELEMENT_COUNT * sizeof(short));
    uint64_t sum = 0;
    int seed = 0; /* so we can reproduce the output */
    srand(seed);

    char* table = malloc( 256 * sizeof(char));

    generate_table(table);
    generate_data(data, seed);

    for(uint64_t i = 0; i < ELEMENT_COUNT * (sizeof(short) / sizeof(char)); ++i) {
        sum += table[((char*) data)[i]]; /* walk byte at a time */
    }

    printf("Set bit count was: %llu\n", sum);

    free(data);
    return 0;
 }
diff --git a/lookup.s b/lookup.s
 	.section	__TEXT,__text,regular,pure_instructions
 	.globl	_generate_data
 	.align	4, 0x90
 _generate_data:                         ## @generate_data
 	.cfi_startproc
 ## BB#0:
 	pushq	%rbp
 Ltmp3:
 	.cfi_def_cfa_offset 16
 Ltmp4:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
 Ltmp5:
 	.cfi_def_cfa_register %rbp
 	pushq	%r14
 	pushq	%rbx
 Ltmp6:
 	.cfi_offset %rbx, -32
 Ltmp7:
 	.cfi_offset %r14, -24
 	movq	%rdi, %r14
 	movl	%esi, %edi
 	callq	_srand
 	xorl	%ebx, %ebx
 	.align	4, 0x90
 LBB0_1:                                 ## =>This Inner Loop Header: Depth=1
 	callq	_rand
 	movw	%ax, (%r14,%rbx,2)
 	incq	%rbx
 	cmpq	$1000000000, %rbx       ## imm = 0x3B9ACA00
 	jne	LBB0_1
 ## BB#2:
 	popq	%rbx
 	popq	%r14
 	popq	%rbp
 	ret
 	.cfi_endproc

 	.globl	_generate_table
 	.align	4, 0x90
 _generate_table:                        ## @generate_table
 	.cfi_startproc
 ## BB#0:
 	pushq	%rbp
 Ltmp10:
 	.cfi_def_cfa_offset 16
 Ltmp11:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
 Ltmp12:
 	.cfi_def_cfa_register %rbp
 	xorl	%eax, %eax
 	.align	4, 0x90
 LBB1_1:                                 ## =>This Inner Loop Header: Depth=1
 	movl	%eax, %ecx
 	shrl	%ecx
 	andl	$1431655765, %ecx       ## imm = 0x55555555
 	movl	%eax, %edx
 	subl	%ecx, %edx
 	movl	%edx, %ecx
 	andl	$858993459, %ecx        ## imm = 0x33333333
 	shrl	$2, %edx
 	andl	$858993459, %edx        ## imm = 0x33333333
 	addl	%ecx, %edx
 	movl	%edx, %ecx
 	shrl	$4, %ecx
 	addl	%edx, %ecx
 	andl	$252645135, %ecx        ## imm = 0xF0F0F0F
 	imull	$16843009, %ecx, %ecx   ## imm = 0x1010101
 	shrl	$24, %ecx
 	movb	%cl, (%rdi,%rax)
 	incq	%rax
 	cmpq	$255, %rax
 	jne	LBB1_1
 ## BB#2:
 	popq	%rbp
 	ret
 	.cfi_endproc

 	.globl	_main
 	.align	4, 0x90
 _main:                                  ## @main
 	.cfi_startproc
 ## BB#0:
 	pushq	%rbp
 Ltmp16:
 	.cfi_def_cfa_offset 16
 Ltmp17:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
 Ltmp18:
 	.cfi_def_cfa_register %rbp
 	pushq	%r15
 	pushq	%r14
 	pushq	%r13
 	pushq	%r12
 	pushq	%rbx
 	pushq	%rax
 Ltmp19:
 	.cfi_offset %rbx, -56
 Ltmp20:
 	.cfi_offset %r12, -48
 Ltmp21:
 	.cfi_offset %r13, -40
 Ltmp22:
 	.cfi_offset %r14, -32
 Ltmp23:
 	.cfi_offset %r15, -24
 	movl	$2000000000, %edi       ## imm = 0x77359400
 	callq	_malloc
 	movq	%rax, %r14
 	xorl	%ebx, %ebx
 	xorl	%edi, %edi
 	callq	_srand
 	movl	$255, %edi
 	callq	_malloc
 	movq	%rax, %r15
 	.align	4, 0x90
 LBB2_1:                                 ## =>This Inner Loop Header: Depth=1
 	movl	%ebx, %eax
 	shrl	%eax
 	andl	$1431655765, %eax       ## imm = 0x55555555
 	movl	%ebx, %ecx
 	subl	%eax, %ecx
 	movl	%ecx, %eax
 	andl	$858993459, %eax        ## imm = 0x33333333
 	shrl	$2, %ecx
 	andl	$858993459, %ecx        ## imm = 0x33333333
 	addl	%eax, %ecx
 	movl	%ecx, %eax
 	shrl	$4, %eax
 	addl	%ecx, %eax
 	andl	$252645135, %eax        ## imm = 0xF0F0F0F
 	imull	$16843009, %eax, %eax   ## imm = 0x1010101
 	shrl	$24, %eax
 	movb	%al, (%r15,%rbx)
 	incq	%rbx
 	cmpq	$255, %rbx
 	jne	LBB2_1
 ## BB#2:                                ## %generate_table.exit
 	xorl	%r13d, %r13d
 	xorl	%edi, %edi
 	callq	_srand
 	movl	$1, %r12d
 	xorl	%ebx, %ebx
 	.align	4, 0x90
 LBB2_3:                                 ## =>This Inner Loop Header: Depth=1
 	callq	_rand
 	movw	%ax, (%r14,%rbx,2)
 	incq	%rbx
 	cmpq	$1000000000, %rbx       ## imm = 0x3B9ACA00
 	jne	LBB2_3
 ## BB#4:
 	xorl	%esi, %esi
 	.align	4, 0x90
 LBB2_5:                                 ## %vector.body
                                        ## =>This Inner Loop Header: Depth=1
 	movq	%rsi, %rax
 	movq	%r13, %rcx
 	movsbq	-1(%r14,%r12), %rdx
 	movsbq	(%r14,%r12), %rsi
 	movsbq	(%r15,%rdx), %r13
 	movsbq	(%r15,%rsi), %rsi
 	addq	%rcx, %r13
 	addq	%rax, %rsi
 	addq	$2, %r12
 	cmpq	$2000000001, %r12       ## imm = 0x77359401
 	jne	LBB2_5
 ## BB#6:                                ## %middle.block
 	addq	%r13, %rsi
 	leaq	L_.str(%rip), %rdi
 	xorl	%eax, %eax
 	callq	_printf
 	movq	%r14, %rdi
 	callq	_free
 	xorl	%eax, %eax
 	addq	$8, %rsp
 	popq	%rbx
 	popq	%r12
 	popq	%r13
 	popq	%r14
 	popq	%r15
 	popq	%rbp
 	ret
 	.cfi_endproc

 	.section	__TEXT,__cstring,cstring_literals
 L_.str:                                 ## @.str
 	.asciz	"Set bit count was: %llu\n"


 .subsections_via_symbols
diff --git a/main.c b/main.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>

 #define ELEMENT_COUNT 10000000

 int main(int argc, char* argv[]) {
    short* data = malloc( ELEMENT_COUNT * sizeof(short));
    uint64_t sum = 0;
    int seed = 0; /* so we can reproduce the output */
    srand(seed);

    for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) {
        data[i] = (short) rand();
    }

    for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) {
        sum += __builtin_popcount(data[i]);
    }

    printf("Set bit count was: %llu\n", sum);

    free(data);
    return 0;
 }
diff --git a/main.s.masks b/main.s.masks
 	.section	__TEXT,__text,regular,pure_instructions
 	.globl	_generate_data
 	.align	4, 0x90
 _generate_data:                         ## @generate_data
 	.cfi_startproc
 ## BB#0:
 	pushq	%rbp
 Ltmp2:
 	.cfi_def_cfa_offset 16
 Ltmp3:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
 Ltmp4:
 	.cfi_def_cfa_register %rbp
 	popq	%rbp
 	ret
 	.cfi_endproc

 	.globl	_main
 	.align	4, 0x90
 _main:                                  ## @main
 	.cfi_startproc
 ## BB#0:
 	pushq	%rbp
 Ltmp8:
 	.cfi_def_cfa_offset 16
 Ltmp9:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
 Ltmp10:
 	.cfi_def_cfa_register %rbp
 	pushq	%r15
 	pushq	%r14
 	pushq	%rbx
 	pushq	%rax
 Ltmp11:
 	.cfi_offset %rbx, -40
 Ltmp12:
 	.cfi_offset %r14, -32
 Ltmp13:
 	.cfi_offset %r15, -24
 	movl	$20000000, %edi         ## imm = 0x1312D00
 	callq	_malloc
 	movq	%rax, %r14
 	xorl	%r15d, %r15d
 	xorl	%edi, %edi
 	callq	_srand
 	xorl	%ebx, %ebx
 	.align	4, 0x90
 LBB1_1:                                 ## =>This Inner Loop Header: Depth=1
 	callq	_rand
 	movw	%ax, (%r14,%rbx,2)
 	incq	%rbx
 	cmpq	$10000000, %rbx         ## imm = 0x989680
 	jne	LBB1_1
 ## BB#2:
 	xorl	%esi, %esi
 	.align	4, 0x90
 LBB1_3:                                 ## %.preheader
                                        ## =>This Inner Loop Header: Depth=1
 	movq	%rsi, %rax
 	movswl	(%r14,%r15,2), %ecx
 	movl	%ecx, %edx
 	shrl	%edx
 	andl	$1431655765, %edx       ## imm = 0x55555555
 	subl	%edx, %ecx
 	movl	%ecx, %edx
 	andl	$858993459, %edx        ## imm = 0x33333333
 	shrl	$2, %ecx
 	andl	$858993459, %ecx        ## imm = 0x33333333
 	addl	%edx, %ecx
 	movl	%ecx, %edx
 	shrl	$4, %edx
 	addl	%ecx, %edx
 	andl	$252645135, %edx        ## imm = 0xF0F0F0F
 	imull	$16843009, %edx, %esi   ## imm = 0x1010101
 	shrl	$24, %esi
 	addq	%rax, %rsi
 	incq	%r15
 	cmpq	$10000000, %r15         ## imm = 0x989680
 	jne	LBB1_3
 ## BB#4:
 	leaq	L_.str(%rip), %rdi
 	xorl	%eax, %eax
 	callq	_printf
 	movq	%r14, %rdi
 	callq	_free
 	xorl	%eax, %eax
 	addq	$8, %rsp
 	popq	%rbx
 	popq	%r14
 	popq	%r15
 	popq	%rbp
 	ret
 	.cfi_endproc

 	.section	__TEXT,__cstring,cstring_literals
 L_.str:                                 ## @.str
 	.asciz	"Set bit count was: %llu\n"


 .subsections_via_symbols
diff --git a/main.s.popcnt b/main.s.popcnt
 	.section	__TEXT,__text,regular,pure_instructions
 	.globl	_generate_data
 	.align	4, 0x90
 _generate_data:                         ## @generate_data
 	.cfi_startproc
 ## BB#0:
 	pushq	%rbp
 Ltmp2:
 	.cfi_def_cfa_offset 16
 Ltmp3:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
 Ltmp4:
 	.cfi_def_cfa_register %rbp
 	popq	%rbp
 	ret
 	.cfi_endproc

 	.globl	_main
 	.align	4, 0x90
 _main:                                  ## @main
 	.cfi_startproc
 ## BB#0:
 	pushq	%rbp
 Ltmp8:
 	.cfi_def_cfa_offset 16
 Ltmp9:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
 Ltmp10:
 	.cfi_def_cfa_register %rbp
 	pushq	%r15
 	pushq	%r14
 	pushq	%rbx
 	pushq	%rax
 Ltmp11:
 	.cfi_offset %rbx, -40
 Ltmp12:
 	.cfi_offset %r14, -32
 Ltmp13:
 	.cfi_offset %r15, -24
 	movl	$20000000, %edi         ## imm = 0x1312D00
 	callq	_malloc
 	movq	%rax, %r14
 	xorl	%r15d, %r15d
 	xorl	%edi, %edi
 	callq	_srand
 	xorl	%ebx, %ebx
 	.align	4, 0x90
 LBB1_1:                                 ## =>This Inner Loop Header: Depth=1
 	callq	_rand
 	movw	%ax, (%r14,%rbx,2)
 	incq	%rbx
 	cmpq	$10000000, %rbx         ## imm = 0x989680
 	jne	LBB1_1
 ## BB#2:
 	xorl	%esi, %esi
 	.align	4, 0x90
 LBB1_3:                                 ## %.preheader
                                        ## =>This Inner Loop Header: Depth=1
 	movq	%rsi, %rax
 	movswl	(%r14,%r15,2), %ecx
 	popcntl	%ecx, %esi
 	addq	%rax, %rsi
 	incq	%r15
 	cmpq	$10000000, %r15         ## imm = 0x989680
 	jne	LBB1_3
 ## BB#4:
 	leaq	L_.str(%rip), %rdi
 	xorl	%eax, %eax
 	callq	_printf
 	movq	%r14, %rdi
 	callq	_free
 	xorl	%eax, %eax
 	addq	$8, %rsp
 	popq	%rbx
 	popq	%r14
 	popq	%r15
 	popq	%rbp
 	ret
 	.cfi_endproc

 	.section	__TEXT,__cstring,cstring_literals
 L_.str:                                 ## @.str
 	.asciz	"Set bit count was: %llu\n"


 .subsections_via_symbols
	$ gcc -Wall -Wextra -pedantic -std=c11 -O3 -S -march=corei7 main.c
	# Gives you the popcnt assembly, GCC knows we have the instruction

	$ gcc -Wall -Wextra -pedantic -std=c11 -O3 -S main.c
	# Gives you the mask assembly
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>

	#define ELEMENT_COUNT 1000000000

	void generate_data(short* data, int seed) {
	srand(seed);

	for(uint64_t i = 0; i < ELEMENT_COUNT; ++i) {
	data[i] = (short) rand();
	}
	}

	void generate_table(char* table) {
	for(unsigned i = 0; i < 256; ++i)
	table[i] = __builtin_popcount(i);
	}

	int main(int argc, char* argv[]) {
	short* data = malloc( ELEMENT_COUNT * sizeof(short));
	uint64_t sum = 0;
	int seed = 0; /* so we can reproduce the output */
	srand(seed);

	char* table = malloc( 256 * sizeof(char));

	generate_table(table);
	generate_data(data, seed);

	for(uint64_t i = 0; i < ELEMENT_COUNT * (sizeof(short) / sizeof(char)); ++i) {
	sum += table[((char) data)[i]]; / walk byte at a time */
	}

	printf("Set bit count was: %llu\n", sum);

	free(data);
	return 0;
	}
	.section __TEXT,__text,regular,pure_instructions
	.globl _generate_data
	.align 4, 0x90
	_generate_data: ## @generate_data
	.cfi_startproc
	## BB#0:
	pushq %rbp
	Ltmp3:
	.cfi_def_cfa_offset 16
	Ltmp4:
	.cfi_offset %rbp, -16
	movq %rsp, %rbp
	Ltmp5:
	.cfi_def_cfa_register %rbp
	pushq %r14
	pushq %rbx
	Ltmp6:
	.cfi_offset %rbx, -32
	Ltmp7:
	.cfi_offset %r14, -24
	movq %rdi, %r14
	movl %esi, %edi
	callq _srand
	xorl %ebx, %ebx
	.align 4, 0x90
	LBB0_1: ## =>This Inner Loop Header: Depth=1
	callq _rand
	movw %ax, (%r14,%rbx,2)
	incq %rbx
	cmpq $1000000000, %rbx ## imm = 0x3B9ACA00
	jne LBB0_1
	## BB#2:
	popq %rbx
	popq %r14
	popq %rbp
	ret
	.cfi_endproc

	.globl _generate_table
	.align 4, 0x90
	_generate_table: ## @generate_table
	.cfi_startproc
	## BB#0:
	pushq %rbp
	Ltmp10:
	.cfi_def_cfa_offset 16
	Ltmp11:
	.cfi_offset %rbp, -16
	movq %rsp, %rbp
	Ltmp12:
	.cfi_def_cfa_register %rbp
	xorl %eax, %eax
	.align 4, 0x90
	LBB1_1: ## =>This Inner Loop Header: Depth=1
	movl %eax, %ecx
	shrl %ecx
	andl $1431655765, %ecx ## imm = 0x55555555
	movl %eax, %edx
	subl %ecx, %edx
	movl %edx, %ecx
	andl $858993459, %ecx ## imm = 0x33333333
	shrl $2, %edx
	andl $858993459, %edx ## imm = 0x33333333
	addl %ecx, %edx
	movl %edx, %ecx
	shrl $4, %ecx
	addl %edx, %ecx
	andl $252645135, %ecx ## imm = 0xF0F0F0F
	imull $16843009, %ecx, %ecx ## imm = 0x1010101
	shrl $24, %ecx
	movb %cl, (%rdi,%rax)
	incq %rax
	cmpq $255, %rax
	jne LBB1_1
	## BB#2:
	popq %rbp
	ret
	.cfi_endproc

	.globl _main
	.align 4, 0x90
	_main: ## @main
	.cfi_startproc
	## BB#0:
	pushq %rbp
	Ltmp16:
	.cfi_def_cfa_offset 16
	Ltmp17:
	.cfi_offset %rbp, -16
	movq %rsp, %rbp
	Ltmp18:
	.cfi_def_cfa_register %rbp
	pushq %r15
	pushq %r14
	pushq %r13
	pushq %r12
	pushq %rbx
	pushq %rax
	Ltmp19:
	.cfi_offset %rbx, -56
	Ltmp20:
	.cfi_offset %r12, -48
	Ltmp21:
	.cfi_offset %r13, -40
	Ltmp22:
	.cfi_offset %r14, -32
	Ltmp23:
	.cfi_offset %r15, -24
	movl $2000000000, %edi ## imm = 0x77359400
	callq _malloc
	movq %rax, %r14
	xorl %ebx, %ebx
	xorl %edi, %edi
	callq _srand
	movl $255, %edi
	callq _malloc
	movq %rax, %r15
	.align 4, 0x90
	LBB2_1: ## =>This Inner Loop Header: Depth=1
	movl %ebx, %eax
	shrl %eax
	andl $1431655765, %eax ## imm = 0x55555555
	movl %ebx, %ecx
	subl %eax, %ecx
	movl %ecx, %eax
	andl $858993459, %eax ## imm = 0x33333333
	shrl $2, %ecx
	andl $858993459, %ecx ## imm = 0x33333333
	addl %eax, %ecx
	movl %ecx, %eax
	shrl $4, %eax
	addl %ecx, %eax
	andl $252645135, %eax ## imm = 0xF0F0F0F
	imull $16843009, %eax, %eax ## imm = 0x1010101
	shrl $24, %eax
	movb %al, (%r15,%rbx)
	incq %rbx
	cmpq $255, %rbx
	jne LBB2_1
	## BB#2: ## %generate_table.exit
	xorl %r13d, %r13d
	xorl %edi, %edi
	callq _srand
	movl $1, %r12d
	xorl %ebx, %ebx
	.align 4, 0x90
	LBB2_3: ## =>This Inner Loop Header: Depth=1
	callq _rand
	movw %ax, (%r14,%rbx,2)
	incq %rbx
	cmpq $1000000000, %rbx ## imm = 0x3B9ACA00
	jne LBB2_3
	## BB#4:
	xorl %esi, %esi
	.align 4, 0x90
	LBB2_5: ## %vector.body
	## =>This Inner Loop Header: Depth=1
	movq %rsi, %rax
	movq %r13, %rcx
	movsbq -1(%r14,%r12), %rdx
	movsbq (%r14,%r12), %rsi
	movsbq (%r15,%rdx), %r13
	movsbq (%r15,%rsi), %rsi
	addq %rcx, %r13
	addq %rax, %rsi
	addq $2, %r12
	cmpq $2000000001, %r12 ## imm = 0x77359401
	jne LBB2_5
	## BB#6: ## %middle.block
	addq %r13, %rsi
	leaq L_.str(%rip), %rdi
	xorl %eax, %eax
	callq _printf
	movq %r14, %rdi
	callq _free
	xorl %eax, %eax
	addq $8, %rsp
	popq %rbx
	popq %r12
	popq %r13
	popq %r14
	popq %r15
	popq %rbp
	ret
	.cfi_endproc

	.section __TEXT,__cstring,cstring_literals
	L_.str: ## @.str
	.asciz "Set bit count was: %llu\n"


	.subsections_via_symbols