Mischa-Alff · July 7, 2024 01:03
diff --git a/AABB.cpp b/AABB.cpp
 #include <chrono>
 #include <ctime>
 #include <cstdlib>
 #include <iomanip>
 #include <iostream>
 #include <random>

 struct AABB{int32_t left, top, right, bottom;};
 extern "C" bool check_aabb(AABB *a, AABB *b);
 extern "C" bool check_aabb_conventional(AABB *a, AABB *b);
 bool check_aabb_c(AABB *a, AABB *b);


 #include <vector>
 #include <algorithm>


 const int aabbAmount  = 10000;
 const int aabbMinPos  = 0;
 const int aabbMaxPos  = 1000000;
 const int aabbMinSize = 1;
 const int aabbMaxSize = 1000000;

 void benchmark();
 void test();

 int main()
 {
    benchmark();
 }


 void benchmark()
 {
    AABB* aabbs = new AABB[aabbAmount];

    std::default_random_engine generator;
    std::uniform_int_distribution<int32_t> positionDistribution(aabbMinPos, aabbMaxPos);
    std::uniform_int_distribution<int32_t> sizeDistribution(aabbMinSize, aabbMaxSize);

    for(int32_t i = 0; i < aabbAmount; ++i)
    {   
        aabbs[i].left = positionDistribution(generator);
        aabbs[i].top = positionDistribution(generator);
        aabbs[i].right = aabbs[i].left + sizeDistribution(generator);
        aabbs[i].bottom = aabbs[i].top + sizeDistribution(generator);
    }; 

    std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();

    for(int32_t i = 0; i < aabbAmount; ++i)
    {   
        for(int32_t j = i + 1; j < aabbAmount; ++j)
        {   
            bool collision = check_aabb_c(&aabbs[i], &aabbs[j]);
            collision = check_aabb_c(&aabbs[j], &aabbs[i]);
        }   
    }

    auto t = std::chrono::high_resolution_clock::now()-start;

    std::cout<<std::setw(25)<<"Conventional (C): "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl;

    start = std::chrono::high_resolution_clock::now();

    for(int32_t i = 0; i < aabbAmount; ++i)
    {   
        for(int32_t j = i + 1; j < aabbAmount; ++j)
        {   
            bool collision = check_aabb_conventional(&aabbs[i], &aabbs[j]);
            collision = check_aabb_conventional(&aabbs[j], &aabbs[i]);
        }   
    }

    t = std::chrono::high_resolution_clock::now()-start;

    std::cout<<std::setw(25)<<"Conventional (ASM): "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl;

    start = std::chrono::high_resolution_clock::now();

    for(int32_t i = 0; i < aabbAmount; ++i)
    {   
        for(int32_t j = i + 1; j < aabbAmount; ++j)
        {   
            bool collision = check_aabb(&aabbs[i], &aabbs[j]);
            collision = check_aabb(&aabbs[j], &aabbs[i]);
        }
    }

    t = std::chrono::high_resolution_clock::now()-start;

    std::cout<<std::setw(25)<<"SIMD: "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl;
    std::cout<<"These results are not trustworthy due to loop and other overhead. Just use them to see \"which is faster\" instead of as a real performance metric. Use callgrind for that."<<std::endl;
 }

 bool check_aabb_c(AABB *a, AABB *b)
 {
    return !((b->left)>(a->right) || (b->right)<(a->left) || (b->top)>(a->bottom) || (b->bottom)<(a->top));
 }
diff --git a/AABB.nasm b/AABB.nasm
 ; AABB.nasm (c) 2015 Mischa "Aster" Alff
 ; x86-64 Assembly implementations of AABB intersection checks.
 ; The algorithm in use here is:
 ;   !(B.left>A.right || B.right<A.left || B.top>A.bottom || B.bottom<A.top)


 section .data
 section .bss
 	aabb:
 		.left: resd 1
 		.top: resd 1
 		.right: resd 1
 		.bottom: resd 1

 section .text
 	global check_aabb
 	global check_aabb_conventional

 	; check_aabb: calculates intersection between two AABBs using SSE
 	; Notes: This routine runs 2.66 times faster than `check_aabb_conventional`
 	;   using the yasm assembler on x86-64 (Intel Haswell i7-4770)
 	; Takes two arguments: *AABB and *AABB, in rdi and rsi respectively.
 	check_aabb:
 		; load the first AABB from memory into xmm0 as four 32-bit signed ints
 		movdqa xmm0, [rdi]
 		; load the second AABB from memory into xmm1 as four 32-bit signed ints
 		movdqa xmm1, [rsi]
 		; shuffle the second AABB so that top/bottom and left/right are inverted
 		shufps xmm1, xmm1, 01001110b
 		; compare each signed 32-bit integer in xmm1 to the ones in xmm0 and
 		;   store the result in xmm1 as a bit mask
 		;   (0xFFFFFFFF for true, 0x0 for false)
 		pcmpgtd xmm1, xmm0
 		; obtain that bitmask in a two-byte register (we use rax here because it
 		;   zeroes out all the other bits)
 		pmovmskb rax, xmm1
 		; invert the greater-than for two members to make them
 		;   less-than-or-equal-to (yes, this is not as intended, but since this 
 		;   is a PoC and will probably be adapted for floating point, nobody
 		;   cares whether it's > or >= for now.)
 		xor eax, 0x00FF
 		; Convert our bitmask to a boolean, and not it:
 		;   If the result of the xor is zero, the zero flag is 1, and thus al
 		;     is 1 (and thus rax is 1)
 		;   If not, the ZF is 0, and thus al is 0, and rax is 0.
 		setz al
 		; Still not sure if this is required, but better be safe than sorry,
 		;   I guess. (Yes, C++ treats >0 as true, but I want a true logical 0/1)
 		and eax, 0x1
 		ret

 	; check_aabb_conventional : calculates intersection between two AABBs using
 	;   conventional methods
 	; Notes: This is a naïve port of the C version mentioned above
 	; Takes two arguments: *AABB and *AABB, in rdi and rsi respectively.
 	check_aabb_conventional:
 		; set ecx to 0, as we'll be using it for our OR accumulator
 		xor ecx, ecx
 		; load B.left into eax, A.right into ebx, and compare for greater-than
 		mov eax, dword [rdi + (aabb.left - aabb)]
 		mov ebx, dword [rsi + (aabb.right - aabb)]
 		cmp eax, ebx
 		; place the result of the comparison in cl
 		setg cl
 		; ... repeat
 		mov eax, dword [rdi + (aabb.right - aabb)]
 		mov ebx, dword [rsi + (aabb.left - aabb)]
 		cmp eax, ebx
 		; place the result of the comparison in ch
 		setle ch
 		; or cl and ch, repeat
 		or cl, ch
 		mov eax, dword [rdi + (aabb.top - aabb)]
 		mov ebx, dword [rsi + (aabb.bottom - aabb)]
 		cmp eax, ebx
 		setg ch
 		or cl, ch
 		mov eax, dword [rdi + (aabb.bottom - aabb)]
 		mov ebx, dword [rsi + (aabb.top - aabb)]
 		cmp eax, ebx
 		setle ch
 		or cl, ch
 		; convert to logical bool and NOT.
 		not ecx
 		and ecx, 1
 		mov eax, ecx
 		ret
diff --git a/build.sh b/build.sh
 yasm -f elf64 AABB.nasm -o AABB.o -l AABB.lst
 g++ AABB.cpp -c -o AABB.cpp.o -std=c++11
 g++ AABB.cpp.o AABB.o -o AABB
	#include <chrono>
	#include <ctime>
	#include <cstdlib>
	#include <iomanip>
	#include <iostream>
	#include <random>

	struct AABB{int32_t left, top, right, bottom;};
	extern "C" bool check_aabb(AABB a, AABB b);
	extern "C" bool check_aabb_conventional(AABB a, AABB b);
	bool check_aabb_c(AABB a, AABB b);


	#include <vector>
	#include <algorithm>


	const int aabbAmount = 10000;
	const int aabbMinPos = 0;
	const int aabbMaxPos = 1000000;
	const int aabbMinSize = 1;
	const int aabbMaxSize = 1000000;

	void benchmark();
	void test();

	int main()
	{
	benchmark();
	}


	void benchmark()
	{
	AABB* aabbs = new AABB[aabbAmount];

	std::default_random_engine generator;
	std::uniform_int_distribution<int32_t> positionDistribution(aabbMinPos, aabbMaxPos);
	std::uniform_int_distribution<int32_t> sizeDistribution(aabbMinSize, aabbMaxSize);

	for(int32_t i = 0; i < aabbAmount; ++i)
	{
	aabbs[i].left = positionDistribution(generator);
	aabbs[i].top = positionDistribution(generator);
	aabbs[i].right = aabbs[i].left + sizeDistribution(generator);
	aabbs[i].bottom = aabbs[i].top + sizeDistribution(generator);
	};

	std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();

	for(int32_t i = 0; i < aabbAmount; ++i)
	{
	for(int32_t j = i + 1; j < aabbAmount; ++j)
	{
	bool collision = check_aabb_c(&aabbs[i], &aabbs[j]);
	collision = check_aabb_c(&aabbs[j], &aabbs[i]);
	}
	}

	auto t = std::chrono::high_resolution_clock::now()-start;

	std::cout<<std::setw(25)<<"Conventional (C): "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl;

	start = std::chrono::high_resolution_clock::now();

	for(int32_t i = 0; i < aabbAmount; ++i)
	{
	for(int32_t j = i + 1; j < aabbAmount; ++j)
	{
	bool collision = check_aabb_conventional(&aabbs[i], &aabbs[j]);
	collision = check_aabb_conventional(&aabbs[j], &aabbs[i]);
	}
	}

	t = std::chrono::high_resolution_clock::now()-start;

	std::cout<<std::setw(25)<<"Conventional (ASM): "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl;

	start = std::chrono::high_resolution_clock::now();

	for(int32_t i = 0; i < aabbAmount; ++i)
	{
	for(int32_t j = i + 1; j < aabbAmount; ++j)
	{
	bool collision = check_aabb(&aabbs[i], &aabbs[j]);
	collision = check_aabb(&aabbs[j], &aabbs[i]);
	}
	}

	t = std::chrono::high_resolution_clock::now()-start;

	std::cout<<std::setw(25)<<"SIMD: "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl;
	std::cout<<"These results are not trustworthy due to loop and other overhead. Just use them to see \"which is faster\" instead of as a real performance metric. Use callgrind for that."<<std::endl;
	}

	bool check_aabb_c(AABB a, AABB b)
	{
	return !((b->left)>(a->right) \|\| (b->right)<(a->left) \|\| (b->top)>(a->bottom) \|\| (b->bottom)<(a->top));
	}
	; AABB.nasm (c) 2015 Mischa "Aster" Alff
	; x86-64 Assembly implementations of AABB intersection checks.
	; The algorithm in use here is:
	; !(B.left>A.right \|\| B.right<A.left \|\| B.top>A.bottom \|\| B.bottom<A.top)


	section .data
	section .bss
	aabb:
	.left: resd 1
	.top: resd 1
	.right: resd 1
	.bottom: resd 1

	section .text
	global check_aabb
	global check_aabb_conventional

	; check_aabb: calculates intersection between two AABBs using SSE
	; Notes: This routine runs 2.66 times faster than `check_aabb_conventional`
	; using the yasm assembler on x86-64 (Intel Haswell i7-4770)
	; Takes two arguments: AABB and AABB, in rdi and rsi respectively.
	check_aabb:
	; load the first AABB from memory into xmm0 as four 32-bit signed ints
	movdqa xmm0, [rdi]
	; load the second AABB from memory into xmm1 as four 32-bit signed ints
	movdqa xmm1, [rsi]
	; shuffle the second AABB so that top/bottom and left/right are inverted
	shufps xmm1, xmm1, 01001110b
	; compare each signed 32-bit integer in xmm1 to the ones in xmm0 and
	; store the result in xmm1 as a bit mask
	; (0xFFFFFFFF for true, 0x0 for false)
	pcmpgtd xmm1, xmm0
	; obtain that bitmask in a two-byte register (we use rax here because it
	; zeroes out all the other bits)
	pmovmskb rax, xmm1
	; invert the greater-than for two members to make them
	; less-than-or-equal-to (yes, this is not as intended, but since this
	; is a PoC and will probably be adapted for floating point, nobody
	; cares whether it's > or >= for now.)
	xor eax, 0x00FF
	; Convert our bitmask to a boolean, and not it:
	; If the result of the xor is zero, the zero flag is 1, and thus al
	; is 1 (and thus rax is 1)
	; If not, the ZF is 0, and thus al is 0, and rax is 0.
	setz al
	; Still not sure if this is required, but better be safe than sorry,
	; I guess. (Yes, C++ treats >0 as true, but I want a true logical 0/1)
	and eax, 0x1
	ret

	; check_aabb_conventional : calculates intersection between two AABBs using
	; conventional methods
	; Notes: This is a naïve port of the C version mentioned above
	; Takes two arguments: AABB and AABB, in rdi and rsi respectively.
	check_aabb_conventional:
	; set ecx to 0, as we'll be using it for our OR accumulator
	xor ecx, ecx
	; load B.left into eax, A.right into ebx, and compare for greater-than
	mov eax, dword [rdi + (aabb.left - aabb)]
	mov ebx, dword [rsi + (aabb.right - aabb)]
	cmp eax, ebx
	; place the result of the comparison in cl
	setg cl
	; ... repeat
	mov eax, dword [rdi + (aabb.right - aabb)]
	mov ebx, dword [rsi + (aabb.left - aabb)]
	cmp eax, ebx
	; place the result of the comparison in ch
	setle ch
	; or cl and ch, repeat
	or cl, ch
	mov eax, dword [rdi + (aabb.top - aabb)]
	mov ebx, dword [rsi + (aabb.bottom - aabb)]
	cmp eax, ebx
	setg ch
	or cl, ch
	mov eax, dword [rdi + (aabb.bottom - aabb)]
	mov ebx, dword [rsi + (aabb.top - aabb)]
	cmp eax, ebx
	setle ch
	or cl, ch
	; convert to logical bool and NOT.
	not ecx
	and ecx, 1
	mov eax, ecx
	ret
	yasm -f elf64 AABB.nasm -o AABB.o -l AABB.lst
	g++ AABB.cpp -c -o AABB.cpp.o -std=c++11
	g++ AABB.cpp.o AABB.o -o AABB