Last active
July 7, 2024 01:03
-
-
Save Mischa-Alff/4fdb4e2e545212cfcbda to your computer and use it in GitHub Desktop.
Simple AABB implementation using SIMD instructions for x86-64
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <chrono> | |
#include <ctime> | |
#include <cstdlib> | |
#include <iomanip> | |
#include <iostream> | |
#include <random> | |
struct AABB{int32_t left, top, right, bottom;}; | |
extern "C" bool check_aabb(AABB *a, AABB *b); | |
extern "C" bool check_aabb_conventional(AABB *a, AABB *b); | |
bool check_aabb_c(AABB *a, AABB *b); | |
#include <vector> | |
#include <algorithm> | |
const int aabbAmount = 10000; | |
const int aabbMinPos = 0; | |
const int aabbMaxPos = 1000000; | |
const int aabbMinSize = 1; | |
const int aabbMaxSize = 1000000; | |
void benchmark(); | |
void test(); | |
int main() | |
{ | |
benchmark(); | |
} | |
void benchmark() | |
{ | |
AABB* aabbs = new AABB[aabbAmount]; | |
std::default_random_engine generator; | |
std::uniform_int_distribution<int32_t> positionDistribution(aabbMinPos, aabbMaxPos); | |
std::uniform_int_distribution<int32_t> sizeDistribution(aabbMinSize, aabbMaxSize); | |
for(int32_t i = 0; i < aabbAmount; ++i) | |
{ | |
aabbs[i].left = positionDistribution(generator); | |
aabbs[i].top = positionDistribution(generator); | |
aabbs[i].right = aabbs[i].left + sizeDistribution(generator); | |
aabbs[i].bottom = aabbs[i].top + sizeDistribution(generator); | |
}; | |
std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); | |
for(int32_t i = 0; i < aabbAmount; ++i) | |
{ | |
for(int32_t j = i + 1; j < aabbAmount; ++j) | |
{ | |
bool collision = check_aabb_c(&aabbs[i], &aabbs[j]); | |
collision = check_aabb_c(&aabbs[j], &aabbs[i]); | |
} | |
} | |
auto t = std::chrono::high_resolution_clock::now()-start; | |
std::cout<<std::setw(25)<<"Conventional (C): "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl; | |
start = std::chrono::high_resolution_clock::now(); | |
for(int32_t i = 0; i < aabbAmount; ++i) | |
{ | |
for(int32_t j = i + 1; j < aabbAmount; ++j) | |
{ | |
bool collision = check_aabb_conventional(&aabbs[i], &aabbs[j]); | |
collision = check_aabb_conventional(&aabbs[j], &aabbs[i]); | |
} | |
} | |
t = std::chrono::high_resolution_clock::now()-start; | |
std::cout<<std::setw(25)<<"Conventional (ASM): "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl; | |
start = std::chrono::high_resolution_clock::now(); | |
for(int32_t i = 0; i < aabbAmount; ++i) | |
{ | |
for(int32_t j = i + 1; j < aabbAmount; ++j) | |
{ | |
bool collision = check_aabb(&aabbs[i], &aabbs[j]); | |
collision = check_aabb(&aabbs[j], &aabbs[i]); | |
} | |
} | |
t = std::chrono::high_resolution_clock::now()-start; | |
std::cout<<std::setw(25)<<"SIMD: "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl; | |
std::cout<<"These results are not trustworthy due to loop and other overhead. Just use them to see \"which is faster\" instead of as a real performance metric. Use callgrind for that."<<std::endl; | |
} | |
bool check_aabb_c(AABB *a, AABB *b) | |
{ | |
return !((b->left)>(a->right) || (b->right)<(a->left) || (b->top)>(a->bottom) || (b->bottom)<(a->top)); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; AABB.nasm (c) 2015 Mischa "Aster" Alff | |
; x86-64 Assembly implementations of AABB intersection checks. | |
; The algorithm in use here is: | |
; !(B.left>A.right || B.right<A.left || B.top>A.bottom || B.bottom<A.top) | |
section .data | |
section .bss | |
aabb: | |
.left: resd 1 | |
.top: resd 1 | |
.right: resd 1 | |
.bottom: resd 1 | |
section .text | |
global check_aabb | |
global check_aabb_conventional | |
; check_aabb: calculates intersection between two AABBs using SSE | |
; Notes: This routine runs 2.66 times faster than `check_aabb_conventional` | |
; using the yasm assembler on x86-64 (Intel Haswell i7-4770) | |
; Takes two arguments: *AABB and *AABB, in rdi and rsi respectively. | |
check_aabb: | |
; load the first AABB from memory into xmm0 as four 32-bit signed ints | |
movdqa xmm0, [rdi] | |
; load the second AABB from memory into xmm1 as four 32-bit signed ints | |
movdqa xmm1, [rsi] | |
; shuffle the second AABB so that top/bottom and left/right are inverted | |
shufps xmm1, xmm1, 01001110b | |
; compare each signed 32-bit integer in xmm1 to the ones in xmm0 and | |
; store the result in xmm1 as a bit mask | |
; (0xFFFFFFFF for true, 0x0 for false) | |
pcmpgtd xmm1, xmm0 | |
; obtain that bitmask in a two-byte register (we use rax here because it | |
; zeroes out all the other bits) | |
pmovmskb rax, xmm1 | |
; invert the greater-than for two members to make them | |
; less-than-or-equal-to (yes, this is not as intended, but since this | |
; is a PoC and will probably be adapted for floating point, nobody | |
; cares whether it's > or >= for now.) | |
xor eax, 0x00FF | |
; Convert our bitmask to a boolean, and not it: | |
; If the result of the xor is zero, the zero flag is 1, and thus al | |
; is 1 (and thus rax is 1) | |
; If not, the ZF is 0, and thus al is 0, and rax is 0. | |
setz al | |
; Still not sure if this is required, but better be safe than sorry, | |
; I guess. (Yes, C++ treats >0 as true, but I want a true logical 0/1) | |
and eax, 0x1 | |
ret | |
; check_aabb_conventional : calculates intersection between two AABBs using | |
; conventional methods | |
; Notes: This is a naïve port of the C version mentioned above | |
; Takes two arguments: *AABB and *AABB, in rdi and rsi respectively. | |
check_aabb_conventional: | |
; set ecx to 0, as we'll be using it for our OR accumulator | |
xor ecx, ecx | |
; load B.left into eax, A.right into ebx, and compare for greater-than | |
mov eax, dword [rdi + (aabb.left - aabb)] | |
mov ebx, dword [rsi + (aabb.right - aabb)] | |
cmp eax, ebx | |
; place the result of the comparison in cl | |
setg cl | |
; ... repeat | |
mov eax, dword [rdi + (aabb.right - aabb)] | |
mov ebx, dword [rsi + (aabb.left - aabb)] | |
cmp eax, ebx | |
; place the result of the comparison in ch | |
setle ch | |
; or cl and ch, repeat | |
or cl, ch | |
mov eax, dword [rdi + (aabb.top - aabb)] | |
mov ebx, dword [rsi + (aabb.bottom - aabb)] | |
cmp eax, ebx | |
setg ch | |
or cl, ch | |
mov eax, dword [rdi + (aabb.bottom - aabb)] | |
mov ebx, dword [rsi + (aabb.top - aabb)] | |
cmp eax, ebx | |
setle ch | |
or cl, ch | |
; convert to logical bool and NOT. | |
not ecx | |
and ecx, 1 | |
mov eax, ecx | |
ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
yasm -f elf64 AABB.nasm -o AABB.o -l AABB.lst | |
g++ AABB.cpp -c -o AABB.cpp.o -std=c++11 | |
g++ AABB.cpp.o AABB.o -o AABB |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment