Skip to content

Instantly share code, notes, and snippets.

@daramkun
Created May 16, 2018 06:45
Show Gist options
  • Save daramkun/7e87e9b789a160df7705d9b6b5efe4e8 to your computer and use it in GitHub Desktop.
Save daramkun/7e87e9b789a160df7705d9b6b5efe4e8 to your computer and use it in GitHub Desktop.
#include <Windows.h>
#include <memory>
#include <mmintrin.h>
#include <immintrin.h>
#include <xmmintrin.h>
#include <functional>
#include <cstdio>
class HeapMemory
{
public:
HeapMemory ( size_t dataSize )
: _dataSize ( dataSize )
{
_data = new BYTE [ dataSize ];
}
HeapMemory ( size_t stride, size_t height )
: HeapMemory ( stride * height )
{ }
~HeapMemory ()
{
if ( _data )
delete [] _data;
}
public:
inline BYTE * getData () const noexcept { return _data; }
inline size_t getDataSize () const noexcept { return _dataSize; }
private:
BYTE * _data;
size_t _dataSize;
};
inline double getTime () noexcept
{
LARGE_INTEGER performanceFrequency, getTime;
QueryPerformanceFrequency ( &performanceFrequency );
QueryPerformanceCounter ( &getTime );
return ( getTime.QuadPart / ( double ) performanceFrequency.QuadPart );
}
using MemoryCopy = std::function<void* ( void *, void *, size_t )>;
#define SystemMemoryCopy memcpy
inline void * SSEMemoryCopy ( void * dst, const void * src, size_t size )
{
const unsigned int step = 64;
BYTE * bdst = reinterpret_cast< BYTE* >( dst );
const BYTE * bsrc = reinterpret_cast< const BYTE* >( src );
while ( size > step )
{
//_mm_prefetch ( ( ( CHAR* ) src ) + 640, _MM_HINT_NTA );
__m128i A = _mm_load_si128 ( ( __m128i* )( bsrc + 0 ) );
__m128i B = _mm_load_si128 ( ( __m128i* )( bsrc + 16 ) );
__m128i C = _mm_load_si128 ( ( __m128i* )( bsrc + 32 ) );
__m128i D = _mm_load_si128 ( ( __m128i* )( bsrc + 48 ) );
// destination address must be 16-byte aligned!
_mm_store_si128 ( ( __m128i* )( bdst + 0 ), A );
_mm_store_si128 ( ( __m128i* )( bdst + 16 ), B );
_mm_store_si128 ( ( __m128i* )( bdst + 32 ), C );
_mm_store_si128 ( ( __m128i* )( bdst + 48 ), D );
bsrc += step;
bdst += step;
size -= step;
}
memcpy ( bdst, bsrc, size );
_mm_mfence ();
return dst;
}
inline void * AVX2MemoryCopy ( void * dst, const void * src, size_t size )
{
//const unsigned long step = 64;
BYTE * bdst = reinterpret_cast< BYTE* >( dst );
const BYTE * bsrc = reinterpret_cast< const BYTE* >( src );
while ( size > 64 )
{
//_mm_prefetch ( ( ( CHAR* ) src ) + 640, _MM_HINT_NTA );
__m256i A = _mm256_load_si256 ( ( __m256i* )( bsrc + 0 ) );
__m256i B = _mm256_load_si256 ( ( __m256i* )( bsrc + 32 ) );
// destination address must be 32-byte aligned!
_mm256_store_si256 ( ( __m256i* )( bdst + 0 ), A );
_mm256_store_si256 ( ( __m256i* )( bdst + 32 ), B );
bsrc += 64;
bdst += 64;
size -= 64;
}
memcpy ( bdst, bsrc, size );
_mm_mfence ();
return dst;
}
inline void Test ( MemoryCopy memcpy, const char * funcName, int width, int height, int fps, int count ) noexcept
{
printf ( "==== %s Test (%dx%d@%dfpsx%d) ====\n", funcName, width, height, fps, count );
double startTime = getTime ();
HeapMemory src ( width, height );
for ( int c = 0; c < count; ++c )
{
for ( int i = 0; i < fps; ++i )
{
HeapMemory dst ( width, height );
memcpy ( dst.getData (), src.getData (), dst.getDataSize () );
}
}
double endTime = getTime ();
printf ( "Elapsed time> %lf\n", endTime - startTime );
printf ( "Average time> %lf\n", ( endTime - startTime ) / ( fps * count ) );
}
int main ( void )
{
SetPriorityClass ( GetCurrentProcess (), HIGH_PRIORITY_CLASS );
SetProcessAffinityMask ( GetCurrentProcess (), 0 );
SetThreadPriority ( GetCurrentThread (), THREAD_PRIORITY_HIGHEST );
Test ( SystemMemoryCopy, "memcpy ()", 1920, 1080, 60, 10 );
Test ( SSEMemoryCopy, "SSEmemcpy ()", 1920, 1080, 60, 10 );
Test ( AVX2MemoryCopy, "AVX2memcpy ()", 1920, 1080, 60, 10 );
return 0;
}
@daramkun
Copy link
Author

image
memcpy is fastest.
Test from AMD Ryzen 5 2600X + DDR4 ESSENCORE Klevv 19200 16GBx2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment