daramkun · May 16, 2018 06:45 · daramkun · May 16, 2018
diff --git a/memcpy_performance_test.cpp b/memcpy_performance_test.cpp
 #include <Windows.h>
 #include <memory>
 #include <mmintrin.h>
 #include <immintrin.h>
 #include <xmmintrin.h>
 #include <functional>
 #include <cstdio>

 class HeapMemory
 {
 public:
 	HeapMemory ( size_t dataSize )
 		: _dataSize ( dataSize )
 	{
 		_data = new BYTE [ dataSize ];
 	}
 	HeapMemory ( size_t stride, size_t height )
 		: HeapMemory ( stride * height )
 	{ }
 	~HeapMemory ()
 	{
 		if ( _data )
 			delete [] _data;
 	}

 public:
 	inline BYTE * getData () const noexcept { return _data; }
 	inline size_t getDataSize () const noexcept { return _dataSize; }

 private:
 	BYTE * _data;
 	size_t _dataSize;
 };

 inline double getTime () noexcept
 {
 	LARGE_INTEGER performanceFrequency, getTime;
 	QueryPerformanceFrequency ( &performanceFrequency );
 	QueryPerformanceCounter ( &getTime );
 	return ( getTime.QuadPart / ( double ) performanceFrequency.QuadPart );
 }

 using MemoryCopy = std::function<void* ( void *, void *, size_t )>;

 #define SystemMemoryCopy									memcpy
 inline void * SSEMemoryCopy ( void * dst, const void * src, size_t size )
 {
 	const unsigned int step = 64;

 	BYTE * bdst = reinterpret_cast< BYTE* >( dst );
 	const BYTE * bsrc = reinterpret_cast< const BYTE* >( src );

 	while ( size > step )
 	{
 		//_mm_prefetch ( ( ( CHAR* ) src ) + 640, _MM_HINT_NTA );
 		__m128i A = _mm_load_si128 ( ( __m128i* )( bsrc + 0 ) );
 		__m128i B = _mm_load_si128 ( ( __m128i* )( bsrc + 16 ) );
 		__m128i C = _mm_load_si128 ( ( __m128i* )( bsrc + 32 ) );
 		__m128i D = _mm_load_si128 ( ( __m128i* )( bsrc + 48 ) );

 		// destination address must be 16-byte aligned!
 		_mm_store_si128 ( ( __m128i* )( bdst + 0 ), A );
 		_mm_store_si128 ( ( __m128i* )( bdst + 16 ), B );
 		_mm_store_si128 ( ( __m128i* )( bdst + 32 ), C );
 		_mm_store_si128 ( ( __m128i* )( bdst + 48 ), D );

 		bsrc  += step;
 		bdst += step;
 		size -= step;
 	}

 	memcpy ( bdst, bsrc, size );

 	_mm_mfence ();

 	return dst;
 }
 inline void * AVX2MemoryCopy ( void * dst, const void * src, size_t size )
 {
 	//const unsigned long step = 64;
 	BYTE * bdst = reinterpret_cast< BYTE* >( dst );
 	const BYTE * bsrc = reinterpret_cast< const BYTE* >( src );

 	while ( size > 64 )
 	{
 		//_mm_prefetch ( ( ( CHAR* ) src ) + 640, _MM_HINT_NTA );
 		__m256i A = _mm256_load_si256 ( ( __m256i* )( bsrc + 0 ) );
 		__m256i B = _mm256_load_si256 ( ( __m256i* )( bsrc + 32 ) );

 		// destination address must be 32-byte aligned!
 		_mm256_store_si256 ( ( __m256i* )( bdst + 0 ), A );
 		_mm256_store_si256 ( ( __m256i* )( bdst + 32 ), B );

 		bsrc += 64;
 		bdst += 64;
 		size -= 64;
 	}

 	memcpy ( bdst, bsrc, size );

 	_mm_mfence ();

 	return dst;
 }

 inline void Test ( MemoryCopy memcpy, const char * funcName, int width, int height, int fps, int count ) noexcept
 {
 	printf ( "==== %s Test (%dx%d@%dfpsx%d) ====\n", funcName, width, height, fps, count );

 	double startTime = getTime ();

 	HeapMemory src ( width, height );
 	for ( int c = 0; c < count; ++c )
 	{
 		for ( int i = 0; i < fps; ++i )
 		{
 			HeapMemory dst ( width, height );
 			memcpy ( dst.getData (), src.getData (), dst.getDataSize () );
 		}
 	}

 	double endTime = getTime ();

 	printf ( "Elapsed time> %lf\n", endTime - startTime );
 	printf ( "Average time> %lf\n", ( endTime - startTime ) / ( fps * count ) );
 }

 int main ( void )
 {
 	SetPriorityClass ( GetCurrentProcess (), HIGH_PRIORITY_CLASS );
 	SetProcessAffinityMask ( GetCurrentProcess (), 0 );
 	SetThreadPriority ( GetCurrentThread (), THREAD_PRIORITY_HIGHEST );

 	Test ( SystemMemoryCopy, "memcpy ()", 1920, 1080, 60, 10 );
 	Test ( SSEMemoryCopy, "SSEmemcpy ()", 1920, 1080, 60, 10 );
 	Test ( AVX2MemoryCopy, "AVX2memcpy ()", 1920, 1080, 60, 10 );

 	return 0;
 }
	#include <Windows.h>
	#include <memory>
	#include <mmintrin.h>
	#include <immintrin.h>
	#include <xmmintrin.h>
	#include <functional>
	#include <cstdio>

	class HeapMemory
	{
	public:
	HeapMemory ( size_t dataSize )
	: _dataSize ( dataSize )
	{
	_data = new BYTE [ dataSize ];
	}
	HeapMemory ( size_t stride, size_t height )
	: HeapMemory ( stride * height )
	{ }
	~HeapMemory ()
	{
	if ( _data )
	delete [] _data;
	}

	public:
	inline BYTE * getData () const noexcept { return _data; }
	inline size_t getDataSize () const noexcept { return _dataSize; }

	private:
	BYTE * _data;
	size_t _dataSize;
	};

	inline double getTime () noexcept
	{
	LARGE_INTEGER performanceFrequency, getTime;
	QueryPerformanceFrequency ( &performanceFrequency );
	QueryPerformanceCounter ( &getTime );
	return ( getTime.QuadPart / ( double ) performanceFrequency.QuadPart );
	}

	using MemoryCopy = std::function<void* ( void , void , size_t )>;

	#define SystemMemoryCopy memcpy
	inline void * SSEMemoryCopy ( void * dst, const void * src, size_t size )
	{
	const unsigned int step = 64;

	BYTE * bdst = reinterpret_cast< BYTE* >( dst );
	const BYTE * bsrc = reinterpret_cast< const BYTE* >( src );

	while ( size > step )
	{
	//_mm_prefetch ( ( ( CHAR* ) src ) + 640, _MM_HINT_NTA );
	__m128i A = _mm_load_si128 ( ( __m128i* )( bsrc + 0 ) );
	__m128i B = _mm_load_si128 ( ( __m128i* )( bsrc + 16 ) );
	__m128i C = _mm_load_si128 ( ( __m128i* )( bsrc + 32 ) );
	__m128i D = _mm_load_si128 ( ( __m128i* )( bsrc + 48 ) );

	// destination address must be 16-byte aligned!
	_mm_store_si128 ( ( __m128i* )( bdst + 0 ), A );
	_mm_store_si128 ( ( __m128i* )( bdst + 16 ), B );
	_mm_store_si128 ( ( __m128i* )( bdst + 32 ), C );
	_mm_store_si128 ( ( __m128i* )( bdst + 48 ), D );

	bsrc += step;
	bdst += step;
	size -= step;
	}

	memcpy ( bdst, bsrc, size );

	_mm_mfence ();

	return dst;
	}
	inline void * AVX2MemoryCopy ( void * dst, const void * src, size_t size )
	{
	//const unsigned long step = 64;
	BYTE * bdst = reinterpret_cast< BYTE* >( dst );
	const BYTE * bsrc = reinterpret_cast< const BYTE* >( src );

	while ( size > 64 )
	{
	//_mm_prefetch ( ( ( CHAR* ) src ) + 640, _MM_HINT_NTA );
	__m256i A = _mm256_load_si256 ( ( __m256i* )( bsrc + 0 ) );
	__m256i B = _mm256_load_si256 ( ( __m256i* )( bsrc + 32 ) );

	// destination address must be 32-byte aligned!
	_mm256_store_si256 ( ( __m256i* )( bdst + 0 ), A );
	_mm256_store_si256 ( ( __m256i* )( bdst + 32 ), B );

	bsrc += 64;
	bdst += 64;
	size -= 64;
	}

	memcpy ( bdst, bsrc, size );

	_mm_mfence ();

	return dst;
	}

	inline void Test ( MemoryCopy memcpy, const char * funcName, int width, int height, int fps, int count ) noexcept
	{
	printf ( "==== %s Test (%dx%d@%dfpsx%d) ====\n", funcName, width, height, fps, count );

	double startTime = getTime ();

	HeapMemory src ( width, height );
	for ( int c = 0; c < count; ++c )
	{
	for ( int i = 0; i < fps; ++i )
	{
	HeapMemory dst ( width, height );
	memcpy ( dst.getData (), src.getData (), dst.getDataSize () );
	}
	}

	double endTime = getTime ();

	printf ( "Elapsed time> %lf\n", endTime - startTime );
	printf ( "Average time> %lf\n", ( endTime - startTime ) / ( fps * count ) );
	}

	int main ( void )
	{
	SetPriorityClass ( GetCurrentProcess (), HIGH_PRIORITY_CLASS );
	SetProcessAffinityMask ( GetCurrentProcess (), 0 );
	SetThreadPriority ( GetCurrentThread (), THREAD_PRIORITY_HIGHEST );

	Test ( SystemMemoryCopy, "memcpy ()", 1920, 1080, 60, 10 );
	Test ( SSEMemoryCopy, "SSEmemcpy ()", 1920, 1080, 60, 10 );
	Test ( AVX2MemoryCopy, "AVX2memcpy ()", 1920, 1080, 60, 10 );

	return 0;
	}