-
-
Save daramkun/456d81800ef076a23caed52f3dd7f808 to your computer and use it in GitHub Desktop.
#include <Windows.h> | |
#include <d3d11.h> | |
#include <atlbase.h> | |
#pragma comment (lib, "d3d11.lib") | |
#include <iostream> | |
#include <thread> | |
#include <chrono> | |
#include <memory> | |
#include <vector> | |
#include <cstdint> | |
#include <cassert> | |
#include <algorithm> | |
#include <execution> | |
constexpr double MEASURE_SECONDS = 10; | |
constexpr double GIGABYTE_MAKER = 1 / 1073741824.0; | |
class performance | |
{ | |
public: | |
performance () : _copy_bytes (0), _running (false) { } | |
virtual ~performance () noexcept | |
{ | |
_run.join (); | |
} | |
public: | |
uint64_t copy_bytes () const noexcept { return _copy_bytes; } | |
std::chrono::duration<double> proceed_time () const noexcept | |
{ | |
return std::chrono::high_resolution_clock::now () - _started; | |
} | |
bool is_running () const noexcept { return _running; } | |
protected: | |
virtual size_t do_measure () noexcept = 0; | |
public: | |
void run () noexcept | |
{ | |
_running = true; | |
_started = std::chrono::high_resolution_clock::now (); | |
_run = std::thread ([this]() | |
{ | |
_copy_bytes = 0; | |
do | |
{ | |
_copy_bytes += this->do_measure (); | |
std::this_thread::yield (); | |
} | |
while (_running); | |
} | |
); | |
} | |
void stop () { _running = false; } | |
private: | |
uint64_t _copy_bytes; | |
std::chrono::steady_clock::time_point _started; | |
std::thread _run; | |
bool _running; | |
}; | |
class memcpy_performance : public performance | |
{ | |
private: | |
const size_t BUFFER_SIZE = 1024 * 1024 * 16; //< 16MB | |
public: | |
memcpy_performance () | |
{ | |
_dest.resize (BUFFER_SIZE); | |
_src.resize (BUFFER_SIZE); | |
for (unsigned int i = 0; i < std::thread::hardware_concurrency (); ++i) | |
_temp.push_back (i); | |
} | |
protected: | |
virtual size_t do_measure () noexcept override | |
{ | |
std::for_each (std::execution::par_unseq, _temp.begin (), _temp.end (), [this](unsigned int i) | |
{ | |
memcpy (_dest.data (), _src.data (), BUFFER_SIZE); | |
}); | |
return BUFFER_SIZE * _temp.size (); | |
} | |
private: | |
std::vector<uint8_t> _dest, _src; | |
std::vector<unsigned int> _temp; | |
}; | |
class D3D11CopyResourceRAM2VRAM_performance : public performance | |
{ | |
public: | |
D3D11CopyResourceRAM2VRAM_performance (size_t size = 4096, DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM) | |
{ | |
HRESULT hr = D3D11CreateDevice (nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &_d3dDevice, nullptr, &_immediateContext); | |
assert (SUCCEEDED (hr)); | |
memset (&_texDesc, 0, sizeof (D3D11_TEXTURE2D_DESC)); | |
_texDesc.Width = _texDesc.Height = size; | |
_texDesc.ArraySize = 1; | |
_texDesc.MipLevels = 1; | |
_texDesc.Format = format; | |
_texDesc.SampleDesc.Count = 1; | |
_texDesc.Usage = D3D11_USAGE_DEFAULT; | |
_texDesc.CPUAccessFlags = 0; | |
_texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE; | |
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_dest); | |
assert (SUCCEEDED (hr)); | |
_texDesc.Usage = D3D11_USAGE_STAGING; | |
_texDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; | |
_texDesc.BindFlags = 0; | |
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_src); | |
assert (SUCCEEDED (hr)); | |
_totalSize = size * size * (format == DXGI_FORMAT_R8G8B8A8_UNORM ? 4 : 16); | |
} | |
protected: | |
virtual size_t do_measure () noexcept override | |
{ | |
_immediateContext->CopyResource (_dest, _src); | |
_immediateContext->Flush (); | |
return _totalSize; | |
} | |
private: | |
D3D11_TEXTURE2D_DESC _texDesc; | |
CComPtr<ID3D11Device> _d3dDevice; | |
CComPtr<ID3D11DeviceContext> _immediateContext; | |
CComPtr<ID3D11Texture2D> _dest, _src; | |
size_t _totalSize; | |
}; | |
class D3D11CopyResourceVRAM2VRAM_performance : public performance | |
{ | |
public: | |
D3D11CopyResourceVRAM2VRAM_performance (size_t size = 4096, DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM) | |
{ | |
HRESULT hr = D3D11CreateDevice (nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &_d3dDevice, nullptr, &_immediateContext); | |
assert (SUCCEEDED (hr)); | |
memset (&_texDesc, 0, sizeof (D3D11_TEXTURE2D_DESC)); | |
_texDesc.Width = _texDesc.Height = size; | |
_texDesc.ArraySize = 1; | |
_texDesc.MipLevels = 1; | |
_texDesc.Format = format; | |
_texDesc.SampleDesc.Count = 1; | |
_texDesc.Usage = D3D11_USAGE_DEFAULT; | |
_texDesc.CPUAccessFlags = 0; | |
_texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE; | |
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_dest); | |
assert (SUCCEEDED (hr)); | |
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_src); | |
assert (SUCCEEDED (hr)); | |
_totalSize = size * size * (format == DXGI_FORMAT_R8G8B8A8_UNORM ? 4 : 16); | |
} | |
protected: | |
virtual size_t do_measure () noexcept override | |
{ | |
_immediateContext->CopyResource (_dest, _src); | |
_immediateContext->Flush (); | |
return _totalSize; | |
} | |
private: | |
D3D11_TEXTURE2D_DESC _texDesc; | |
CComPtr<ID3D11Device> _d3dDevice; | |
CComPtr<ID3D11DeviceContext> _immediateContext; | |
CComPtr<ID3D11Texture2D> _dest, _src; | |
size_t _totalSize; | |
}; | |
void measure (const char * testname, performance* perf) | |
{ | |
printf ("==== %s Performance Measure ====\n", testname); | |
std::shared_ptr<performance> _measure (perf); | |
_measure->run (); | |
while (_measure->is_running ()) | |
{ | |
if (_measure->proceed_time ().count () >= MEASURE_SECONDS) | |
_measure->stop (); | |
printf ("\r%3.3lfs... %lfGB/s... Total Copied: %lfGB", | |
_measure->proceed_time ().count (), | |
(_measure->copy_bytes () / _measure->proceed_time ().count ()) * GIGABYTE_MAKER, | |
_measure->copy_bytes () * GIGABYTE_MAKER); | |
std::this_thread::yield (); | |
} | |
printf ("\r%3.3lfs... %lfGB/s... Total Copied: %lfGB", | |
_measure->proceed_time ().count (), | |
(_measure->copy_bytes () / _measure->proceed_time ().count ()) * GIGABYTE_MAKER, | |
_measure->copy_bytes ()* GIGABYTE_MAKER); | |
putchar ('\n'); | |
} | |
int main (int argc, char* argv[]) | |
{ | |
measure (u8"CPU memcpy", new memcpy_performance ()); | |
measure (u8"Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8)", new D3D11CopyResourceRAM2VRAM_performance (4096, DXGI_FORMAT_R8G8B8A8_UNORM)); | |
measure (u8"Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF)", new D3D11CopyResourceRAM2VRAM_performance (4096, DXGI_FORMAT_R32G32B32A32_FLOAT)); | |
measure (u8"Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8)", new D3D11CopyResourceVRAM2VRAM_performance (4096, DXGI_FORMAT_R8G8B8A8_UNORM)); | |
measure (u8"Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF)", new D3D11CopyResourceVRAM2VRAM_performance (4096, DXGI_FORMAT_R32G32B32A32_FLOAT)); | |
return 0; | |
} |
CPU: AMD Ryzen 5 2600X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (Up to 2666MHz Overclocked)
M/B: MSI B350M Mortar
GPU: AMD Radeon RX 5700XT Reference Model by Sapphire
Result:
==== CPU memcpy Performance Measure ====
10.000s... 15.356055GB/s... Total Copied: 153.562500GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 12.356203GB/s... Total Copied: 123.562500GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 13.624967GB/s... Total Copied: 136.250000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 182.030647GB/s... Total Copied: 1820.312500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 186.499263GB/s... Total Copied: 1865.000000GB
Background Informations
GDDR6 1750MHz + 256-bit Memory bus Maximum Bandwidth: 448 GB/s
CPU: AMD Ryzen 5 1600X (No Overclocked)
RAM: DDR4 2133MHz 8GBx2 (No Overclocked)
M/B: ASRock B350M Pro4
GPU: Zotac NVIDIA GeForce GTX 970
Result:
==== CPU memcpy Performance Measure ====
10.000s... 13.245923GB/s... Total Copied: 132.562500GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 12.181203GB/s... Total Copied: 121.812500GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 13.474920GB/s... Total Copied: 134.750000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 127.330644GB/s... Total Copied: 1273.312500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 74.749695GB/s... Total Copied: 747.500000GB
CPU: AMD Ryzen 7 3700X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (No Overclocked, RAM Timing XMP)
M/B: MSI B550M Mortar WiFi
GPU: AMD Radeon RX 5700XT Reference Model by Sapphire
Result:
==== CPU memcpy Performance Measure ====
10.005s... 14.043436GB/s... Total Copied: 140.500000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 22.462423GB/s... Total Copied: 224.625000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 23.749940GB/s... Total Copied: 237.500000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 183.493069GB/s... Total Copied: 1834.937500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 187.524475GB/s... Total Copied: 1875.250000GB
Background Informations
PCI-Express 4.0 x16 Maximum Bandwidth: 31.5 GB/s.
CPU: AMD Ryzen 7 3700X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (No Overclocked, RAM Timing XMP)
M/B: MSI B550M Mortar WiFi
GPU: GALAX NVIDIA GeForce RTX 3070 EX OC
Result:
==== CPU memcpy Performance Measure ====
10.006s... 14.916766GB/s... Total Copied: 149.250000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 22.962413GB/s... Total Copied: 229.625000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 25.024848GB/s... Total Copied: 250.250000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 328.060400GB/s... Total Copied: 3280.625000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 174.724219GB/s... Total Copied: 1747.250000GB
CPU: AMD Ryzen 3 2200U
RAM: Crucial DDR4 4GBx2
M/B: Lenovo ideapad 330S-15ARR (81FB)
GPU: AMD Radeon Vega 3 Mobile Graphics
Result:
==== CPU memcpy Performance Measure ====
10.007s... 5.233843GB/s... Total Copied: 52.375000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 6.624958GB/s... Total Copied: 66.250000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 7.474972GB/s... Total Copied: 74.750000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 10.406215GB/s... Total Copied: 104.062500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 8.499961GB/s... Total Copied: 85.000000GB
CPU: AMD Ryzen 5 2600X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (Up to 2666MHz Overclocked)
M/B: MSI B350M Mortar
GPU: Sapphire AMD Radeon RX 480 8GB Nitro+
Result:
Background Informations
2666MHz DDR4 Maximum Bandwidth : 21.33333GB/s.
PCI-Express 3.0 x16 Maximum Bandwidth: 15.75 GB/s.
GDDR5 2000MHz + 256-bit Memory bus Maximum Bandwidth: 256 GB/s.