Last active
September 5, 2017 20:51
-
-
Save ned14/8e579df8e89fbec38b71a14a2604c07d to your computer and use it in GitHub Desktop.
Demos the poor cached i/o performance on Windows
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Demos the poor cached i/o performance on Windows | |
Windows x64 with NTFS: | |
memcpy: 91.7057ns per 1Kb | |
Write: 1258.34ns per 1Kb | |
Read: 1318ns per 1Kb | |
Linux x64 with ext4: | |
memcpy: 125.164ns per 1Kb | |
Write: 298.724ns per 1Kb | |
Read: 115.725ns per 1Kb | |
*/ | |
#include <chrono> | |
#include <iostream> | |
#include <vector> | |
#ifdef _WIN32 | |
#include <Windows.h> | |
int main() { | |
std::cout << "Preallocating the test file ..." << std::endl; | |
// Get some aligned memory | |
void *mem = | |
VirtualAlloc(NULL, 1024, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); | |
memset(mem, '1', 1024); | |
// Work around the scheduler bug which badly affects blocking i/o | |
SetThreadAffinityMask(GetCurrentThread(), 1); | |
// Standard fully cached file | |
HANDLE h = CreateFile(L"testfile", GENERIC_READ | GENERIC_WRITE, 0, NULL, | |
CREATE_ALWAYS, FILE_FLAG_DELETE_ON_CLOSE, NULL); | |
// Preallocate to ensure later i/o is exclusively to/from page cache only | |
FILE_END_OF_FILE_INFO feofi; | |
feofi.EndOfFile.QuadPart = 1024 * 1024 * 1024; | |
SetFileInformationByHandle(h, FileEndOfFileInfo, &feofi, sizeof(feofi)); | |
for (size_t n = 0; n < 1024; n++) { | |
static char buffer[1024 * 1024]; | |
DWORD bytes = 0; | |
WriteFile(h, buffer, 1024 * 1024, &bytes, NULL); | |
if (bytes != 1024 * 1024) | |
abort(); | |
} | |
SetFilePointer(h, 0, NULL, FILE_BEGIN); | |
// Precalc randomised offsets | |
std::vector<uint64_t> offsets(1024 * 1024); | |
for (size_t n = 0; n < 1024 * 1024; n++) { | |
offsets[n] = (rand() * 64) % (1024 * 1024); | |
} | |
static char buffer[1024 * 1024 * 1024]; | |
memset(buffer, 1, sizeof(buffer)); | |
// Ensure background writing doesn't interfere with results | |
FlushFileBuffers(h); | |
// Warm up SpeedStep | |
auto begin = std::chrono::high_resolution_clock::now(); | |
while (std::chrono::duration_cast<std::chrono::seconds>( | |
std::chrono::high_resolution_clock::now() - begin) | |
.count() < 3) | |
; | |
begin = std::chrono::high_resolution_clock::now(); | |
for (size_t n = 0; n < 1024 * 1024; n++) { | |
memcpy(buffer + n * 1024, mem, 1024); | |
} | |
auto end = std::chrono::high_resolution_clock::now(); | |
auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin) | |
.count() / | |
1024.0 / 1024.0; | |
std::cout << "memcpy: " << diff << "ns per 1Kb" << std::endl; | |
// Warm up SpeedStep | |
begin = std::chrono::high_resolution_clock::now(); | |
while (std::chrono::duration_cast<std::chrono::seconds>( | |
std::chrono::high_resolution_clock::now() - begin) | |
.count() < 3) | |
; | |
begin = std::chrono::high_resolution_clock::now(); | |
for (size_t n = 0; n < 1024 * 1024; n++) { | |
DWORD bytes = 0; | |
OVERLAPPED ol; | |
memset(&ol, 0, sizeof(ol)); | |
ol.Internal = offsets[n]; | |
WriteFile(h, mem, 1024, &bytes, &ol); | |
if (bytes != 1024) | |
abort(); | |
} | |
end = std::chrono::high_resolution_clock::now(); | |
diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin) | |
.count() / | |
1024.0 / 1024.0; | |
std::cout << "Write: " << diff << "ns per 1Kb" << std::endl; | |
// Warm up SpeedStep | |
while (std::chrono::duration_cast<std::chrono::seconds>( | |
std::chrono::high_resolution_clock::now() - begin) | |
.count() < 3) | |
; | |
begin = std::chrono::high_resolution_clock::now(); | |
for (size_t n = 0; n < 1024 * 1024; n++) { | |
DWORD bytes = 0; | |
OVERLAPPED ol; | |
memset(&ol, 0, sizeof(ol)); | |
ol.Internal = offsets[n]; | |
ReadFile(h, mem, 1024, &bytes, &ol); | |
if (bytes != 1024) | |
abort(); | |
} | |
end = std::chrono::high_resolution_clock::now(); | |
diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin) | |
.count() / | |
1024.0 / 1024.0; | |
std::cout << "Read: " << diff << "ns per 1Kb" << std::endl; | |
return 0; | |
} | |
#else | |
#include <fcntl.h> | |
#include <string.h> | |
#include <sys/mman.h> | |
#include <unistd.h> | |
int main() { | |
std::cout << "Preallocating the test file ..." << std::endl; | |
// Get some aligned memory | |
void *mem = mmap(NULL, 1024, PROT_READ | PROT_WRITE, | |
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | |
memset(mem, '1', 1024); | |
// Standard fully cached file | |
int h = open("testfile", O_CREAT | O_RDWR | O_TRUNC, 0700); | |
// Preallocate to ensure later i/o is exclusively to/from page cache only | |
for (size_t n = 0; n < 1024; n++) { | |
static char buffer[1024 * 1024]; | |
write(h, buffer, 1024 * 1024); | |
} | |
lseek(h, 0, SEEK_SET); | |
// Precalc randomised offsets | |
std::vector<uint64_t> offsets(1024 * 1024); | |
for (size_t n = 0; n < 1024 * 1024; n++) { | |
offsets[n] = (rand() * 64) % (1024 * 1024); | |
} | |
static char buffer[1024 * 1024 * 1024]; | |
memset(buffer, 1, sizeof(buffer)); | |
// Ensure background writing doesn't interfere with results | |
fsync(h); | |
// Warm up SpeedStep | |
auto begin = std::chrono::high_resolution_clock::now(); | |
while (std::chrono::duration_cast<std::chrono::seconds>( | |
std::chrono::high_resolution_clock::now() - begin) | |
.count() < 3) | |
; | |
begin = std::chrono::high_resolution_clock::now(); | |
for (size_t n = 0; n < 1024 * 1024; n++) { | |
memcpy(buffer + n * 1024, mem, 1024); | |
} | |
auto end = std::chrono::high_resolution_clock::now(); | |
auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin) | |
.count() / | |
1024.0 / 1024.0; | |
std::cout << "memcpy: " << diff << "ns per 1Kb" << std::endl; | |
// Warm up SpeedStep | |
begin = std::chrono::high_resolution_clock::now(); | |
while (std::chrono::duration_cast<std::chrono::seconds>( | |
std::chrono::high_resolution_clock::now() - begin) | |
.count() < 3) | |
; | |
begin = std::chrono::high_resolution_clock::now(); | |
for (size_t n = 0; n < 1024 * 1024; n++) { | |
pwrite(h, mem, 1024, offsets[n]); | |
} | |
end = std::chrono::high_resolution_clock::now(); | |
diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin) | |
.count() / | |
1024.0 / 1024.0; | |
std::cout << "Write: " << diff << "ns per 1Kb" << std::endl; | |
// Warm up SpeedStep | |
while (std::chrono::duration_cast<std::chrono::seconds>( | |
std::chrono::high_resolution_clock::now() - begin) | |
.count() < 3) | |
; | |
begin = std::chrono::high_resolution_clock::now(); | |
for (size_t n = 0; n < 1024 * 1024; n++) { | |
pread(h, mem, 1024, offsets[n]); | |
} | |
end = std::chrono::high_resolution_clock::now(); | |
diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin) | |
.count() / | |
1024.0 / 1024.0; | |
std::cout << "Read: " << diff << "ns per 1Kb" << std::endl; | |
return 0; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment