ned14 · September 5, 2017 20:51
diff --git a/PoorCachedIoPerformance.cpp b/PoorCachedIoPerformance.cpp
 /* Demos the poor cached i/o performance on Windows

 Windows x64 with NTFS:

 memcpy: 91.7057ns per 1Kb
 Write: 1258.34ns per 1Kb
 Read: 1318ns per 1Kb

 Linux x64 with ext4:

 memcpy: 125.164ns per 1Kb
 Write: 298.724ns per 1Kb
 Read: 115.725ns per 1Kb

 */

 #include <chrono>
 #include <iostream>
 #include <vector>

 #ifdef _WIN32
 #include <Windows.h>

 int main() {
  std::cout << "Preallocating the test file ..." << std::endl;
  // Get some aligned memory
  void *mem =
      VirtualAlloc(NULL, 1024, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
  memset(mem, '1', 1024);
  // Work around the scheduler bug which badly affects blocking i/o
  SetThreadAffinityMask(GetCurrentThread(), 1);
  // Standard fully cached file
  HANDLE h = CreateFile(L"testfile", GENERIC_READ | GENERIC_WRITE, 0, NULL,
                        CREATE_ALWAYS, FILE_FLAG_DELETE_ON_CLOSE, NULL);
  // Preallocate to ensure later i/o is exclusively to/from page cache only
  FILE_END_OF_FILE_INFO feofi;
  feofi.EndOfFile.QuadPart = 1024 * 1024 * 1024;
  SetFileInformationByHandle(h, FileEndOfFileInfo, &feofi, sizeof(feofi));
  for (size_t n = 0; n < 1024; n++) {
    static char buffer[1024 * 1024];
    DWORD bytes = 0;
    WriteFile(h, buffer, 1024 * 1024, &bytes, NULL);
    if (bytes != 1024 * 1024)
      abort();
  }
  SetFilePointer(h, 0, NULL, FILE_BEGIN);
  // Precalc randomised offsets
  std::vector<uint64_t> offsets(1024 * 1024);
  for (size_t n = 0; n < 1024 * 1024; n++) {
    offsets[n] = (rand() * 64) % (1024 * 1024);
  }
  static char buffer[1024 * 1024 * 1024];
  memset(buffer, 1, sizeof(buffer));
  // Ensure background writing doesn't interfere with results
  FlushFileBuffers(h);

  // Warm up SpeedStep
  auto begin = std::chrono::high_resolution_clock::now();
  while (std::chrono::duration_cast<std::chrono::seconds>(
             std::chrono::high_resolution_clock::now() - begin)
             .count() < 3)
    ;
  begin = std::chrono::high_resolution_clock::now();
  for (size_t n = 0; n < 1024 * 1024; n++) {
    memcpy(buffer + n * 1024, mem, 1024);
  }
  auto end = std::chrono::high_resolution_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
                  .count() /
              1024.0 / 1024.0;
  std::cout << "memcpy: " << diff << "ns per 1Kb" << std::endl;

  // Warm up SpeedStep
  begin = std::chrono::high_resolution_clock::now();
  while (std::chrono::duration_cast<std::chrono::seconds>(
             std::chrono::high_resolution_clock::now() - begin)
             .count() < 3)
    ;
  begin = std::chrono::high_resolution_clock::now();
  for (size_t n = 0; n < 1024 * 1024; n++) {
    DWORD bytes = 0;
    OVERLAPPED ol;
    memset(&ol, 0, sizeof(ol));
    ol.Internal = offsets[n];
    WriteFile(h, mem, 1024, &bytes, &ol);
    if (bytes != 1024)
      abort();
  }
  end = std::chrono::high_resolution_clock::now();
  diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
             .count() /
         1024.0 / 1024.0;
  std::cout << "Write: " << diff << "ns per 1Kb" << std::endl;

  // Warm up SpeedStep
  while (std::chrono::duration_cast<std::chrono::seconds>(
             std::chrono::high_resolution_clock::now() - begin)
             .count() < 3)
    ;
  begin = std::chrono::high_resolution_clock::now();
  for (size_t n = 0; n < 1024 * 1024; n++) {
    DWORD bytes = 0;
    OVERLAPPED ol;
    memset(&ol, 0, sizeof(ol));
    ol.Internal = offsets[n];
    ReadFile(h, mem, 1024, &bytes, &ol);
    if (bytes != 1024)
      abort();
  }
  end = std::chrono::high_resolution_clock::now();
  diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
             .count() /
         1024.0 / 1024.0;
  std::cout << "Read: " << diff << "ns per 1Kb" << std::endl;
  return 0;
 }

 #else

 #include <fcntl.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <unistd.h>

 int main() {
  std::cout << "Preallocating the test file ..." << std::endl;
  // Get some aligned memory
  void *mem = mmap(NULL, 1024, PROT_READ | PROT_WRITE,
                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
  memset(mem, '1', 1024);
  // Standard fully cached file
  int h = open("testfile", O_CREAT | O_RDWR | O_TRUNC, 0700);
  // Preallocate to ensure later i/o is exclusively to/from page cache only
  for (size_t n = 0; n < 1024; n++) {
    static char buffer[1024 * 1024];
    write(h, buffer, 1024 * 1024);
  }
  lseek(h, 0, SEEK_SET);
  // Precalc randomised offsets
  std::vector<uint64_t> offsets(1024 * 1024);
  for (size_t n = 0; n < 1024 * 1024; n++) {
    offsets[n] = (rand() * 64) % (1024 * 1024);
  }
  static char buffer[1024 * 1024 * 1024];
  memset(buffer, 1, sizeof(buffer));
  // Ensure background writing doesn't interfere with results
  fsync(h);

  // Warm up SpeedStep
  auto begin = std::chrono::high_resolution_clock::now();
  while (std::chrono::duration_cast<std::chrono::seconds>(
             std::chrono::high_resolution_clock::now() - begin)
             .count() < 3)
    ;
  begin = std::chrono::high_resolution_clock::now();
  for (size_t n = 0; n < 1024 * 1024; n++) {
    memcpy(buffer + n * 1024, mem, 1024);
  }
  auto end = std::chrono::high_resolution_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
                  .count() /
              1024.0 / 1024.0;
  std::cout << "memcpy: " << diff << "ns per 1Kb" << std::endl;

  // Warm up SpeedStep
  begin = std::chrono::high_resolution_clock::now();
  while (std::chrono::duration_cast<std::chrono::seconds>(
             std::chrono::high_resolution_clock::now() - begin)
             .count() < 3)
    ;
  begin = std::chrono::high_resolution_clock::now();
  for (size_t n = 0; n < 1024 * 1024; n++) {
    pwrite(h, mem, 1024, offsets[n]);
  }
  end = std::chrono::high_resolution_clock::now();
  diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
             .count() /
         1024.0 / 1024.0;
  std::cout << "Write: " << diff << "ns per 1Kb" << std::endl;

  // Warm up SpeedStep
  while (std::chrono::duration_cast<std::chrono::seconds>(
             std::chrono::high_resolution_clock::now() - begin)
             .count() < 3)
    ;
  begin = std::chrono::high_resolution_clock::now();
  for (size_t n = 0; n < 1024 * 1024; n++) {
    pread(h, mem, 1024, offsets[n]);
  }
  end = std::chrono::high_resolution_clock::now();
  diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
             .count() /
         1024.0 / 1024.0;
  std::cout << "Read: " << diff << "ns per 1Kb" << std::endl;
  return 0;
 }

 #endif
	/* Demos the poor cached i/o performance on Windows

	Windows x64 with NTFS:

	memcpy: 91.7057ns per 1Kb
	Write: 1258.34ns per 1Kb
	Read: 1318ns per 1Kb

	Linux x64 with ext4:

	memcpy: 125.164ns per 1Kb
	Write: 298.724ns per 1Kb
	Read: 115.725ns per 1Kb

	*/

	#include <chrono>
	#include <iostream>
	#include <vector>

	#ifdef _WIN32
	#include <Windows.h>

	int main() {
	std::cout << "Preallocating the test file ..." << std::endl;
	// Get some aligned memory
	void *mem =
	VirtualAlloc(NULL, 1024, MEM_COMMIT \| MEM_RESERVE, PAGE_READWRITE);
	memset(mem, '1', 1024);
	// Work around the scheduler bug which badly affects blocking i/o
	SetThreadAffinityMask(GetCurrentThread(), 1);
	// Standard fully cached file
	HANDLE h = CreateFile(L"testfile", GENERIC_READ \| GENERIC_WRITE, 0, NULL,
	CREATE_ALWAYS, FILE_FLAG_DELETE_ON_CLOSE, NULL);
	// Preallocate to ensure later i/o is exclusively to/from page cache only
	FILE_END_OF_FILE_INFO feofi;
	feofi.EndOfFile.QuadPart = 1024 * 1024 * 1024;
	SetFileInformationByHandle(h, FileEndOfFileInfo, &feofi, sizeof(feofi));
	for (size_t n = 0; n < 1024; n++) {
	static char buffer[1024 * 1024];
	DWORD bytes = 0;
	WriteFile(h, buffer, 1024 * 1024, &bytes, NULL);
	if (bytes != 1024 * 1024)
	abort();
	}
	SetFilePointer(h, 0, NULL, FILE_BEGIN);
	// Precalc randomised offsets
	std::vector<uint64_t> offsets(1024 * 1024);
	for (size_t n = 0; n < 1024 * 1024; n++) {
	offsets[n] = (rand() * 64) % (1024 * 1024);
	}
	static char buffer[1024 * 1024 * 1024];
	memset(buffer, 1, sizeof(buffer));
	// Ensure background writing doesn't interfere with results
	FlushFileBuffers(h);

	// Warm up SpeedStep
	auto begin = std::chrono::high_resolution_clock::now();
	while (std::chrono::duration_cast<std::chrono::seconds>(
	std::chrono::high_resolution_clock::now() - begin)
	.count() < 3)
	;
	begin = std::chrono::high_resolution_clock::now();
	for (size_t n = 0; n < 1024 * 1024; n++) {
	memcpy(buffer + n * 1024, mem, 1024);
	}
	auto end = std::chrono::high_resolution_clock::now();
	auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
	.count() /
	1024.0 / 1024.0;
	std::cout << "memcpy: " << diff << "ns per 1Kb" << std::endl;

	// Warm up SpeedStep
	begin = std::chrono::high_resolution_clock::now();
	while (std::chrono::duration_cast<std::chrono::seconds>(
	std::chrono::high_resolution_clock::now() - begin)
	.count() < 3)
	;
	begin = std::chrono::high_resolution_clock::now();
	for (size_t n = 0; n < 1024 * 1024; n++) {
	DWORD bytes = 0;
	OVERLAPPED ol;
	memset(&ol, 0, sizeof(ol));
	ol.Internal = offsets[n];
	WriteFile(h, mem, 1024, &bytes, &ol);
	if (bytes != 1024)
	abort();
	}
	end = std::chrono::high_resolution_clock::now();
	diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
	.count() /
	1024.0 / 1024.0;
	std::cout << "Write: " << diff << "ns per 1Kb" << std::endl;

	// Warm up SpeedStep
	while (std::chrono::duration_cast<std::chrono::seconds>(
	std::chrono::high_resolution_clock::now() - begin)
	.count() < 3)
	;
	begin = std::chrono::high_resolution_clock::now();
	for (size_t n = 0; n < 1024 * 1024; n++) {
	DWORD bytes = 0;
	OVERLAPPED ol;
	memset(&ol, 0, sizeof(ol));
	ol.Internal = offsets[n];
	ReadFile(h, mem, 1024, &bytes, &ol);
	if (bytes != 1024)
	abort();
	}
	end = std::chrono::high_resolution_clock::now();
	diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
	.count() /
	1024.0 / 1024.0;
	std::cout << "Read: " << diff << "ns per 1Kb" << std::endl;
	return 0;
	}

	#else

	#include <fcntl.h>
	#include <string.h>
	#include <sys/mman.h>
	#include <unistd.h>

	int main() {
	std::cout << "Preallocating the test file ..." << std::endl;
	// Get some aligned memory
	void *mem = mmap(NULL, 1024, PROT_READ \| PROT_WRITE,
	MAP_PRIVATE \| MAP_ANONYMOUS, -1, 0);
	memset(mem, '1', 1024);
	// Standard fully cached file
	int h = open("testfile", O_CREAT \| O_RDWR \| O_TRUNC, 0700);
	// Preallocate to ensure later i/o is exclusively to/from page cache only
	for (size_t n = 0; n < 1024; n++) {
	static char buffer[1024 * 1024];
	write(h, buffer, 1024 * 1024);
	}
	lseek(h, 0, SEEK_SET);
	// Precalc randomised offsets
	std::vector<uint64_t> offsets(1024 * 1024);
	for (size_t n = 0; n < 1024 * 1024; n++) {
	offsets[n] = (rand() * 64) % (1024 * 1024);
	}
	static char buffer[1024 * 1024 * 1024];
	memset(buffer, 1, sizeof(buffer));
	// Ensure background writing doesn't interfere with results
	fsync(h);

	// Warm up SpeedStep
	auto begin = std::chrono::high_resolution_clock::now();
	while (std::chrono::duration_cast<std::chrono::seconds>(
	std::chrono::high_resolution_clock::now() - begin)
	.count() < 3)
	;
	begin = std::chrono::high_resolution_clock::now();
	for (size_t n = 0; n < 1024 * 1024; n++) {
	memcpy(buffer + n * 1024, mem, 1024);
	}
	auto end = std::chrono::high_resolution_clock::now();
	auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
	.count() /
	1024.0 / 1024.0;
	std::cout << "memcpy: " << diff << "ns per 1Kb" << std::endl;

	// Warm up SpeedStep
	begin = std::chrono::high_resolution_clock::now();
	while (std::chrono::duration_cast<std::chrono::seconds>(
	std::chrono::high_resolution_clock::now() - begin)
	.count() < 3)
	;
	begin = std::chrono::high_resolution_clock::now();
	for (size_t n = 0; n < 1024 * 1024; n++) {
	pwrite(h, mem, 1024, offsets[n]);
	}
	end = std::chrono::high_resolution_clock::now();
	diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
	.count() /
	1024.0 / 1024.0;
	std::cout << "Write: " << diff << "ns per 1Kb" << std::endl;

	// Warm up SpeedStep
	while (std::chrono::duration_cast<std::chrono::seconds>(
	std::chrono::high_resolution_clock::now() - begin)
	.count() < 3)
	;
	begin = std::chrono::high_resolution_clock::now();
	for (size_t n = 0; n < 1024 * 1024; n++) {
	pread(h, mem, 1024, offsets[n]);
	}
	end = std::chrono::high_resolution_clock::now();
	diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin)
	.count() /
	1024.0 / 1024.0;
	std::cout << "Read: " << diff << "ns per 1Kb" << std::endl;
	return 0;
	}

	#endif