supplient · November 6, 2021 08:55 · supplient · Nov 6, 2021
diff --git a/offset2index.cpp b/offset2index.cpp
 #include <Windows.h>

 // cuda_helpers.h is my own repo
 // refer to https://github.com/supplient/CudaHelper
 #define CUDA_HELPER_NAMESPACE cuh
 #include <cuda_helpers.h>

 #include <thrust/scan.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>

 #include <string>
 #include <unordered_map>
 #include <random>
 #include <iostream>
 #include <functional>
 #include <vector>
 #include <numeric>
 using namespace std;


 namespace kernel {
 	__global__ void UsingSeqFill(uint32_t* B, const uint32_t* A, size_t n) {
 		auto ti = LINEAR_THREAD_ID;
 		if (ti >= n)
 			return;
 		uint32_t end = A[ti];
 		uint32_t begin = ti == 0 ? 0 : A[ti - 1];

 		thrust::fill_n(thrust::seq, B + begin, end - begin, ti);
 	}

 	__global__ void UsingParFill(uint32_t* B, const uint32_t* A, size_t n) {
 		auto ti = LINEAR_THREAD_ID;
 		if (ti >= n)
 			return;
 		uint32_t end = A[ti];
 		uint32_t begin = ti == 0 ? 0 : A[ti - 1];

 		thrust::fill_n(thrust::device, B + begin, end - begin, ti);
 	}
 }

 size_t UsingSeqFill(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) {
 	thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp);

 	kernel::UsingSeqFill KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n);

 	return cuh::GetEntry(d_tmp, n - 1);
 }

 size_t UsingParFill(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) {
 	thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp);

 	kernel::UsingParFill KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n);

 	return cuh::GetEntry(d_tmp, n - 1);
 }

 namespace kernel {
 	__global__ void SetMark(uint32_t* B, const uint32_t* A, size_t n) {
 		auto ti = LINEAR_THREAD_ID;
 		if (ti >= n-1)
 			return;
 		B[A[ti]] = 1;
 	}
 }

 size_t LinearAlgo(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) {
 	thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp);

 	auto m = cuh::GetEntry(d_tmp, n - 1);
 	CheckCuda(cudaMemsetAsync(d_B, 0, sizeof(uint32_t) * m));

 	kernel::SetMark KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n);

 	thrust::inclusive_scan(thrust::device, d_B, d_B + m, d_B);

 	return m;
 }




 typedef size_t(*WorkFunc)(uint32_t*, const uint32_t*, size_t, uint32_t*);

 /// <summary>
 /// Test function, including correctness testing.
 /// </summary>
 /// <param name="f">: Function to test.</param>
 /// <param name="seed">: A seed for random.</param>
 /// <returns>Time in ms.</returns>
 double Benchmark(WorkFunc f, uint32_t seed) {
 	constexpr uint32_t minSegSize = 1;
 	constexpr uint32_t maxSegSize = 500;
 	constexpr size_t n = 1000;
 	constexpr size_t ITER_N = 10000;

 	default_random_engine generator(seed);
 	uniform_int_distribution<uint32_t> dist(minSegSize, maxSegSize);
 	auto rint = bind(dist, generator);

 	vector<uint32_t> A(n); { 
 		for (size_t i = 0; i < n; i++) 
 			A[i] = rint(); 
 	}
 	uint32_t m = accumulate(A.begin(), A.end(), 0);
 	vector<uint32_t> B(m); {
 		size_t bi = 0;
 		for (uint32_t ai = 0; ai < n; ai++) {
 			uint32_t count = A[ai];
 			do {
 				B[bi] = ai;
 				bi++;
 				count--;
 			} while (count > 0);
 		}
 		assert(bi == m);
 	}
 	
 	vector<uint32_t> resB(m);
 	size_t resSize;

 	cuh::TempStorage<uint32_t> d_A;
 	cuh::TempStorage<uint32_t> d_B;
 	cuh::TempStorage<uint32_t> d_tmp;
 	d_A.CheckAndAlloc(n);
 	d_B.CheckAndAlloc(m);
 	d_tmp.CheckAndAlloc(n);
 	cuh::Copy<uint32_t>(d_A, A.data(), A.size());

 	LARGE_INTEGER start, end;
 	QueryPerformanceCounter(&start);
 	for(size_t i=0; i<ITER_N; i++)
 		resSize = f(d_B, d_A, n, d_tmp);
 	QueryPerformanceCounter(&end);
 	
 	cuh::Copy<uint32_t>(resB.data(), d_B, m);
 	assert(resSize == m);
 	for (size_t i = 0; i < resSize; i++)
 		assert(resB[i] == B[i]);

 	LARGE_INTEGER freq;
 	QueryPerformanceFrequency(&freq);
 	double delta = double(end.QuadPart - start.QuadPart) / double(freq.QuadPart * ITER_N) * 1000.0;
 	return delta;
 }


 int main() {
 	uint32_t seed = 0x46af87ecL;
 	
 	// Warmup
 	cout << "Warm up...\n";
 	Benchmark(UsingSeqFill,	seed);
 	Benchmark(UsingParFill,	seed);
 	Benchmark(LinearAlgo,	seed);
 	
 	cout << "Calculating...\n";
 	unordered_map<string, double> algo2time;
 	algo2time["SeqFill"	] =	 Benchmark(UsingSeqFill,	seed);
 	algo2time["ParFill"	] =  Benchmark(UsingParFill,	seed);
 	algo2time["Linear"	] =  Benchmark(LinearAlgo,		seed);

 	for (auto& pair : algo2time) {
 		printf("%s: %fms\n", pair.first.c_str(), pair.second);
 	}

 	return 0;
 }
	#include <Windows.h>

	// cuda_helpers.h is my own repo
	// refer to https://github.com/supplient/CudaHelper
	#define CUDA_HELPER_NAMESPACE cuh
	#include <cuda_helpers.h>

	#include <thrust/scan.h>
	#include <thrust/execution_policy.h>
	#include <thrust/fill.h>

	#include <string>
	#include <unordered_map>
	#include <random>
	#include <iostream>
	#include <functional>
	#include <vector>
	#include <numeric>
	using namespace std;


	namespace kernel {
	__global__ void UsingSeqFill(uint32_t* B, const uint32_t* A, size_t n) {
	auto ti = LINEAR_THREAD_ID;
	if (ti >= n)
	return;
	uint32_t end = A[ti];
	uint32_t begin = ti == 0 ? 0 : A[ti - 1];

	thrust::fill_n(thrust::seq, B + begin, end - begin, ti);
	}

	__global__ void UsingParFill(uint32_t* B, const uint32_t* A, size_t n) {
	auto ti = LINEAR_THREAD_ID;
	if (ti >= n)
	return;
	uint32_t end = A[ti];
	uint32_t begin = ti == 0 ? 0 : A[ti - 1];

	thrust::fill_n(thrust::device, B + begin, end - begin, ti);
	}
	}

	size_t UsingSeqFill(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) {
	thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp);

	kernel::UsingSeqFill KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n);

	return cuh::GetEntry(d_tmp, n - 1);
	}

	size_t UsingParFill(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) {
	thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp);

	kernel::UsingParFill KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n);

	return cuh::GetEntry(d_tmp, n - 1);
	}

	namespace kernel {
	__global__ void SetMark(uint32_t* B, const uint32_t* A, size_t n) {
	auto ti = LINEAR_THREAD_ID;
	if (ti >= n-1)
	return;
	B[A[ti]] = 1;
	}
	}

	size_t LinearAlgo(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) {
	thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp);

	auto m = cuh::GetEntry(d_tmp, n - 1);
	CheckCuda(cudaMemsetAsync(d_B, 0, sizeof(uint32_t) * m));

	kernel::SetMark KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n);

	thrust::inclusive_scan(thrust::device, d_B, d_B + m, d_B);

	return m;
	}




	typedef size_t(WorkFunc)(uint32_t, const uint32_t, size_t, uint32_t);

	/// <summary>
	/// Test function, including correctness testing.
	/// </summary>
	/// <param name="f">: Function to test.</param>
	/// <param name="seed">: A seed for random.</param>
	/// <returns>Time in ms.</returns>
	double Benchmark(WorkFunc f, uint32_t seed) {
	constexpr uint32_t minSegSize = 1;
	constexpr uint32_t maxSegSize = 500;
	constexpr size_t n = 1000;
	constexpr size_t ITER_N = 10000;

	default_random_engine generator(seed);
	uniform_int_distribution<uint32_t> dist(minSegSize, maxSegSize);
	auto rint = bind(dist, generator);

	vector<uint32_t> A(n); {
	for (size_t i = 0; i < n; i++)
	A[i] = rint();
	}
	uint32_t m = accumulate(A.begin(), A.end(), 0);
	vector<uint32_t> B(m); {
	size_t bi = 0;
	for (uint32_t ai = 0; ai < n; ai++) {
	uint32_t count = A[ai];
	do {
	B[bi] = ai;
	bi++;
	count--;
	} while (count > 0);
	}
	assert(bi == m);
	}

	vector<uint32_t> resB(m);
	size_t resSize;

	cuh::TempStorage<uint32_t> d_A;
	cuh::TempStorage<uint32_t> d_B;
	cuh::TempStorage<uint32_t> d_tmp;
	d_A.CheckAndAlloc(n);
	d_B.CheckAndAlloc(m);
	d_tmp.CheckAndAlloc(n);
	cuh::Copy<uint32_t>(d_A, A.data(), A.size());

	LARGE_INTEGER start, end;
	QueryPerformanceCounter(&start);
	for(size_t i=0; i<ITER_N; i++)
	resSize = f(d_B, d_A, n, d_tmp);
	QueryPerformanceCounter(&end);

	cuh::Copy<uint32_t>(resB.data(), d_B, m);
	assert(resSize == m);
	for (size_t i = 0; i < resSize; i++)
	assert(resB[i] == B[i]);

	LARGE_INTEGER freq;
	QueryPerformanceFrequency(&freq);
	double delta = double(end.QuadPart - start.QuadPart) / double(freq.QuadPart * ITER_N) * 1000.0;
	return delta;
	}


	int main() {
	uint32_t seed = 0x46af87ecL;

	// Warmup
	cout << "Warm up...\n";
	Benchmark(UsingSeqFill, seed);
	Benchmark(UsingParFill, seed);
	Benchmark(LinearAlgo, seed);

	cout << "Calculating...\n";
	unordered_map<string, double> algo2time;
	algo2time["SeqFill" ] = Benchmark(UsingSeqFill, seed);
	algo2time["ParFill" ] = Benchmark(UsingParFill, seed);
	algo2time["Linear" ] = Benchmark(LinearAlgo, seed);

	for (auto& pair : algo2time) {
	printf("%s: %fms\n", pair.first.c_str(), pair.second);
	}

	return 0;
	}