Skip to content

Instantly share code, notes, and snippets.

@supplient
Created November 6, 2021 08:55
Show Gist options
  • Save supplient/cf657df60332a198992602f6452fd532 to your computer and use it in GitHub Desktop.
Save supplient/cf657df60332a198992602f6452fd532 to your computer and use it in GitHub Desktop.
并行算法:反向scan,将片段偏移量转化为片段索引
#include <Windows.h>
// cuda_helpers.h is my own repo
// refer to https://github.com/supplient/CudaHelper
#define CUDA_HELPER_NAMESPACE cuh
#include <cuda_helpers.h>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <thrust/fill.h>
#include <string>
#include <unordered_map>
#include <random>
#include <iostream>
#include <functional>
#include <vector>
#include <numeric>
using namespace std;
namespace kernel {
__global__ void UsingSeqFill(uint32_t* B, const uint32_t* A, size_t n) {
auto ti = LINEAR_THREAD_ID;
if (ti >= n)
return;
uint32_t end = A[ti];
uint32_t begin = ti == 0 ? 0 : A[ti - 1];
thrust::fill_n(thrust::seq, B + begin, end - begin, ti);
}
__global__ void UsingParFill(uint32_t* B, const uint32_t* A, size_t n) {
auto ti = LINEAR_THREAD_ID;
if (ti >= n)
return;
uint32_t end = A[ti];
uint32_t begin = ti == 0 ? 0 : A[ti - 1];
thrust::fill_n(thrust::device, B + begin, end - begin, ti);
}
}
size_t UsingSeqFill(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) {
thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp);
kernel::UsingSeqFill KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n);
return cuh::GetEntry(d_tmp, n - 1);
}
size_t UsingParFill(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) {
thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp);
kernel::UsingParFill KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n);
return cuh::GetEntry(d_tmp, n - 1);
}
namespace kernel {
__global__ void SetMark(uint32_t* B, const uint32_t* A, size_t n) {
auto ti = LINEAR_THREAD_ID;
if (ti >= n-1)
return;
B[A[ti]] = 1;
}
}
size_t LinearAlgo(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) {
thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp);
auto m = cuh::GetEntry(d_tmp, n - 1);
CheckCuda(cudaMemsetAsync(d_B, 0, sizeof(uint32_t) * m));
kernel::SetMark KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n);
thrust::inclusive_scan(thrust::device, d_B, d_B + m, d_B);
return m;
}
typedef size_t(*WorkFunc)(uint32_t*, const uint32_t*, size_t, uint32_t*);
/// <summary>
/// Test function, including correctness testing.
/// </summary>
/// <param name="f">: Function to test.</param>
/// <param name="seed">: A seed for random.</param>
/// <returns>Time in ms.</returns>
double Benchmark(WorkFunc f, uint32_t seed) {
constexpr uint32_t minSegSize = 1;
constexpr uint32_t maxSegSize = 500;
constexpr size_t n = 1000;
constexpr size_t ITER_N = 10000;
default_random_engine generator(seed);
uniform_int_distribution<uint32_t> dist(minSegSize, maxSegSize);
auto rint = bind(dist, generator);
vector<uint32_t> A(n); {
for (size_t i = 0; i < n; i++)
A[i] = rint();
}
uint32_t m = accumulate(A.begin(), A.end(), 0);
vector<uint32_t> B(m); {
size_t bi = 0;
for (uint32_t ai = 0; ai < n; ai++) {
uint32_t count = A[ai];
do {
B[bi] = ai;
bi++;
count--;
} while (count > 0);
}
assert(bi == m);
}
vector<uint32_t> resB(m);
size_t resSize;
cuh::TempStorage<uint32_t> d_A;
cuh::TempStorage<uint32_t> d_B;
cuh::TempStorage<uint32_t> d_tmp;
d_A.CheckAndAlloc(n);
d_B.CheckAndAlloc(m);
d_tmp.CheckAndAlloc(n);
cuh::Copy<uint32_t>(d_A, A.data(), A.size());
LARGE_INTEGER start, end;
QueryPerformanceCounter(&start);
for(size_t i=0; i<ITER_N; i++)
resSize = f(d_B, d_A, n, d_tmp);
QueryPerformanceCounter(&end);
cuh::Copy<uint32_t>(resB.data(), d_B, m);
assert(resSize == m);
for (size_t i = 0; i < resSize; i++)
assert(resB[i] == B[i]);
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
double delta = double(end.QuadPart - start.QuadPart) / double(freq.QuadPart * ITER_N) * 1000.0;
return delta;
}
int main() {
uint32_t seed = 0x46af87ecL;
// Warmup
cout << "Warm up...\n";
Benchmark(UsingSeqFill, seed);
Benchmark(UsingParFill, seed);
Benchmark(LinearAlgo, seed);
cout << "Calculating...\n";
unordered_map<string, double> algo2time;
algo2time["SeqFill" ] = Benchmark(UsingSeqFill, seed);
algo2time["ParFill" ] = Benchmark(UsingParFill, seed);
algo2time["Linear" ] = Benchmark(LinearAlgo, seed);
for (auto& pair : algo2time) {
printf("%s: %fms\n", pair.first.c_str(), pair.second);
}
return 0;
}
@supplient
Copy link
Author

Output on RTX 2060, i7 10700

Warm up...
Calculating...
SeqFill: 0.097737ms
ParFill: 0.096750ms
Linear: 0.077204ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment