Skip to content

Instantly share code, notes, and snippets.

@wheremyfoodat
Last active August 16, 2024 10:11
Show Gist options
  • Save wheremyfoodat/24c1aaa1c2c1338356b33cc9a47c9681 to your computer and use it in GitHub Desktop.
Save wheremyfoodat/24c1aaa1c2c1338356b33cc9a47c9681 to your computer and use it in GitHub Desktop.
Nokotan Markov Chain CUDA kernel
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <ctime>
static constexpr size_t blockCount = 32;
static constexpr size_t threadsPerBlock = 128;
static constexpr size_t totalThreadCount = threadsPerBlock * blockCount;
using RngState = curandState_t;
enum State : uint32_t {
Shi_1 = 0,
Ka,
No_1,
Ko_1,
No_2,
Ko_2,
No_3,
Ko_3,
Ko_4,
Shi_2,
Tan_1,
Tan_2,
End,
TotalStates,
};
// Globally state for synchronizing between kernel invocations
struct NokoMetrics {
int count;
int finished;
int initialSeed;
void reset() {
count = 0;
finished = 0;
std::srand(std::time(nullptr));
initialSeed = rand();
}
};
__device__ State transition(State state, RngState* rng) {
// The probability of each state to transition into the correct next state
static constexpr float transitionProbabilities[State::TotalStates] = {
0.5f, // Shi_1
1.0f, // Ka
1.0f, // No_1
0.5f, // Ko_1
1.0f, // No_2
0.5f, // Ko_2
1.0f, // No_3
0.25f, // Ko_3
0.25f, // Ko_4
0.5f, // Shi_2
1.0f, // Tan_1
0.5f, // Tan_2
1.0f, // (End)
};
float rand = curand_uniform(rng);
float probability = transitionProbabilities[state];
// Properly transitioned to next state
if (rand < probability) {
return State(state + 1);
} else {
// Back to the start otherwise
return State::Shi_1;
}
}
__global__ void nokotanKernel(NokoMetrics* kernelState, RngState* rngStates) {
int threadID = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize RNG for this thread
RngState* rng = &rngStates[threadID];
curand_init(kernelState->initialSeed, threadID, 0, rng);
State state = State::Shi_1;
while (true) {
// Some other invocation already completed the task so stop
if (kernelState->finished) {
break;
}
// This invocation completed the task, signal that it's finished to the others.
if (state == State::End) {
kernelState->finished = 1;
break;
}
else if (state == State::Shi_1) {
atomicAdd(&kernelState->count, 1);
}
state = transition(state, rng);
}
}
int main() {
// Initialize our counter and RNG state for each thread
NokoMetrics* kernelState = nullptr;
RngState* rngStates = nullptr;
cudaMallocHost(&kernelState, sizeof(NokoMetrics));
cudaMalloc(&rngStates, sizeof(RngState) * totalThreadCount);
kernelState->reset();
// Initialize a CUDA stream so we can properly sync with our kernel
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
// Run kernel, wait for it to finish and print result
nokotanKernel<<<blockCount, threadsPerBlock, 0, stream>>>(kernelState, rngStates);
cudaStreamSynchronize(stream);
printf("It took %d runs to reproduce the Nokotan theme song\n", kernelState->count);
return 0;
}
BasedOnStyle: Google
IndentWidth: 4
ColumnLimit: 150
AccessModifierOffset: -2
TabWidth: 4
NamespaceIndentation: All
UseTab: ForContinuationAndIndentation
AllowShortEnumsOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: true
AllowShortIfStatementsOnASingleLine: true
Cpp11BracedListStyle: true
PackConstructorInitializers: BinPack
AlignAfterOpenBracket: BlockIndent
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment