Last active
August 29, 2015 14:18
-
-
Save ybubnov/2c854122cc6b8a749ad1 to your computer and use it in GitHub Desktop.
Dodgson condensation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pycuda.autoinit | |
import pycuda.driver as drv | |
import numpy as np | |
import math | |
import time | |
from __future__ import division | |
from pycuda.compiler import SourceModule | |
""" Dodgson condensation """ | |
if drv.Device.count() < 0: | |
print("There is no cuda devices") | |
exit() | |
cuda_mod = SourceModule(""" | |
__global__ void cuda_det_1(float *a_dest, const float *a_row1, const float *a_row2, const int size) | |
{ | |
const int i = blockDim.x * blockIdx.x + threadIdx.x; | |
const int j = i + 1; | |
if (j < size) { | |
a_dest[i] = a_row1[i]*a_row2[j] - a_row1[j]*a_row2[i]; | |
} | |
} | |
__global__ void cuda_det_2(float *a_dest, const float *a_row1, const float *a_row2, const int size) | |
{ | |
const int i = blockDim.x * blockIdx.x + threadIdx.x; | |
const int j = i + 1; | |
if (i < size) { | |
a_dest[i] = a_row1[i] / a_row2[j]; | |
} | |
} | |
""") | |
cuda_max_threads = drv.Device(0).get_attributes().get( | |
pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK) | |
cuda_det_1 = cuda_mod.get_function("cuda_det_1") | |
cuda_det_2 = cuda_mod.get_function("cuda_det_2") | |
def cuda_reduce(a, blocks, threads): | |
x, y = map(lambda v: v-1, a.shape) | |
b = np.zeros(shape=(x,y)).astype(np.float32) | |
size = np.int32(y+1) | |
for i in range(x): | |
cuda_det_1(drv.Out(b[i]), drv.In(a[i]), drv.In(a[i+1]), | |
size, block=(threads,1,1), grid=(blocks,1,1)) | |
return b | |
def cuda_product(a, d, blocks, threads): | |
x, y = map(lambda v: v-1, a.shape) | |
c = np.zeros(shape=(x,y)).astype(np.float32) | |
size = np.int32(y+1) | |
for i in range(x): | |
cuda_det_1(drv.Out(c[i]), drv.In(a[i]), drv.In(a[i+1]), | |
size, block=(threads,1,1), grid=(blocks,1,1)) | |
cuda_det_2(drv.Out(c[i]), drv.In(c[i]), drv.In(d[i+1]), | |
size, block=(threads,1,1), grid=(blocks,1,1)) | |
return c, a | |
def determinant(d): | |
x, y = d.shape | |
threads = min(y, cuda_max_threads) | |
blocks = int(math.ceil(y/threads)) | |
a = cuda_reduce(d, blocks, threads) | |
x, y = a.shape | |
while x > 1: | |
threads = min(y, cuda_max_threads) | |
blocks = int(math.ceil(y/threads)) | |
a, d = cuda_product(a, d, blocks, threads) | |
x, y = a.shape | |
return a[0][0] | |
def note_time(function, params): | |
start = time.time() | |
result = function(params) | |
end = time.time() | |
return result, end-start | |
if __name__ == "__main__": | |
a = np.random.normal(loc=1.0, size=(32, 32)).astype(np.float32) | |
cuda_result, cuda_time = note_time(determinant, a) | |
cpu_result, cpu_time = note_time(np.linalg.det, a) | |
print("Rank=%s: cuda(%s)=%s, cpu(%s)=%s" % | |
(comm.rank, cuda_time, cuda_result, cpu_time, cpu_result)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment