ybubnov · August 29, 2015 14:18
diff --git a/dodgson.py b/dodgson.py
 import pycuda.autoinit
 import pycuda.driver as drv
 import numpy as np
 import math
 import time

 from __future__ import division
 from pycuda.compiler import SourceModule

 """ Dodgson condensation """


 if drv.Device.count() < 0:
    print("There is no cuda devices")
    exit()

 cuda_mod = SourceModule("""
 __global__ void cuda_det_1(float *a_dest, const float *a_row1, const float *a_row2, const int size)
 {
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    const int j = i + 1;

    if (j < size) {
        a_dest[i] = a_row1[i]*a_row2[j] - a_row1[j]*a_row2[i];
    }
 }

 __global__ void cuda_det_2(float *a_dest, const float *a_row1, const float *a_row2, const int size)
 {
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    const int j = i + 1;

    if (i < size) {
        a_dest[i] = a_row1[i] / a_row2[j];
    }
 }
 """)

 cuda_max_threads = drv.Device(0).get_attributes().get(
    pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK)
 cuda_det_1 = cuda_mod.get_function("cuda_det_1")
 cuda_det_2 = cuda_mod.get_function("cuda_det_2")


 def cuda_reduce(a, blocks, threads):
    x, y = map(lambda v: v-1, a.shape)
    b = np.zeros(shape=(x,y)).astype(np.float32)
    size = np.int32(y+1)

    for i in range(x):
        cuda_det_1(drv.Out(b[i]), drv.In(a[i]), drv.In(a[i+1]),
            size, block=(threads,1,1), grid=(blocks,1,1))

    return b

 def cuda_product(a, d, blocks, threads):
    x, y = map(lambda v: v-1, a.shape)
    c = np.zeros(shape=(x,y)).astype(np.float32)
    size = np.int32(y+1)

    for i in range(x):
        cuda_det_1(drv.Out(c[i]), drv.In(a[i]), drv.In(a[i+1]),
                size, block=(threads,1,1), grid=(blocks,1,1))
        cuda_det_2(drv.Out(c[i]), drv.In(c[i]), drv.In(d[i+1]),
                size, block=(threads,1,1), grid=(blocks,1,1))

    return c, a

 def determinant(d):
    x, y = d.shape

    threads = min(y, cuda_max_threads)
    blocks = int(math.ceil(y/threads))

    a = cuda_reduce(d, blocks, threads)
    x, y = a.shape

    while x > 1:
        threads = min(y, cuda_max_threads)
        blocks = int(math.ceil(y/threads))
        a, d = cuda_product(a, d, blocks, threads)
        x, y = a.shape

    return a[0][0]

 def note_time(function, params):
    start = time.time()
    result = function(params)
    end = time.time()
    return result, end-start


 if __name__ == "__main__":
    a = np.random.normal(loc=1.0, size=(32, 32)).astype(np.float32)

    cuda_result, cuda_time = note_time(determinant, a)
    cpu_result, cpu_time = note_time(np.linalg.det, a)

    print("Rank=%s: cuda(%s)=%s, cpu(%s)=%s" %
        (comm.rank, cuda_time, cuda_result, cpu_time, cpu_result))
	import pycuda.autoinit
	import pycuda.driver as drv
	import numpy as np
	import math
	import time

	from __future__ import division
	from pycuda.compiler import SourceModule

	""" Dodgson condensation """


	if drv.Device.count() < 0:
	print("There is no cuda devices")
	exit()

	cuda_mod = SourceModule("""
	__global__ void cuda_det_1(float a_dest, const float a_row1, const float *a_row2, const int size)
	{
	const int i = blockDim.x * blockIdx.x + threadIdx.x;
	const int j = i + 1;

	if (j < size) {
	a_dest[i] = a_row1[i]a_row2[j] - a_row1[j]a_row2[i];
	}
	}

	__global__ void cuda_det_2(float a_dest, const float a_row1, const float *a_row2, const int size)
	{
	const int i = blockDim.x * blockIdx.x + threadIdx.x;
	const int j = i + 1;

	if (i < size) {
	a_dest[i] = a_row1[i] / a_row2[j];
	}
	}
	""")

	cuda_max_threads = drv.Device(0).get_attributes().get(
	pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK)
	cuda_det_1 = cuda_mod.get_function("cuda_det_1")
	cuda_det_2 = cuda_mod.get_function("cuda_det_2")


	def cuda_reduce(a, blocks, threads):
	x, y = map(lambda v: v-1, a.shape)
	b = np.zeros(shape=(x,y)).astype(np.float32)
	size = np.int32(y+1)

	for i in range(x):
	cuda_det_1(drv.Out(b[i]), drv.In(a[i]), drv.In(a[i+1]),
	size, block=(threads,1,1), grid=(blocks,1,1))

	return b

	def cuda_product(a, d, blocks, threads):
	x, y = map(lambda v: v-1, a.shape)
	c = np.zeros(shape=(x,y)).astype(np.float32)
	size = np.int32(y+1)

	for i in range(x):
	cuda_det_1(drv.Out(c[i]), drv.In(a[i]), drv.In(a[i+1]),
	size, block=(threads,1,1), grid=(blocks,1,1))
	cuda_det_2(drv.Out(c[i]), drv.In(c[i]), drv.In(d[i+1]),
	size, block=(threads,1,1), grid=(blocks,1,1))

	return c, a

	def determinant(d):
	x, y = d.shape

	threads = min(y, cuda_max_threads)
	blocks = int(math.ceil(y/threads))

	a = cuda_reduce(d, blocks, threads)
	x, y = a.shape

	while x > 1:
	threads = min(y, cuda_max_threads)
	blocks = int(math.ceil(y/threads))
	a, d = cuda_product(a, d, blocks, threads)
	x, y = a.shape

	return a[0][0]

	def note_time(function, params):
	start = time.time()
	result = function(params)
	end = time.time()
	return result, end-start


	if __name__ == "__main__":
	a = np.random.normal(loc=1.0, size=(32, 32)).astype(np.float32)

	cuda_result, cuda_time = note_time(determinant, a)
	cpu_result, cpu_time = note_time(np.linalg.det, a)

	print("Rank=%s: cuda(%s)=%s, cpu(%s)=%s" %
	(comm.rank, cuda_time, cuda_result, cpu_time, cpu_result))