-
-
Save sandeepkumar-skb/b634fa4a53e2e19ea08c3721fa045fb9 to your computer and use it in GitHub Desktop.
Acceleration inference of onnx model with TensorRT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorrt as trt | |
import numpy as np | |
import pycuda.autoinit | |
import pycuda.driver as cuda | |
import time | |
model_path = "model.onnx" | |
input_size = 32 | |
TRT_LOGGER = trt.Logger(trt.Logger.WARNING) | |
def build_engine(model_path): | |
with trt.Builder(TRT_LOGGER) as builder, \ | |
builder.create_network() as network, \ | |
trt.OnnxParser(network, TRT_LOGGER) as parser: | |
builder.max_workspace_size = 1<<20 | |
builder.max_batch_size = 1 | |
with open(model_path, "rb") as f: | |
parser.parse(f.read()) | |
engine = builder.build_cuda_engine(network) | |
return engine | |
def alloc_buf(engine): | |
# host cpu mem | |
h_in_size = trt.volume(engine.get_binding_shape(0)) | |
h_out_size = trt.volume(engine.get_binding_shape(1)) | |
h_in_dtype = trt.nptype(engine.get_binding_dtype(0)) | |
h_out_dtype = trt.nptype(engine.get_binding_dtype(1)) | |
in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype) | |
out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype) | |
# allocate gpu mem | |
in_gpu = cuda.mem_alloc(in_cpu.nbytes) | |
out_gpu = cuda.mem_alloc(out_cpu.nbytes) | |
stream = cuda.Stream() | |
return in_cpu, out_cpu, in_gpu, out_gpu, stream | |
def inference(engine, context, inputs, out_cpu, in_gpu, out_gpu, stream): | |
# async version | |
# with engine.create_execution_context() as context: # cost time to initialize | |
# cuda.memcpy_htod_async(in_gpu, inputs, stream) | |
# context.execute_async(1, [int(in_gpu), int(out_gpu)], stream.handle, None) | |
# cuda.memcpy_dtoh_async(out_cpu, out_gpu, stream) | |
# stream.synchronize() | |
# sync version | |
cuda.memcpy_htod(in_gpu, inputs) | |
context.execute(1, [int(in_gpu), int(out_gpu)]) | |
cuda.memcpy_dtoh(out_cpu, out_gpu) | |
return out_cpu | |
if __name__ == "__main__": | |
inputs = np.random.random((1, 3, input_size, input_size)).astype(np.float32) | |
engine = build_engine(model_path) | |
context = engine.create_execution_context() | |
for _ in range(10): | |
t1 = time.time() | |
in_cpu, out_cpu, in_gpu, out_gpu, stream = alloc_buf(engine) | |
res = inference(engine, context, inputs.reshape(-1), out_cpu, in_gpu, out_gpu, stream) | |
print(res) | |
print("cost time: ", time.time()-t1) | |
# tensorrt docker image: docker pull nvcr.io/nvidia/tensorrt:19.09-py3 (See: https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt/tags) | |
# NOTE: cuda driver >= 418 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment