I was comparing the inference times for an input using pytorch and onnxruntime and I find that onnxruntime is actually slower on GPU while being significantly faster on CPU
I was tryng this on Windows 10.
- ONNX Runtime installed from source - ONNX Runtime version: 1.11.0 (onnx version 1.10.1)
- Python version - 3.8.12
- CUDA/cuDNN version - cuda version 11.5, cudnn version 8.2
- GPU model and memory - Quadro M2000M, 4 GB
Relevant code -
import torch
from torchvision import models
import onnxruntime # to inference ONNX models, we use the ONNX Runtime
import onnx
import os
import time
batch_size = 1
total_samples = 1000
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
def convert_to_onnx(resnet):
resnet.eval()
dummy_input = (torch.randn(batch_size, 3, 224, 224, device=device)).to(device=device)
input_names = [ 'input' ]
output_names = [ 'output' ]
torch.onnx.export(resnet,
dummy_input,
"resnet18.onnx",
verbose=True,
opset_version=13,
input_names=input_names,
output_names=output_names,
export_params=True,
do_constant_folding=True,
dynamic_axes={
'input': {0: 'batch_size'}, # variable length axes
'output': {0: 'batch_size'}}
)
def infer_pytorch(resnet):
print('Pytorch Inference')
print('==========================')
print()
x = torch.randn((batch_size, 3, 224, 224))
x = x.to(device=device)
latency = []
for i in range(total_samples):
t0 = time.time()
resnet.eval()
with torch.no_grad():
out = resnet(x)
latency.append(time.time() - t0)
print('Number of runs:', len(latency))
print("Average PyTorch {} Inference time = {} ms".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))
def to_numpy(tensor):
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
def infer_onnxruntime():
print('Onnxruntime Inference')
print('==========================')
print()
onnx_model = onnx.load("resnet18.onnx")
onnx.checker.check_model(onnx_model)
# Input
x = torch.randn((batch_size, 3, 224, 224))
x = x.to(device=device)
x = to_numpy(x)
so = onnxruntime.SessionOptions()
so.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
exproviders = ['CUDAExecutionProvider', 'CPUExecutionProvider']
model_onnx_path = os.path.join(".", "resnet18.onnx")
ort_session = onnxruntime.InferenceSession(model_onnx_path, so, providers=exproviders)
options = ort_session.get_provider_options()
cuda_options = options['CUDAExecutionProvider']
cuda_options['cudnn_conv_use_max_workspace'] = '1'
ort_session.set_providers(['CUDAExecutionProvider'], [cuda_options])
#IOBinding
input_names = ort_session.get_inputs()[0].name
output_names = ort_session.get_outputs()[0].name
io_binding = ort_session.io_binding()
io_binding.bind_cpu_input(input_names, x)
io_binding.bind_output(output_names, device)
#warm up run
ort_session.run_with_iobinding(io_binding)
ort_outs = io_binding.copy_outputs_to_cpu()
latency = []
for i in range(total_samples):
t0 = time.time()
ort_session.run_with_iobinding(io_binding)
latency.append(time.time() - t0)
ort_outs = io_binding.copy_outputs_to_cpu()
print('Number of runs:', len(latency))
print("Average onnxruntime {} Inference time = {} ms".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))
if __name__ == '__main__':
torch.cuda.empty_cache()
resnet = (models.resnet18(pretrained=True)).to(device=device)
convert_to_onnx(resnet)
infer_onnxruntime()
infer_pytorch(resnet)
Output
If run on CPU,
Average onnxruntime cpu Inference time = 18.48 ms
Average PyTorch cpu Inference time = 51.74 ms
but, if run on GPU, I see
Average onnxruntime cuda Inference time = 47.89 ms
Average PyTorch cuda Inference time = 8.94 ms
If I change graph optimizations to onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL, I see some improvements in inference time on GPU, but its still slower than Pytorch.
I use io binding for the input tensor numpy array and the nodes of the model are on GPU.
Further, during the processing for onnxruntime, I print device usage stats and I see this -
Using device: cuda:0
GPU Device name: Quadro M2000M
Memory Usage:
Allocated: 0.1 GB
Cached: 0.1 GB
So, GPU device is being used.
Further, I have used the resnet18.onnx model from the ModelZoo to see if it is a converted mode issue, but i get the same results.
What am I doing wrong or missing here?
torch.cuda.empty_cache()
as it it will slow down your code for no gain discuss.pytorch.org/t/… – Protestant