一、参考资料
如何测试模型的推理速度
Pytorch 测试模型的推理速度
二、计算PyTorch模型推理时间
1. 计算CPU推理时间
import torch
import torchvision
import time
import tqdm
from torchsummary import summary
def calcCPUTime():
model = torchvision.models.resnet18()
model.eval()
# summary(model, input_size=(3, 224, 224), device="cpu")
dummy_input = torch.randn(1, 3, 224, 224)
num_iterations = 1000 # 迭代次数
# 预热, GPU 平时可能为了节能而处于休眠状态, 因此需要预热
print('warm up ...\n')
with torch.no_grad():
for _ in range(100):
_ = model(dummy_input)
print('testing ...\n')
total_forward_time = 0.0 # 使用time来测试
# 记录开始时间
start_event = time.time()
with torch.no_grad():
for _ in tqdm.tqdm(range(num_iterations)):
start_forward_time = time.time()
_ = model(dummy_input)
end_forward_time = time.time()
forward_time = end_forward_time - start_forward_time
total_forward_time += forward_time * 1000 # 转换为毫秒
# 记录结束时间
end_event = time.time()
elapsed_time = (end_event - start_event) # 转换为秒
fps = num_iterations / elapsed_time
elapsed_time_ms = elapsed_time / (num_iterations * dummy_input.shape[0])
avg_forward_time = total_forward_time / (num_iterations * dummy_input.shape[0])
print(f"FPS: {fps}")
print("elapsed_time_ms:", elapsed_time_ms * 1000)
print(f"Avg Forward Time per Image: {avg_forward_time} ms")
if __name__ == "__main__":
calcCPUTime()
输出结果
warm up ...
testing ...
100%|██████████| 1000/1000 [00:09<00:00, 102.13it/s]
FPS: 102.11109490533485
elapsed_time_ms: 9.793255090713501
Avg Forward Time per Image: 9.777164697647095 ms
CPU资源占用情况
2. 计算GPU推理时间
方法一
import torch
import torchvision
import time
import tqdm
from torchsummary import summary
def calcGPUTime():
model = torchvision.models.resnet18()
model.cuda()
model.eval()
# summary(model, input_size=(3, 224, 224), device="cuda")
dummy_input = torch.randn(1, 3, 224, 224).cuda()
num_iterations = 1000 # 迭代次数
# 预热, GPU 平时可能为了节能而处于休眠状态, 因此需要预热
print('warm up ...\n')
with torch.no_grad():
for _ in range(100):
_ = model(dummy_input)
print('testing ...\n')
total_forward_time = 0.0 # 使用time来测试
# 记录开始时间
start_event = time.time() * 1000
with torch.no_grad():
for _ in tqdm.tqdm(range(num_iterations)):
start_forward_time = time.time()
_ = model(dummy_input)
end_forward_time = time.time()
forward_time = end_forward_time - start_forward_time
total_forward_time += forward_time * 1000 # 转换为毫秒
# 记录结束时间
end_event = time.time() * 1000
elapsed_time = (end_event - start_event) / 1000.0 # 转换为秒
fps = num_iterations / elapsed_time
elapsed_time_ms = elapsed_time / (num_iterations * dummy_input.shape[0])
avg_forward_time = total_forward_time / (num_iterations * dummy_input.shape[0])
print(f"FPS: {fps}")
print("elapsed_time_ms:", elapsed_time_ms * 1000)
print(f"Avg Forward Time per Image: {avg_forward_time} ms")
if __name__ == "__main__":
calcGPUTime()
输出结果
warm up ...
testing ...
100%|██████████| 1000/1000 [00:01<00:00, 727.79it/s]
FPS: 727.1527832145586
elapsed_time_ms: 1.375226806640625
Avg Forward Time per Image: 1.3709843158721924 ms
GPU资源占用情况
方法二
import torch
import torchvision
import numpy as np
import tqdm
# TODO - 计算模型的推理时间
def calcGPUTime():
device = 'cuda:0'
model = torchvision.models.resnet18()
model.to(device)
model.eval()
repetitions = 1000
dummy_input = torch.rand(1, 3, 224, 224).to(device)
# 预热, GPU 平时可能为了节能而处于休眠状态, 因此需要预热
print('warm up ...\n')
with torch.no_grad():
for _ in range(100):
_ = model(dummy_input)
# synchronize 等待所有 GPU 任务处理完才返回 CPU 主线程
torch.cuda.synchronize()
# 设置用于测量时间的 cuda Event, 这是PyTorch 官方推荐的接口,理论上应该最靠谱
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
# 初始化一个时间容器
timings = np.zeros((repetitions, 1))
print('testing ...\n')
with torch.no_grad():
for rep in tqdm.tqdm(range(repetitions)):
starter.record()
_ = model(dummy_input)
ender.record()
torch.cuda.synchronize() # 等待GPU任务完成
curr_time = starter.elapsed_time(ender) # 从 starter 到 ender 之间用时,单位为毫秒
timings[rep] = curr_time
avg = timings.sum() / repetitions
print('\navg={}\n'.format(avg))
if __name__ == '__main__':
calcGPUTime()
输出结果
warm up ...
testing ...
100%|██████████| 1000/1000 [00:01<00:00, 627.50it/s]
avg=1.4300348817110062
GPU资源占用情况