pth转onnx，同时使用onnx进行部署

当像我一样的菜鸡在使用开源的深度学习代码时，对于输出的pth模型文件，在预测时使用开源的predict.py文件进行部署，但是使用pth文件有一个问题，就是每次他都要重新加载一次模型，而且不方便移植，所以，想使用onnx进行部署，特此记录

pth2onnx

import torch
from ptsemseg.models import get_model

# 设置device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# torch.onnx.export的两个参数input和output
input=["input"]
output=['output']

# 加载模型model
model = get_model('pspnet', 150, version = 'ade20k')
model = model.eval().to(device)

# 构造一个输入图像Tensor为x
x = torch.randn(1, 3, 256, 256).to(device)
# 导出为onnx
torch.onnx.export(model, x, "pspnet.onnx256", verbose=True, input_names=input, output_names=output, opset_version=11)

preonnx

参照使用pth文件进行预测的predict.py，这里我新建了一个preonnx.py来使用onnx文件来进行预测

两个文件的核心是tensor和numpy数据类型的转换以及维度变换

使用pth进行预测

import numpy as np

from skimage.transform import resize
import cv2

import torch
import time
import onnx
import onnxruntime as ort
from ptsemseg.models import get_model
from ptsemseg.utils import convert_state_dict


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
weight_path = "../../pspnet_50_ade20k.pth"



def color_map(N=256, normalized=False):
    """
    Return Color Map in PASCAL VOC format (rgb)
    \param N (int) number of classes
    \param normalized (bool) whether colors are normalized (float 0-1)
    \return (Nx3 numpy array) a color map
    """
    def bitget(byteval, idx):
        return ((byteval & (1 << idx)) != 0)
    dtype = 'float32' if normalized else 'uint8'
    cmap = np.zeros((N, 3), dtype=dtype)
    for i in range(N):
        r = g = b = 0
        c = i
        for j in range(8):
            r = r | (bitget(c, 0) << 7-j)
            g = g | (bitget(c, 1) << 7-j)
            b = b | (bitget(c, 2) << 7-j)
            c = c >> 3
        cmap[i] = np.array([r, g, b])
    cmap = cmap/255.0 if normalized else cmap
    return cmap

def decode_segmap(temp, n_classes, cmap):
    """
    Given an image of class predictions, produce an bgr8 image with class colors
    \param temp (2d numpy int array) input image with semantic classes (as integer)
    \param n_classes (int) number of classes
    \cmap (Nx3 numpy array) input color map
    \return (numpy array bgr8) the decoded image with class colors
    """
    r = temp.copy()
    g = temp.copy()
    b = temp.copy()
    for l in range(0, n_classes):
        r[temp == l] = cmap[l,0]
        g[temp == l] = cmap[l,1]
        b[temp == l] = cmap[l,2]
    bgr = np.zeros((temp.shape[0], temp.shape[1], 3))
    bgr[:, :, 0] = b
    bgr[:, :, 1] = g
    bgr[:, :, 2] = r
    return bgr.astype(np.uint8)

def predict_max(img):
        """
        Do semantic prediction for max fusion
        \param img (numpy array rgb8)
        """
        class_probs = predict(img)
        # Take best prediction and confidence
        pred_confidence, pred_label = class_probs.max(1)
        pred_confidence = pred_confidence.squeeze(0).cpu().numpy()
        pred_label = pred_label.squeeze(0).cpu().numpy()
        pred_label = resize(pred_label, (375, 500), order = 0, mode = 'reflect', anti_aliasing=False, preserve_range = True) # order = 0, nearest neighbour
        pred_label = pred_label.astype(np.int64)
        # Add semantic color
        semantic_color = decode_segmap(pred_label, 150, cmap)
        pred_confidence = resize(pred_confidence, (375, 500),  mode = 'reflect', anti_aliasing=True, preserve_range = True)
        return (semantic_color, pred_confidence)

def predict(img):
        """
        Do semantic segmantation
        \param img: (numpy array bgr8) The input cv image
        """
        img = img.copy() # Make a copy of image because the method will modify the image
        #orig_size = (img.shape[0], img.shape[1]) # Original image size
        # Prepare image: first resize to CNN input size then extract the mean value of SUNRGBD dataset. No normalization
        img = resize(img, cnn_input_size, mode = 'reflect', anti_aliasing=True, preserve_range = True) # Give float64
        img = img.astype(np.float32)
        img -= mean
        # Convert HWC -> CHW
        img = img.transpose(2, 0, 1)
        # Convert to tensor
        img = torch.tensor(img, dtype = torch.float32)
        img = img.unsqueeze(0) # Add batch dimension required by CNN
        with torch.no_grad():
            img = img.to(device)
            # Do inference
            since = time.time()
            outputs = model(img) #N,C,W,H
            # Apply softmax to obtain normalized probabilities
            outputs = torch.nn.functional.softmax(outputs, 1)
            return outputs

if __name__ == '__main__':
    cmap = color_map(N = 150, normalized = False)
    color_img = cv2.imread("./include/2007_000661.jpg")
    weight_dict = torch.load(weight_path, map_location={'cuda:0':'cpu'})

    cnn_input_size = (473, 473)
    mean = np.array([104.00699, 116.66877, 122.67892]) # Mean value of dataset

    model = get_model("pspnet",150, version = 'ade20k')
    state = torch.load(weight_path, map_location={'cuda:0':'cpu'})
    model.load_state_dict(convert_state_dict(state['model_state'])) # Remove 'module' from dictionary keys
    
    model = model.to(device)
    model.eval().to(device)
    
    semantic_color, pred_confidence = predict_max(color_img)
    while(1):
        cv2.imshow("semantic_color", semantic_color)
        if cv2.waitKey(0) & 0xFF == ord('q'):
            break

使用onnx进行预测

import numpy as np
from skimage.transform import resize
import cv2
import onnxruntime as ort
import torch
import time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the ONNX model
onnx_model_path = "./pspnet473.onnx"
ort_session = ort.InferenceSession(onnx_model_path)

def color_map(N=256, normalized=False):
    """
    Return Color Map in PASCAL VOC format (rgb)
    \param N (int) number of classes
    \param normalized (bool) whether colors are normalized (float 0-1)
    \return (Nx3 numpy array) a color map
    """
    def bitget(byteval, idx):
        return ((byteval & (1 << idx)) != 0)
    dtype = 'float32' if normalized else 'uint8'
    cmap = np.zeros((N, 3), dtype=dtype)
    for i in range(N):
        r = g = b = 0
        c = i
        for j in range(8):
            r = r | (bitget(c, 0) << 7-j)
            g = g | (bitget(c, 1) << 7-j)
            b = b | (bitget(c, 2) << 7-j)
            c = c >> 3
        cmap[i] = np.array([r, g, b])
    cmap = cmap/255.0 if normalized else cmap
    return cmap

def decode_segmap(temp, n_classes, cmap):
    """
    Given an image of class predictions, produce an bgr8 image with class colors
    \param temp (2d numpy int array) input image with semantic classes (as integer)
    \param n_classes (int) number of classes
    \cmap (Nx3 numpy array) input color map
    \return (numpy array bgr8) the decoded image with class colors
    """
    r = temp.copy()
    g = temp.copy()
    b = temp.copy()
    for l in range(0, n_classes):
        r[temp == l] = cmap[l,0]
        g[temp == l] = cmap[l,1]
        b[temp == l] = cmap[l,2]
    bgr = np.zeros((temp.shape[0], temp.shape[1], 3))
    bgr[:, :, 0] = b
    bgr[:, :, 1] = g
    bgr[:, :, 2] = r
    return bgr.astype(np.uint8)

if __name__ == '__main__':
    cmap = color_map(N=150, normalized=False)
    color_img = cv2.imread("./2007_000661.jpg")

    # Resize image to match the model's input size
    input_size = (473, 473)  #473 256
    color_img_resized = cv2.resize(color_img, input_size)

    # Preprocess the image
    img = color_img_resized.astype(np.float32)
    img -= np.array([104.00699, 116.66877, 122.67892])  # Mean value of dataset
    img = img.transpose(2, 0, 1)
    img = np.expand_dims(img, axis=0)

    # Perform inference
    outputs = ort_session.run(None, {'input': img})
    class_probs = outputs[0]

    # Process the output to get semantic color and confidence
    pred_confidence, pred_label = np.max(class_probs, axis=1), np.argmax(class_probs, axis=1)
    pred_label = pred_label.squeeze(0)
    pred_label = resize(pred_label, (375, 500), order=0, mode='reflect', anti_aliasing=False, preserve_range=True)
    pred_label = pred_label.astype(np.int64)
    semantic_color = decode_segmap(pred_label, 150, cmap)
    pred_confidence = resize(pred_confidence, (375, 500), mode='reflect', anti_aliasing=True, preserve_range=True)

    # Display the result
    while(1):
        cv2.imshow("semantic_color", semantic_color)
        if cv2.waitKey(0) & 0xFF == ord('q'):
            break

成功案例

使用pth进行预测（不重要）

相关代码为（主要关注tensor维度变化和处理方式）

    def detect_image(self, image, count=False, name_classes=None):
        #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
        #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
        image       = cvtColor(image)
        #   对输入图像进行一个备份，后面用于绘图
        old_img     = copy.deepcopy(image)
        orininal_h  = np.array(image).shape[0]
        orininal_w  = np.array(image).shape[1]
        #   给图像增加灰条，实现不失真的resize
        #   也可以直接resize进行识别
        image_data, nw, nh  = resize_image(image, (self.input_shape[1],self.input_shape[0]))
        #   添加上batch_size维度
        image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)

        with torch.no_grad():
            images = torch.from_numpy(image_data)
            if self.cuda:
                images = images.cuda()
                
            #   图片传入网络进行预测
            pr = self.net(images)[0]                                    # tensor:[21,512,512]
            #   取出每一个像素点的种类,使用pr.permute(1,2,0)进行维度转换，并转为numpy
            pr = F.softmax(pr.permute(1,2,0),dim = -1).cpu().numpy()    # numpy:[512,512,21]
            #   将灰条部分截取掉
            pr = pr[int((self.input_shape[0] - nh) // 2) : int((self.input_shape[0] - nh) // 2 + nh), \
                    int((self.input_shape[1] - nw) // 2) : int((self.input_shape[1] - nw) // 2 + nw)]
            #   进行图片的resize
            pr = cv2.resize(pr, (orininal_w, orininal_h), interpolation = cv2.INTER_LINEAR)
            #   取出每一个像素点的种类
            pr = pr.argmax(axis=-1)                                     # numpy:[512,512]
            seg_img = np.reshape(np.array(self.colors, np.uint8)[np.reshape(pr, [-1])], [orininal_h, orininal_w, -1])
            #   将新图片转换成Image的形式
            image   = Image.fromarray(np.uint8(seg_img))
            #   将新图与原图及进行混合
            image   = Image.blend(old_img, image, 0.7)
            return image

推理时间

使用onnx进行预测（完整）

相关代码为（主要看操作的数据类型和操作方法）

import time
import copy
import colorsys
import onnx
import onnxruntime as ort

import cv2
import numpy as np
from PIL import Image
from utils.utils import cvtColor, preprocess_input, resize_image, show_config

import torch
from torch import nn
import torch.nn.functional as F

name_classes = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
                "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train",
                "tvmonitor"]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the ONNX model
onnx_model_path = "./model_data/models.onnx"
ort_session = ort.InferenceSession(onnx_model_path)

mix_type = 0
colors = [(0, 0, 0), (128, 0, 0), (0, 128, 0), (128, 128, 0), (0, 0, 128), (128, 0, 128), (0, 128, 128),
          (128, 128, 128), (64, 0, 0), (192, 0, 0), (64, 128, 0), (192, 128, 0), (64, 0, 128), (192, 0, 128),
          (64, 128, 128), (192, 128, 128), (0, 64, 0), (128, 64, 0), (0, 192, 0), (128, 192, 0), (0, 64, 128),
          (128, 64, 12)]


def onnx_detect(image):
    image = cvtColor(image)
    old_img = copy.deepcopy(image)
    orininal_h = np.array(image).shape[0]
    orininal_w = np.array(image).shape[1]
    image_data, nw, nh = resize_image(image, (512, 512))
    image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)

    outputs = ort_session.run(None, {'images': image_data})
    # 获取想要的信息
    pr = outputs[0]                # numpy:[1,21,512,512]
    # 将由tensor转换成的numpy的多余维度去掉，使用np.squeeze方法
    pr = np.squeeze(pr, axis=0)    # numpy:[21,512,512]
    # 改变numpy维度顺序，将类别信息放在最后一维
    pr = np.transpose(pr,(1,2,0))  # numpy:[512,512,21]
    pr = np.exp(pr) / np.sum(np.exp(pr), axis=-1, keepdims=True)
    pr = pr[int((512 - nh) // 2): int((512 - nh) // 2 + nh), \
         int((512 - nw) // 2): int((512 - nw) // 2 + nw)]
    # 进行图片的resize
    pr = cv2.resize(pr, (orininal_w, orininal_h), interpolation=cv2.INTER_LINEAR)
    # 取出每一个像素点的种类
    pr = pr.argmax(axis=-1)  # numpy:[512,512]

    seg_img = np.reshape(np.array(colors, np.uint8)[np.reshape(pr, [-1])], [orininal_h, orininal_w, -1])
    # ------------------------------------------------#
    #   将新图片转换成Image的形式
    # ------------------------------------------------#
    image = Image.fromarray(np.uint8(seg_img))
    # ------------------------------------------------#
    #   将新图与原图及进行混合
    # ------------------------------------------------#
    image = Image.blend(old_img, image, 0.7)

    return image


while True:
    frame = cv2.imread("/home/pc/文档/deeplabv3-plus-pytorch-main/img/street.jpg")

    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # 转变成Image
    frame = Image.fromarray(np.uint8(frame))
    # 进行检测
    frame = np.array(onnx_detect(frame))
    # RGBtoBGR满足opencv显示格式
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

    cv2.imshow("video", frame)
    if cv2.waitKey(1) & 0xff == ord('q'):
        break

思路解释

1.首先将输入的待处理图片resize为模型要求的尺寸（512，512）

image_data, nw, nh = resize_image(image, (512, 512))

2.对图像数据进行预处理，并将其转换为适合模型输入的格式。

np.transpose(..., (2, 0, 1))：将预处理后的图像数据的维度进行转置。原始图像数据的维度可能是（高度，宽度，通道数），而模型通常要求输入数据的维度为（通道数，高度，宽度）。通过转置操作，可以将数据维度调整为符合模型要求的格式。
np.expand_dims(..., 0)：在转置后的图像数据的第一个维度上增加一个新的维度。这样做是为了将单张图像扩展为一个批次（batch）的形式，因为模型通常接受批量的输入数据。

image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1))

3.onnx推理，得到的outputs为list，再使用pr=outputs[0]得到信息数据，pr为numpy数据

outputs = ort_session.run(None, {'images': image_data})
pr = outputs[0]

4. 对pr维度进行调整，去掉多余的第一维（batch），此时pr为[21,512,512],对应[类别数，宽，高]

pr = np.squeeze(pr, axis=0)

5. 进行维度变换，把[类别数，宽，高]改为[宽，高，类别数]

pr = np.transpose(pr,(1,2,0))

6. 归一化

pr = np.exp(pr) / np.sum(np.exp(pr), axis=-1, keepdims=True)

7.这段代码的作用是对图像数据进行裁剪，以适应模型的输入要求。

int((512 - nh) // 2)：计算裁剪后的图像高度的起始位置。假设原始图像的高度为nh，而模型要求的输入高度为512，那么需要将图像在高度方向上居中裁剪，使得裁剪后的图像高度为512。这里使用整除运算符//来确保结果为整数。
int((512 - nh) // 2 + nh)：计算裁剪后的图像高度的结束位置。通过起始位置加上原始图像的高度nh，可以得到裁剪后的图像高度的结束位置。
int((512 - nw) // 2)：计算裁剪后的图像宽度的起始位置。同样地，假设原始图像的宽度为nw，而模型要求的输入宽度为512，那么需要将图像在宽度方向上居中裁剪，使得裁剪后的图像宽度为512。
int((512 - nw) // 2 + nw)：计算裁剪后的图像宽度的结束位置。通过起始位置加上原始图像的宽度nw，可以得到裁剪后的图像宽度的结束位置。

pr = pr[int((512 - nh) // 2): int((512 - nh) // 2 + nh), int((512 - nw) // 2): int((512 - nw) // 2 + nw)]

8.将图片resize成原尺寸

pr = cv2.resize(pr, (orininal_w, orininal_h), interpolation=cv2.INTER_LINEAR)

9. 取出每一个像素的类别，因为axis=-1代表最后一维，也就是类别这一维，然后对每一个像素取argmax，得到对应的置信度最大的类别作为这个像素的类别

pr = pr.argmax(axis=-1)

10.这段代码的作用是将预测结果（pr）转换为彩色图像（seg_img）。具体解释如下：

np.reshape(pr, [-1])：将预测结果（pr）展平为一维数组。[-1]表示自动计算展平后的维度大小，即原始数组的元素总数。
np.array(colors, np.uint8)：将颜色列表（colors）转换为NumPy数组，并将数据类型设置为uint8。
np.array(colors, np.uint8)[np.reshape(pr, [-1])]：使用展平后的预测结果作为索引，从颜色数组中获取对应的颜色值。这样可以得到与预测结果相同大小的彩色值数组。
np.reshape(..., [orininal_h, orininal_w, -1])：将彩色值数组重新调整为原始图像的高度和宽度，以及通道数（这里使用-1表示自动计算通道数）。
综上所述，这段代码的目的是将预测结果转换为彩色图像，以便可视化分割结果

seg_img = np.reshape(np.array(colors, np.uint8)[np.reshape(pr, [-1])], [orininal_h, orininal_w, -1])

11.这段代码的作用是将彩色图像（seg_img）转换为PIL.Image对象（image）。具体解释如下：

np.uint8(seg_img)：将彩色图像数组的数据类型转换为uint8，以确保像素值在0到255的范围内。
Image.fromarray(...)：使用NumPy数组作为输入，创建一个PIL.Image对象。这个对象可以用于显示、保存或进一步处理图像。

image = Image.fromarray(np.uint8(seg_img))

12将原图和新图混合

image = Image.blend(old_img, image, 0.7)

如果没有这一行，则是这样

推理时间

使用openvino进行推理加速（好像有问题，推理速度没怎么加速）

代码

import time
import copy
import colorsys
import numpy as np
from PIL import Image
from utils.utils import cvtColor, preprocess_input, resize_image, show_config

import torch
from torch import nn
import torch.nn.functional as F
import openvino

from openvino.inference_engine import IECore
import cv2
from openvino.inference_engine import IECore

name_classes = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
                "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train",
                "tvmonitor"]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the OpenVINO IR model
ir_model_path = "./model_data/models.xml"
ie = IECore()
net = ie.read_network(model=ir_model_path)
exec_net = ie.load_network(network=net, device_name="CPU")

mix_type = 0
colors = [(0, 0, 0), (128, 0, 0), (0, 128, 0), (128, 128, 0), (0, 0, 128), (128, 0, 128), (0, 128, 128),
          (128, 128, 128), (64, 0, 0), (192, 0, 0), (64, 128, 0), (192, 128, 0), (64, 0, 128), (192, 0, 128),
          (64, 128, 128), (192, 128, 128), (0, 64, 0), (128, 64, 0), (0, 192, 0), (128, 192, 0), (0, 64, 128),
          (128, 64, 12)]

def openvino_detect(image):
    image = cvtColor(image)
    old_img = copy.deepcopy(image)
    orininal_h = np.array(image).shape[0]
    orininal_w = np.array(image).shape[1]
    image_data, nw, nh = resize_image(image, (512, 512))
    image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)

    input_blob = next(iter(net.input_info))
    output_blob = next(iter(net.outputs))
    res = exec_net.infer(inputs={input_blob: image_data})
    pr = res[output_blob]
    pr = np.squeeze(pr, axis=0)    # numpy:[21,512,512]
    # 改变numpy维度顺序，将类别信息放在最后一维
    pr = np.transpose(pr,(1,2,0))  # numpy:[512,512,21]
    pr = np.exp(pr) / np.sum(np.exp(pr), axis=-1, keepdims=True)

    pr = pr[int((512 - nh) // 2): int((512 - nh) // 2 + nh), int((512 - nw) // 2): int((512 - nw) // 2 + nw)]
    pr = cv2.resize(pr, (orininal_w, orininal_h), interpolation=cv2.INTER_LINEAR)
    pr = pr.argmax(axis=-1)

    seg_img = np.reshape(np.array(colors, np.uint8)[np.reshape(pr, [-1])], [orininal_h, orininal_w, -1])
    image = Image.fromarray(np.uint8(seg_img))

    # image = Image.blend(old_img, image, 0.7)

    return image
if __name__ == "__main__":
    while True:
        frame = cv2.imread("/home/pc/文档/deeplabv3-plus-pytorch-main/img/street.jpg")

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = Image.fromarray(np.uint8(frame))
        start_time = time.time()
        frame = np.array(openvino_detect(frame))
        end_time = time.time()
        print(f"推理耗时：{end_time - start_time} 秒")
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        cv2.imshow("video", frame)
        if cv2.waitKey(1) & 0xff == ord('q'):
            break

输出

推理时间

总结

在使用 pth进行预测时，他的操作方法主要是通过torch自带的预测推理函数得出输出为tensor，转换维度后进行softmax，再转为numpy，再得出输出；而在使用onnx进行预测时，他的操作方法主要是通过onnxruntime的函数outputs = ort_session.run(None, {'images': image_data})，这里images'为转换为onnx时使用的input_name，image_data为输入图片，得到outputs后，pr = outputs[0]得到想要的信息，这里此时的pr为numpy，再根据pth预测时的操作，使用numpy的相关函数即可。