paddle模型使用TensorRT推理
- 1 模型末端添加softmax和argmax算子
- 2 paddle模型转onnx模型
- 3 onnx模型转TensorRT模型
- 3.1 安装TensorRT-8.5.3.1
- 3.2 使用 trtexec 将onnx模型编译优化导出为engine模型
- 4 TensorRT模型推理测试
- 5 完整代码
- 6 测试结果
1 模型末端添加softmax和argmax算子
前文 PaddleSeg c++部署OCRNet+HRNet模型中的语义分割模型输出为float32
类型,模型不含softmax和argmax处理,导致在项目应用过程中后处理耗时较高。
通过PaddleSeg/tools/export.py在网络末端增加softmax和argmax算子,解决应用中的后处理耗时问题。
参考文档PaddleSeg/docs/model_export_cn.md导出预测模型。将导出的预测模型文件保存在output/inference_model
文件夹中,如下。模型输出类型为int32
。
./output/inference_model
├── deploy.yaml # 部署相关的配置文件,主要说明数据预处理的方式
├── model.pdmodel # 预测模型的拓扑结构文件
├── model.pdiparams # 预测模型的权重文件
└── model.pdiparams.info # 参数额外信息,一般无需关注网络输出类型为int32。
python tools/export.py \
--config configs\ocrnet\ocrnet_hrnetw18_cityscapes_1024x512_160k_lovasz_softmax.yml\
--model_path output\iter_12000\model.pdparams \
--save_dir output\inference_model
--output_op argmax
PaddleSeg v2.0
以前export.py
中不含argmax
和softmax
参数选项,可通过以下代码在模型末端增加softmax
和argmax
算子。
import argparse
import os
import paddle
import yaml
from paddleseg.cvlibs import Config
from paddleseg.utils import logger
def parse_args():
parser = argparse.ArgumentParser(description='Model export.')
# params of training
parser.add_argument(
"--config",
dest="cfg",
help="The config file.",
default=None,
type=str,
required=True)
parser.add_argument(
'--save_dir',
dest='save_dir',
help='The directory for saving the model snapshot',
type=str,
default='./output')
parser.add_argument(
'--model_path',
dest='model_path',
help='The path of model for evaluation',
type=str,
default=None)
return parser.parse_args()
class SavedSegmentationNet(paddle.nn.Layer):
def __init__(self, net, without_argmax=False, with_softmax=False):
super().__init__()
self.net = net
self.post_processer = PostPorcesser(without_argmax, with_softmax)
def forward(self, x):
outs = self.net(x)
outs = self.post_processer(outs)
return outs
class PostPorcesser(paddle.nn.Layer):
def __init__(self, without_argmax, with_softmax):
super().__init__()
self.without_argmax = without_argmax
self.with_softmax = with_softmax
def forward(self, outs):
new_outs = []
for out in outs:
if self.with_softmax:
out = paddle.nn.functional.softmax(out, axis=1)
if not self.without_argmax:
out = paddle.argmax(out, axis=1)
new_outs.append(out)
return new_outs
def main(args):
os.environ['PADDLESEG_EXPORT_STAGE'] = 'True'
cfg = Config(args.cfg)
net = cfg.model
if args.model_path:
para_state_dict = paddle.load(args.model_path)
net.set_dict(para_state_dict)
logger.info('Loaded trained params of model successfully.')
# 增加softmax、argmax处理
new_net = SavedSegmentationNet(net, True,True)
new_net.eval()
new_net = paddle.jit.to_static(
new_net,
input_spec=[
paddle.static.InputSpec(
shape=[None, 3, None, None], dtype='float32')
])
save_path = os.path.join(args.save_dir, 'model')
paddle.jit.save(new_net, save_path)
yml_file = os.path.join(args.save_dir, 'deploy.yaml')
with open(yml_file, 'w') as file:
transforms = cfg.export_config.get('transforms', [{
'type': 'Normalize'
}])
data = {
'Deploy': {
'transforms': transforms,
'model': 'model.pdmodel',
'params': 'model.pdiparams'
}
}
yaml.dump(data, file)
logger.info(f'Model is saved in {args.save_dir}.')
if __name__ == '__main__':
args = parse_args()
main(args)
2 paddle模型转onnx模型
参考文档 PaddleSeg/docs/model_export_onnx_cn.md
参考文档Paddle2ONNX
(1)安装Paddle2ONNX
pip install paddle2onnx
(2)模型转换
执行如下命令,使用Paddle2ONNX
将output/inference_model
文件夹中的预测模型导出为ONNX格式模型。将导出的预测模型文件保存为model.onnx
。
paddle2onnx --model_dir output/inference_model \
--model_filename model.pdmodel \
--params_filename model.pdiparams \
--opset_version 12 \
--save_file model.onnx \
--enable_dev_version True
3 onnx模型转TensorRT模型
3.1 安装TensorRT-8.5.3.1
参考TensorRt安装
3.2 使用 trtexec 将onnx模型编译优化导出为engine模型
由于是动态输入,因此指定了输入尺寸范围和最优尺寸。将导出的预测模型文件保存为model.trt
。
trtexec.exe
--onnx=model.onnx
--explicitBatch --fp16
--minShapes=x:1x3x540x960
--optShapes=x:1x3x720x1280
--maxShapes=x:1x3x1080x1920
--saveEngine=model.trt
4 TensorRT模型推理测试
参考TensorRt动态尺寸输入的分割模型测试
5 完整代码
namespace TRTSegmentation {
class Logger : public nvinfer1::ILogger
{
public:
Logger(Severity severity = Severity::kWARNING) :
severity_(severity) {}
virtual void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity <= severity_) {
//std::cout << msg << std::endl;
}
}
nvinfer1::ILogger& getTRTLogger() noexcept
{
return *this;
}
private:
Severity severity_;
};
struct InferDeleter
{
template <typename T>
void operator()(T* obj) const
{
delete obj;
}
};
template <typename T>
using SampleUniquePtr = std::unique_ptr<T, InferDeleter>;
class LaneSegInferTRT
{
public:
LaneSegInferTRT(const std::string seg_model_dir = "") {
this->seg_model_dir_ = seg_model_dir;
InitPredictor();
}
~LaneSegInferTRT()
{
cudaFree(bindings_[0]);
cudaFree(bindings_[1]);
}
void PredictSeg(
const cv::Mat &image_mat,
std::vector<PaddleSegmentation::DataLane> &solLanes /*实线*/,
std::vector<PaddleSegmentation::DataLane> &dasLanes /*虚线*/,
std::vector<double>* times = nullptr);
private:
void InitPredictor();
// Preprocess image and copy data to input buffer
cv::Mat Preprocess(const cv::Mat& image_mat);
// Postprocess image
void Postprocess(int rows,
int cols,
std::vector<int> &out_data,
std::vector<PaddleSegmentation::DataLane> &solLanes,
std::vector<PaddleSegmentation::DataLane> &dasLanes);
private:
//static const int num_classes_ = 15;
std::shared_ptr<nvinfer1::ICudaEngine> mEngine_;
SampleUniquePtr<nvinfer1::IExecutionContext> context_seg_lane_;
std::vector<void*> bindings_;
std::string seg_model_dir_;
int gpuMaxBufSize = 1280 * 720; // output
};
}//namespace PaddleSegmentation
#include "LaneSegInferTRT.hpp"
namespace {
class Logger : public nvinfer1::ILogger
{
public:
Logger(Severity severity = Severity::kWARNING) :
severity_(severity) {}
virtual void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity <= severity_) {
//std::cout << msg << std::endl;
}
}
nvinfer1::ILogger& getTRTLogger() noexcept
{
return *this;
}
private:
Severity severity_;
};
}
namespace TRTSegmentation {
#define CHECK(status) \
do \
{ \
auto ret = (status); \
if (ret != 0) \
{ \
std::cerr << "Cuda failure: " << ret << std::endl; \
} \
} while (0)
void LaneSegInferTRT::InitPredictor()
{
if (seg_model_dir_.empty()) {
throw "Predictor must receive seg_model!";
}
std::ifstream ifs(seg_model_dir_, std::ifstream::binary);
if (!ifs) {
throw "seg_model_dir error!";
}
ifs.seekg(0, std::ios_base::end);
int size = ifs.tellg();
ifs.seekg(0, std::ios_base::beg);
std::unique_ptr<char> pData(new char[size]);
ifs.read(pData.get(), size);
ifs.close();
// engine模型
Logger logger(nvinfer1::ILogger::Severity::kVERBOSE);
SampleUniquePtr<nvinfer1::IRuntime> runtime{nvinfer1::createInferRuntime(logger.getTRTLogger()) };
mEngine_ = std::shared_ptr<nvinfer1::ICudaEngine>(
runtime->deserializeCudaEngine(pData.get(), size), InferDeleter());
this->context_seg_lane_ = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine_->createExecutionContext());
bindings_.resize(mEngine_->getNbBindings());
CHECK(cudaMalloc(&bindings_[0], sizeof(float) * 3 * gpuMaxBufSize)); // n*3*h*w
CHECK(cudaMalloc(&bindings_[1], sizeof(int) * 1 * gpuMaxBufSize)); // n*1*h*w
}
cv::Mat LaneSegInferTRT::Preprocess(const cv::Mat& image_mat)
{
cv::Mat img;
cv::cvtColor(image_mat, img, cv::COLOR_BGR2RGB);
if (true/*is_normalize*/) {
img.convertTo(img, CV_32F, 1.0 / 255, 0);
img = (img - 0.5) / 0.5;
}
return img;
}
void LaneSegInferTRT::PredictSeg(
const cv::Mat &image_mat,
std::vector<PaddleSegmentation::DataLane> &solLanes ,
std::vector<PaddleSegmentation::DataLane> &dasLanes,
std::vector<double>* times)
{
// Preprocess image
cv::Mat img = Preprocess(image_mat);
int rows = img.rows;
int cols = img.cols;
this->context_seg_lane_->setBindingDimensions(0, nvinfer1::Dims4{ 1, 3 , rows, cols });
int chs = img.channels();
std::vector<float> input_data(1 * chs * rows * cols, 0.0f);
hwc_img_2_chw_data(img, input_data.data());
CHECK(cudaMemcpy(bindings_[0], static_cast<const void*>(input_data.data()), 3 * img.rows * img.cols * sizeof(float), cudaMemcpyHostToDevice));
// Run predictor 推理
context_seg_lane_->executeV2(bindings_.data());
// Get output tensor
std::vector<int> out_data(1 * 1 * rows * cols);
CHECK(cudaMemcpy(static_cast<void*>(out_data.data()), bindings_[1], out_data.size() * sizeof(int), cudaMemcpyDeviceToHost));
// Postprocessing
Postprocess(rows, cols, out_data, solLanes,dasLanes);
}
void LaneSegInferTRT::Postprocess(int rows, int cols, vector<int>& out_data,std::vector<PaddleSegmentation::DataLane> &solLanes,
std::vector<PaddleSegmentation::DataLane> &dasLanes)
{
PaddleSegmentation::LanePostProcess laneNet(rows, cols);
laneNet.lanePostprocessForTRT(out_data,solLanes,dasLanes);
}
}//namespace PaddleSegmentation
6 测试结果