个人环境配置--安装记录

根据显卡下载对应的cuda和cudnn
我使用的是docker,首先拉取镜像,我用的是ubuntu20.04
加速：pull hub.1panel.dev/
devel是开发版本

sudo docker pull hub.1panel.dev/nvidia/cuda:11.6.1-devel-ubuntu20.04

先测试一下cuda有没有安装好

nvcc -V

更新，安装 vim、 wget

apt update

apt install vim wget

安装cudnn
cudnn下载网址：https://developer.nvidia.com/rdp/cudnn-archive

# 解压
tar -xf cudnn-linux-x86_64-8.9.7.29_cuda11-archive.tar.xz
# cd进入文件
cd cudnn-linux-x86_64-8.9.7.29_cuda11-archive 
# 将include/cudnn.h文件复制到usr/local/cuda/include文件夹
cp include/cudnn.h /usr/local/cuda-11.6/include
#将lib下所有文件复制到/usr/local/cuda/lib64文件夹中
cp lib/libcudnn* /usr/local/cuda-11.6/lib64 
# 添加读取权限
chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*

安装cmake
直接参考之前的博客：https://blog.csdn.net/qq_42102546/article/details/135014765

安装minni conda3

wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh

chmod +x Miniconda3-latest-Linux-x86_64.sh

./Miniconda3-latest-Linux-x86_64.sh

激活环境

source /root/miniconda3/bin/activate

创建虚拟环境

conda create -n py_17 python=3.9

进入虚拟环境

conda activate py_17

安装yolo环境

pip install ultralytics -i https://pypi.tuna.tsinghua.edu.cn/simple

安装onnxruntime-gpu

pip install onnxruntime-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple

安装cv2的依赖

apt install libglib2.0-0 libgl1-mesa-glx

测试：

import torch
import os
import cv2

print(torch.__version__)  # 确认 PyTorch 版本
cuda_available = torch.cuda.is_available()
if cuda_available:
    print("安装的是 GPU 版本的 PyTorch，当前可用的 GPU 数量为:", torch.cuda.device_count())
    print("当前使用的 GPU 名称为:", torch.cuda.get_device_name(0))
else:
    print("安装的是 CPU 版本的 PyTorch")
    
#  使用GPU进行计算
os.environ['CUDA_LAUNCH_BLOCKING'] = "0"
print(torch.rand(1).cuda())
a = torch.Tensor([1, 2])
a = a.cuda()
print(a)
# 查看 torch版本
print(torch.__version__)
# 查看cuda是否可用
device = torch.device('cuda')
print(torch.cuda.is_available())
print("结束")


print(cv2.__version__)
print(cv2.cuda.getCudaEnabledDeviceCount())

import onnxruntime as ort
import tensorrt
print(ort.get_device())
print(ort.get_available_providers())
print(tensorrt.__version__ )

tensorRT还没有安装，报错没有关系，等都安装好了还用这个做测试。
安装c++的opencv 可以直接看之前的博客：https://blog.csdn.net/qq_42102546/article/details/145717954

tensorRT 下载网址：https://developer.nvidia.cn/tensorrt
根据你的cuda版本去下载,我的是cuda11.6
在这里插入图片描述
下载后解压

tar -zxvf TensorRT-8.6.0.12.Linux.x86_64-gnu.cuda-11.8.tar.gz

然后复制到 /usr/local 这只是个人喜好，你可以直接配置环境变量

cd TensorRT-8.6.0.12
cp ./* /usr/local

配置环境变量

vim ~/.bashrc

键盘大写“G”，在最末端输入

export LD_LIBRARY_PATH=$PATH:/usr/local/TensorRT-8.6.0.12/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=$PATH:/usr/local/TensorRT-8.6.0.12/lib::$LIBRARY_PATH

cuda也可以加上

export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

使其生效：

source ~/.bashrc

测试

cd /usr/local/TensorRT-8.6.0.12/samples/sampleOnnxMNIST; make; ../../bin/sample_onnx_mnist

输出结果：
在这里插入图片描述
c++测试
CMakeLists.txt 文件内容

cmake_minimum_required(VERSION 3.16)
project(first_cmake)

# 查找 OpenCV
find_package(OpenCV REQUIRED)
message(STATUS "OpenCV version: ${OpenCV_VERSION}")
message(STATUS "OpenCV libraries: ${OpenCV_LIBS}")
message(STATUS "OpenCV include path: ${OpenCV_INCLUDE_DIRS}")

# 查找 CUDA
find_package(CUDA REQUIRED)
message(STATUS "CUDA version: ${CUDA_VERSION}")
message(STATUS "CUDA libraries: ${CUDA_LIBRARIES}")
message(STATUS "CUDA include path: ${CUDA_INCLUDE_DIRS}")

# 查找线程库
find_package(Threads REQUIRED)

# 设置 TensorRT 路径
set(TENSORRT_INCLUDE_DIR /usr/local/TensorRT-8.6.0.12/include)
set(TENSORRT_LIBRARY_DIR /usr/local/TensorRT-8.6.0.12/lib)

# 包含 TensorRT 头文件路径
include_directories(${TENSORRT_INCLUDE_DIR})

# 链接 TensorRT 库路径
link_directories(${TENSORRT_LIBRARY_DIR})

# 添加可执行文件
add_executable(first_cmake open_ce.cpp)

# 链接库
target_link_libraries(first_cmake ${OpenCV_LIBS} Threads::Threads nvinfer nvinfer_plugin ${CUDA_LIBRARIES})

# 包含头文件目录
target_include_directories(first_cmake PRIVATE ${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR})

open_ce.cpp 文件内容

#include <iostream>
#include <NvInfer.h>
#include <cuda_runtime_api.h>

// 自定义日志记录器
class Logger : public nvinfer1::ILogger {
    void log(Severity severity, const char* msg) noexcept override {
        if (severity != Severity::kINFO) {
            std::cerr << msg << std::endl;
        }
    }
};

int main() {
    // 创建日志记录器
    Logger logger;

    // 创建构建器
    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
    if (!builder) {
        std::cerr << "Failed to create TensorRT builder." << std::endl;
        return -1;
    }

    // 创建网络定义
    const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
    if (!network) {
        std::cerr << "Failed to create TensorRT network." << std::endl;
        builder->destroy();
        return -1;
    }

    // 创建输入张量
    nvinfer1::ITensor* input = network->addInput("input", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{1, 1, 1, 1});
    if (!input) {
        std::cerr << "Failed to create input tensor." << std::endl;
        network->destroy();
        builder->destroy();
        return -1;
    }

    // 添加一个恒等层（Identity Layer）
    nvinfer1::IIdentityLayer* identityLayer = network->addIdentity(*input);
    if (!identityLayer) {
        std::cerr << "Failed to add identity layer." << std::endl;
        network->destroy();
        builder->destroy();
        return -1;
    }

    // 获取恒等层的输出张量
    nvinfer1::ITensor* output = identityLayer->getOutput(0);
    output->setName("output");

    // 标记输出张量
    network->markOutput(*output);

    // 创建构建配置
    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
    if (!config) {
        std::cerr << "Failed to create TensorRT builder config." << std::endl;
        network->destroy();
        builder->destroy();
        return -1;
    }

    // 构建引擎
    nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    if (!engine) {
        std::cerr << "Failed to build TensorRT engine." << std::endl;
        config->destroy();
        network->destroy();
        builder->destroy();
        return -1;
    }

    // 创建推理上下文
    nvinfer1::IExecutionContext* context = engine->createExecutionContext();
    if (!context) {
        std::cerr << "Failed to create TensorRT execution context." << std::endl;
        engine->destroy();
        return -1;
    }

    // 准备输入和输出数据
    float inputData[1] = {1.0f};  // 输入数据
    float outputData[1];          // 输出数据

    // 分配 CUDA 内存
    void* d_input;
    void* d_output;
    cudaMalloc(&d_input, sizeof(float));
    cudaMalloc(&d_output, sizeof(float));

    // 将输入数据从主机内存复制到设备内存
    cudaMemcpy(d_input, inputData, sizeof(float), cudaMemcpyHostToDevice);

    // 定义输入和输出缓冲区指针
    void* buffers[2];
    buffers[0] = d_input;   // 输入缓冲区
    buffers[1] = d_output;  // 输出缓冲区

    // 执行推理
    context->enqueueV2(buffers, 0, nullptr);

    // 将输出数据从设备内存复制到主机内存
    cudaMemcpy(outputData, d_output, sizeof(float), cudaMemcpyDeviceToHost);

    // 输出结果
    std::cout << "Output: " << outputData[0] << std::endl;

    // 释放 CUDA 内存
    cudaFree(d_input);
    cudaFree(d_output);

    // 释放资源
    context->destroy();
    engine->destroy();
    config->destroy();
    network->destroy();
    builder->destroy();

    return 0;
}

在这里插入图片描述

Trying to load shared library libnvinfer_builder_resource.so.8.6.0
Loaded shared library libnvinfer_builder_resource.so.8.6.0
CUDA lazy loading is enabled.
Original: 1 layers
After dead-layer removal: 1 layers
Graph construction completed in 0.00158627 seconds.
Running: IdentityToCastTransform on (Unnamed Layer* 0) [Identity]
Swap the layer type of (Unnamed Layer* 0) [Identity] from IDENTITY to CAST
After Myelin optimization: 1 layers
Applying ScaleNodes fusions.
After scale fusion: 1 layers
Running: CastToCopyTransform on (Unnamed Layer* 0) [Identity]
Swap the layer type of (Unnamed Layer* 0) [Identity] from CAST to CAST
After dupe layer removal: 1 layers
After final dead-layer removal: 1 layers
After tensor merging: 1 layers
After vertical fusions: 1 layers
After dupe layer removal: 1 layers
After final dead-layer removal: 1 layers
After tensor merging: 1 layers
After slice removal: 1 layers
After concat removal: 1 layers
Trying to split Reshape and strided tensor
Building graph using backend strategy 2
Constructing optimization profile number 0 [1/1].
Applying generic optimizations to the graph for inference.
Reserving memory for host IO tensors. Host: 0 bytes
=============== Computing reformatting costs
=============== Computing reformatting costs:
*************** Autotuning Reformat: Float(1,1,1,1) -> Float(1,1,1,1) ***************
--------------- Timing Runner: (Unnamed Layer* 0) [Identity] (Reformat[0x80000006])
Tactic: 0x00000000000003e8 Time: 0.0122651
Tactic: 0x00000000000003ea Time: 0.0224988
Tactic: 0x0000000000000000 Time: 0.00841467
(Unnamed Layer* 0) [Identity] (Reformat[0x80000006]) profiling completed in 0.0247727 seconds. Fastest Tactic: 0x0000000000000000 Time: 0.00841467
--------------- Timing Runner: (Unnamed Layer* 0) [Identity] (MyelinReformat[0x80000035])
(foreignNode) Set user’s cuda kernel library
(foreignNode) Pass fuse_conv_padding is currently skipped for dynamic shapes
(foreignNode) Pass pad_conv_channel is currently skipped for dynamic shapes
(foreignNode) Pass fuse_conv_padding is currently skipped for dynamic shapes
(foreignNode) Pass pad_conv_channel is currently skipped for dynamic shapes
Tactic: 0x0000000000000000 Time: 0.00626647
(Unnamed Layer* 0) [Identity] (MyelinReformat[0x80000035]) profiling completed in 0.3183 seconds. Fastest Tactic: 0x0000000000000000 Time: 0.00626647

Chose Runner Type: MyelinReformat Tactic: 0x0000000000000000
Formats and tactics selection completed in 0.343932 seconds.
After reformat layers: 1 layers
Total number of blocks in pre-optimized block assignment: 1
(foreignNode) Set user’s cuda kernel library
(foreignNode) Pass fuse_conv_padding is currently skipped for dynamic shapes
(foreignNode) Pass pad_conv_channel is currently skipped for dynamic shapes
(foreignNode) Pass fuse_conv_padding is currently skipped for dynamic shapes
(foreignNode) Pass pad_conv_channel is currently skipped for dynamic shapes
Layer: (Unnamed Layer* 0) [Identity] Host Persistent: 32 Device Persistent: 0 Scratch Memory: 0
Skipped printing memory information for 0 layers with 0 memory size i.e. Host Persistent + Device Persistent + Scratch Memory == 0.
Total number of blocks in optimized block assignment: 0
Total number of generated kernels selected for the engine: 0
Disabling unused tactic source: EDGE_MASK_CONVOLUTIONS
Disabling unused tactic source: JIT_CONVOLUTIONS
Engine generation completed in 0.522099 seconds.
Deleting timing cache: 1 entries, served 0 hits since creation.
Engine Layer Information:
Layer(MyelinReformat): (Unnamed Layer* 0) [Identity], Tactic: 0x0000000000000000, input (Float[1,1,1,1]) -> output (Float[1,1,1,1])
Total per-runner device persistent memory is 0
Total per-runner host persistent memory is 32
Allocated activation device memory of size 0
CUDA lazy loading is enabled.
Output: 1