英伟达SSD目标检测代码解析

一、官方原代码

#!/usr/bin/env python3
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#

import sys
import argparse

from jetson_inference import detectNet
from jetson_utils import videoSource, videoOutput, Log

# parse the command line
parser = argparse.ArgumentParser(description="Locate objects in a live camera stream using an object detection DNN.", 
                                 formatter_class=argparse.RawTextHelpFormatter, 
                                 epilog=detectNet.Usage() + videoSource.Usage() + videoOutput.Usage() + Log.Usage())

parser.add_argument("input", type=str, default="", nargs='?', help="URI of the input stream")
parser.add_argument("output", type=str, default="", nargs='?', help="URI of the output stream")
parser.add_argument("--network", type=str, default="ssd-mobilenet-v2", help="pre-trained model to load (see below for options)")
parser.add_argument("--overlay", type=str, default="box,labels,conf", help="detection overlay flags (e.g. --overlay=box,labels,conf)\nvalid combinations are:  'box', 'labels', 'conf', 'none'")
parser.add_argument("--threshold", type=float, default=0.5, help="minimum detection threshold to use") 

try:
	args = parser.parse_known_args()[0]
except:
	print("")
	parser.print_help()
	sys.exit(0)

# create video sources and outputs
input = videoSource(args.input, argv=sys.argv)
output = videoOutput(args.output, argv=sys.argv)
	
# load the object detection network
net = detectNet(args.network, sys.argv, args.threshold)

# note: to hard-code the paths to load a model, the following API can be used:
#
# net = detectNet(model="model/ssd-mobilenet.onnx", labels="model/labels.txt", 
#                 input_blob="input_0", output_cvg="scores", output_bbox="boxes", 
#                 threshold=args.threshold)

# process frames until EOS or the user exits
while True:
    # capture the next image
    img = input.Capture()

    if img is None: # timeout
        continue  
        
    # detect objects in the image (with overlay)
    detections = net.Detect(img, overlay=args.overlay)

    # print the detections
    print("detected {:d} objects in image".format(len(detections)))

    for detection in detections:
        print(detection)

    # render the image
    output.Render(img)

    # update the title bar
    output.SetStatus("{:s} | Network {:.0f} FPS".format(args.network, net.GetNetworkFPS()))

    # print out performance info
    net.PrintProfilerTimes()

    # exit on input/output EOS
    if not input.IsStreaming() or not output.IsStreaming():
        break

二、代码解析

这段代码是一个用于实时对象检测的Python脚本，利用NVIDIA的Jetson平台上的Jetson Inference库来实现。以下是对代码的逐步分析：

1. 文件头

#!/usr/bin/env python3

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
# ...

2. 导入模块

import sys
import argparse
from jetson_inference import detectNet
from jetson_utils import videoSource, videoOutput, Log

导入了必要的Python标准库和Jetson Inference相关模块。

3. 命令行参数解析

parser = argparse.ArgumentParser(description="Locate objects in a live camera stream using an object detection DNN.", 
                                 formatter_class=argparse.RawTextHelpFormatter, 
                                 epilog=detectNet.Usage() + videoSource.Usage() + videoOutput.Usage() + Log.Usage())

parser.add_argument("input", type=str, default="", nargs='?', help="URI of the input stream")
parser.add_argument("output", type=str, default="", nargs='?', help="URI of the output stream")
parser.add_argument("--network", type=str, default="ssd-mobilenet-v2", help="pre-trained model to load (see below for options)")
parser.add_argument("--overlay", type=str, default="box,labels,conf", help="detection overlay flags (e.g. --overlay=box,labels,conf)\nvalid combinations are:  'box', 'labels', 'conf', 'none'")
parser.add_argument("--threshold", type=float, default=0.5, help="minimum detection threshold to use") 

try:
    args = parser.parse_known_args()[0]
except:
    print("")
    parser.print_help()
    sys.exit(0)

这部分代码解析命令行参数，支持输入和输出流的URI、使用的预训练模型、检测的叠加标志和最低检测阈值。

4. 视频源和输出的创建

input = videoSource(args.input, argv=sys.argv)
output = videoOutput(args.output, argv=sys.argv)

根据解析得到的命令行参数，创建视频输入源和输出对象。

5. 加载对象检测网络

net = detectNet(args.network, sys.argv, args.threshold)

加载指定的预训练对象检测网络，并设置检测阈值。

6. 主处理循环

while True:
    # capture the next image
    img = input.Capture()

    if img is None: # timeout
        continue  
    
    # detect objects in the image (with overlay)
    detections = net.Detect(img, overlay=args.overlay)

    # print the detections
    print("detected {:d} objects in image".format(len(detections)))

    for detection in detections:
        print(detection)

    # render the image
    output.Render(img)

    # update the title bar
    output.SetStatus("{:s} | Network {:.0f} FPS".format(args.network, net.GetNetworkFPS()))

    # print out performance info
    net.PrintProfilerTimes()

    # exit on input/output EOS
    if not input.IsStreaming() or not output.IsStreaming():
        break

从视频源捕获下一帧图像。
使用检测网络检测图像中的对象。
打印检测到的对象信息。
渲染带有检测叠加的图像到输出流。
更新输出窗口的状态栏以显示网络的FPS。
打印性能信息。
检查输入输出流是否仍在继续，如果任一停止，退出循环。

总结

这段代码实现了一个实时对象检测系统，通过解析命令行参数设定视频输入和输出、加载对象检测模型，并在一个循环中不断捕获图像、检测对象、渲染结果并显示性能信息。该代码适用于NVIDIA Jetson设备，利用Jetson Inference库进行高效的对象检测。

三、代码优化

打印目标边界的位置，并在图像上显示
目标区域填充改为边界标识

#!/usr/bin/env python3
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#

import sys
import argparse
import cv2
import numpy as np
from jetson_inference import detectNet
from jetson_utils import videoSource, videoOutput, Log, cudaToNumpy, cudaFromNumpy

# 解析命令行参数
parser = argparse.ArgumentParser(description="使用目标检测DNN在实时摄像头流中定位目标。", 
                                 formatter_class=argparse.RawTextHelpFormatter, 
                                 epilog=detectNet.Usage() + videoSource.Usage() + videoOutput.Usage() + Log.Usage())

parser.add_argument("input", type=str, default="", nargs='?', help="输入流的URI")
parser.add_argument("output", type=str, default="", nargs='?', help="输出流的URI")
parser.add_argument("--network", type=str, default="ssd-mobilenet-v2", help="要加载的预训练模型（见下文选项）")
parser.add_argument("--overlay", type=str, default="box,labels,conf", help="检测叠加标志（例如 --overlay=box,labels,conf）\n有效组合有： 'box', 'labels', 'conf', 'none'")
parser.add_argument("--threshold", type=float, default=0.5, help="使用的最小检测阈值") 

try:
    args = parser.parse_known_args()[0]
except:
    print("")
    parser.print_help()
    sys.exit(0)

# 创建视频源和输出
input = videoSource(args.input, argv=sys.argv)
output = videoOutput(args.output, argv=sys.argv)
    
# 加载目标检测网络
net = detectNet(args.network, sys.argv, args.threshold)

# 注意：要硬编码路径以加载模型，可以使用以下API：
#
# net = detectNet(model="model/ssd-mobilenet.onnx", labels="model/labels.txt", 
#                 input_blob="input_0", output_cvg="scores", output_bbox="boxes", 
#                 threshold=args.threshold)

# 处理帧，直到EOS或用户退出
while True:
    # 捕获下一张图像
    img = input.Capture()

    if img is None: # 超时
        continue  
    
    # 在图像中检测目标（无叠加）
    detections = net.Detect(img, overlay='none')

    # 打印检测结果
    print("在图像中检测到 {:d} 个目标".format(len(detections)))

    # 将CUDA图像转换为NumPy数组以便OpenCV处理
    np_img = cudaToNumpy(img)

    for idx, detection in enumerate(detections):
        print(detection)
        
        # 打印检测目标的坐标
        print(f"目标 {idx}: 左={detection.Left}, 上={detection.Top}, 右={detection.Right}, 下={detection.Bottom}")

        # 使用OpenCV在图像上绘制红色边框
        color = (255, 0, 0)  # BGR格式，红色
        cv2.rectangle(np_img, (int(detection.Left), int(detection.Top)), (int(detection.Right), int(detection.Bottom)), color, 2)

        # 使用OpenCV在图像上绘制坐标
        coordinates_text = f"({int(detection.Left)}, {int(detection.Top)}), ({int(detection.Right)}, {int(detection.Bottom)})"
        cv2.putText(np_img, coordinates_text, (int(detection.Left), int(detection.Top) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)

    # 将修改后的NumPy数组转换回CUDA图像
    cuda_img = cudaFromNumpy(np_img)

    # 渲染图像
    output.Render(cuda_img)

    # 更新标题栏
    output.SetStatus("{:s} | 网络 {:.0f} FPS".format(args.network, net.GetNetworkFPS()))

    # 打印性能信息
    net.PrintProfilerTimes()

    # 输入/输出EOS时退出
    if not input.IsStreaming() or not output.IsStreaming():
        break

原代码的效果
在这里插入图片描述
打印边界并增加红框的效果

去掉填充的效果

优化说明：

img 和 np_img 是代表不同图像格式的数据结构，主要区别在于存储方式和使用的库：

img:
- 类型：这是一个CUDA图像，通常是通过Jetson平台上的jetson_utils库进行处理和存储的图像数据。
- 存储：图像数据存储在GPU（CUDA内存）中，适合高效的并行计算和图像处理。
- 用途：img通常在Jetson平台上用于高性能实时处理，如目标检测、视频流处理等，因为它可以直接在GPU上操作数据。
- 库：由jetson_utils库管理，提供了一些CUDA专用的函数进行操作。
np_img:
- 类型：这是一个NumPy数组，代表了图像数据在CPU内存中的形式。
- 存储：图像数据存储在CPU内存中，便于使用Python中的NumPy库和OpenCV库进行各种图像处理和分析。
- 用途：np_img通常用于需要灵活和广泛支持的图像处理任务，如使用OpenCV进行图像绘制、特征提取等。
- 库：主要由NumPy和OpenCV库管理，用于各种图像处理和计算任务。

具体区别和相互转换

在您的代码中，img 是通过Jetson平台上的视频源（如摄像头或视频文件）捕获的CUDA图像。为了在图像上绘制检测结果，需要将CUDA图像转换为NumPy数组形式，以便使用OpenCV绘制文本。

转换过程：
- cudaToNumpy(img)：将CUDA图像img转换为NumPy数组np_img。这样可以使用OpenCV在图像上绘制文本。
- cudaFromNumpy(np_img)：将修改后的NumPy数组np_img转换回CUDA图像，便于Jetson平台上的后续处理和显示。

代码示例

以下是对两个图像格式的具体操作：

# Capture the next image from the video source (CUDA image)
img = input.Capture()

# Convert the CUDA image to a NumPy array for OpenCV processing
np_img = cudaToNumpy(img)

# Use OpenCV to draw text on the NumPy image
coordinates_text = f"({int(detection.Left)}, {int(detection.Top)}), ({int(detection.Right)}, {int(detection.Bottom)})"
cv2.putText(np_img, coordinates_text, (int(detection.Left), int(detection.Top) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)

# Convert the modified NumPy array back to a CUDA image
cuda_img = cudaFromNumpy(np_img)

# Render the CUDA image
output.Render(cuda_img)

总结

img 是在GPU内存中的CUDA图像，适用于高性能并行计算。
np_img 是在CPU内存中的NumPy数组，便于灵活的图像处理和分析。
通过cudaToNumpy和cudaFromNumpy进行相互转换，可以在GPU和CPU之间高效处理图像数据。

这种转换使得您可以充分利用GPU的计算能力和CPU的灵活处理能力，进行复杂的图像处理任务。

四、detectNet库解析

detectNet的属性和方法

detectNet 类是 NVIDIA Jetson 上的一个库，用于进行目标检测。这个类提供了加载模型、处理图像、检测目标以及获取检测结果的功能。以下是 detectNet 类的主要属性和方法：

主要属性

net:
- 说明：检测网络的核心对象。

主要方法

__init__：
- 说明：初始化 detectNet 对象。
- 参数：
  - network：要加载的预训练模型的名称或路径。
  - threshold：用于检测的最小置信度阈值。
  - overlay：检测叠加标志（例如：‘box,labels,conf’）。
  - argv：命令行参数。
Detect：
- 说明：在给定图像上进行目标检测。
- 参数：
  - image：输入的图像。
  - overlay：检测叠加标志。
- 返回：一个 Detection 对象列表，包含检测到的目标信息。
GetNetworkFPS：
- 说明：获取网络的帧率。
- 返回：网络的帧率（FPS）。
GetClassDesc：
- 说明：获取类别ID对应的描述（名称）。
- 参数：
  - class_id：类别ID。
- 返回：类别的描述（名称）。
GetClassPath：
- 说明：获取类别ID对应的路径（标签文件中的路径）。
- 参数：
  - class_id：类别ID。
- 返回：类别的路径。
PrintProfilerTimes：
- 说明：打印网络的性能数据。
Usage：
- 说明：返回一个描述 detectNet 使用方法的字符串。
- 返回：一个字符串，描述如何使用 detectNet。

`Detection` 对象

Detection 对象表示检测结果，包含以下属性：

Confidence：
- 说明：检测到目标的置信度。
- 类型：浮点数。
ClassID：
- 说明：检测到目标的类别ID。
- 类型：整数。
Left：
- 说明：目标边界框的左边界坐标。
- 类型：浮点数。
Top：
- 说明：目标边界框的上边界坐标。
- 类型：浮点数。
Right：
- 说明：目标边界框的右边界坐标。
- 类型：浮点数。
Bottom：
- 说明：目标边界框的下边界坐标。
- 类型：浮点数。
Width：
- 说明：目标边界框的宽度。
- 类型：浮点数。
Height：
- 说明：目标边界框的高度。
- 类型：浮点数。
Area：
- 说明：目标边界框的面积。
- 类型：浮点数。
Center：
- 说明：目标边界框的中心点坐标，格式为 (x, y)。
- 类型：浮点数元组。

示例代码

以下是如何使用 detectNet 类的示例：

from jetson_inference import detectNet
from jetson_utils import videoSource, videoOutput, cudaToNumpy

# 初始化detectNet对象
net = detectNet(network="ssd-mobilenet-v2", threshold=0.5)

# 创建视频源和输出
input = videoSource("csi://0")  # 从摄像头读取
output = videoOutput("display://0")  # 输出到显示器

while True:
    # 捕获下一帧
    img = input.Capture()

    if img is None:  # 超时
        continue

    # 在图像上检测目标
    detections = net.Detect(img)

    # 打印检测结果
    print("检测到 {:d} 个目标".format(len(detections)))

    for detection in detections:
        print(f"置信度: {detection.Confidence}")
        print(f"类别ID: {detection.ClassID}")
        print(f"左: {detection.Left}, 上: {detection.Top}, 右: {detection.Right}, 下: {detection.Bottom}")
        print(f"宽度: {detection.Width}, 高度: {detection.Height}")
        print(f"面积: {detection.Area}")
        print(f"中心: {detection.Center}")

    # 渲染图像
    output.Render(img)

    # 更新标题栏
    output.SetStatus("Network {:.0f} FPS".format(net.GetNetworkFPS()))

    # 打印性能信息
    net.PrintProfilerTimes()

    # 输入/输出EOS时退出
    if not input.IsStreaming() or not output.IsStreaming():
        break

通过这些方法和属性，detectNet 类提供了在 Jetson 平台上进行实时目标检测的强大功能。

总结

detectNet 是一个强大的目标检测工具，提供了便捷的方法来加载预训练模型、处理图像并获取检测结果。通过 Detection 对象，用户可以方便地访问每个检测到的目标的详细信息。这些功能使得 detectNet 非常适合在 Jetson 平台上进行实时目标检测应用。

detection的属性

在 detectNet 的 Detection 对象中，除了 Left 和 Top 之外，还有其他几个属性。这些属性包括目标的边界框坐标、尺寸、置信度、类别ID等。以下是所有属性的列表和说明：

Confidence：
- 说明：检测到目标的置信度值，表示模型对检测结果的信心程度。
- 类型：浮点数，范围为0到1。
ClassID：
- 说明：检测到目标的类别ID，对应于模型标签文件中的类别。
- 类型：整数。
Left：
- 说明：目标边界框的左边界坐标。
- 类型：浮点数。
Top：
- 说明：目标边界框的上边界坐标。
- 类型：浮点数。
Right：
- 说明：目标边界框的右边界坐标。
- 类型：浮点数。
Bottom：
- 说明：目标边界框的下边界坐标。
- 类型：浮点数。
Width：
- 说明：目标边界框的宽度。
- 类型：浮点数。
Height：
- 说明：目标边界框的高度。
- 类型：浮点数。
Area：
- 说明：目标边界框的面积。
- 类型：浮点数。
Center：
- 说明：目标边界框的中心点坐标，格式为 (x, y)。
- 类型：浮点数元组。

下面是一个示例输出，展示了这些属性：

detected 1 objects in image
<detectNet.Detection object>
   -- Confidence: 0.992188
   -- ClassID: 1
   -- Left:    186.562
   -- Top:     416.953
   -- Right:   390.625
   -- Bottom:  550.547
   -- Width:   204.062
   -- Height:  133.594
   -- Area:    27261.5
   -- Center:  (288.594, 483.75)
Object 0: Left=186.5625, Top=416.953125, Right=390.625, Bottom=550.546875