LeNet:最早用于数字识别的CNN
AlexNet:2012年ILSVRC比赛冠军,远超第二名的CNN,比LeNet更深,用多层小卷积叠加来替换单个的大卷积
ZF Net:2013ILSVRC冠军
GoogleNet:2014ILSVRC冠军
VGGNet:2014ILSVRC比赛中算法模型,效果率低于GoogleNet
ResNet:2015ILSVRC冠军,结构修正以适应更深层次的CNN训练
DenseNet:CVPR 2017最佳论文
卷积神经网络典型CNN-LeNet
import torch
import torch.nn as nn
class LeNet(nn.Module):
def __init__(self):
super(LeNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=20, kernel_size=(5, 5), stride=(1, 1), padding=0),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),
nn.Conv2d(in_channels=20, out_channels=50, kernel_size=(5, 5), stride=(1, 1), padding=0),
nn.ReLU(),
nn.AdaptiveAvgPool2d(output_size=(4, 4))
)
self.classify = nn.Sequential(
nn.Linear(800, 500),
nn.ReLU(),
nn.Linear(500, 10)
)
def forward(self, x):
z = self.features(x)
z = z.view(-1, 800)
z = self.classify(z)
return z
if __name__ == '__main__':
net = LeNet()
img = torch.randn(2, 1, 28, 28)
score = net(img)
print(score)
probs = torch.softmax(score, dim=1)
print(probs)
LeNet-5
C1层是一个卷积层
6个特征图,每个特征图中的每个神经元与输入中55的邻域相连,特征图大小为2828
每个卷积神经元的参数数目:5*5=25个weight参数和一个bias参数
链接数目:(5*5+1)6(28**28)=122304个链接
参数共享:每个特征图内共享参数,因此参数总数:共(5*5+1)*6=156个参数
S2层是一个下采样层
6个1414的特征图,每个图中的每个单元与C1特征图中的一个22邻域相连接,不重叠。
和max pooling和average pooling不一样,在S2层中每个单元的4个输入相
加,乘以一个可训练参数w,再加上一个可训练偏置b,结果通过sigmoid函数计算得到最终池化之后的值。
连接数:(22+1)14146=5880个。
参数共享:每个特征图内共享参数,因此有2*6=12个可训练参数
C3层是一个卷积层
输入的feature map数量为6个,每个大小为1414;16个卷积核,得到16张特征图,特征图大小为1010。
每个特征图中的每个神经元与S2中某几层的多个5*5的邻域相连;
例如:对于C3层第0张特征图,其每一个节点与S2层的第0~2张特征图,总共3个5*5个节点相连接。
S4层是一个下采样层(和S2一样)
由16个55大小的特征图构成,特征图中的每个单元与C3中相应特征图的22邻域相连接。
连接数:(22+1)5516=2000个。
参数共享:特征图内共享参数,每个特征图中的每个神经元需要1个因子和一个偏置,因此有2*16个可训练参数。
C5层是一个卷积层
120个神经元,可以看作120个特征图,每张特征图的大小为1*1
每个单元与S4层的全部16个单元的5*5邻域相连(S4和C5之间的全连接)
连接数=可训练参数:(5516+1)*120=48120个
F6层是一个全连接层
有84个单元,与C5层全连接。
F6层计算输入向量和权重向量之间的点积,再加上一个偏置(wx+b),最后将加权值做一个sigmoid转换。
连接数=可训练参数:(120+1)*84=10164。
这里选择84作为神经元的数目从论文中可以认为是:ASCII字符标准的打印字符,是用712大小的位图,这里希望每一维特征分别体现标准712大小位图上每一个像素点的特性。
F7层是一个输出层
输出层是由欧式径向基函数(RBF)组成。每一个输出对应一个RBF函数,每一个RBF函数都有84维的输入向量,RBF的函数公式如下。每一个RBF函数都会有一个输出,最后输出层会输出一个10维的向量。
卷积神经网络典型CNN-AlexNet
在AlexNet引入了一种特殊的网络层次,即:Local Response Normalization(LRN,局部响应归一化),主要是对ReLU激活函数的输出进行局部归一化操作(和LN差不多)。
AlexNet结构优化
非线性激活函数:ReLU
使用Max Pooling,并且提出池化核和步长,使池化核之间存在重叠,提升了特征的丰富性。
防止过拟合的方法:Dropout,Data augmentation(数据增强)
大数据训练:百万级ImageNet图像数据
GPU实现:在每个GPU中放置一半核(或神经元),还有一个额外的技巧:GPU间的通讯只在某些层进行。
LRN归一化:对局部神经元的活动创建了竞争机制,使得其中响应比较大的值变得相对更大,
并抑制其它反馈较小的神经元,增强了模型的泛化能力。本质上,LRN是仿造生物学上活跃的神经元对于相邻神经元的抑制现象(侧抑制)。
import torch
import torch.nn as nn
class AlerxNet(nn.Module):
def __init__(self, device1, device2):
super(AlerxNet, self).__init__()
self.device1 = device1
self.device2 = device2
self.feature11 = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=48, kernel_size=(11, 11), stride=(4, 4), padding=2),
nn.ReLU(),
nn.LocalResponseNorm(size=5),
nn.MaxPool2d(3, 2),
nn.Conv2d(in_channels=48, out_channels=128, kernel_size=(5, 5), stride=(1, 1), padding=2),
nn.ReLU(),
nn.MaxPool2d(3, 2)
).to(self.device1)
self.feature21 = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=48, kernel_size=(11, 11), stride=(4, 4), padding=2),
nn.ReLU(),
nn.LocalResponseNorm(size=5),
nn.MaxPool2d(3, 2),
nn.Conv2d(in_channels=48, out_channels=128, kernel_size=(5, 5), stride=(1, 1), padding=2),
nn.ReLU(),
nn.MaxPool2d(3, 2)
).to(self.device2)
self.feature12 = nn.Sequential(
nn.Conv2d(in_channels=256, out_channels=192, kernel_size=(3, 3), stride=(1, 1), padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=192, out_channels=192, kernel_size=(3, 3), stride=(1, 1), padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=192, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=1),
nn.ReLU(),
nn.MaxPool2d(3, 2)
).to(self.device1)
self.feature22 = nn.Sequential(
nn.Conv2d(in_channels=384, out_channels=192, kernel_size=(3, 3), stride=(1, 1), padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=192, out_channels=192, kernel_size=(3, 3), stride=(1, 1), padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=192, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=1),
nn.ReLU(),
nn.MaxPool2d(3, 2)
).to(self.device2)
self.classsify = nn.Sequential(
nn.Linear(6 * 6 * 128 * 2, 4096),
nn.ReLU(),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Linear(4096, 1000)
)
def forward(self, x):
x1 = x.to(self.device1)
x2 = x.to(self.device2)
z1 = self.feature11(x1)
z2 = self.feature21(x2)
z1 = torch.concat([z1, z2.to(self.device1)], dim=1)
z2 = torch.concat([z2, z1.to(self.device2)], dim=1)
z1 = self.feature12(z1)
z2 = self.feature22(z2)
z = torch.concat([z1, z2.to(self.device1)], dim=1)
z = z.view(-1, 6 * 6 * 128 * 2)
z = self.classsify(z)
return z
if __name__ == '__main__':
print(torch.cuda.is_available())
device1 = torch.device('cpu')
device2 = torch.device('cuda:0')
net = AlerxNet(device1, device2)
img = torch.randn(2, 3, 224, 224)
score = net(img)
print(score)
卷积神经网络典型CNN-ZF Net
ZF Net
基于AlexNet进行微调
修改窗口大小和步长
使用稠密单GPU的网络结构替换AlexNet的稀疏双GPU结构
Top5错误率11.2%
使用ReLU激活函数和交叉熵损失函数
import torch
import torch.nn as nn
class ZFNet(nn.Module):
def __init__(self):
super(ZFNet, self).__init__()
self.feature = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=96, kernel_size=(7,7), stride=(2,2), padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(3,3), stride=(2,2), padding=1),
nn.LocalResponseNorm(size=30),
nn.Conv2d(in_channels=96, out_channels=256, kernel_size=(5,5), stride=(2,2)),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=1),
nn.Conv2d(in_channels=256, out_channels=384, kernel_size=(3,3),stride=(1,1),padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=384, out_channels=384, kernel_size=(3, 3), stride=(1, 1), padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=384, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2))
)
self.classify = nn.Sequential(
nn.Linear(6*6*256, 4096),
nn.ReLU(),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Linear(4096, 1000)
)
def forward(self, x):
z = self.feature(x)
z = z.view(-1, 6*6*256)
z = self.classify(z)
return z
if __name__ == '__main__':
net = ZFNet()
img = torch.randn(2, 3, 224, 224)
score = net(img)
print(score)
probs = torch.softmax(score, dim=1)
print(probs)
卷积神经网络典型CNN-VGGNet
import torch
import torch.nn as nn
import torch.nn.functional as F
class VggBlock(nn.Module):
def __init__(self, in_channel, out_channel, n, use_11=False):
super(VggBlock, self).__init__()
layers = []
kernel_size = (3, 3)
for i in range(n):
if use_11 and (i == n-1):
kernel_size = (1, 1)
conv = nn.Sequential(
nn.Conv2d(in_channel, out_channel, kernel_size=kernel_size, stride=(1,1), padding='same'),
nn.ReLU())
in_channel = out_channel
layers.append(conv)
layers.append(nn.MaxPool2d(2,2))
self.block = nn.Sequential(*layers)
def forward(self, x):
return self.block(x)
class VggNet(nn.Module):
def __init__(self,features, num_classes, classify_input_channel):
super(VggNet, self).__init__()
self.num_classes = num_classes
self.features = features
self.pooling = nn.AdaptiveAvgPool2d(output_size=(7, 7))
self.classify = nn.Sequential(
nn.Linear(in_features=7*7*classify_input_channel, out_features=4096),
nn.ReLU(),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Linear(4096, self.num_classes),
)
def forward(self, images):
"""
images:[N,3,W,H] 原始图像信息
return:[N,num_classes] 预测类别置信度
"""
z = self.features(images) #[N,3,H,W] -> [N, classify_input_channel, ?, ?]
z = self.pooling(z) # [N, classify_input_channel, ?, ?] -> [N, classify_input_channel, 7, 7]
z = z.flatten(1)
return self.classify(z)
class Vgg16cNet(nn.Module):
def __init__(self, num_classes):
super(Vgg16cNet, self).__init__()
features = nn.Sequential(
VggBlock(3, 64, 2),
VggBlock(64, 128, 2),
VggBlock(128, 256, 3, use_11=True),
VggBlock(256, 512, 3, use_11=True),
VggBlock(512, 512, 3, use_11=True)
)
self.vgg = VggNet(
features=features,
num_classes=num_classes,
classify_input_channel=512
)
def forward(self, images):
return self.vgg(images)
class Vgg16Net(nn.Module):
def __init__(self, num_classes):
super(Vgg16Net, self).__init__()
features = nn.Sequential(
VggBlock(3, 64, 2),
VggBlock(64, 128, 2),
VggBlock(128, 256, 3),
VggBlock(256, 512, 3),
VggBlock(512, 512, 3)
)
self.vgg = VggNet(
features=features,
num_classes=num_classes,
classify_input_channel=512
)
def forward(self, images):
return self.vgg(images)
class Vgg19Net(nn.Module):
def __init__(self, num_classes):
super(Vgg19Net, self).__init__()
features = nn.Sequential(
VggBlock(3, 64, 2),
VggBlock(64, 128, 2),
VggBlock(128, 256, 4),
VggBlock(256, 512, 4),
VggBlock(512, 512, 4)
)
self.vgg = VggNet(
features=features,
num_classes=num_classes,
classify_input_channel=512
)
def forward(self, images):
return self.vgg(images)
class VggLabelNet(nn.Module):
def __init__(self, vgg):
super(VggLabelNet, self).__init__()
self.vgg = vgg
self.id2name = {
0: 'dog',
1: 'cat',
2: 'cow',
3: 'sheep'
}
def forward(self, images):
scores = self.vgg(images) #[N,C,H,W] -> [N,num_classes]
pred_index = torch.argmax(scores, dim=1).detach().numpy() #[N,num_classes] -> [N]
result =[]
for idx in pred_index:
result.append(self.id2name[idx])
return pred_index
if __name__ == '__main__':
vgg16 = Vgg16cNet(num_classes=4)
vgg_label = VggLabelNet(vgg16)
print(vgg_label)
r = vgg_label(torch.rand(4, 3, 224, 224))
print(r)
from pathlib import Path
from typing import Union, List
import torch
from torchvision import models, transforms
from PIL import Image
import torch.nn as nn
import torchvision
class VggHook(object):
def __init__(self, vgg, indexes: Union[int, List[int]] = 44):
if isinstance(indexes, int):
indexes = list(range(indexes))
self.images = {}
self.hooks = []
for idx in indexes:
# 注册一个钩子
self.hooks.append(vgg.features[idx].register_forward_hook(self._bulid_hook(idx)))
def _bulid_hook(self, idx):
def hook(module, module_input, module_output):
self.images[idx] = module_output.cpu() # 将当 前模块的出保存到当前
return hook
def remove(self):
for hook in self.hooks:
hook.remove()
if __name__ == '__main__':
vgg = models.vgg16_bn(pretrained=True) #从网络上下在vgg16的模型参数
vgg_hooks = VggHook(vgg)
vgg.eval().cpu()
print(vgg)
tfs = transforms.ToTensor()
resize = transforms.Resize(size=(50, 60))
image_path = {
'小狗': r'../datas/小狗.png',
'小狗2': r'../datas/小狗2.png',
'小猫': r'../datas/小猫.jpg',
'飞机': r'../datas/飞机.jpg',
'飞机2': r'../datas/飞机2.jpg'
}
# img = Image.open(image_path['飞机']).convert("RGB")
# img = tfs(img)
# print(type(img))
# print(img.shape)
# img = img[None] # [3, H, W] -> [1, 3, H, W]
# for i in range(1):
# score = vgg(img)
# print(score.shape)
# pred_indexes = torch.argmax(score, dim=1)
# print(pred_indexes)
# prob = torch.softmax(score, dim=1)
# top5 = torch.topk(prob, 5, dim=1)
# print(top5)
# print(top5.indices)
output_dir = Path('./output/vgg/features/')
for name in image_path.keys():
img = Image.open(image_path[name]).convert("RGB")
img = tfs(img) #[3, H, W]
img = img[None] # [3, H, W] -> [1, 3, H, W]
score = vgg(img) # [1, 1000]
prob = torch.softmax(score, dim=1)
top5 = torch.topk(prob, 5, dim=1)
print(name)
print(top5)
# 各个阶段的可视化输出
_output_dir = output_dir /name
_output_dir.mkdir(parents=True, exist_ok=True)
for layer_idx in vgg_hooks.images.keys():
fertures = vgg_hooks.images[layer_idx] #[1,C,H,W]
# [1,C,H,W] -> [C,H,W] ->[C,1,H,W]
n, c, h, w = fertures.shape
for i in range(n):
imgs = fertures[i: i+1]
imgs = torch.permute(imgs, dims=(1, 0, 2, 3))
imgs = resize(imgs)
torchvision.utils.save_image(
imgs,
output_dir / name /f'{i}_{layer_idx}.png',
nrow=8,
padding=5,
pad_value=128
)
vgg_hooks.remove()
GoogLeNet
可视化:
from pathlib import Path
from typing import Union, List, Optional
import torch
from torchvision import models, transforms
from PIL import Image
import torch.nn as nn
import torchvision
class GoogLeNetHook(object):
def __init__(self, net, names: Optional[List[str]]=None):
if names is None:
names = ['conv1', 'maxpool1', 'conv2', 'conv3', 'maxpool2', 'inception3a',
'inception3b', 'maxpool3', 'inception4b', 'inception4c', 'inception4d',
'inception4e', 'maxpool4', 'inception5a', 'inception5b']
self.images = {}
self.hooks = []
for name in names:
if name.startswith('inception'):
inception = getattr(net, name)
branch1 = inception.branch1.register_forward_hook(self._bulid_hook(f"{name}.branch1"))
branch2 = inception.branch1.register_forward_hook(self._bulid_hook(f"{name}.branch2"))
branch3 = inception.branch1.register_forward_hook(self._bulid_hook(f"{name}.branch3"))
branch4 = inception.branch1.register_forward_hook(self._bulid_hook(f"{name}.branch4"))
self.hooks.extend([branch1, branch2, branch3, branch4])
else:
hook = getattr(net, name).register_forward_hook(self._bulid_hook(name))
self.hooks.append(hook)
def _bulid_hook(self, idx):
def hook(module, module_input, module_output):
self.images[idx] = module_output.cpu() # 将当 前模块的出保存到当前
return hook
def remove(self):
for hook in self.hooks:
hook.remove()
if __name__ == '__main__':
model = models.googlenet(pretrained=True) #从网络上下在vgg16的模型参数
model.eval().cpu()
hooks = GoogLeNetHook(model)
print(model)
tfs = transforms.ToTensor()
resize = transforms.Resize(size=(50, 60))
image_path = {
'小狗': r'../datas/小狗.png',
'小狗2': r'../datas/小狗2.png',
'小猫': r'../datas/小猫.jpg',
'飞机': r'../datas/飞机.jpg',
'飞机2': r'../datas/飞机2.jpg'
}
output_dir = Path('./output/googlenet/features/')
for name in image_path.keys():
img = Image.open(image_path[name]).convert("RGB")
img = tfs(img) #[3, H, W]
img = img[None] # [3, H, W] -> [1, 3, H, W]
score = model(img) # [1, 1000]
prob = torch.softmax(score, dim=1)
top5 = torch.topk(prob, 5, dim=1)
print("=" * 100)
print(name)
print(top5)
# 各个阶段的可视化输出
_output_dir = output_dir /name
_output_dir.mkdir(parents=True, exist_ok=True)
for layer_name in hooks.images.keys():
fertures = hooks.images[layer_name] #[1,C,H,W]
# [1,C,H,W] -> [C,H,W] ->[C,1,H,W]
n, c, h, w = fertures.shape
for i in range(n):
imgs = fertures[i: i+1]
imgs = torch.permute(imgs, dims=(1, 0, 2, 3))
imgs = resize(imgs)
torchvision.utils.save_image(
imgs,
output_dir / name /f'{i}_{layer_name}.png',
nrow=8,
padding=5,
pad_value=128
)
hooks.remove()
自己实现:
Inception架构的主要思想是找出如何让已有的稠密组件接近与覆盖卷积视觉网络中的最佳局部稀疏结构。
为了避免patch校准问题,现在的滤波器大小限制在1x1,3x3和5x5,主要是为了方便,不是必要的。
另外,在pooling层添加一个额外的并行pooling路径用于提高效率。
架构的第二个主要思想:在计算要求增加很多的地方应用维度缩减和预测。即,在3x3和5x5的卷积前用一个1x1的卷积用于减少计算,还用于修正线性激活。
Network-in-Network主要思想是,用全连接的多层感知机去代替传统的卷积过程,以获取特征更加全面的表达,同时,因为前面已经做了提升特征表达的过程,传统CNN最后的全连接层也被替换为一个全局平均池化层,因为作者认为此时的map已经具备分类足够的可信度了,它可以直接通过softmax来计算loss了。
GoogLeNet借鉴了NIN的特性,在原先的卷积过程中附加了1*1的卷积核加上ReLU激活。
这不仅仅提升了网络的深度,提高了representation power,而且文中还通过1*1的卷积来进行降维,减少了更新参数量。
import torch
import torch.nn as nn
class GolbalAvgPool2d(nn.Module):
def __init__(self):
super(GolbalAvgPool2d, self).__init__()
def forward(self, x):
"""
[N, C, H, W]
-> [N, C, 1, 1]
"""
return torch.mean(x, dim=(2, 3), keepdim=True)
class BasicConv2d(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
super(BasicConv2d, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
self.relu = nn.ReLU()
def forward(self, x):
return self.relu(self.conv(x))
class Inception(nn.Module):
def __init__(self, in_channels, out_channels, ):
"""
in_channels:输入通道数 eg:192
out_channels:各个分支的输出通道数, eg:[[64], [96, 128], [16,32], [32]]
"""
super(Inception, self).__init__()
self.branch1 = nn.Sequential(BasicConv2d(in_channels, out_channels[0][0], kernel_size=1, stride=1, padding=0))
self.branch2 = nn.Sequential(
BasicConv2d(in_channels, out_channels[1][0], kernel_size=1, stride=1, padding=0),
BasicConv2d(out_channels[1][0], out_channels[1][1], kernel_size=3, stride=1, padding=1)
)
self.branch3 = nn.Sequential(
BasicConv2d(in_channels, out_channels[2][0], kernel_size=1, stride=1, padding=0),
BasicConv2d(out_channels[2][0], out_channels[2][1], kernel_size=5, stride=1, padding=2)
)
self.branch4 = nn.Sequential(
nn.MaxPool2d(3, 1, padding=1),
BasicConv2d(in_channels, out_channels[3][0], kernel_size=1, stride=1, padding=0)
)
def forward(self, x):
x1 = self.branch1(x) # [N, C, H, W] -> [N, C1, H, W]
x2 = self.branch2(x) # [N, C, H, W] -> [N, C2, H, W]
x3 = self.branch3(x) # [N, C, H, W] -> [N, C3, H, W]
x4 = self.branch4(x) # [N, C, H, W] -> [N, C4, H, W]
x = torch.concat([x1, x2, x3, x4], dim=1) # [N, C1+C2+C3+C4, H, W]
return x
class GoogLeNet(nn.Module):
def __init__(self, num_class, add_aux_stage=False):
super(GoogLeNet, self).__init__()
self.stage1 = nn.Sequential(
BasicConv2d(3, 64, 7, 2, 3),
nn.MaxPool2d(3, 2, padding=1),
# nn.LocalResponseNorm(size=10),
BasicConv2d(64, 64, 1, 1, 0),
BasicConv2d(64, 192, 3, 1, 1),
nn.MaxPool2d(3, 2, padding=1),
Inception(192, [[64], [96, 128], [16, 32], [32]]), #inception3a
Inception(256, [[128], [128, 192], [32, 96], [64]]), #inception3b
nn.MaxPool2d(3, 2, padding=1),
Inception(480, [[192], [96, 208], [16, 48], [64]]) #inception4a
)
self.stage2 = nn.Sequential(
Inception(512, [[160], [112, 224], [24, 64], [64]]), # inception4b
Inception(512, [[128], [128, 256], [24, 64], [64]]), # inception4c
Inception(512, [[112], [144, 288], [32, 64], [64]]), # inception4d
)
self.stage3 = nn.Sequential(
Inception(528, [[256], [160, 320], [32, 128], [128]]), # inception4e
nn.MaxPool2d(3, 2, padding=1),
Inception(832, [[256], [160, 320], [32, 128], [128]]), # inception5a
Inception(832, [[384], [192, 384], [48, 128], [128]]), # inception5b
GolbalAvgPool2d()
)
self.classify = nn.Conv2d(1024, num_class, kernel_size=(1, 1), stride=(1, 1), padding=0)
if add_aux_stage:
self.aux_stage1 = nn.Sequential(
nn.MaxPool2d(5, 3, padding=0),
nn.Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), padding=0),
nn.ReLU(),
nn.AdaptiveAvgPool2d(output_size=(2, 2)),
nn.Flatten(1),
nn.Linear(4096, 2048),
nn.Dropout(p=0.4),
nn.ReLU(),
nn.Linear(2048, num_class)
)
self.aux_stage2 = nn.Sequential(
nn.MaxPool2d(5, 3, padding=0),
nn.Conv2d(528, 1024, kernel_size=(1, 1), stride=(1, 1), padding=0),
nn.ReLU(),
nn.AdaptiveAvgPool2d(output_size=(2, 2)),
nn.Flatten(1),
nn.Linear(4096, 2048),
nn.Dropout(p=0.4),
nn.ReLU(),
nn.Linear(2048, num_class)
)
else:
self.aux_stage1 = None
self.aux_stage2 = None
def forward(self, x):
"""
[N, C, H, W]
"""
z1 = self.stage1(x) # [N, C, H, W] -> [N, 512, H1, W1]
z2 = self.stage2(z1) # [N, 512, H1, W1] -> [N, 528, H2, W2]
z3 = self.stage3(z2) # [N, 528, H2, W2] -> [N, 1024, 1, 1]
# 三个决策分支输出
scores3 = torch.squeeze(self.classify(z3)) # [N, 1024, 1, 1] -> [N, num_class, 1, 1] ->[N, num_class]
if self.aux_stage1 is not None:
score1 = self.aux_stage1(z1)
score2 = self.aux_stage2(z2)
return score1, score2, scores3
else:
return scores3
def t1():
net = GoogLeNet(num_class=4, add_aux_stage=True)
loss_fn = nn.CrossEntropyLoss()
_x = torch.rand(2, 3, 224, 224)
_y = torch.tensor([0, 3], dtype=torch.long) # 模拟的真是类别标签id
_r1, _r2, _r3 = net(_x) # 获取三个分支的预测值,可以用来和实际标签一起构架损失函数
_loss1 = loss_fn(_r1, _y)
_loss2 = loss_fn(_r2, _y)
_loss3 = loss_fn(_r3, _y)
_loss = _loss1 + _loss2, _loss3
print(_r1)
print(_r2)
print(_r3)
print(_r3.shape)
traceed_script_module = torch.jit.trace(net.eval(), _x)
traceed_script_module.save('./output/modules/googlenet.pt')
# 模型持久化
torch.save(net, './output/modules/googlenet.pkl')
def t2():
net1 = torch.load('./output/modules/googlenet.pkl')
net2 = GoogLeNet(num_class=4, add_aux_stage=False)
# net2 中有部分参数没有恢复
# net2 中没有这部分参数,但是入参的字典中传入该参数
missing_keeys, unexpected_keys = net2.load_state_dict(net1.state_dict(), strict=False)
if len(missing_keeys) >0 :
raise ValueError(f"网络有部分参数没有恢复:{missing_keeys}")
_x = torch.rand(2, 3, 224, 224)
traceed_script_module = torch.jit.trace(net2.eval(), _x)
traceed_script_module.save('./output/modules/googlenet.pt')
# 转换为onnx结构
torch.onnx.export(
model=net2.eval().cpu(), # 给定模型对象
args=_x, # 给定模型forward的输出参数
f= './output/modules/googlenet_dynamic.onnx', # 输出文件名称
# training=TrainingMode.EVAL,
do_constant_folding=True,
input_names=['images'], # 输入的tensor名称列表
output_names=['scores'], # 输出的tensor名称列表
opset_version=12,
# dynamic_axes=None # 是否是动态结构
dynamic_axes={
'images': {
0: 'n',
2: 'h',
3: 'w'
},
'label': {
0: 'n'
}
}
)
if __name__ == '__main__':
# inception = Inception(192, [[64], [96, 128], [16, 32], [32]])
# print(inception)
# _x = torch.rand(4, 192, 100, 100)
# _r = inception(_x)
# print(_r.shape)
t1()
t2()
ResNet
使用了一种连接方式叫做“shortcut connection” ,顾名思义,shortcut就是“抄近道”的意思。
实线的的Connection部分(“第一个粉色矩形和第三个粉色矩形”)都是3x3x64的特征图,他们的channel个数一致,所以采用计算方式:y=F(x)+x
虚线的的Connection部分(”第一个绿色矩形和第三个绿色矩形“)分别是3x3x64和3x3x128的特征图,他们的channel个数不同(64和128),所以采用计算方式:y=F(x)+Wx
其中W是卷积操作,用来调整x的channel维度的。
可视化
from pathlib import Path
from typing import Optional, List
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms
import torchvision
from PIL import Image
class RestNet(object):
def __init__(self, net, names: Optional[List[str]]=None):
if names is None:
names = ['conv1', 'bn1', 'relu', 'maxpool', 'layer1',
'layer2', 'layer3', 'layer4', 'avgpool']
self.images = {}
self.hooks = []
for name in names:
hook = getattr(net, name).register_forward_hook(self._bulid_hook(name))
self.hooks.append(hook)
def _bulid_hook(self, idx):
def hook(module, module_input, module_output):
self.images[idx] = module_output.cpu() # 将当 前模块的出保存到当前
return hook
def reset_images(self):
self.images = {}
def remove(self):
for hook in self.hooks:
hook.remove()
if __name__ == '__main__':
model = models.resnet18(pretrained=True)
model.eval().cpu()
hooks = RestNet(model)
print(model)
tfs = transforms.ToTensor()
resize = transforms.Resize(size=(50, 60))
image_path = {
'小狗': r'../datas/小狗.png',
'小狗2': r'../datas/小狗2.png',
'小猫': r'../datas/小猫.jpg',
'飞机': r'../datas/飞机.jpg',
'飞机2': r'../datas/飞机2.jpg'
}
output_dir = Path('./output/resnet18/features/')
for name in image_path.keys():
img = Image.open(image_path[name]).convert("RGB")
img = tfs(img) # [3, H, W]
img = img[None] # [3, H, W] -> [1, 3, H, W]
score = model(img) # [1, 1000]
prob = torch.softmax(score, dim=1)
top5 = torch.topk(prob, 5, dim=1)
print("=" * 100)
print(name)
print(top5)
# 各个阶段的可视化输出
_output_dir = output_dir / name
_output_dir.mkdir(parents=True, exist_ok=True)
for layer_name in hooks.images.keys():
fertures = hooks.images[layer_name] # [1,C,H,W]
# [1,C,H,W] -> [C,H,W] ->[C,1,H,W]
n, c, h, w = fertures.shape
for i in range(n):
imgs = fertures[i: i + 1]
imgs = torch.Tensor.permute(imgs, dims=(1, 0, 2, 3))
imgs = resize(imgs)
torchvision.utils.save_image(
imgs,
output_dir / name / f'{i}_{layer_name}.png',
nrow=8,
padding=5,
pad_value=128
)
hooks.reset_images()
hooks.remove()
DenseNet
DenseNet(Dense Convolutional Network)是一种具有密集连接的卷积神经网络,在这个网络结构中任意两层之间均存在直接连接,也就是说每一层的输入都是前面所有层输出的并集,而该层所学习的特征图也会被直接传给其后面所有层作为输入
NOTE:DenseNet中的dense connectivity仅存在一个dense block中,不同dense block块之间是没有dense connectivity的。
密集连接的优点,缓解梯度消失问题,加强特征传播,增加特征复用,极大的减少参数量。
DenseNet中的dense block类似ResNet中的block结构,也即是:BN-ReLU-Conv(1x1)->BN-ReLU- Conv(3x3),并且DenseNet中的dense block具有多个这样的block结构。
每个dense block之间层称为transition layer,由BN-Conv(1x1)- AveragePooling(2x2)组成。
import re
from collections import OrderedDict
from functools import partial
from typing import Any, List, Optional, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from torch import Tensor
from torchvision.models._api import register_model, Weights, WeightsEnum
from torchvision.models._utils import _ovewrite_named_param, handle_legacy_interface
from torchvision.transforms._presets import ImageClassification
from torchvision.utils import _log_api_usage_once
from torchvision.models._meta import _IMAGENET_CATEGORIES
class BasicConv2d(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
super(BasicConv2d, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
self.relu = nn.ReLU()
def forward(self, x):
return self.relu(self.conv(x))
class Inception(nn.Module):
def __init__(self, in_channels, out_channels, ):
"""
in_channels:输入通道数 eg:192
out_channels:各个分支的输出通道数, eg:[[64], [96, 128], [16,32], [32]]
"""
super(Inception, self).__init__()
self.branch1 = nn.Sequential(BasicConv2d(in_channels, out_channels[0][0], kernel_size=1, stride=1, padding=0))
self.branch2 = nn.Sequential(
BasicConv2d(in_channels, out_channels[1][0], kernel_size=1, stride=1, padding=0),
BasicConv2d(out_channels[1][0], out_channels[1][1], kernel_size=3, stride=1, padding=1)
)
self.branch3 = nn.Sequential(
BasicConv2d(in_channels, out_channels[2][0], kernel_size=1, stride=1, padding=0),
BasicConv2d(out_channels[2][0], out_channels[2][1], kernel_size=5, stride=1, padding=2)
)
self.branch4 = nn.Sequential(
nn.MaxPool2d(3, 1, padding=1),
BasicConv2d(in_channels, out_channels[3][0], kernel_size=1, stride=1, padding=0)
)
def forward(self, x):
x1 = self.branch1(x) # [N, C, H, W] -> [N, C1, H, W]
x2 = self.branch2(x) # [N, C, H, W] -> [N, C2, H, W]
x3 = self.branch3(x) # [N, C, H, W] -> [N, C3, H, W]
x4 = self.branch4(x) # [N, C, H, W] -> [N, C4, H, W]
x = torch.concat([x1, x2, x3, x4], dim=1) # [N, C1+C2+C3+C4, H, W]
return x
class _DenseLayer(nn.Module):
def __init__(
self, num_input_features: int, growth_rate: int, bn_size: int, drop_rate: float, memory_efficient: bool = False
) -> None:
super().__init__()
conv_growth_rate = int(0.25 * growth_rate)
out_channels = [
[conv_growth_rate],
[bn_size * conv_growth_rate, conv_growth_rate],
[bn_size * conv_growth_rate, conv_growth_rate],
[growth_rate - 3 * conv_growth_rate]
]
self.model = Inception(in_channels=num_input_features, out_channels=out_channels)
# torchscript does not yet support *args, so we overload method
# allowing it to take either a List[Tensor] or single Tensor
def forward(self, input: Tensor) -> Tensor: # noqa: F811
if isinstance(input, Tensor):
prev_features = input
else:
prev_features = torch.concat(input, dim=1)
new_features = self.model(prev_features)
return new_features
class _DenseBlock(nn.ModuleDict):
_version = 2
def __init__(
self,
num_layers: int,
num_input_features: int,
bn_size: int,
growth_rate: int,
drop_rate: float,
memory_efficient: bool = False,
) -> None:
super().__init__()
for i in range(num_layers):
layer = _DenseLayer(
num_input_features + i * growth_rate,
growth_rate=growth_rate,
bn_size=bn_size,
drop_rate=drop_rate,
memory_efficient=memory_efficient,
)
self.add_module("denselayer%d" % (i + 1), layer)
def forward(self, init_features: Tensor) -> Tensor:
features = [init_features]
for name, layer in self.items():
new_features = layer(features)
features.append(new_features)
return torch.cat(features, 1)
class _Transition(nn.Sequential):
def __init__(self, num_input_features: int, num_output_features: int) -> None:
super().__init__()
self.norm = nn.BatchNorm2d(num_input_features)
self.relu = nn.ReLU(inplace=True)
self.conv = nn.Conv2d(num_input_features, num_output_features, kernel_size=1, stride=1, bias=False)
self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
class DenseNet(nn.Module):
def __init__(
self,
growth_rate: int = 32,
block_config: Tuple[int, int, int, int] = (6, 12, 24, 16),
num_init_features: int = 64,
bn_size: int = 4,
drop_rate: float = 0,
num_classes: int = 1000,
memory_efficient: bool = False,
) -> None:
super().__init__()
_log_api_usage_once(self)
# First convolution
self.features = nn.Sequential(
OrderedDict(
[
("conv0", nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
("norm0", nn.BatchNorm2d(num_init_features)),
("relu0", nn.ReLU(inplace=True)),
("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
]
)
)
# Each denseblock
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = _DenseBlock(
num_layers=num_layers,
num_input_features=num_features,
bn_size=bn_size,
growth_rate=growth_rate,
drop_rate=drop_rate,
memory_efficient=memory_efficient,
)
self.features.add_module("denseblock%d" % (i + 1), block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
self.features.add_module("transition%d" % (i + 1), trans)
num_features = num_features // 2
# Final batch norm
self.features.add_module("norm5", nn.BatchNorm2d(num_features))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
# Official init from torch repo.
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.constant_(m.bias, 0)
def forward(self, x: Tensor) -> Tensor:
features = self.features(x)
out = F.relu(features, inplace=True)
out = F.adaptive_avg_pool2d(out, (1, 1))
out = torch.flatten(out, 1)
out = self.classifier(out)
return out
def _densenet(
growth_rate: int,
block_config: Tuple[int, int, int, int],
num_init_features: int,
weights: Optional[WeightsEnum],
progress: bool,
**kwargs: Any,
) -> DenseNet:
if weights is not None:
_ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)
return model
_COMMON_META = {
"min_size": (29, 29),
"categories": _IMAGENET_CATEGORIES,
"recipe": "https://github.com/pytorch/vision/pull/116",
"_docs": """These weights are ported from LuaTorch.""",
}
class DenseNet121_Weights(WeightsEnum):
IMAGENET1K_V1 = Weights(
url="https://download.pytorch.org/models/densenet121-a639ec97.pth",
transforms=partial(ImageClassification, crop_size=224),
meta={
**_COMMON_META,
"num_params": 7978856,
"_metrics": {
"ImageNet-1K": {
"acc@1": 74.434,
"acc@5": 91.972,
}
},
"_ops": 2.834,
"_file_size": 30.845,
},
)
DEFAULT = IMAGENET1K_V1
class DenseNet161_Weights(WeightsEnum):
IMAGENET1K_V1 = Weights(
url="https://download.pytorch.org/models/densenet161-8d451a50.pth",
transforms=partial(ImageClassification, crop_size=224),
meta={
**_COMMON_META,
"num_params": 28681000,
"_metrics": {
"ImageNet-1K": {
"acc@1": 77.138,
"acc@5": 93.560,
}
},
"_ops": 7.728,
"_file_size": 110.369,
},
)
DEFAULT = IMAGENET1K_V1
class DenseNet169_Weights(WeightsEnum):
IMAGENET1K_V1 = Weights(
url="https://download.pytorch.org/models/densenet169-b2777c0a.pth",
transforms=partial(ImageClassification, crop_size=224),
meta={
**_COMMON_META,
"num_params": 14149480,
"_metrics": {
"ImageNet-1K": {
"acc@1": 75.600,
"acc@5": 92.806,
}
},
"_ops": 3.36,
"_file_size": 54.708,
},
)
DEFAULT = IMAGENET1K_V1
class DenseNet201_Weights(WeightsEnum):
IMAGENET1K_V1 = Weights(
url="https://download.pytorch.org/models/densenet201-c1103571.pth",
transforms=partial(ImageClassification, crop_size=224),
meta={
**_COMMON_META,
"num_params": 20013928,
"_metrics": {
"ImageNet-1K": {
"acc@1": 76.896,
"acc@5": 93.370,
}
},
"_ops": 4.291,
"_file_size": 77.373,
},
)
DEFAULT = IMAGENET1K_V1
@register_model()
@handle_legacy_interface(weights=("pretrained", DenseNet121_Weights.IMAGENET1K_V1))
def my_densenet121(*, weights: Optional[DenseNet121_Weights] = None, progress: bool = True, **kwargs: Any) -> DenseNet:
weights = DenseNet121_Weights.verify(weights)
return _densenet(32, (6, 12, 24, 16), 64, weights, progress, **kwargs)
if __name__ == '__main__':
net = my_densenet121()
print(net)