参考
PaddleVideo/docs/zh-CN/contribute/add_new_algorithm.md at develop · PaddlePaddle/PaddleVideo · GitHubAwesome video understanding toolkits based on PaddlePaddle. It supports video data annotation tools, lightweight RGB and skeleton based action recognition model, practical applications for video tagging and sport action detection. - PaddleVideo/docs/zh-CN/contribute/add_new_algorithm.md at develop · PaddlePaddle/PaddleVideohttps://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/contribute/add_new_algorithm.md
1:添加backbone:(网络我自己砍了几刀,目的是想和ppTSM-v2做对比)
paddlevideo/modeling/backbones/squeezetime.py
from __future__ import absolute_import, division, print_function
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear, BatchNorm2D
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal,Constant
import paddle.nn.functional as F
from ..registry import BACKBONES
def get_inplanes():
return [64, 128, 256, 512]
class SpatialConv(nn.Layer):
"""
Inter-temporal Object Interaction Module (IOI)
"""
def __init__(self, dim_in, dim_out, pos_dim=7):
super(SpatialConv, self).__init__()
self.short_conv = nn.Conv2D(dim_in, dim_out, kernel_size=3, stride=1, padding=1, groups=1)
self.glo_conv = nn.Sequential(
nn.Conv2D(dim_in, 16, kernel_size=3, stride=1, padding=1, groups=1),
nn.BatchNorm2D(16), nn.ReLU(),
nn.Conv2D(16, 16, kernel_size=7, stride=1, padding=3),
nn.BatchNorm2D(16), nn.ReLU(),
nn.Conv2D(16, dim_out, kernel_size=3, stride=1, padding=1, groups=1), nn.Sigmoid()
)
self.pos_embed = self.create_parameter(shape=[1, 16, pos_dim, pos_dim], default_initializer=nn.initializer.KaimingNormal())
def forward(self, x, param):
x_short = self.short_conv(x)
x = x * param
for i in range(len(self.glo_conv)):
if i == 3:
_, _, H, W = x.shape
if self.pos_embed.shape[2] != H or self.pos_embed.shape[3] != W:
pos_embed = F.interpolate(self.pos_embed, size=(H, W), mode='bilinear', align_corners=True)
else:
pos_embed = self.pos_embed
x = x + pos_embed
x = self.glo_conv[i](x)
return x_short * x
class Conv2d(nn.Layer):
"""
Channel-Time Learning Module (CTL)
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
padding: int = 0,
dilation: int = 1,
groups: int = 1,
bias: bool = True,
padding_mode: str = 'zeros',
pos_dim = 7):
super(Conv2d, self).__init__()
self.stride = stride
self.param_conv = nn.Sequential(
nn.AdaptiveAvgPool2D((1, 1)),
nn.Conv2D(in_channels, in_channels, 1, stride=1, padding=1 // 2, bias_attr=False),
nn.BatchNorm2D(in_channels),
nn.ReLU(),
nn.Conv2D(in_channels, in_channels, 1, bias_attr=False),
nn.Sigmoid()
)
self.temporal_conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=1,
padding=padding,
dilation=dilation,
groups=groups,
bias_attr=bias,
padding_mode=padding_mode
)
self.spatial_conv = SpatialConv(dim_in=in_channels, dim_out=out_channels, pos_dim=pos_dim)
def forward(self, x):
param = self.param_conv(x)
x = self.temporal_conv(param * x) + self.spatial_conv(x, param)
return x
def conv3x3x3(in_planes, out_planes, stride=1, pos_dim=7):
return Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False, pos_dim=pos_dim)
def conv1x1x1(in_planes, out_planes, stride=1):
return nn.Conv2D(in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False)
class BasicBlock(nn.Layer):
"""
Channel-Time Learning (CTL) Block
"""
expansion = 1
def __init__(self, in_planes, planes, stride=1, shortcut_conv=None, pos_dim=7):
super().__init__()
self.conv1 = conv3x3x3(in_planes, planes, stride)
self.bn1 = nn.BatchNorm2D(planes)
self.relu = nn.ReLU()
self.conv2 = conv3x3x3(planes, planes, pos_dim=pos_dim)
self.bn2 = nn.BatchNorm2D(planes)
self.shortcut_conv = shortcut_conv
self.stride = stride
if stride != 1:
self.downsample = nn.Sequential(
nn.Conv2D(in_planes, in_planes, kernel_size=2, stride=2, groups=in_planes),
nn.BatchNorm2D(in_planes)
)
def forward(self, x):
if self.stride != 1:
x = self.downsample(x)
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.shortcut_conv is not None:
residual = self.shortcut_conv(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Layer):
"""
Channel-Time Learning (CTL) Block
"""
expansion = 4
def __init__(self, in_planes, planes, stride=1, shortcut_conv=None, pos_dim=7):
super().__init__()
self.conv1 = conv1x1x1(in_planes, planes)
self.bn1 = nn.BatchNorm2D(planes)
self.conv2 = conv3x3x3(planes, planes, pos_dim=pos_dim)
self.bn2 = nn.BatchNorm2D(planes)
self.conv3 = conv1x1x1(planes, planes * self.expansion)
self.bn3 = nn.BatchNorm2D(planes * self.expansion)
self.relu = nn.ReLU()
self.shortcut_conv = shortcut_conv
self.stride = stride
if stride != 1:
self.downsample = nn.Sequential(
nn.Conv2D(in_planes, in_planes, kernel_size=2, stride=2, groups=in_planes),
nn.BatchNorm2D(in_planes)
)
def forward(self, x):
if self.stride != 1:
x = self.downsample(x)
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.shortcut_conv is not None:
residual = self.shortcut_conv(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Layer):
def __init__(self,
block,
layers,
block_inplanes,
n_input_channels=3,
no_max_pool=False,
shortcut_type='B',
widen_factor=1.0,
dropout=0.2,
freeze_bn=False,
spatial_stride=[1,2,2,2],
pos_dim=[64,32,16,8]):
super().__init__()
self.freeze_bn = freeze_bn
block_inplanes = [int(x * widen_factor) for x in block_inplanes]
self.in_planes = block_inplanes[0]
self.no_max_pool = no_max_pool
self.dropout = dropout
self.conv1 = nn.Conv2D(n_input_channels,
self.in_planes,
kernel_size=5,
stride=2,
padding=2,
groups=1,
bias_attr=False)
self.bn1 = nn.BatchNorm2D(self.in_planes)
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, block_inplanes[0], layers[0],
shortcut_type, stride=spatial_stride[0], pos_dim=pos_dim[0])
self.layer2 = self._make_layer(block,
block_inplanes[1],
layers[1],
shortcut_type,
stride=spatial_stride[1], pos_dim=pos_dim[1])
self.layer3 = self._make_layer(block,
block_inplanes[2],
layers[2],
shortcut_type,
stride=spatial_stride[2], pos_dim=pos_dim[2])
self.layer4 = self._make_layer(block,
block_inplanes[3],
layers[3],
shortcut_type,
stride=spatial_stride[3], pos_dim=pos_dim[3])
def _downsample_basic_block(self, x, planes, stride):
out = F.avg_pool2d(x, kernel_size=1, stride=stride)
zero_pads = paddle.zeros([out.shape[0], planes - out.shape[1], out.shape[2], out.shape[3]])
if isinstance(out, paddle.CUDAPlace):
zero_pads = zero_pads.cuda()
out = paddle.concat([out, zero_pads], axis=1)
return out
def _make_layer(self, block, planes, blocks, shortcut_type, stride=1, pos_dim=7):
shortcut = None
if self.in_planes != planes * block.expansion:
shortcut = nn.Sequential(
conv1x1x1(self.in_planes, planes * block.expansion, stride=1),
nn.BatchNorm2D(planes * block.expansion)
)
layers = []
layers.append(
block(in_planes=self.in_planes,
planes=planes,
stride=stride, shortcut_conv=shortcut, pos_dim=pos_dim)
)
self.in_planes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.in_planes, planes, pos_dim=pos_dim))
return nn.Sequential(*layers)
def forward(self, x):
print('##################', x.shape)
if len(x.shape) == 3:
x = paddle.unsqueeze(x, axis=0)
N, C, H, W = x.shape
x = x.reshape([int(N/16), -1, H, W])
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
if not self.no_max_pool:
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
return x
def train(self, mode=True):
freeze_bn = self.freeze_bn
freeze_bn_affine = self.freeze_bn
super(ResNet, self).train(mode)
if freeze_bn:
print("Freezing Mean/Var of BatchNorm2D.")
for m in self.sublayers():
if isinstance(m, nn.BatchNorm2D):
m.eval()
if freeze_bn_affine:
print("Freezing Weight/Bias of BatchNorm2D.")
for m in self.sublayers():
if isinstance(m, nn.BatchNorm2D):
m.weight.stop_gradient = True
m.bias.stop_gradient = True
def SqueezeTime_model(**kwargs):
model = ResNet(Bottleneck, [2, 2, 2, 2], get_inplanes(), **kwargs)
return model
@BACKBONES.register()
def SqueezeTime(pretrained=None, use_ssld=False, **kwargs):
"""
Build SqueezeTime Model
"""
model = SqueezeTime_model(widen_factor=0.5, dropout=0.5, n_input_channels=48, freeze_bn=False, spatial_stride=[1, 2, 2, 2], pos_dim=[64, 32, 16, 8])
return model
2:导入backbone:
paddlevideo/modeling/backbones/__init__.py
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .actbert import BertForMultiModalPreTraining
from .adds import ADDS_DepthNet
from .agcn import AGCN
from .asrf import ASRF
from .bmn import BMN
from .cfbi import CFBI
from .movinet import MoViNet
from .ms_tcn import MSTCN
from .resnet import ResNet
from .resnet_slowfast import ResNetSlowFast
from .resnet_slowfast_MRI import ResNetSlowFast_MRI
from .resnet_tsm import ResNetTSM
from .resnet_tsm_MRI import ResNetTSM_MRI
from .resnet_tsn_MRI import ResNetTSN_MRI
from .resnet_tweaks_tsm import ResNetTweaksTSM
from .resnet_tweaks_tsn import ResNetTweaksTSN
from .stgcn import STGCN
from .swin_transformer import SwinTransformer3D
from .transnetv2 import TransNetV2
from .vit import VisionTransformer
from .vit_tweaks import VisionTransformer_tweaks
from .ms_tcn import MSTCN
from .asrf import ASRF
from .resnet_tsn_MRI import ResNetTSN_MRI
from .resnet_tsm_MRI import ResNetTSM_MRI
from .resnet_slowfast_MRI import ResNetSlowFast_MRI
from .cfbi import CFBI
from .ctrgcn import CTRGCN
from .agcn2s import AGCN2s
from .movinet import MoViNet
from .resnet3d_slowonly import ResNet3dSlowOnly
from .toshift_vit import TokenShiftVisionTransformer
from .pptsm_mv2 import PPTSM_MobileNetV2
from .pptsm_mv3 import PPTSM_MobileNetV3
from .pptsm_v2 import PPTSM_v2
from .yowo import YOWO
from .squeezetime import SqueezeTime
__all__ = [
'ResNet', 'ResNetTSM', 'ResNetTweaksTSM', 'ResNetSlowFast', 'BMN',
'ResNetTweaksTSN', 'VisionTransformer', 'STGCN', 'AGCN', 'TransNetV2',
'ADDS_DepthNet', 'VisionTransformer_tweaks', 'BertForMultiModalPreTraining',
'ResNetTSN_MRI', 'ResNetTSM_MRI', 'ResNetSlowFast_MRI', 'CFBI', 'MSTCN',
'ASRF', 'MoViNet', 'SwinTransformer3D', 'CTRGCN',
'TokenShiftVisionTransformer', 'AGCN2s', 'PPTSM_MobileNetV2',
'PPTSM_MobileNetV3', 'PPTSM_v2', 'ResNet3dSlowOnly', 'YOWO', 'SqueezeTime'
]
3:添加head:
paddlevideo/modeling/heads/i2d_head.py
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from ..registry import HEADS
from ..weight_init import weight_init_
from .base import BaseHead
@HEADS.register()
class I2DHead(BaseHead):
"""Classification head for I2D.
Args:
num_classes (int): Number of classes to be classified.
in_channels (int): Number of channels in input feature.
loss_cls (dict): Config for building loss.
Default: dict(name='CrossEntropyLoss')
spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
drop_ratio (float): Probability of dropout layer. Default: 0.5.
std (float): Std value for Initiation. Default: 0.01.
kwargs (dict, optional): Any keyword argument to be used to initialize
the head.
"""
def __init__(self,
num_classes,
in_channels,
loss_cfg=dict(name='CrossEntropyLoss'),
spatial_type='avg',
drop_ratio=0.5,
std=0.01,
**kwargs):
super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
self.spatial_type = spatial_type
self.dropout_ratio = drop_ratio
self.init_std = std
if self.dropout_ratio != 0:
self.dropout = nn.Dropout(p=self.dropout_ratio)
else:
self.dropout = None
self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
if self.spatial_type == 'avg':
self.avg_pool = nn.AdaptiveAvgPool2D((1, 1))
else:
self.avg_pool = nn.AdaptiveMaxPool2D((1,1))
def forward(self, x, num_segs = None):
"""Defines the computation performed at every call.
Args:
x (Tensor): The input data.
Returns:
Tensor: The classification scores for input samples.
"""
# [N, in_channels, 4, 7, 7]
if self.avg_pool is not None:
x = self.avg_pool(x)
# [N, in_channels, 1, 1, 1]
if self.dropout is not None:
x = self.dropout(x)
# [N, in_channels, 1, 1, 1]
x = paddle.reshape(x, [x.shape[0], -1])
# [N, in_channels]
cls_score = self.fc_cls(x)
# [N, num_classes]
return cls_score
# def forward_new(self, x, num_segs = None):
# """Defines the computation performed at every call.
# Args:
# x (Tensor): The input data.
# Returns:
# Tensor: The classification scores for input samples.
# """
# # [N, in_channels, 4, 7, 7]
# if self.avg_pool is not None:
# x = self.avg_pool(x)
# # [N, in_channels, 1, 1, 1]
# if self.dropout is not None:
# x = self.dropout(x)
# # [N, in_channels, 1, 1, 1]
# x = paddle.reshape(x, [x.shape[0], -1])
# # [N, in_channels]
# cls_score = self.fc_cls(x)
# # [N, num_classes]
# return cls_score
4:导入head:
paddlevideo/modeling/heads/__init__.py
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .adds_head import AddsHead
from .asrf_head import ASRFHead
from .attention_lstm_head import AttentionLstmHead, ActionAttentionLstmHead
from .base import BaseHead
from .bbox_head import BBoxHeadAVA
from .cfbi_head import CollaborativeEnsemblerMS
from .i3d_head import I3DHead
from .movinet_head import MoViNetHead
from .ms_tcn_head import MSTCNHead
from .pptimesformer_head import ppTimeSformerHead
from .pptsm_head import ppTSMHead
from .pptsn_head import ppTSNHead
from .roi_head import AVARoIHead
from .single_straight3d import SingleRoIExtractor3D
from .slowfast_head import SlowFastHead
from .stgcn_head import STGCNHead
from .timesformer_head import TimeSformerHead
from .transnetv2_head import TransNetV2Head
from .tsm_head import TSMHead
from .tsn_head import TSNHead
from .ms_tcn_head import MSTCNHead
from .asrf_head import ASRFHead
from .ctrgcn_head import CTRGCNHead
from .movinet_head import MoViNetHead
from .agcn2s_head import AGCN2sHead
from .token_shift_head import TokenShiftHead
from .i2d_head import I2DHead
__all__ = [
'BaseHead', 'TSNHead', 'TSMHead', 'ppTSMHead', 'ppTSNHead', 'SlowFastHead',
'AttentionLstmHead', 'TimeSformerHead', 'STGCNHead', 'TransNetV2Head',
'I3DHead', 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'AddsHead',
'ppTimeSformerHead', 'CollaborativeEnsemblerMS', 'MSTCNHead', 'ASRFHead',
'MoViNetHead', 'CTRGCNHead', 'TokenShiftHead', 'ActionAttentionLstmHead',
'AGCN2sHead', 'I2DHead'
]
5:训练配置文件:
configs/recognition/pptsm/v2/md_ppsqt_16frames_uniform.yaml
MODEL: #MODEL field
framework: "Recognizer2D" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
name: "SqueezeTime" #Mandatory, The name of backbone.
head:
name: "I2DHead" #Mandatory, indicate the type of head, associate to the 'paddlevideo/modeling/heads'
#pretrained: "" #Optional, pretrained model path.
num_classes: 2
in_channels: 1024
DATASET: #DATASET field
batch_size: 16 #Mandatory, bacth size
num_workers: 4 #Mandatory, the number of subprocess on each GPU.
train:
format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "/home/mnt/sdd/Data/data_fights/rawframes" #Mandatory, train data root path
file_path: "/home/mnt/sdd/Data/data_fights/train_list.txt" #Mandatory, train data index file path
suffix: 'img_{:06}.jpg'
valid:
format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "/home/mnt/sdd/Data/data_fights/rawframes" #Mandatory, valid data root path
file_path: "/home/mnt/sdd/Data/data_fights/test_list.txt" #Mandatory, valid data index file path
suffix: 'img_{:06}.jpg'
test:
format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "/home/mnt/sdd/Data/data_fights/rawframes" #Mandatory, valid data root path
file_path: "/home/mnt/sdd/Data/data_fights/test_list.txt" #Mandatory, valid data index file path
suffix: 'img_{:06}.jpg'
PIPELINE: #PIPELINE field
train: #Mandotary, indicate the pipeline to deal with the training data, associate to the 'paddlevideo/loader/pipelines/'
decode:
name: "FrameDecoder"
sample:
name: "Sampler"
num_seg: 16
seg_len: 1
valid_mode: False
transform: #Mandotary, image transfrom operator
- Scale:
short_size: 256
- MultiScaleCrop:
target_size: 256
- RandomCrop:
target_size: 224
- RandomFlip:
- Image2Array:
- Normalization:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
valid: #Mandatory, indicate the pipeline to deal with the validing data. associate to the 'paddlevideo/loader/pipelines/'
decode:
name: "FrameDecoder"
sample:
name: "Sampler"
num_seg: 16
seg_len: 1
valid_mode: True
transform:
- Scale:
short_size: 256
- CenterCrop:
target_size: 224
- Image2Array:
- Normalization:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
test: #Mandatory, indicate the pipeline to deal with the validing data. associate to the 'paddlevideo/loader/pipelines/'
decode:
name: "FrameDecoder"
sample:
name: "Sampler"
num_seg: 16
seg_len: 1
valid_mode: True
transform:
- Scale:
short_size: 256
- CenterCrop:
target_size: 224
- Image2Array:
- Normalization:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
OPTIMIZER: #OPTIMIZER field
name: 'Momentum'
momentum: 0.9
learning_rate:
iter_step: True
name: 'CustomWarmupCosineDecay'
max_epoch: 120
warmup_epochs: 10
warmup_start_lr: 0.005
cosine_base_lr: 0.01
weight_decay:
name: 'L2'
value: 1e-4
use_nesterov: True
MIX:
name: "Mixup"
alpha: 0.2
METRIC:
name: 'CenterCropMetric'
INFERENCE:
name: 'ppSQT_Inference_helper'
num_seg: 16
target_size: 224
model_name: "ppSQT"
log_interval: 10 #Optional, the interal of logger, default:10
epochs: 120 #Mandatory, total epoch
log_level: "INFO" #Optional, the logger level. default: "INFO"
6:训练:
# multi-gpu-st
export CUDA_VISIBLE_DEVICES=0,1
python -B -m paddle.distributed.launch --gpus="0,1" --log_dir=./log/log_sqt_frame_16 main.py --validate -c configs/recognition/pptsm/v2/ppsqt_lcnet_md_16frames_uniform.yaml
7:结果:精度比ppTSM-v2低8个点左右。有可能是没有预训练权重的问题。