EfficientNet模型原理:EfficientNet:对模型深度、宽度和分辨率的混合缩放策略-CSDN博客
一、激活函数:
EfficientNet模型使用了Swish激活函数而不是更常见的Relu激活函数
1、公式定义
- Swish(x) = x * sigmoid(x)
- 是一个平滑的非线性激活函数
2、优势分析
- 相比 ReLU:
- 平滑可导
- 允许负值通过
- 不会出现梯度消失问题
- 相比 Sigmoid:
- 无上界限制
- 计算更简单
- 梯度更稳定
3、实现原理
class Swish(nn.Module):
def forward(self, x):
return x * torch.sigmoid(x) # 自门控机制
4、性能特点
- 在深层网络中表现更好
- 训练更稳定
- 有助于模型收敛
- 适合 EfficientNet 这样的大型网络
5、自适应性
- 当 x < 0 时,趋近于 0
- 当 x > 0 时,近似线性
- 动态调节信息流
二、自适应特征提取模块:
1、目的
- 自适应学习特征通道间的重要性
- 增强有用特征,抑制不重要特征
- 提高模型对特征的利用效率
2、核心步骤
- Squeeze(压缩):将空间信息压缩为通道描述符
- Excitation(激发):学习通道间的相互依赖关系
- Scale(缩放):对原始特征进行重标定
3、具体实现
class SEModule(nn.Module):
def __init__(self, channels, se_ratio):
super(SEModule, self).__init__()
# 将特征图压缩为 1x1:[B,C,H,W] -> [B,C,1,1]
self.avg_pool = nn.AdaptiveAvgPool2d(1)
# 两个全连接层(使用卷积层实现),通过激活函数 ReLU 和 Sigmoid 来实现特征图的压缩
self.fc1 = nn.Conv2d(channels, int(channels * se_ratio), kernel_size=1, bias=False)
# 稀疏化特征:负值置零,减少信息冗余
self.relu = nn.ReLU(inplace=True)
self.fc2 = nn.Conv2d(int(channels * se_ratio), channels, kernel_size=1, bias=False)
# 特征重标定:将特征图的每个通道的重要性系数压缩到 0~1 之间(归一化)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
# out是每个通道的重要性系数
out = self.avg_pool(x)
out = self.fc1(out)
out = self.relu(out) # max(0,x)
out = self.fc2(out)
out = self.sigmoid(out) # 1/(1+e^(-x))
return x * out
三、MBConvBlock:
1、具体实现:
class MBConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride, expand_ratio, se_ratio, drop_rate):
super(MBConvBlock, self).__init__()
self.expand_ratio = expand_ratio
self.stride = stride
self.in_channels = in_channels
self.out_channels = out_channels
self.swish = Swish()
# 扩张阶段
self.expanded_channels = int(in_channels * expand_ratio)
if expand_ratio > 1:
self.expand_conv = nn.Conv2d(in_channels, self.expanded_channels, kernel_size=1, bias=False)
self.expand_bn = nn.BatchNorm2d(self.expanded_channels)
# 深度可分离卷积阶段
# 每个通道独立进行空间卷积
self.depthwise_conv = nn.Conv2d(self.expanded_channels, self.expanded_channels, kernel_size=kernel_size,
stride=stride, groups=self.expanded_channels, bias=False, padding=kernel_size//2)
self.depthwise_bn = nn.BatchNorm2d(self.expanded_channels)
# SE注意力模块
self.se = SEModule(self.expanded_channels, se_ratio)
# 输出阶段
self.project_conv = nn.Conv2d(self.expanded_channels, out_channels, kernel_size=1, bias=False)
self.project_bn = nn.BatchNorm2d(out_channels)
self.dropout = nn.Dropout(drop_rate)
def forward(self, x):
out = x
if self.expand_ratio > 1:
out = self.expand_conv(out)
out = self.expand_bn(out)
out = self.swish(out)
out = self.depthwise_conv(out)
out = self.depthwise_bn(out)
out = self.swish(out)
out = self.se(out)
out = self.project_conv(out)
out = self.project_bn(out)
out = self.dropout(out)
if self.in_channels == self.out_channels and self.stride == 1:
out = out + x
return out
2、卷积层分析:
2.1、扩张阶段 (当 expand_ratio > 1)
- 1x1 卷积层: self.expand_conv
2.2、深度可分离卷积阶段
- KxK 深度卷积: self.depthwise_conv
2.3、SE模块中的卷积
- 1x1 降维卷积: self.fc1
- 1x1 升维卷积: self.fc2
2.4、输出阶段
- 1x1 投影卷积: self.project_conv
2.5、总结:
- 当 expand_ratio > 1 时:共 5 个卷积层
- 1个扩张卷积
- 1个深度卷积
- 2个SE模块卷积
- 1个投影卷积
- 当 expand_ratio = 1 时:共 4 个卷积层
- 不包含扩张卷积
- 其他层保持不变
四、宽度与深度扩展:
1、深度扩展
向上取整确保至少保持原有深度
def _round_repeats(self, repeats, depth_multiplier):
"""计算模块的重复次数"""
return int(math.ceil(depth_multiplier * repeats))
2、宽度扩展
确保通道数是8的倍数,便于硬件优化
def _round_filters(self, filters, width_multiplier):
"""计算卷积层的输出通道数"""
filters *= width_multiplier
new_filters = max(8, int(filters + 4) // 8 * 8)
# 确保通道数不会减少超过10%
if new_filters < 0.9 * filters:
new_filters += 8
return int(new_filters)
五、完整模型:
1、具体实现:
class EfficientNet(nn.Module):
def __init__(self, width_multiplier=1.0, depth_multiplier=1.0, dropout=0.2, num_classes=10):
super(EfficientNet, self).__init__()
self.num_classes = num_classes
# 基础配置
settings = [
# expand_ratio, channels, repeats, stride, kernel_size
[1, 16, 1, 1, 3],
[6, 24, 2, 2, 3],
[6, 40, 2, 2, 5],
[6, 80, 3, 2, 3],
[6, 112, 3, 1, 5],
[6, 192, 4, 2, 5],
[6, 320, 1, 1, 3]
]
stem_channels = self._round_filters(32, width_multiplier)
# Define the inverted residual blocks
self.stem = nn.Sequential(
nn.Conv2d(3, stem_channels, 3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(stem_channels),
Swish()
)
# 构建主干网络
self.blocks = nn.Sequential()
input_channels = stem_channels
total_blocks = sum(setting[2] for setting in settings)
block_idx = 0
for idx, (expand_ratio, channels, repeats, stride, kernel_size) in enumerate(settings):
output_channels = self._round_filters(channels, width_multiplier)
repeats = self._round_repeats(repeats, depth_multiplier)
for i in range(repeats):
drop_rate = dropout * block_idx / total_blocks
self.blocks.add_module(f'block_{block_idx}',
MBConvBlock(
in_channels=input_channels,
out_channels=output_channels,
kernel_size=kernel_size,
stride=stride if i == 0 else 1,
expand_ratio=expand_ratio,
se_ratio=0.25,
drop_rate=drop_rate
))
input_channels = output_channels
block_idx += 1
# Head
head_channels = self._round_filters(1280, width_multiplier)
self.head = nn.Sequential(
nn.Conv2d(input_channels, head_channels, 1, bias=False),
nn.BatchNorm2d(head_channels),
Swish()
)
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.classifier = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(head_channels, num_classes)
)
def _round_filters(self, filters, width_multiplier):
"""计算卷积层的输出通道数"""
filters *= width_multiplier
new_filters = max(8, int(filters + 4) // 8 * 8)
# 确保通道数不会减少超过10%
if new_filters < 0.9 * filters:
new_filters += 8
return int(new_filters)
def _round_repeats(self, repeats, depth_multiplier):
"""计算模块的重复次数"""
return int(math.ceil(depth_multiplier * repeats))
def forward(self, x):
x = self.stem(x)
x = self.blocks(x)
x = self.head(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.classifier(x)
return x
2、卷积层分析:
# 基础配置
settings = [
# expand_ratio, channels, repeats, stride, kernel_size
[1, 16, 1, 1, 3],
[6, 24, 2, 2, 3],
[6, 40, 2, 2, 5],
[6, 80, 3, 2, 3],
[6, 112, 3, 1, 5],
[6, 192, 4, 2, 5],
[6, 320, 1, 1, 3]
]
这里的基础配置指的是B0模型中的MBConvBlock的配置,按照这个配置, B0模型的第一个MBConvBlock由于扩张比率是1,没有扩张卷积,只有4个卷积层,从第二个MBConvBlock开始,每个MBConvBlock都有5个卷积层,并且,第二个和第三个MBConvBlock重复执行2次,第四个和第五个MBConvBlock重复执行3次,第六个MBConvBlock重复执行4次、第七个卷积层执行1次。外加最开始执行的卷积和最后执行的卷积,总共需要执行81个。
六、不同版本模型配置:
def efficient_config(model_name):
"""EfficientNet 配置"""
params_dict = {
# (width, depth, resolution, dropout)
'efficientnet-b0': (1.0, 1.0, 224, 0.2),
'efficientnet-b1': (1.0, 1.1, 240, 0.2),
'efficientnet-b2': (1.1, 1.2, 260, 0.3),
'efficientnet-b3': (1.2, 1.4, 300, 0.3),
'efficientnet-b4': (1.4, 1.8, 380, 0.4),
'efficientnet-b5': (1.6, 2.2, 456, 0.4),
'efficientnet-b6': (1.8, 2.6, 528, 0.5),
'efficientnet-b7': (2.0, 3.1, 600, 0.5)
}
return params_dict[model_name]
def build_efficientnet(model_name, num_classes=10):
config = efficient_config(model_name)
return EfficientNet(
width_multiplier=config[0],
depth_multiplier=config[1],
dropout=config[3],
num_classes=num_classes
)
按照这个配置,B1到B7模型的outchannels和repeats为:
B1 (1.0, 1.1):
- channels 不变
- repeats: [1, 3, 3, 4, 4, 5, 2]B2 (1.1, 1.2):
- channels: [16, 26, 44, 88, 123, 211, 352]
- repeats: [1, 3, 3, 4, 4, 5, 2]B3 (1.2, 1.4):
- channels: [19, 29, 48, 96, 134, 230, 384]
- repeats: [1, 3, 3, 5, 5, 6, 2]B4 (1.4, 1.8):
- channels: [22, 34, 56, 112, 157, 269, 448]
- repeats: [2, 4, 4, 6, 6, 8, 2]B5 (1.6, 2.2):
- channels: [26, 38, 64, 128, 179, 307, 512]
- repeats: [2, 5, 5, 7, 7, 9, 3]B6 (1.8, 2.6):
- channels: [29, 43, 72, 144, 202, 346, 576]
- repeats: [2, 6, 6, 8, 8, 11, 3]B7 (2.0, 3.1):
- channels: [32, 48, 80, 160, 224, 384, 640]
- repeats: [3, 7, 7, 10, 10, 13, 4]
那么B1模型需要经过111个卷积层;
那么B2模型需要经过111个卷积层;
那么B3模型需要经过126个卷积层;
那么B4模型需要经过161个卷积层;
那么B5模型需要经过191个卷积层;
那么B6模型需要经过221个卷积层;
那么B7模型需要经过271个卷积层;
七、数据处理:
使用CIAFR-10数据集:
# 数据增强
def create_transforms(image_size):
"""根据模型配置创建数据变换"""
transform_train = transforms.Compose([
transforms.Resize((image_size, image_size)),
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomRotation(15),
transforms.RandomCrop(image_size, padding=4),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
transforms.ToTensor(),
transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010))
])
transform_test = transforms.Compose([
transforms.Resize((image_size, image_size)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010))
])
return transform_train, transform_test
# 加载数据集
def load_data(data_path, model_name, num_gpus=4):
config = efficient_config(model_name)
image_size = config[2]
transform_train, transform_test = create_transforms(image_size)
train_dataset = datasets.CIFAR10(root=data_path, train=True, download=False, transform=transform_train)
test_dataset = datasets.CIFAR10(root=data_path, train=False, download=False, transform=transform_test)
batch_size = 32 * num_gpus
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=4 * num_gpus, # 每个 GPU 配置 4 个工作进程
pin_memory=True # 加快数据传输到 GPU
)
test_loader = DataLoader(
test_dataset,
batch_size=512,
shuffle=False,
num_workers=4 * num_gpus,
pin_memory=True
)
return train_loader, test_loader
八、训练与测试:
# 设置训练与测试
def train(model, device, train_loader, optimizer, epoch, criterion, scaler):
model.train()
total_loss = 0
total_samples = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
with autocast('cuda'):
output = model(data)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# 更精确的损失计算
total_loss += loss.item() * data.size(0)
total_samples += data.size(0)
if batch_idx % 100 == 0:
current_avg_loss = total_loss / total_samples
print(
f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
f'({100. * batch_idx / len(train_loader):.0f}%)]\tAVG_Loss: {current_avg_loss:.6f}'
)
def test(model, device, test_loader, criterion):
model.eval()
test_loss = 0
correct = 0
total = len(test_loader.dataset)
with torch.no_grad():
pbar = tqdm(test_loader, desc='Testing')
for data, target in pbar:
data, target = data.to(device), target.to(device)
output = model(data)
# 计算批次损失
loss = criterion(output, target)
test_loss += loss.item() * data.size(0) # 累积实际batch size的损失
# 计算准确率
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
# 更新进度条信息
acc = 100. * correct / total
pbar.set_postfix({'Loss': f'{test_loss / total:.4f}', 'Acc': f'{acc:.2f}%'})
avg_loss = test_loss / total
accuracy = 100. * correct / total
print(f'\nTest set: Average loss: {avg_loss:.4f}, '
f'Accuracy: {correct}/{total} ({accuracy:.2f}%)\n')
return accuracy
九、模型训练配置:
if __name__ == '__main__':
data_path = r'你的数据地址'
model_save_path = '你的模型保存地址'
model_name = 'efficientnet-b3'#选择要训练的模型
# 设置多GPU
devices = [3, 4, 6]
device = torch.device(f"cuda:{devices[0]}") # 主GPU
epoch = 40
num_classes = 10
print('Building model...')
model = build_efficientnet(model_name=model_name, num_classes=num_classes)
# 使用DataParallel包装模型
model = nn.DataParallel(model, device_ids=devices)
model.to(device)
print('==> Preparing data...')
train_loader, test_loader = load_data(data_path, model_name, num_gpus=len(devices))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
model.to(device)
scaler = GradScaler() # 用于缩放梯度
print('Start training...')
best_acc = 0
best_epoch = 0
for epoch in range(1, epoch + 1):
train(model, device, train_loader, optimizer, epoch, criterion, scaler)
acc = test(model, device, test_loader, criterion)
if acc > best_acc:
best_acc = acc
best_epoch = epoch
torch.save({
'epoch': epoch,
'model_state_dict': model.module.state_dict(), # 注意这里要用.module来获取原始模型
'accuracy': acc,
}, model_save_path)
print('Finished Training')
print('Start Evaing...')
print('train_data:')
test(model, device, train_loader, criterion)
print(f'Best Epoch: {best_epoch} Best Accuracy: {best_acc:.2f}%')
print('Finished!')
十、模型训练结果:
1、实验环境:
PyTorch | 2.4.0 |
Python | 3.12.7 |
Cuda | 12.0 |
CPU | AMD EPYC 7402 24-Core Processor |
GPU | A30(24G) |
环境 | Linux |
2、训练配置:
optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=0.001)
criterion = nn.CrossEntropyLoss()
epoch=40
num_classes = 10
3、训练结果:
3.1、B0模型:
3.2、B1模型:
3.3、B2模型:
3.4、B3模型:
3.5、B4模型:
由于CIFAR-10数据集过于的小,所以对于这几个模型来说性能没有太大差距,反倒是模型越复杂性能还降低了。