动手学深度学习（Pytorch版）代码实践 -计算机视觉-39实战Kaggle比赛：狗的品种识别（ImageNet Dogs）

39实战`Kaggle`比赛：狗的品种识别（`ImageNet Dogs`）

比赛链接：Dog Breed Identification | Kaggle

1.导入包

import torch
from torch import nn
import collections
import math
import os
import shutil
import torchvision
from d2l import torch as d2l
import matplotlib.pyplot as plt
import liliPytorch as lp

2.数据集处理

# 精简数据集
# file_path = '../data/kaggle_dog_tiny/'
# 原数据集
file_path = '../data/dog-breed-identification/'

# 整理数据集
# 从原始训练集中拆分验证集，然后将图像移动到按标签分组的子文件夹中。
#@save
def read_csv_labels(fname):
    """读取CSV文件中的标签，它返回一个字典，该字典将文件名中不带扩展名的部分映射到其标签"""
    with open(fname, 'r') as f:
        # 跳过文件头行(列名)
        lines = f.readlines()[1:]
    tokens = [l.rstrip().split(',') for l in lines]
    return dict(((name, label) for name, label in tokens))

# labels = read_csv_labels(os.path.join(file_path, 'labels.csv'))
# print(labels) # {'0097c6242c6f3071762d9f85c3ef1b2f': 'bedlington_terrier', '00a338a92e4e7bf543340dc849230e75': 'dingo'}
# print('训练样本 :', len(labels)) # 训练样本 : 1000
# print('类别 :', len(set(labels.values()))) # 类别 : 120

# 定义reorg_train_valid函数来将验证集从原始的训练集中拆分出来
#@save
def copyfile(filename, target_dir):
    """将文件复制到目标目录"""
    os.makedirs(target_dir, exist_ok=True)
    shutil.copy(filename, target_dir)

#@save
def reorg_train_valid(data_dir, labels, valid_ratio):
    """将验证集从原始的训练集中拆分出来"""
    # 训练数据集中样本最少的类别中的样本数
    n = collections.Counter(labels.values()).most_common()[-1][1]
    # 验证集中每个类别的样本数
    n_valid_per_label = max(1, math.floor(n * valid_ratio))
    label_count = {}
    for train_file in os.listdir(os.path.join(data_dir, 'train')): # 遍历训练集文件夹中的所有文件。
        label = labels[train_file.split('.')[0]] # 获取文件名（去掉扩展名）
        fname = os.path.join(data_dir, 'train', train_file) # 构建完整的文件路径

        copyfile(fname, os.path.join(data_dir, 'train_valid_test',
                                     'train_valid', label))
        
        if label not in label_count or label_count[label] < n_valid_per_label:
            copyfile(fname, os.path.join(data_dir, 'train_valid_test',
                                         'valid', label))
            label_count[label] = label_count.get(label, 0) + 1
        else:
            copyfile(fname, os.path.join(data_dir, 'train_valid_test',
                                         'train', label))
    return n_valid_per_label


# reorg_test函数用来在预测期间整理测试集
#@save
def reorg_test(data_dir):
    """在预测期间整理测试集，以方便读取"""
    for test_file in os.listdir(os.path.join(data_dir, 'test')):
        copyfile(os.path.join(data_dir, 'test', test_file),
                 os.path.join(data_dir, 'train_valid_test', 'test',
                              'unknown'))

def reorg_dog_data(data_dir, valid_ratio):
    labels = read_csv_labels(os.path.join(data_dir, 'labels.csv'))
    reorg_train_valid(data_dir, labels, valid_ratio)
    reorg_test(data_dir)


reorg_dog_data(file_path, valid_ratio = 0.1)

3.数据集加载

# 数据图像增广
# 训练
transform_train = torchvision.transforms.Compose([
    # 随机裁剪图像，所得图像为原始面积的0.08～1之间，高宽比在3/4和4/3之间。
    # 然后，缩放图像以创建224x224的新图像
    torchvision.transforms.RandomResizedCrop(224, scale=(0.08, 1.0),
                                             ratio=(3.0/4.0, 4.0/3.0)),
    torchvision.transforms.RandomHorizontalFlip(),
    # 随机更改亮度，对比度和饱和度
    torchvision.transforms.ColorJitter(brightness=0.4,
                                       contrast=0.4,
                                       saturation=0.4),
    # 添加随机噪声
    torchvision.transforms.ToTensor(),
    # 标准化图像的每个通道
    torchvision.transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225])])
# 测试
transform_test = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    # 从图像中心裁切224x224大小的图片
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225])])

# 读取数据集
# 创建数据集对象
# 通常用于定义数据源及其预处理方法。
train_dataset, train_valid_dataset = [
    # ImageFolder 创建数据集时，它会遍历指定目录下的所有子文件夹，
    # 并将每个子文件夹的名称作为一个类别标签。然后，它会按字母顺序给每个类别分配一个索引
    torchvision.datasets.ImageFolder(
        os.path.join(file_path, 'train_valid_test', folder),
        transform=transform_train
    ) for folder in ['train', 'train_valid']]

valid_dataset, test_dataset = [
    torchvision.datasets.ImageFolder(
        os.path.join(file_path, 'train_valid_test', folder),
        transform=transform_test
    ) for folder in ['valid', 'test']]

# 显示每个类别名称和对应的索引
# print(train_dataset.class_to_idx) 4
# {'affenpinscher': 0, 'afghan_hound': 1, 'african_hunting_dog': 2}

batch_size = 128
# 创建数据加载器
# 通常用于训练过程中按批次提供数据，具有更高效的数据加载和处理能力。
train_iter, train_valid_iter = [
    torch.utils.data.DataLoader(
        dataset, batch_size, shuffle=True, drop_last=True
    ) for dataset in (train_dataset, train_valid_dataset)]

valid_iter = torch.utils.data.DataLoader(
    valid_dataset, batch_size, shuffle=False,drop_last=True)

test_iter = torch.utils.data.DataLoader(
    test_dataset, batch_size, shuffle=False,drop_last=False)

4.预训练模型resnet34

# 用于创建和配置训练模型
def get_net(devices):
    # 创建一个空的 nn.Sequential 容器
    finetune_net = nn.Sequential()
    # 加载预训练的 ResNet-34 模型，并将其特征层（features）部分添加到 finetune_net 中
    finetune_net.features = torchvision.models.resnet34(pretrained=True)
    # 定义一个新的输出网络
    finetune_net.output_new = nn.Sequential(
        nn.Linear(1000, 256),
        nn.ReLU(),
        nn.Linear(256, 120)
    )
    # 将模型参数分配到指定的设备（如 GPU 或 CPU）
    finetune_net = finetune_net.to(devices[0])
    # 冻结预训练的特征层参数，以避免在训练过程中更新这些参数
    for param in finetune_net.features.parameters():
        param.requires_grad = False
    # 返回配置好的模型
    return finetune_net

5.模型训练

def train_batch(net, X, y, loss, trainer, devices):
    """使用多GPU训练一个小批量数据。
    参数：
    net: 神经网络模型。
    X: 输入数据，张量或张量列表。
    y: 标签数据。
    loss: 损失函数。
    trainer: 优化器。
    devices: GPU设备列表。
    返回：
    train_loss_sum: 当前批次的训练损失和。
    train_acc_sum: 当前批次的训练准确度和。
    """
    # 如果输入数据X是列表类型
    if isinstance(X, list):
        # 将列表中的每个张量移动到第一个GPU设备
        X = [x.to(devices[0]) for x in X]
    else:
        X = X.to(devices[0])# 如果X不是列表，直接将X移动到第一个GPU设备
    y = y.to(devices[0])# 将标签数据y移动到第一个GPU设备
    net.train() # 设置网络为训练模式
    trainer.zero_grad()# 梯度清零
    pred = net(X) # 前向传播，计算预测值
    l = loss(pred, y) # 计算损失
    l.sum().backward()# 反向传播，计算梯度
    trainer.step() # 更新模型参数
    train_loss_sum = l.sum()# 计算当前批次的总损失
    train_acc_sum = d2l.accuracy(pred, y)# 计算当前批次的总准确度
    return train_loss_sum, train_acc_sum# 返回训练损失和与准确度和


def train(net, train_iter, valid_iter, num_epochs, lr, wd, devices, lr_period, lr_decay):

    trainer = torch.optim.SGD(
        # net.parameters()：返回模型 net 中所有参数。
        # if param.requires_grad：仅选择那些 requires_grad 为 True 的参数。
        # 这些参数是需要进行梯度更新的（即未冻结的参数）
        (param for param in net.parameters()if param.requires_grad), 
        # momentum用于加速 SGD 的收敛速度，通过在更新参数时考虑之前的更新方向，减少震荡
        # weight_decay权重衰减用于防止过拟合
        lr=lr,momentum=0.9, weight_decay=wd)
    # trainer = torch.optim.Adam(net.parameters(), lr=lr,weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.StepLR(trainer, lr_period, lr_decay)
    loss = nn.CrossEntropyLoss(reduction="none")
    num_batches, timer = len(train_iter), d2l.Timer()
    legend = ['train loss', 'train acc']
    if valid_iter is not None:
        legend.append('valid acc')
    animator = lp.Animator(xlabel='epoch', xlim=[1, num_epochs],
                            legend=legend)
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    for epoch in range(num_epochs):
        net.train()
        metric = lp.Accumulator(3)
        for i, (features, labels) in enumerate(train_iter):
            timer.start()
            l, acc = train_batch(net, features, labels,loss, trainer, devices)
            metric.add(l, acc, labels.shape[0])
            timer.stop()
            # train_l = metric[0] / metric[2] # 计算训练损失
            # train_acc = metric[1] / metric[2] # 计算训练准确率
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,(metric[0] / metric[2], metric[1] / metric[2],None))
        if valid_iter is not None:
            valid_acc = d2l.evaluate_accuracy_gpu(net, valid_iter)
            animator.add(epoch + 1, (None, None, valid_acc))
        scheduler.step()
        # print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
        #       f'valid_acc {valid_acc:.3f}')
        
    measures = (f'train loss {metric[0] / metric[2]:.3f}, '
                f'train acc {metric[1] / metric[2]:.3f}')
    if valid_iter is not None:
        measures += f', valid acc {valid_acc:.3f}'
    print(measures + f'\n{metric[2] * num_epochs / timer.sum():.1f}'
          f' examples/sec on {str(devices)}')

6.模型预测

def predict(file_path_module):
    # 预测
    net = get_net(d2l.try_all_gpus())
    net.load_state_dict(torch.load(file_path_module + 'imageNet_Dogs.params'))

    # 初始化一个空列表preds用于存储预测结果
    preds = []

    # 遍历测试集中的每一个数据和标签
    for data, label in test_iter:
        # 使用神经网络(net)对数据进行预测，并使用softmax函数将输出转化为概率分布
        output = torch.nn.functional.softmax(net(data.to(devices[0])), dim=1)
        # 将预测结果从GPU中取出，转换为NumPy数组后，添加到preds列表中
        preds.extend(output.cpu().detach().numpy())

    # 获取测试数据文件夹中所有文件的id，并按字典顺序排序
    ids = sorted(os.listdir(
        os.path.join(file_path, 'train_valid_test', 'test', 'unknown')))

    # 打开一个新的CSV文件submission.csv用于写入预测结果
    with open(file_path + 'submission.csv', 'w') as f:
        # 写入CSV文件的表头，包含'id'和所有类别标签
        f.write('id,' + ','.join(train_valid_dataset.classes) + '\n')
        # 遍历文件id和对应的预测结果
        for i, output in zip(ids, preds):
            # 写入每个文件的id和对应的预测概率
            f.write(i.split('.')[0] + ',' + ','.join(
                [str(num) for num in output]) + '\n')

7.定义超参数并保存训练参数

# 定义模型
devices, num_epochs, lr, wd = d2l.try_all_gpus(), 20, 1e-4, 1e-4
lr_period, lr_decay, net = 10, 0.1, get_net(devices)
train(net, train_iter, valid_iter, num_epochs, lr, wd, devices, lr_period, lr_decay)
# num_epochs, lr, wd, lr_period, lr_decay = 20, 1e-4, 1e-4, 4, 0.9 (简略数据集)
# train loss 0.750, train acc 0.814, valid acc 0.646
# 647.4 examples/sec on [device(type='cuda', index=0)]

# num_epochs, lr, wd, lr_period, lr_decay = 20, 1e-4, 1e-4, 10, 0.1 (原数据集)
# train loss 0.863, train acc 0.759, valid acc 0.844
# 830.8 examples/sec on [device(type='cuda', index=0)]
plt.show()

net = get_net(devices)
train(net, train_valid_iter, None, num_epochs, lr, wd, devices, lr_period,lr_decay)
# num_epochs, lr, wd, lr_period, lr_decay = 20, 1e-4, 1e-4, 4, 0.9 (简略数据集)
# train loss 0.721, train acc 0.815
# 704.9 examples/sec on [device(type='cuda', index=0)]

# num_epochs, lr, wd, lr_period, lr_decay = 20, 1e-4, 1e-4, 10, 0.1 (原数据集)
# train loss 0.865, train acc 0.758
# 845.4 examples/sec on [device(type='cuda', index=0)]

plt.show()
# 保存模型参数
file_path_module = '../limuPytorch/module/'
torch.save(net.state_dict(), file_path_module + 'imageNet_Dogs.params')