13Kaggle
竞赛:2020加州房价预测
# 导入所需的库
import numpy as np
import pandas as pd
import torch
import hashlib
import os
import tarfile
import zipfile
import requests
from torch import nn
from d2l import torch as d2l
# 读取训练和测试数据
train_data = pd.read_csv('../data/california-house-prices/train.csv')
test_data = pd.read_csv('../data/california-house-prices/test.csv')
# 打印数据形状
# print(train_data.shape)
# print(test_data.shape)
# (47439, 41)
# (31626, 40)
# 打印前4行的部分列
# print(train_data.iloc[0:4, [0, 1, 2, 3, 4, 5, 6, -3, -2, -1]])
# 合并训练和测试数据,用于特征工程
all_features = pd.concat((train_data.iloc[:, train_data.columns != 'Sold Price'], test_data.iloc[:, 1:]))
# all_features.info()
# print(all_features.shape)
# (79065, 40)
# 去除ID列
all_features = all_features.iloc[:, 1:]
# 将字符型日期列转化为日期型
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format="%Y-%m-%d")
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format="%Y-%m-%d")
# 标准化数值特征
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# 打印每个字符型特征的唯一值数量
# for in_object in all_features.dtypes[all_features.dtypes == 'object'].index:
# print(in_object.ljust(20), len(all_features[in_object].unique()))
"""
in_object.ljust(20):将列名左对齐,并填充空格使其长度至少为20个字符,这样打印时更整齐。
len(all_features[in_object].unique()):计算该列中唯一值的数量。
便于后续的独热编码,防止内存爆炸
"""
# 选择需要的特征
features = list(numeric_features)
features.extend(['Type'])
all_features = all_features[features[:]]
# 独热编码
all_features = pd.get_dummies(all_features, dummy_na=True, dtype=float)
# print(all_features.shape)
# (79065, 195)
# 查看全部特征的数据类型
# print(all_features.dtypes.unique())
# 从pandas格式中提取NumPy格式,并将其转换为张量表示用于训练
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(
train_data['Sold Price'].values.reshape(-1, 1),
dtype=torch.float32
)
# 是否使用GPU训练
if not torch.cuda.is_available():
print('CUDA is not available. Training on CPU ...')
else:
print('CUDA is available. Training on GPU ...')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 将特征和标签移到设备上
train_features = train_features.to(device)
test_features = test_features.to(device)
train_labels = train_labels.to(device)
# 定义均方误差损失函数
loss = nn.MSELoss()
# 输入特征的数量
in_features = train_features.shape[1]
# print(in_features)
# 195
# 定义神经网络模型
dropout1, dropout2, dropout3 = 0.2, 0.3, 0.5
def get_net():
net = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features, 128), nn.ReLU(),
# nn.Dropout(dropout1),
nn.Linear(128, 64), nn.ReLU(),
# nn.Dropout(dropout2),
nn.Linear(64, 32), nn.ReLU(),
# nn.Dropout(dropout3),
nn.Linear(32, 1))
return net.to(device) # 使用GPU
# 计算对数均方根误差
def log_rmse(net, features, labels):
"""
使用 torch.clamp 函数将预测值的下限限制在 1,确保所有预测值至少为 1。
这是为了避免在取对数时出现负值或零值,因为对数在这些点上未定义或会导致数值问题。
"""
clipped_preds = torch.clamp(net(features), 1, float('inf'))
rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
# 将 PyTorch 张量转换为 Python 标量
return rmse.item()
# 训练模型函数
def train(net, train_features, train_labels, test_features, test_labels,
num_epochs, learning_rate, weight_decay, batch_size):
train_ls, tets_ls = [], [] # 用于存储每个epoch的训练和测试损失
train_iter = d2l.load_array((train_features, train_labels), batch_size) # 创建训练数据迭代器
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate,
weight_decay=weight_decay) # 定义Adam优化器
# weight_decay: 权重衰减,用于L2正则化。
for epoch in range(num_epochs):
for X, y in train_iter:
X, y = X.to(device), y.to(device) # 确保批次数据在GPU上
optimizer.zero_grad() # 梯度清零
l = loss(net(X), y) # 计算损失
l.backward() # 反向传播
optimizer.step() # 更新模型参数
# 计算并记录训练集上的对数均方根误差。
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
# 计算并记录测试集上的对数均方根误差
tets_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, tets_ls
# K折交叉验证
# 它选择第i个切片作为验证数据,其余部分作为训练数据
def get_k_fold_data(k, i, X, y):
assert k > 1
fold_size = X.shape[0] // k
X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
X_part, y_part = X[idx, :], y[idx]
if j == i:
X_valid, y_valid = X_part, y_part
elif X_train is None:
X_train, y_train = X_part, y_part
else:
X_train = torch.cat([X_train, X_part], 0)
y_train = torch.cat([y_train, y_part], 0)
return X_train.to(device), y_train.to(device), X_valid.to(device), y_valid.to(device) # 确保在GPU上
# 在K折交叉验证中训练K次后,返回训练和验证误差的平均值。
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
batch_size):
train_l_sum, valid_l_sum = 0, 0
for i in range(k):
data = get_k_fold_data(k, i, X_train, y_train)
net = get_net()
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
weight_decay, batch_size)
train_l_sum += train_ls[-1]
# 将 train_ls 列表中的最新值(即当前 epoch 的训练损失)累加到 train_l_sum 变量中。
valid_l_sum += valid_ls[-1]
if i == 0:
d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
legend=['train', 'valid'], yscale='log')
print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
f'验证log rmse{float(valid_ls[-1]):f}')
return train_l_sum / k, valid_l_sum / k
# 定义训练参数
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 0.01, 0, 256
# 进行K折交叉验证
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
f'平均验证log rmse: {float(valid_l):f}')
d2l.plt.show()
# 提交Kaggle预测
def train_and_pred(train_features, test_features, train_labels, test_data,
num_epochs, lr, weight_decay, batch_size):
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None, None,
num_epochs, lr, weight_decay, batch_size)
d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
print(f'训练log rmse:{float(train_ls[-1]):f}')
# 将网络应用于测试集,并将结果从GPU转移到CPU再转换为NumPy数组
preds = net(test_features).detach().cpu().numpy()
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('../data/california-house-prices/submission.csv', index=False)
# 训练模型并进行预测
train_and_pred(train_features, test_features, train_labels, test_data,
num_epochs, lr, weight_decay, batch_size)
d2l.plt.show()
运行结果:
折1,训练log rmse0.356748, 验证log rmse0.331666
折2,训练log rmse0.337252, 验证log rmse0.341875
折3,训练log rmse0.317294, 验证log rmse0.324516
折4,训练log rmse0.337175, 验证log rmse0.360625
折5,训练log rmse0.356537, 验证log rmse0.379667
5-折验证: 平均训练log rmse: 0.341001, 平均验证log rmse: 0.347670
训练log rmse:0.307162
竞赛得分: