本文为为🔗365天深度学习训练营内部文章
原作者:K同学啊
本期,做个二维结构化数据的分类预测。提到结构化数据,一般的分类算法常用有:逻辑回归(二分类)、KNN、SVM、决策树、贝叶斯、随机森林、XGBoost等。本次我们采用LSTM长短期记忆网络进行分类预测
一 数据导入
import torch.nn as nn
import torch.nn.functional as F
import torchvision,torch
from torch.utils.data import TensorDataset,DataLoader
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
import warnings
warnings.filterwarnings('ignore')
# 设置硬件设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pd.read_excel('dia.xls')
print(df.head())
print(df.shape)
print('数据缺失值-------------------')
print(df.isnull().sum())
print('数据重复值-------------------')
print(df.duplicated().sum())
卡号 性别 年龄 高密度脂蛋白胆固醇 低密度脂蛋白胆固醇 极低密度脂蛋白胆固醇 甘油三酯 总胆固醇 脉搏 舒张压 0 18054421 0 38 1.25 2.99 1.07 0.64 5.31 83 83 \ 1 18054422 0 31 1.15 1.99 0.84 0.50 3.98 85 63 2 18054423 0 27 1.29 2.21 0.69 0.60 4.19 73 61 3 18054424 0 33 0.93 2.01 0.66 0.84 3.60 83 60 4 18054425 0 36 1.17 2.83 0.83 0.73 4.83 85 67 高血压史 尿素氮 尿酸 肌酐 体重检查结果 是否糖尿病 0 0 4.99 243.3 50 1 0 1 0 4.72 391.0 47 1 0 2 0 5.87 325.7 51 1 0 3 0 2.40 203.2 40 2 0 4 0 4.09 236.8 43 0 0 (1006, 16) 数据缺失值------------------- 卡号 0 性别 0 年龄 0 高密度脂蛋白胆固醇 0 低密度脂蛋白胆固醇 0 极低密度脂蛋白胆固醇 0 甘油三酯 0 总胆固醇 0 脉搏 0 舒张压 0 高血压史 0 尿素氮 0 尿酸 0 肌酐 0 体重检查结果 0 是否糖尿病 0 dtype: int64 数据重复值------------------- 0
二 探索性数据分析
columns = df.drop(['是否糖尿病','卡号','性别'],axis=1).columns
plt.figure(figsize=(15,10))
for i,col in enumerate(columns,1):
plt.subplot(3,5,i)
sns.boxplot(x=df['是否糖尿病'],y=df[col])
plt.title(f'{col}的箱线图')
plt.ylabel('数值')
plt.grid(True)
plt.tight_layout()
plt.show()
三 相关性分析
df_corr = df.drop(['卡号'],axis=1).corr()
plt.figure(figsize=(8,6))
plt.title('相关性热图')
sns.heatmap(df_corr,annot=True)
plt.show()
四 划分数据集
将负相关的特征(高密度脂蛋白胆固醇)剔除,只保留正相关的特征
# 划分数据集
X = df.drop(['是否糖尿病','高密度脂蛋白胆固醇','卡号'],axis=1)
y = df['是否糖尿病']
X = torch.tensor(np.array(X),dtype=torch.float32)
y = torch.tensor(np.array(y),dtype=torch.int64)
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.3,random_state=1)
# 数据集构建
train_dl = DataLoader(TensorDataset(train_X,train_y),
batch_size=64,
shuffle=False)
test_dl = DataLoader(TensorDataset(test_X,test_y),
batch_size=64,
shuffle=False)
五 构建LSTM模型
# 定义模型
class model_lstm(nn.Module):
def __init__(self):
super(model_lstm,self).__init__()
self.lstm0 = nn.LSTM(input_size=13,hidden_size=200,num_layers=1,batch_first=True)
self.lstm1 = nn.LSTM(input_size=200,hidden_size=200,num_layers=1,batch_first=True)
self.fc0 = nn.Linear(200,2)
def forward(self,x):
out,hidden1 = self.lstm0(x)
out,_ = self.lstm1(out,hidden1)
out = self.fc0(out)
return out
model = model_lstm().to(device)
print(model)
六 训练并评估
# 训练循环
def train(dataloader,model,loss_fn,optimizer):
size = len(dataloader.dataset) # 训练集的大小
num_batches = len(dataloader) # 批次数目,(size/batchsize,向上取整)
train_acc,train_loss = 0,0 # 初始化训练损失和正确率
for x,y in dataloader: # 获取数据
X,y = x.to(device),y.to(device)
# 计算预测误差
pred = model(X) # 网络输出
loss = loss_fn(pred,y) # 计算误差
# 反向传播
optimizer.zero_grad() # grad属性归零
loss.backward() # 反向传播
optimizer.step() # 每一步自动更新
# 记录acc与loss
train_acc += (pred.argmax(1) == y).type(torch.float).sum().item()
train_loss += loss.item()
train_acc /= size
train_loss /= num_batches
return train_acc,train_loss
# 测试循环
def valid(dataloader,model,loss_fn):
size = len(dataloader.dataset) # 训练集的大小
num_batches = len(dataloader) # 批次数目,(size/batchsize,向上取整)
test_loss, test_acc = 0, 0 # 初始化训练损失和正确率
# 当不进行训练时,停止梯度更新,节省计算内存消耗
with torch.no_grad():
for imgs,target in dataloader:
imgs,target = imgs.to(device),target.to(device)
# 计算loss
target_pred = model(imgs)
loss = loss_fn(target_pred,target)
test_loss += loss.item()
test_acc += (target_pred.argmax(1) == target).type(torch.float).sum().item()
test_acc /= size
test_loss /= num_batches
return test_acc,test_loss
loss_fn = nn.CrossEntropyLoss() # 创建损失函数
learn_rate = 1e-4 # 学习率
opt = torch.optim.Adam(model.parameters(),lr=learn_rate)
epochs = 30
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(epochs):
model.train()
epoch_train_acc,epoch_train_loss = train(train_dl,model,loss_fn,opt)
model.eval()
epoch_test_acc,epoch_test_loss = valid(test_dl,model,loss_fn)
train_acc.append(epoch_train_acc)
train_loss.append(epoch_train_loss)
test_acc.append(epoch_test_acc)
test_loss.append(epoch_test_loss)
# 获取当前的学习率
lr = opt.state_dict()['param_groups'][0]['lr']
template = ('Epoch:{:2d},Train_acc:{:.1f}%,Train_loss:{:.3f},Test_acc:{:.1f}%,Test_loss:{:.3f},lr:{:.2E}')
print(template.format(epoch+1,epoch_train_acc*100,epoch_train_loss,epoch_test_acc*100,epoch_test_loss,lr))
print("="*20,'Done',"="*20)
Epoch: 1,Train_acc:56.5%,Train_loss:0.687,Test_acc:53.3%,Test_loss:0.686,lr:1.00E-04 Epoch: 2,Train_acc:56.7%,Train_loss:0.683,Test_acc:53.3%,Test_loss:0.687,lr:1.00E-04 Epoch: 3,Train_acc:56.5%,Train_loss:0.682,Test_acc:53.3%,Test_loss:0.686,lr:1.00E-04 Epoch: 4,Train_acc:56.5%,Train_loss:0.681,Test_acc:53.3%,Test_loss:0.685,lr:1.00E-04 Epoch: 5,Train_acc:56.7%,Train_loss:0.681,Test_acc:53.3%,Test_loss:0.685,lr:1.00E-04 Epoch: 6,Train_acc:56.8%,Train_loss:0.679,Test_acc:53.3%,Test_loss:0.684,lr:1.00E-04 Epoch: 7,Train_acc:57.0%,Train_loss:0.678,Test_acc:53.3%,Test_loss:0.683,lr:1.00E-04 Epoch: 8,Train_acc:56.8%,Train_loss:0.676,Test_acc:53.3%,Test_loss:0.681,lr:1.00E-04 Epoch: 9,Train_acc:56.5%,Train_loss:0.674,Test_acc:53.3%,Test_loss:0.679,lr:1.00E-04 Epoch:10,Train_acc:56.7%,Train_loss:0.671,Test_acc:53.3%,Test_loss:0.676,lr:1.00E-04 Epoch:11,Train_acc:57.0%,Train_loss:0.668,Test_acc:53.3%,Test_loss:0.673,lr:1.00E-04 Epoch:12,Train_acc:57.4%,Train_loss:0.665,Test_acc:53.3%,Test_loss:0.669,lr:1.00E-04 Epoch:13,Train_acc:57.7%,Train_loss:0.660,Test_acc:53.6%,Test_loss:0.664,lr:1.00E-04 Epoch:14,Train_acc:58.8%,Train_loss:0.655,Test_acc:53.6%,Test_loss:0.660,lr:1.00E-04 Epoch:15,Train_acc:59.5%,Train_loss:0.649,Test_acc:54.0%,Test_loss:0.655,lr:1.00E-04 Epoch:16,Train_acc:59.9%,Train_loss:0.643,Test_acc:56.3%,Test_loss:0.650,lr:1.00E-04 Epoch:17,Train_acc:61.8%,Train_loss:0.636,Test_acc:57.9%,Test_loss:0.644,lr:1.00E-04 Epoch:18,Train_acc:63.6%,Train_loss:0.628,Test_acc:60.3%,Test_loss:0.637,lr:1.00E-04 Epoch:19,Train_acc:65.3%,Train_loss:0.618,Test_acc:61.6%,Test_loss:0.630,lr:1.00E-04 Epoch:20,Train_acc:65.8%,Train_loss:0.607,Test_acc:63.2%,Test_loss:0.623,lr:1.00E-04 Epoch:21,Train_acc:66.6%,Train_loss:0.596,Test_acc:63.9%,Test_loss:0.616,lr:1.00E-04 Epoch:22,Train_acc:67.8%,Train_loss:0.584,Test_acc:64.6%,Test_loss:0.609,lr:1.00E-04 Epoch:23,Train_acc:70.3%,Train_loss:0.572,Test_acc:64.2%,Test_loss:0.602,lr:1.00E-04 Epoch:24,Train_acc:71.4%,Train_loss:0.560,Test_acc:66.6%,Test_loss:0.595,lr:1.00E-04 Epoch:25,Train_acc:72.4%,Train_loss:0.549,Test_acc:66.9%,Test_loss:0.590,lr:1.00E-04 Epoch:26,Train_acc:73.9%,Train_loss:0.538,Test_acc:66.6%,Test_loss:0.584,lr:1.00E-04 Epoch:27,Train_acc:74.3%,Train_loss:0.528,Test_acc:66.9%,Test_loss:0.579,lr:1.00E-04 Epoch:28,Train_acc:74.7%,Train_loss:0.518,Test_acc:67.5%,Test_loss:0.574,lr:1.00E-04 Epoch:29,Train_acc:76.0%,Train_loss:0.508,Test_acc:69.5%,Test_loss:0.570,lr:1.00E-04 Epoch:30,Train_acc:76.8%,Train_loss:0.499,Test_acc:70.5%,Test_loss:0.566,lr:1.00E-04 ==================== Done ====================
七 可视化
epochs_range = range(30)
plt.figure(figsize=(14,4))
plt.subplot(1,2,1)
plt.plot(epochs_range,train_acc,label='training accuracy')
plt.plot(epochs_range,test_acc,label='validation accuracy')
plt.legend(loc='lower right')
plt.title('training and validation accuracy')
plt.subplot(1,2,2)
plt.plot(epochs_range,train_loss,label='training loss')
plt.plot(epochs_range,test_loss,label='validation loss')
plt.legend(loc='upper right')
plt.title('training and validation loss')
plt.show()
总结:
1. 处理长序列数据
LSTM通过其独特的门控机制(输入门、遗忘门和输出门)有效地保留和忘记信息,解决了传统RNN在长序列学习中面临的梯度消失和梯度爆炸问题。这使得LSTM在处理长序列数据时能保持较好的性能。
2. 捕捉时间依赖性
LSTM能够捕捉序列中长期和短期的时间依赖性,对于许多应用(如语言建模、语音识别和视频分析等)来说,这种能力至关重要。它能够记住之前的信息并利用这些信息来影响当前的输出。
3. 适用于多种输入类型
LSTM不仅可以处理一维的时间序列数据,还可以处理多维的序列数据,适用于图像序列、文本序列等多种数据类型。这使得LSTM在许多不同的领域具有广泛的应用。