2024五一数学建模竞赛(五一赛)C题保姆级分析完整思路+代码+数据教学
C题 煤矿深部开采冲击地压危险预测
第一问 导入数据
以下仅展示部分,完整版看文末的文章
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
def preprocess_dataframe(df):
window = '60s' # 窗口大小
features = df.groupby(pd.Grouper(key='时间 (time)', freq=window)).agg({
'声波强度 (AE)': ['mean', 'std', 'max', 'min']
}).rename(columns={
'mean': 'EMR_mean',
'std': 'EMR_std',
'max': 'EMR_max',
'min': 'EMR_min'
})
features.reset_index(inplace=True)
features.fillna(0, inplace=True)
def calculate_fft_features(signal):
if isinstance(signal, pd.Series):
signal = signal.values # 将 Series 转换为 NumPy 数组
if isinstance(signal, (list, tuple)):
signal = np.array(signal) # 将列表或元组转换为 NumPy 数组
if isinstance(signal, (float, int)):
signal = np.array([signal]) # 将单个数值转换为 NumPy 数组
if signal.ndim > 1:
signal = signal.squeeze() # 如果信号的维度大于1,压缩维度为1
# 快速傅里叶变换
fft_values = np.fft.fft(signal)
fft_abs = np.abs(fft_values)
return np.mean(fft_abs), np.std(fft_abs), np.max(fft_abs)
df['声波强度 (AE)'] = df['声波强度 (AE)'].astype(float)
# 应用FFT并计算特征
fft_features = df['声波强度 (AE)'].apply(calculate_fft_features)
# 解压特征并添加到 DataFrame
df['FFT_mean'], df['FFT_std'], df['FFT_max'] = zip(*fft_features)
df['hour'] = df['时间 (time)'].dt.hour
df['minute'] = df['时间 (time)'].dt.minute
df['weekday'] = df['时间 (time)'].dt.weekday
df['is_weekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)
rolling_window = 10 # 使用10个数据点的窗口
df['rolling_mean'] = df['声波强度 (AE)'].rolling(window=rolling_window).mean()
df['rolling_std'] = df['声波强度 (AE)'].rolling(window=rolling_window).std()
df['rolling_mean'].fillna(df.iloc[10,-2], inplace=True)
df['rolling_std'].fillna(df.iloc[10,-1], inplace=True)
df['diff_1'] = df['声波强度 (AE)'].diff(1) # 一阶差分
df['diff_2'] = df['声波强度 (AE)'].diff(2) # 二阶差分
df['diff_1'].fillna(0, inplace=True)
df['diff_2'].fillna(0, inplace=True)
# q = np.percentile(df['声波强度 (AE)'], 75)
# data_modified = [1 if x > q else 0 for x in df['声波强度 (AE)']]
# df['高频信号'] = data_modified
# 数据预处理
# df['类别 (class)_encoded'], class_categories = pd.factorize(df['类别 (class)'])
return df
train_df = pd.read_excel("./附件1 (Attachment 1).xlsx",sheet_name='AE')
train_df
processed_df = preprocess_dataframe(train_df)
processed_df['类别 (class)_encoded'], class_categories = pd.factorize(processed_df['类别 (class)'])
q = np.percentile(processed_df['声波强度 (AE)'], 80)
data_modified = [1 if x > q else 0 for x in processed_df['声波强度 (AE)']]
processed_df['干扰信号'] = data_modified
processed_df
# 分离特征和目标变量
X = processed_df.drop(['时间 (time)','类别 (class)','干扰信号','类别 (class)_encoded'], axis=1) # 假设目标变量列名为'类别'
# X = processed_df.drop(['时间 (time)','类别 (class)','干扰信号'], axis=1) # 假设目标变量列名为'类别'
y = processed_df['干扰信号']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.head()
集成随机森林代码:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42) # 100棵树
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
# 打印准确率
print("Accuracy:", accuracy_score(y_test, y_pred))
# 打印分类报告
print("Classification Report:\n", classification_report(y_test, y_pred))
# 混淆矩阵
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
# 可视化混淆矩阵
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(len(np.unique(y)))
plt.xticks(tick_marks, np.unique(y), rotation=45)
plt.yticks(tick_marks, np.unique(y))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
第一问结果表:(csv文件)
第二问xgboost混淆矩阵:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
第三问部分代码:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 转换为PyTorch张量
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
# 创建数据加载器
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
# 定义神经网络结构
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.layer1 = nn.Linear(X_train.shape[1], 64)
self.layer2 = nn.Linear(64, 32)
self.output_layer = nn.Linear(32, 1)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.layer1(x))
x = self.relu(self.layer2(x))
x = torch.sigmoid(self.output_layer(x)) # 使用sigmoid输出概率
return x
# 实例化模型、定义损失函数和优化器
model = NeuralNetwork()
criterion = nn.BCELoss() # 二元交叉熵损失函数
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
for inputs, labels in train_loader:
outputs = model(inputs)
loss = criterion(outputs.squeeze(), labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
其中更详细的思路,各题目思路、代码、讲解视频、成品论文及其他相关内容,可以点击下方群名片哦!