B题题目:洪水灾害的数据分析与预测
完整论文也写完了
第二问代码(1、3、4问、还有论文见文末)
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
# 设置matplotlib支持中文显示
rcParams['font.sans-serif'] = ['PingFang HK'] # 设置字体为PingFang HK
rcParams['axes.unicode_minus'] = False # 解决负号显示问题 # DS数模原创代码,请务必购买正版,群:689826519,有问题也会及时更新
train_data = pd.read_csv('train.csv', encoding='GBK')
# 使用K-means进行聚类分析
# 将洪水概率列提取出来进行聚类
X = train_data[['洪水概率']]
# 使用K-means聚类
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
train_data['风险类别'] = kmeans.labels_
# 可视化聚类结果
plt.figure(figsize=(10, 6))
sns.scatterplot(x=train_data.index, y='洪水概率', hue='风险类别', data=train_data, palette='viridis')
plt.title('洪水概率聚类结果')
plt.show()
# 分析不同风险类别的指标特征
high_risk = train_data[train_data['风险类别'] == 0]
medium_risk = train_data[train_data['风险类别'] == 1]
low_risk = train_data[train_data['风险类别'] == 2]
print("High risk group:\n", high_risk.describe())
print("Medium risk group:\n", medium_risk.describe())
print("Low risk group:\n", low_risk.describe())
from sklearn.ensemble import RandomForestClassifier
# 准备数据
X = train_data.drop(['id', '洪水概率', '风险类别'], axis=1)
y = train_data['风险类别']
# 使用随机森林计算特征重要性
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X, y)
# 提取特征重要性
feature_importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature importances:\n", feature_importances)
# 选取前5个重要特征
top_5_features = feature_importances.head(5).index.tolist()
print("Top 5 features:\n", top_5_features)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# 使用前5个特征
X_top5 = train_data[top_5_features]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_top5, y, test_size=0.2, random_state=0)
# 训练逻辑回归模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 评估模型
print(classification_report(y_test, y_pred))
import numpy as np
# 灵敏度分析
sensitivity_analysis = {}
for feature in top_5_features:
original_value = X_test[feature].mean()
sensitivity_analysis[feature] = []
for change in np.linspace(-0.1, 0.1, 5): # DS数模原创代码,请务必购买正版,群:689826519,有问题也会及时更新
X_test_copy = X_test.copy()
X_test_copy[feature] += change
y_pred = model.predict(X_test_copy)
sensitivity_analysis[feature].append((change, (y_pred == y_test).mean()))
print("Sensitivity analysis:\n", sensitivity_analysis)
# 可视化灵敏度分析结果
plt.figure(figsize=(14, 8))
for feature, values in sensitivity_analysis.items():
changes, accuracies = zip(*values)
plt.plot(changes, accuracies, marker='o', label=feature)
plt.title('Sensitivity Analysis of Top 5 Features')
plt.xlabel('Change in Feature Value')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()
更详细的思路、各题目思路、代码、讲解视频、成品论文及其他相关内容,可以点击下方群名片哦!