机器学习复习代码
利用sklearn实现knn
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
def model_selection(x_train, y_train):
## 第一个是网格搜索
## p是选择查找方式:1是欧式距离 2是曼哈顿距离
params = {'n_neighbors': [3,5,7], 'p': [1,2]}
model = KNeighborsClassifier()
gs = GridSearchCV(model, params, verbose=2, cv=5)
gs.fit(x_train, y_train)
print("Best Model:", gs.best_params_, "Accuracy:", gs.best_score_)
print(gs.best_estimator_)
return gs.best_estimator_
def read():
filename = r"data/shuixianhua.xlsx"
data = pd.read_excel(filename, header=None)
## iloc[行,列]
x1 = data.iloc[1:, [0, 1]].values
x2 = data.iloc[1:, [3, 4]].values
# print(x2)
y1 = data.iloc[1:, 2].values
y2 = data.iloc[1:, 5].values
x = np.vstack((x1, x2)) # 竖向合并
print("x:")
print(x)
y = np.hstack((y1, y2)) # 横向合并
print("y:")
print(y)
## 这里是因为我把excel的y理解成string类型了,如果正常读可以不加这个
## 将y转为数值的int
y = y.astype(int)
return x, y
if __name__ == '__main__':
x, y = read()
best_model = model_selection(x, y)
利用sklearn实现线性回归
数据集展示
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
def MAE(y,y_pre):
return np.mean(np.abs(y-y_pre))
def MSE(y,y_pred):
return np.mean((y-y_pred)**2)
def RMSE(y,y_pred):
return np.sqrt(MSE(y,y_pred))
def MAPE(y,y_pred):
return np.mean(np.abs(y-y_pred)/y)
def R2(y,y_pred):
u=np.sum((y-y_pred)**2)
v=np.sum((y-np.mean(y_pred))**2)
return 1-(u/v)
def judege(name,y,y_pre):
mae=MAE(y,y_pre)
mse=MSE(y,y_pre)
rmse=RMSE(y,y_pre)
mape=MAPE(y,y_pre)
r2=R2(y,y_pre)
print(f"{name}的MAE:{mae},MSE:{mse},RMSE:{rmse}.MAPE:{mape},R2:{r2}")
def read():
filename = r"../data/ComposePlot.xlsx"
data=pd.read_excel(filename,header=None)
x1 = data.iloc[2:, [0,]].values
y1 = data.iloc[2:,1].values
x2 = data.iloc[2:,[2,]].values
y2 = data.iloc[2:,3].values
x3 = data.iloc[2:,[4,]].values
y3 = data.iloc[2:,5].values
x4 = data.iloc[2:,[6,]].values
y4 = data.iloc[2:,7].values
return x1,y1,x2,y2,x3,y3,x4,y4
def getModel(x,y):
model = LinearRegression()
model.fit(x,y)
return model
def main(x1, y1, x2, y2, x3, y3, x4, y4):
model1 = getModel(x1,y1)
model2 = getModel(x2, y2)
model3 =getModel(x3,y3)
model4 =getModel(x4,y4)
judege("mode1",y1,model1.predict(x1))
judege("mode2",y2,model2.predict(x2))
judege("mode3",y3,model3.predict(x3))
judege("mode4",y4,model4.predict(x4))
if __name__ == '__main__':
x1, y1, x2, y2, x3, y3, x4, y4 = read()
main(x1, y1, x2, y2, x3, y3, x4, y4)
利用sklearn实现逻辑回归
数据集展示
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
def main(x,y):
model=LogisticRegression()
model.fit(x,y)
print(model.predict(x))
def read():
filename = "data/student.xlsx"
data=pd.read_excel(filename,header=None)
x=data.iloc[1:,[0,1]].values
y=data.iloc[1:,2].values
print(x)
print(y)
return x,y
if __name__ =='__main__':
x,y=read()
main(x,y)
利用sklearn实现SVM(向量机)
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, \
f1_score
def load_data(): #导入的尾花
data = load_iris()
x, y = data.data, data.target
x_train, x_test, y_train, y_test = \
train_test_split(x, y, test_size=0.3,
shuffle=True, random_state=20)
return x, y, x_train, x_test, y_train, y_test
## 无脑写这个就行
def model_selection(x_train, y_train):
model = SVC()
paras = {'C': np.arange(1, 10, 5),
# rbf:高斯核函数 linear:线性核函数 poly:多项式核函数
'kernel': ['rbf', 'linear', 'poly'],
'degree': np.arange(1, 10, 2),
'gamma': ['scale', 'auto'],
'coef0': np.arange(-10, 10, 5)
}
gs = GridSearchCV(model, paras, cv=3, verbose=2, n_jobs=3)
gs.fit(x_train, y_train)
print('best score:', gs.best_score_)
print('best parameters:', gs.best_params_)
return gs.best_params_
def train(x_train, x_test, y_train, y_test, C, gamma, kernel):
model = SVC(C=C, kernel=kernel, gamma=gamma)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
# 生成混淆矩阵
confusion = confusion_matrix(y_test, y_pred)
print("混淆矩阵:\n", confusion)
# 生成分类报告,包括准确率、精度、召回率和 F1 得分
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("准确率Accuracy:", accuracy)
print("精度Precision:", precision)
print("召回率Recall:", recall)
print("FScore:", f1)
if __name__ == '__main__':
X, Y, x_train, x_test, y_train, y_test = load_data()
best_params = model_selection(x_train, y_train)
## 这步骤也是无脑写就行
train(x_train, x_test, y_train, y_test, best_params["C"], best_params["gamma"], best_params["kernel"])
小知识点扩展
将数据分成训练集和测试集
x_train, x_test, y_train, y_test = \
train_test_split(x, y, test_size=0.3,
shuffle=True, random_state=20)
来自chargpt的解释
是的,这段代码用于将数据集分成训练集和测试集。具体地说,它使用
train_test_split
函数将输入的特征数据x
和标签数据y
分成四个部分:x_train
(训练集的特征数据)、x_test
(测试集的特征数据)、y_train
(训练集的标签数据)和y_test
(测试集的标签数据)。以下是这段代码的主要参数和作用:
x
是特征数据。y
是标签数据。test_size=0.3
指定了测试集的比例,这里是总数据的30%将被用作测试集。shuffle=True
表示在分割数据之前对数据进行洗牌,以确保数据的随机性。random_state=20
是一个随机种子,用于确保每次运行代码时分割数据的结果都相同,这有助于复现实验结果。
综上所诉,只要背就好了,还有参数的意思
归一化
def hypo(x,w,b):
z=np.matmul(x,w)+b
h_x=1/(1+np.exp(-z))
h_x=(h_x>=0.5)*1
return h_x
书上p49,我也不太懂归一化的用法,其中z=wx+b
从0实现线性回归
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 添加归一化函数
def normalize_data(data):
min_val = np.min(data)
max_val = np.max(data)
normalized_data = (data - min_val) / (max_val - min_val)
return normalized_data
def prediction(X, W, bias):
return np.matmul(X, W) + bias
def cost_function(X, y, W, bias):
m, n = X.shape
y_hat = prediction(X, W, bias)
return 0.5 * (1 / m) * np.sum((y - y_hat) ** 2)
def gradient_descent(X, y, W, bias, alpha):
m, n = X.shape
y_hat = prediction(X, W, bias)
grad_w = -(1 / m) * np.matmul(X.T, (y - y_hat))
grad_b = -(1 / m) * np.sum(y - y_hat)
W = W - alpha * grad_w
bias = bias - alpha * grad_b
return W, bias
def train(X, y, ite=200):
m, n = X.shape
W, b, alpha, costs = np.random.randn(n, 1), 0.1, 0.2, []
for i in range(ite):
costs.append(cost_function(X, y, W, b))
W, b = gradient_descent(X, y, W, b, alpha)
return costs
def read():
filename = r"../../data/easy_test.xlsx"
data = pd.read_excel(filename, header=None)
x = data.iloc[2:, [0, ]].values
y = data.iloc[2:, 1].values
# 对特征数据 x 进行归一化
x_normalized = normalize_data(x)
return x_normalized, y
if __name__ == '__main__':
x, y = read()
costs = train(x, y)
# print(costs)
# 绘制损失曲线
plt.figure()
plt.plot(range(len(costs)), costs, marker='o', linestyle='-', color='b', label='Training Loss')
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.title('Training Loss')
plt.legend()
plt.grid(True)
plt.show()