目录
神经网络整体框架
核心计算步骤
参数初始化
矩阵拉伸与还原
前向传播
损失函数定义
反向传播
全部迭代更新完成
数字识别实战
神经网络整体框架
核心计算步骤
参数初始化
# 定义初始化函数 normalize_data是否需要标准化
def __init__(self,data,labels,layers,normalize_data=False):
# 数据处理
data_procesed = prepare_for_training(data,normalize_data = normalize_data)
# 获取当前处理好的数据
self.data = data_procesed
self.labels = labels
# 定义神经网络层数
self.layers = layers #784(28*28*1)| 像素点 25(隐层神经元)| 10(十分类任务)
self.normalize_data = normalize_data
# 初始化权重参数
self.thetas = MultilayerPerceptron.thetas_init(layers)
@staticmethod
def thetas_init(layers):
# 层的个数
num_layers = len(layers)
# 构建多组权重参数
thetas = {}
# 会执行两次,得到两组参数矩阵:25*785 , 10*26
for layer_index in range(num_layers - 1):
# 输入
in_count = layers[layer_index] #784
# 输出
out_count = layers[layer_index+1]
# 构建矩阵并初始化操作 随机进行初始化操作,值尽量小一点
thetas[layer_index] = np.random.rand(out_count,in_count+1)*0.05 # WX in_count+1:偏置项考虑进去,偏置项个数跟输出的结果一致
return thetas
矩阵拉伸与还原
# 将25*785矩阵拉长1*19625
@staticmethod
def thetas_unroll(thetas):
num_theta_layers = len(thetas) #25*785 , 10*26;num_theta_layers长度为2
unrolled_theta = np.array([])
for theta_layer_index in range(num_theta_layers):
# thetas[theta_layer_index].flatten()矩阵拉长
# np.hstack((unrolled_theta, thetas[theta_layer_index].flatten())) 数组拼接 将25*785 , 10*26两个矩阵拼接成一个 1*x的矩阵
unrolled_theta = np.hstack((unrolled_theta, thetas[theta_layer_index].flatten()))
return unrolled_theta
# 矩阵还原
"""
将展开的`unrolled_thetas`重新组织成一个字典`thetas`,其中每个键值对表示一个层的权重阵。
函数的输入参数包括展开的参数`unrolled_thetas`和神经网络的层结构`layers`。
具体来说,函数首先获取层的数量`num_layers`,然后创建一个空字典`thetas`用于存储权重矩阵。
接下来,函数通过循环遍历每一层(除了最后一层),并根据层的输入和输出节点数计算权重矩阵的大小。
在循环中,函数首先计算当前层权重矩阵的宽度`thetas_width`和高度`thetas_height`,
然后计算权重矩阵的总元素个数`thetas_volume`。
接着,函数根据展开参数的索引范围提取对应层的展开权重,并通过`reshape`函数将展开权重重新组织成一个二维矩阵。
最后,函数将该矩阵存储到字典`thetas`中,键为当前层的索引。
循环结束后,函数返回字典`thetas`,其中包含了每一层的权重矩阵。
"""
@staticmethod
def thetas_roll(unrolled_thetas,layers):
# 进行反变换,将行转换为矩阵
num_layers = len(layers)
# 包含第一层、第二层、第三层
thetas = {}
# 指定标识符当前变换到哪了
unrolled_shift = 0
# 构造想要的参数矩阵
for layer_index in range(num_layers - 1):
# 输入
in_count = layers[layer_index]
# 输出
out_count = layers[layer_index + 1]
# 构造矩阵
thetas_width = in_count + 1
thetas_height = out_count
# 计算权重矩阵的总元素个数
thetas_volume = thetas_width * thetas_height
# 指定从矩阵中哪个位置取值
start_index = unrolled_shift
# 结束位置
end_index = unrolled_shift + thetas_volume
layer_theta_unrolled = unrolled_thetas[start_index:end_index]
# 获得25*785和10*26两个矩阵
thetas[layer_index] = layer_theta_unrolled.reshape((thetas_height, thetas_width))
# 更新
unrolled_shift = unrolled_shift + thetas_volume
return thetas
前向传播
# 前向传播函数
@staticmethod
def feedforward_propagation(data,thetas,layers):
# 获取层数
num_layers = len(layers)
# 样本个数
num_examples = data.shape[0]
# 定义输入层
in_layer_activation = data
# 逐层计算
for layer_index in range(num_layers - 1):
theta = thetas[layer_index]
# 隐藏层得到的结果
out_layer_activation = sigmoid(np.dot(in_layer_activation, theta.T))
# 正常计算完之后是num_examples*25,但是要考虑偏置项 变成num_examples*26
out_layer_activation = np.hstack((np.ones((num_examples, 1)), out_layer_activation))
in_layer_activation = out_layer_activation
# 返回输出层结果,结果中不要偏置项了
return in_layer_activation[:, 1:]
损失函数定义
# 损失函数定义
@staticmethod
def cost_function(data,labels,thetas,layers):
# 获取层数
num_layers = len(layers)
# 样本个数
num_examples = data.shape[0]
# 分类个数
num_labels = layers[-1]
# 前向传播走一次
predictions = MultilayerPerceptron.feedforward_propagation(data, thetas, layers)
# 制作标签,每一个样本的标签都得是one-hot
bitwise_labels = np.zeros((num_examples, num_labels))
for example_index in range(num_examples):
# 将对应的位置改为 1
bitwise_labels[example_index][labels[example_index][0]] = 1
# 计算损失
bit_set_cost = np.sum(np.log(predictions[bitwise_labels == 1]))
bit_not_set_cost = np.sum(np.log(1 - predictions[bitwise_labels == 0]))
cost = (-1 / num_examples) * (bit_set_cost + bit_not_set_cost)
return cost
反向传播
# 反向传播函数
@staticmethod
def back_propagation(data, labels, thetas, layers):
# 获取层数
num_layers = len(layers)
# 样本个数、特征个数
(num_examples, num_features) = data.shape
# 输出结果
num_label_types = layers[-1]
# 输出层跟结果之间的差异
deltas = {}
# 初始化操作 逐层定义当前的值
for layer_index in range(num_layers - 1):
in_count = layers[layer_index]
out_count = layers[layer_index + 1]
# 构建矩阵
deltas[layer_index] = np.zeros((out_count, in_count + 1)) # 得到两个矩阵25*785 10*26
# 遍历输入层每个样本
for example_index in range(num_examples):
# 得到每层的输出结果
layers_inputs = {}
layers_activations = {}
# 第0层输入转换为向量个数 输入层结果
layers_activation = data[example_index, :].reshape((num_features, 1)) # 785*1
layers_activations[0] = layers_activation
# 逐层计算
for layer_index in range(num_layers - 1):
layer_theta = thetas[layer_index] # 得到当前权重参数值 25*785 10*26
layer_input = np.dot(layer_theta, layers_activation) # 第一次得到25*1 第二次10*1 第一层输出等于第二层输入
layers_activation = np.vstack((np.array([[1]]), sigmoid(layer_input)))
layers_inputs[layer_index + 1] = layer_input # 后一层计算结果
layers_activations[layer_index + 1] = layers_activation # 后一层经过激活函数后的结果
output_layer_activation = layers_activation[1:, :] #去除偏置参数
delta = {}
# 标签处理
bitwise_label = np.zeros((num_label_types, 1))
bitwise_label[labels[example_index][0]] = 1
# 计算输出层和真实值之间的差异
delta[num_layers - 1] = output_layer_activation - bitwise_label
# 遍历循环 L L-1 L-2 ...2
for layer_index in range(num_layers - 2, 0, -1):
layer_theta = thetas[layer_index]
next_delta = delta[layer_index + 1]
layer_input = layers_inputs[layer_index]
layer_input = np.vstack((np.array((1)), layer_input))
# 按照公式进行计算
delta[layer_index] = np.dot(layer_theta.T, next_delta) * sigmoid_gradient(layer_input)
# 过滤掉偏置参数
delta[layer_index] = delta[layer_index][1:, :]
# 梯度值计算
for layer_index in range(num_layers - 1):
layer_delta = np.dot(delta[layer_index + 1], layers_activations[layer_index].T)
deltas[layer_index] = deltas[layer_index] + layer_delta # 第一次25*785 第二次10*26
for layer_index in range(num_layers - 1):
deltas[layer_index] = deltas[layer_index] * (1 / num_examples)
return deltas
全部迭代更新完成
import numpy as np
from utils.features import prepare_for_training
from utils.hypothesis import sigmoid,sigmoid_gradient
class MultilayerPerceptron:
# 定义初始化函数 normalize_data是否需要标准化
def __init__(self,data,labels,layers,normalize_data=False):
# 数据处理
data_procesed = prepare_for_training(data,normalize_data = normalize_data)
# 获取当前处理好的数据
self.data = data_procesed
self.labels = labels
# 定义神经网络层数
self.layers = layers #784(28*28*1)| 像素点 25(隐层神经元)| 10(十分类任务)
self.normalize_data = normalize_data
# 初始化权重参数
self.thetas = MultilayerPerceptron.thetas_init(layers)
@staticmethod
def thetas_init(layers):
# 层的个数
num_layers = len(layers)
# 构建多组权重参数
thetas = {}
# 会执行两次,得到两组参数矩阵:25*785 , 10*26
for layer_index in range(num_layers - 1):
# 输入
in_count = layers[layer_index] #784
# 输出
out_count = layers[layer_index+1]
# 构建矩阵并初始化操作 随机进行初始化操作,值尽量小一点
thetas[layer_index] = np.random.rand(out_count,in_count+1)*0.05 # WX in_count+1:偏置项考虑进去,偏置项个数跟输出的结果一致
return thetas
# 将25*785矩阵拉长1*19625
@staticmethod
def thetas_unroll(thetas):
num_theta_layers = len(thetas) #25*785 , 10*26;num_theta_layers长度为2
unrolled_theta = np.array([])
for theta_layer_index in range(num_theta_layers):
# thetas[theta_layer_index].flatten()矩阵拉长
# np.hstack((unrolled_theta, thetas[theta_layer_index].flatten())) 数组拼接 将25*785 , 10*26两个矩阵拼接成一个 1*x的矩阵
unrolled_theta = np.hstack((unrolled_theta, thetas[theta_layer_index].flatten()))
return unrolled_theta
# 矩阵还原
"""
将展开的`unrolled_thetas`重新组织成一个字典`thetas`,其中每个键值对表示一个层的权重阵。
函数的输入参数包括展开的参数`unrolled_thetas`和神经网络的层结构`layers`。
具体来说,函数首先获取层的数量`num_layers`,然后创建一个空字典`thetas`用于存储权重矩阵。
接下来,函数通过循环遍历每一层(除了最后一层),并根据层的输入和输出节点数计算权重矩阵的大小。
在循环中,函数首先计算当前层权重矩阵的宽度`thetas_width`和高度`thetas_height`,
然后计算权重矩阵的总元素个数`thetas_volume`。
接着,函数根据展开参数的索引范围提取对应层的展开权重,并通过`reshape`函数将展开权重重新组织成一个二维矩阵。
最后,函数将该矩阵存储到字典`thetas`中,键为当前层的索引。
循环结束后,函数返回字典`thetas`,其中包含了每一层的权重矩阵。
"""
@staticmethod
def thetas_roll(unrolled_thetas,layers):
# 进行反变换,将行转换为矩阵
num_layers = len(layers)
# 包含第一层、第二层、第三层
thetas = {}
# 指定标识符当前变换到哪了
unrolled_shift = 0
# 构造想要的参数矩阵
for layer_index in range(num_layers - 1):
# 输入
in_count = layers[layer_index]
# 输出
out_count = layers[layer_index + 1]
# 构造矩阵
thetas_width = in_count + 1
thetas_height = out_count
# 计算权重矩阵的总元素个数
thetas_volume = thetas_width * thetas_height
# 指定从矩阵中哪个位置取值
start_index = unrolled_shift
# 结束位置
end_index = unrolled_shift + thetas_volume
layer_theta_unrolled = unrolled_thetas[start_index:end_index]
# 获得25*785和10*26两个矩阵
thetas[layer_index] = layer_theta_unrolled.reshape((thetas_height, thetas_width))
# 更新
unrolled_shift = unrolled_shift + thetas_volume
return thetas
# 损失函数定义
@staticmethod
def cost_function(data,labels,thetas,layers):
# 获取层数
num_layers = len(layers)
# 样本个数
num_examples = data.shape[0]
# 分类个数
num_labels = layers[-1]
# 前向传播走一次
predictions = MultilayerPerceptron.feedforward_propagation(data, thetas, layers)
# 制作标签,每一个样本的标签都得是one-hot
bitwise_labels = np.zeros((num_examples, num_labels))
for example_index in range(num_examples):
# 将对应的位置改为 1
bitwise_labels[example_index][labels[example_index][0]] = 1
# 计算损失
bit_set_cost = np.sum(np.log(predictions[bitwise_labels == 1]))
bit_not_set_cost = np.sum(np.log(1 - predictions[bitwise_labels == 0]))
cost = (-1 / num_examples) * (bit_set_cost + bit_not_set_cost)
return cost
# 梯度值计算函数
@staticmethod
def gradient_step(data, labels, optimized_theta, layers):
# 将theta值还原成矩阵格式
theta = MultilayerPerceptron.thetas_roll(optimized_theta, layers)
# 反向传播
thetas_rolled_gradients = MultilayerPerceptron.back_propagation(data, labels, theta, layers)
# 将矩阵拉伸方便参数更新
thetas_unrolled_gradients = MultilayerPerceptron.thetas_unroll(thetas_rolled_gradients)
# 返回梯度值
return thetas_unrolled_gradients
# 梯度下降模块
@staticmethod
def gradient_descent(data,labels,unrolled_theta,layers,max_iterations,alpha):
# 最终得到的theta值先使用初始theta
optimized_theta = unrolled_theta
# 每次迭代都会得到当前的损失值,记录到 cost_history数组中
cost_history = []
# 进行迭代
for _ in range(max_iterations):
# 1.计算当前损失值 thetas_roll()函数将拉长后的向量还原成矩阵
cost = MultilayerPerceptron.cost_function(data, labels,
MultilayerPerceptron.thetas_roll(optimized_theta, layers), layers)
# 记录当前损失
cost_history.append(cost)
# 2.根据损失值计算当前梯度值
theta_gradient = MultilayerPerceptron.gradient_step(data, labels, optimized_theta, layers)
# 3.更新梯度值,最终得到的结果是优化完的theta
optimized_theta = optimized_theta - alpha * theta_gradient
return optimized_theta, cost_history
# 前向传播函数
@staticmethod
def feedforward_propagation(data,thetas,layers):
# 获取层数
num_layers = len(layers)
# 样本个数
num_examples = data.shape[0]
# 定义输入层
in_layer_activation = data
# 逐层计算
for layer_index in range(num_layers - 1):
theta = thetas[layer_index]
# 隐藏层得到的结果
out_layer_activation = sigmoid(np.dot(in_layer_activation, theta.T))
# 正常计算完之后是num_examples*25,但是要考虑偏置项 变成num_examples*26
out_layer_activation = np.hstack((np.ones((num_examples, 1)), out_layer_activation))
in_layer_activation = out_layer_activation
# 返回输出层结果,结果中不要偏置项了
return in_layer_activation[:, 1:]
# 反向传播函数
@staticmethod
def back_propagation(data, labels, thetas, layers):
# 获取层数
num_layers = len(layers)
# 样本个数、特征个数
(num_examples, num_features) = data.shape
# 输出结果
num_label_types = layers[-1]
# 输出层跟结果之间的差异
deltas = {}
# 初始化操作 逐层定义当前的值
for layer_index in range(num_layers - 1):
in_count = layers[layer_index]
out_count = layers[layer_index + 1]
# 构建矩阵
deltas[layer_index] = np.zeros((out_count, in_count + 1)) # 得到两个矩阵25*785 10*26
# 遍历输入层每个样本
for example_index in range(num_examples):
# 得到每层的输出结果
layers_inputs = {}
layers_activations = {}
# 第0层输入转换为向量个数 输入层结果
layers_activation = data[example_index, :].reshape((num_features, 1)) # 785*1
layers_activations[0] = layers_activation
# 逐层计算
for layer_index in range(num_layers - 1):
layer_theta = thetas[layer_index] # 得到当前权重参数值 25*785 10*26
layer_input = np.dot(layer_theta, layers_activation) # 第一次得到25*1 第二次10*1 第一层输出等于第二层输入
layers_activation = np.vstack((np.array([[1]]), sigmoid(layer_input)))
layers_inputs[layer_index + 1] = layer_input # 后一层计算结果
layers_activations[layer_index + 1] = layers_activation # 后一层经过激活函数后的结果
output_layer_activation = layers_activation[1:, :] #去除偏置参数
delta = {}
# 标签处理
bitwise_label = np.zeros((num_label_types, 1))
bitwise_label[labels[example_index][0]] = 1
# 计算输出层和真实值之间的差异
delta[num_layers - 1] = output_layer_activation - bitwise_label
# 遍历循环 L L-1 L-2 ...2
for layer_index in range(num_layers - 2, 0, -1):
layer_theta = thetas[layer_index]
next_delta = delta[layer_index + 1]
layer_input = layers_inputs[layer_index]
layer_input = np.vstack((np.array((1)), layer_input))
# 按照公式进行计算
delta[layer_index] = np.dot(layer_theta.T, next_delta) * sigmoid_gradient(layer_input)
# 过滤掉偏置参数
delta[layer_index] = delta[layer_index][1:, :]
# 梯度值计算
for layer_index in range(num_layers - 1):
layer_delta = np.dot(delta[layer_index + 1], layers_activations[layer_index].T)
deltas[layer_index] = deltas[layer_index] + layer_delta # 第一次25*785 第二次10*26
for layer_index in range(num_layers - 1):
deltas[layer_index] = deltas[layer_index] * (1 / num_examples)
return deltas
# 定义训练模块 定义最大迭代次数 定义学习率
def train(self,max_iterations=1000,alpha=0.1):
"""第一步首先需要做优化,计算损失函数然后根据损失函数计算梯度值,
然后更新梯度值调整权重参数,前向传播加反向传播,整体一次迭代"""
# 将矩阵转化为向量方便参数更新,将25*785矩阵拉长1*19625
unrolled_theta = MultilayerPerceptron.thetas_unroll(self.thetas)
# 梯度下降模块
(optimized_theta, cost_history) = MultilayerPerceptron.gradient_descent(self.data, self.labels, unrolled_theta,
self.layers, max_iterations, alpha)
# 参数更新完成之后需要将拉长后的还原,进行前向传播
self.thetas = MultilayerPerceptron.thetas_roll(optimized_theta, self.layers)
return self.thetas, cost_history
数字识别实战
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mping
import math
from multilayer_perceptron import MultilayerPerceptron
data = pd.read_csv('../data/mnist-demo.csv')
numbers_to_display = 25
num_cells = math.ceil(math.sqrt(numbers_to_display))
plt.figure(figsize=(10,10))
for plot_index in range(numbers_to_display):
digit = data[plot_index:plot_index+1].values
digit_label = digit[0][0]
digit_pixels = digit[0][1:]
image_size = int(math.sqrt(digit_pixels.shape[0]))
frame = digit_pixels.reshape((image_size,image_size))
plt.subplot(num_cells,num_cells,plot_index+1)
plt.imshow(frame,cmap='Greys')
plt.title(digit_label)
plt.subplots_adjust(wspace=0.5,hspace=0.5)
plt.show()
train_data = data.sample(frac = 0.8)
test_data = data.drop(train_data.index)
train_data = train_data.values
test_data = test_data.values
num_training_examples = 1700
x_train = train_data[:num_training_examples,1:]
y_train = train_data[:num_training_examples,[0]]
x_test = test_data[:,1:]
y_test = test_data[:,[0]]
layers=[784,25,10]
normalize_data = True
max_iterations = 300
alpha = 0.1
multilayer_perceptron = MultilayerPerceptron(x_train,y_train,layers,normalize_data)
(thetas,costs) = multilayer_perceptron.train(max_iterations,alpha)
plt.plot(range(len(costs)),costs)
plt.xlabel('Grident steps')
plt.xlabel('costs')
plt.show()
y_train_predictions = multilayer_perceptron.predict(x_train)
y_test_predictions = multilayer_perceptron.predict(x_test)
train_p = np.sum(y_train_predictions == y_train)/y_train.shape[0] * 100
test_p = np.sum(y_test_predictions == y_test)/y_test.shape[0] * 100
print ('训练集准确率:',train_p)
print ('测试集准确率:',test_p)
numbers_to_display = 64
num_cells = math.ceil(math.sqrt(numbers_to_display))
plt.figure(figsize=(15, 15))
for plot_index in range(numbers_to_display):
digit_label = y_test[plot_index, 0]
digit_pixels = x_test[plot_index, :]
predicted_label = y_test_predictions[plot_index][0]
image_size = int(math.sqrt(digit_pixels.shape[0]))
frame = digit_pixels.reshape((image_size, image_size))
color_map = 'Greens' if predicted_label == digit_label else 'Reds'
plt.subplot(num_cells, num_cells, plot_index + 1)
plt.imshow(frame, cmap=color_map)
plt.title(predicted_label)
plt.tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()