前言
BN(Batch Normalization)首次提出与论文,主要目的是为了解决训练深层神经网络慢的问题。我们可以神经网络整体可以看成一个高阶的复杂函数,通过训练优化它的参数,可以用于拟合各种复杂的数据分布。一般而言,一个网络会有多层,其中的每一层都可以看成一个子函数,用于拟合其各自的子分布。由于下一层的输入来自上一层的输出,而随着训练过程中上一层网络参数的改动,它的输出也将发生变化,那么将会导致下一层学习更加慢,同时也会使得模型的训练更加难,这个现象在原文中被称为“internal covariate shif”现象,针对这一问题,作者提出了将上一层的输出进行标准化以调整下一层输入的分布,以减弱“internal covariate shif”的影响,BN一般用在卷积层之后,激活层之前。更多的细节可以参考原文Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift。
BN涉及到跨样本间的计算,因而难免会受到batch size大小的影响;另一方面,在序列样本中(如文本、语音等),一般会进行padding对齐每条样本的长度,以便批量输入到模型中,此时在跨样本计算统计值的话,会引入噪声;为了解决以上两个问题,LN(Layer Normalization)就应运而生,单独对每条样本的特征进行正则化,详情可参考原文Layer Normalization。
类似的还有Instance Normalization、Group Normalization、Switchable Normalization,这里就不多赘述了。
以下部分将会省略很多细节,直接进入代码实现部分。
批量正则化-BN
BN是同时对所有样本的对应通道进行正则化,有多少个通道,就会有多少组均值和方差,单独对每个通道的值进行缩放,示意图如下:
假设数据格式为 V [ B , C , H , W ] V_[B,C,H,W_] V[B,C,H,W],分别表示 b a t c h _ s i z e batch\_size batch_size、通道数、高度、宽度,那么针对通过维度的任一个坐标 c ∈ [ 0 , C ) c\in [0, C) c∈[0,C),都会求出一组均值和方差
u c = 1 B ∗ H ∗ W ∑ b ∈ [ 0 , B ) h ∈ [ 0 , H ) w ∈ [ 0 , W ) V [ b , h , w ] u_c = \frac{1} {B*H*W} \sum_{b\in[0,B) h\in [0,H) w\in[0,W)} V_[b,h,w] uc=B∗H∗W1∑b∈[0,B)h∈[0,H)w∈[0,W)V[b,h,w]
θ c 2 = 1 B ∗ H ∗ W ∑ b ∈ [ 0 , B ) h ∈ [ 0 , H ) w ∈ [ 0 , W ) ( V [ b , h , w ] − u c ) 2 \theta^2_c = \frac{1} {B*H*W} \sum_{b\in[0,B) h\in [0,H) w\in[0,W)}(V_[b,h,w] - u_c)^2 θc2=B∗H∗W1∑b∈[0,B)h∈[0,H)w∈[0,W)(V[b,h,w]−uc)2
接下来对该通道坐标的每一个值,进行缩放操作
$ \hat{V}{[:, c, :, :]} \leftarrow \gamma_c . \frac{V{[:,c,:,:]} - u_c } {\sqrt{\theta^2_c + \epsilon}} + \beta_c $
其中 ϵ \epsilon ϵ为一个非常小的常数,防止分母为0, γ c , β c \gamma_c,\beta_c γc,βc为该通道对应的参数。
import torch
from torch.nn import BatchNorm1d, BatchNorm2d, BatchNorm3d
import numpy as np
np.random.seed(0)
class myBatchNorm(torch.nn.Module):
def __init__(self,
size,
dim,
eps: float = 1e-6) -> None:
super().__init__()
self.gamma = torch.nn.Parameter(torch.ones(size))
self.beta = torch.nn.Parameter(torch.zeros(size))
self.eps = eps
self.dim = dim
def forward(self, tensor: torch.Tensor): # pylint: disable=arguments-differ
mean = tensor.mean(self.dim, keepdim=True)
std = tensor.std(self.dim, unbiased=False, keepdim=True)
return self.gamma * (tensor - mean) / (std + self.eps) + self.beta
print("-----一维BN------")
val_tensor = torch.from_numpy(np.random.randn(1, 3, 4)).float()
BN1 = BatchNorm1d(3, affine=False)(val_tensor)
print(BN1)
myBN1 = myBatchNorm((1, 3, 1), (0, 2))(val_tensor)
print(myBN1)
print("-----二维BN------")
val_tensor = torch.from_numpy(np.random.randn(1, 3, 4, 5)).float()
BN2 = BatchNorm2d(3, affine=False)(val_tensor)
print(BN2)
myBN2 = myBatchNorm((1, 3, 1, 1), (0, 2, 3))(val_tensor)
print(myBN2)
print("-----三维BN------")
val_tensor = torch.from_numpy(np.random.randn(1, 2, 3, 4, 5)).float()
BN3 = BatchNorm3d(2, affine=False)(val_tensor)
print(BN3)
myBN3 = myBatchNorm((1, 2, 1, 1, 1), (0, 2, 3, 4))(val_tensor)
print(myBN3)
输出如下:
-----一维BN------
tensor([[[ 0.5905, -1.3359, -0.5187, 1.2640],
[ 1.3397, -1.2973, 0.4893, -0.5317],
[-0.9773, -0.1110, -0.5604, 1.6487]]])
tensor([[[ 0.5905, -1.3359, -0.5187, 1.2640],
[ 1.3397, -1.2973, 0.4893, -0.5317],
[-0.9773, -0.1110, -0.5604, 1.6488]]], grad_fn=<AddBackward0>)
-----二维BN------
tensor([[[[ 0.4834, -0.1121, 0.1880, 0.0854, 1.1662],
[-0.4165, 0.0662, -1.0209, -2.6032, 0.3834],
[ 0.5797, -0.9166, 1.8886, -1.5800, -0.1828],
[-0.3997, 1.2022, 1.1431, -0.0811, 0.1268]],
[[-0.5048, -1.5601, 0.0165, 0.5033, 1.5403],
[ 1.5133, -0.0216, 0.0605, -0.6600, -1.0187],
[-1.2951, 2.2359, -0.1397, -0.0706, -0.8572],
[ 1.1031, -1.2059, 0.1470, -0.5122, 0.7259]],
[[-0.2520, -1.2639, 0.4771, 1.1667, 0.6201],
[ 0.9766, -0.4386, -0.0283, -0.4962, -0.0235],
[-0.7087, -2.0882, 0.7877, -0.0873, -1.9430],
[ 1.2188, -0.8510, 0.5981, 1.6211, 0.7145]]]])
tensor([[[[ 0.4834, -0.1121, 0.1880, 0.0854, 1.1662],
[-0.4165, 0.0662, -1.0209, -2.6032, 0.3834],
[ 0.5797, -0.9166, 1.8886, -1.5800, -0.1828],
[-0.3997, 1.2022, 1.1431, -0.0811, 0.1268]],
[[-0.5048, -1.5601, 0.0165, 0.5033, 1.5403],
[ 1.5133, -0.0216, 0.0605, -0.6600, -1.0187],
[-1.2951, 2.2359, -0.1397, -0.0706, -0.8572],
[ 1.1031, -1.2059, 0.1470, -0.5122, 0.7260]],
[[-0.2520, -1.2639, 0.4771, 1.1667, 0.6201],
[ 0.9766, -0.4386, -0.0283, -0.4962, -0.0235],
[-0.7087, -2.0882, 0.7877, -0.0873, -1.9430],
[ 1.2188, -0.8510, 0.5981, 1.6211, 0.7145]]]],
grad_fn=<AddBackward0>)
-----三维BN------
tensor([[[[[ 0.8306, -1.5469, 0.0926, -0.9961, -1.1823],
[-0.8900, -0.6223, -0.2541, -1.4771, 0.5917],
[ 0.1560, -1.8487, 1.1800, 1.5882, 0.8701],
[-0.4905, -1.3826, 0.7456, -0.7141, 0.9138]],
[[-0.1018, 0.6676, 0.0465, 0.3972, -0.2998],
[ 1.4780, -0.1832, 0.0922, 1.5754, -1.6599],
[-1.5826, 0.6604, -1.4851, 1.6360, -0.7245],
[-1.0588, 1.6152, 1.1722, 1.5598, 0.5970]],
[[-1.1727, 1.6023, -0.5787, 0.4932, 0.6382],
[-0.4656, 0.3046, 0.6131, 0.0666, -1.4112],
[-0.0117, 1.0179, -1.0059, -0.4602, -0.7461],
[ 1.5415, 0.3629, 0.0977, -1.0813, 0.2297]]],
[[[-0.5496, 0.1743, -0.5101, 0.8350, 0.7327],
[-0.0719, 0.5476, -0.9788, -1.3869, 0.5920],
[ 0.3125, 0.7926, 2.5845, 1.1098, -0.7940],
[ 1.2866, -1.2072, -0.3315, 0.0717, 1.8979]],
[[-0.6218, -0.7055, 0.0407, -0.5384, 1.2965],
[-0.9653, -1.0345, -0.3071, -0.3689, 2.1195],
[ 1.1148, 0.2314, -1.1145, 1.0072, -0.8836],
[-1.4418, 1.3594, 0.4665, 1.0856, 0.4684]],
[[ 1.0199, -0.5257, -0.9185, 0.8403, -0.6819],
[-0.5652, -0.3253, 0.1596, -0.2212, -1.2677],
[-0.5181, -2.1374, 0.7825, -1.5005, -0.9904],
[ 0.1951, -0.6164, 1.7233, -1.1836, 0.4154]]]]])
tensor([[[[[ 0.8306, -1.5469, 0.0926, -0.9961, -1.1823],
[-0.8900, -0.6223, -0.2541, -1.4771, 0.5917],
[ 0.1560, -1.8487, 1.1800, 1.5882, 0.8701],
[-0.4905, -1.3826, 0.7456, -0.7141, 0.9138]],
[[-0.1018, 0.6676, 0.0465, 0.3972, -0.2998],
[ 1.4780, -0.1832, 0.0922, 1.5754, -1.6599],
[-1.5826, 0.6604, -1.4851, 1.6360, -0.7245],
[-1.0588, 1.6153, 1.1722, 1.5598, 0.5970]],
[[-1.1727, 1.6024, -0.5787, 0.4932, 0.6382],
[-0.4656, 0.3046, 0.6131, 0.0666, -1.4112],
[-0.0117, 1.0179, -1.0059, -0.4602, -0.7461],
[ 1.5415, 0.3629, 0.0977, -1.0813, 0.2297]]],
[[[-0.5496, 0.1743, -0.5101, 0.8350, 0.7327],
[-0.0719, 0.5476, -0.9788, -1.3870, 0.5920],
[ 0.3125, 0.7926, 2.5845, 1.1098, -0.7940],
[ 1.2866, -1.2072, -0.3315, 0.0717, 1.8979]],
[[-0.6218, -0.7055, 0.0407, -0.5384, 1.2965],
[-0.9653, -1.0346, -0.3071, -0.3689, 2.1195],
[ 1.1148, 0.2314, -1.1145, 1.0072, -0.8836],
[-1.4418, 1.3594, 0.4665, 1.0856, 0.4684]],
[[ 1.0199, -0.5257, -0.9185, 0.8403, -0.6819],
[-0.5652, -0.3253, 0.1596, -0.2212, -1.2677],
[-0.5181, -2.1374, 0.7825, -1.5005, -0.9904],
[ 0.1951, -0.6164, 1.7233, -1.1836, 0.4154]]]]],
grad_fn=<AddBackward0>)
层正则化-LN
LN是逐个样本进行正则化,假设数据格式为
V
[
B
,
T
,
C
]
V_[B,T,C_]
V[B,T,C],分别表示
b
a
t
c
h
_
s
i
z
e
batch\_size
batch_size、序列长度、特征维度,那么一般会针对每个样本的每个序列点的特征进行标准化,当然也可以对每个样本整体进行标准化,示意图如下:
下面以对每个样本的每个序列点进行标准化为例进行公示的演示。
u b , t = 1 C ∑ c ∈ [ 0 , C ) V [ b , t , c ] u_{b,t} = \frac{1} {C} \sum_{c\in[0,C)} V_[b,t,c] ub,t=C1∑c∈[0,C)V[b,t,c]
θ b , t 2 = 1 C ∑ c ∈ [ 0 , C ) ( V [ b , t , c ] − u b , t ) 2 \theta^2_{b,t} = \frac{1} {C} \sum_{c\in[0,C)}(V_[b,t,c] - u_{b,t})^2 θb,t2=C1∑c∈[0,C)(V[b,t,c]−ub,t)2
接下来,进行对应的缩放操作
V
^
[
b
,
t
,
:
]
←
γ
t
.
V
[
b
,
t
,
:
]
−
u
b
,
t
θ
b
,
t
2
+
ϵ
+
β
t
\hat{V}_{[b, t, :]} \leftarrow \gamma_t . \frac{V_{[b,t,:]} - u_{b,t} } {\sqrt{\theta^2_{b,t} + \epsilon}} + \beta_t
V^[b,t,:]←γt.θb,t2+ϵV[b,t,:]−ub,t+βt
其中
ϵ
\epsilon
ϵ为一个非常小的常数,防止分母为0,注意
γ
t
,
β
t
\gamma_t,\beta_t
γt,βt为所有样本共享。
import torch
from torch.nn import LayerNorm as LN
import numpy as np
np.random.seed(0)
val = np.random.randn(2, 3, 4)
val_tensor = torch.from_numpy(val).float()
class myLayerNorm(torch.nn.Module):
def __init__(self,
size,
dim,
eps: float = 1e-6) -> None:
super().__init__()
self.gamma = torch.nn.Parameter(torch.ones(size))
self.beta = torch.nn.Parameter(torch.zeros(size))
self.eps = eps
self.dim = dim
def forward(self, tensor: torch.Tensor): # pylint: disable=arguments-differ
mean = tensor.mean(self.dim, keepdim=True)
std = tensor.std(self.dim, unbiased=False, keepdim=True)
return self.gamma * (tensor - mean) / (std + self.eps) + self.beta
print("对整个样本进行正则化")
LN2 = LN([3, 4])(val_tensor)
myLN2 = myLayerNorm((3, 4), (1, 2))(val_tensor)
print("torchNL")
print(LV2)
print("myML")
print(myLN2)
print("对每个样本的每个序列点进行正则化")
LN1 = LN(4)(val_tensor)
print(LN1)
myLN2 = myLayerNorm((4), 2)(val_tensor)
print(myLN2)
输出如下:
对整个样本进行正则化
torchNL
tensor([[[ 1.1009, -0.3772, 0.2498, 1.6177],
[ 1.2131, -1.8700, 0.2188, -0.9749],
[-0.9227, -0.3659, -0.6548, 0.7652]],
[[ 0.7022, 0.0685, 0.3878, 0.2786],
[ 1.4288, -0.2555, 0.2582, -0.8987],
[-2.5826, 0.5957, 0.8047, -0.7878]]],
grad_fn=<NativeLayerNormBackward0>)
myML
tensor([[[ 1.1009, -0.3772, 0.2498, 1.6177],
[ 1.2131, -1.8700, 0.2188, -0.9749],
[-0.9227, -0.3659, -0.6548, 0.7652]],
[[ 0.7022, 0.0685, 0.3878, 0.2786],
[ 1.4288, -0.2555, 0.2582, -0.8987],
[-2.5826, 0.5957, 0.8047, -0.7878]]], grad_fn=<AddBackward0>)
对每个样本的每个序列点进行正则化
tensor([[[ 0.5905, -1.3359, -0.5187, 1.2640],
[ 1.3397, -1.2973, 0.4893, -0.5317],
[-0.9773, -0.1110, -0.5604, 1.6487]],
[[ 1.4983, -1.2706, 0.1247, -0.3525],
[ 1.5190, -0.4557, 0.1465, -1.2098],
[-1.5448, 0.8043, 0.9587, -0.2182]]],
grad_fn=<NativeLayerNormBackward0>)
tensor([[[ 0.5905, -1.3359, -0.5187, 1.2640],
[ 1.3397, -1.2973, 0.4893, -0.5317],
[-0.9773, -0.1110, -0.5604, 1.6488]],
[[ 1.4985, -1.2707, 0.1247, -0.3525],
[ 1.5190, -0.4557, 0.1465, -1.2098],
[-1.5448, 0.8043, 0.9587, -0.2182]]], grad_fn=<AddBackward0>)