经过上一篇transformer学习笔记-自注意力机制(1)原理学习,这一篇对其中的几个关键知识点代码演示:
1、整体qkv注意力计算
先来个最简单未经变换的QKV处理:
import torch
Q = torch.tensor([[3.0, 3.0,0.0],
[0.5, 4.0,0.0]])
K = Q.T
V = Q
scores = Q @ K #计算内积
weights = torch.softmax(scores, dim=0)
print(f"概率分布:{weights}")
newQ = weights @ V
print(f"输出:{newQ}")
再来个输入经过Wq/Wk/Wv变换的:
import torch
Q = torch.tensor([[3.0, 3.0,0.0],
[0.5, 4.0,0.0]])
torch.manual_seed(123)
d_q, d_k, d_v = 4, 4, 5 # W_query, W_key, W_value 的维度
d = Q.shape[1] # W_query, W_key, W_value 的行数等于输入token的维度
# 获取W_query, W_key, W_value(随机生成)
W_query = torch.nn.Parameter(torch.rand(d, d_q))
W_key = torch.nn.Parameter(torch.rand(d, d_k))
W_value = torch.nn.Parameter(torch.rand(d, d_v))
print("W_query:", W_query)
print("W_key:", W_key)
print("W_value:", W_value)
#先只计算苹果对整个句子的注意力,看看效果
apple = Q[0]
query_apple = apple @ W_query
keys = Q @ W_key
values = Q @ W_value
print(f"query_apple:{query_apple}")
print(f"keys:{keys}")
print(f"values:{values}")
scores = query_apple @ keys.T
print(f"scores:{scores}")
weights = torch.softmax(scores, dim=0)
print(f"weights:{weights}")
newQ = weights @ values
print(f"newQ:{newQ}")
#再看下整体的
querys = Q @ W_query
all_scores = querys @ keys.T
print(f"all_scores:{all_scores}")
all_weights = torch.softmax(all_scores, dim=-1)
print(f"all_weights:{all_weights}")
output = all_weights @ values
print(f"output:{output}")
最终生成的output的维度与W_value 的维度一致。
2、调换顺序结果不变
import torch
def simple_attention(Q):
K = Q.T
V = Q
scores = Q @ K #计算内积
weights = torch.softmax(scores, dim=-1)
print(f"概率分布:{weights}")
newQ = weights @ V
print(f"输出:{newQ}")
Q = torch.tensor([[3.0, 3.0,0.0],
[0.5, 4.0,0.0]])
Q1 = torch.tensor([[0.5, 4.0,0.0],
[3.0, 3.0,0.0]])
print("模拟‘苹果梨’:")
simple_attention(Q)
print("模拟‘梨苹果’:")
simple_attention(Q1)
可以看到“苹果梨”、“梨苹果”即便换了词token的顺序,并不会影响新的梨和新的苹果的向量数值。这里我们用了softmax函数求概率分布,因此跟上一篇文章的示例数值不一样,不要在意这个细节。
3、softmax:
import numpy as np
def softmax(x):
e_x = np.exp(x)
return e_x / e_x.sum(axis=0)
def softmax_with_temperature(x,T):
e_x = np.exp(x/T)
return e_x / e_x.sum(axis=0)
# 示例使用
if __name__ == "__main__":
input_vector = np.array([2.0, 1.0, 0.1])
output = softmax(input_vector)
print("Softmax Output:", output)
print("Softmax with Temperature 0.5 Output:", softmax_with_temperature(input_vector,0.5))
print("Softmax with Temperature 1 Output:", softmax_with_temperature(input_vector,1))
print("Softmax with Temperature 5 Output:", softmax_with_temperature(input_vector,5))
可以看到随着T的不断加大,概率分布不断趋于均匀分布。
4、softmax除以 d k \sqrt{d_k} dk
还是用上面的softmax函数,演示下除以 d k \sqrt{d_k} dk的效果:
# 高维输入向量
input_vector_high_dim = np.random.randn(100) * 10 # 生成一个100维的高斯分布随机向量,乘以10增加内积
output_high_dim = softmax(input_vector_high_dim)
print("High Dimension Softmax Output:", output_high_dim)
# 打印高维输出的概率分布
print("Max Probability in High Dimension:", np.max(output_high_dim))
print("Min Probability in High Dimension:", np.min(output_high_dim))
# 高维输入向量除以10
input_vector_high_dim_div10 = input_vector_high_dim / 10
output_high_dim_div10 = softmax(input_vector_high_dim_div10)
print("High Dimension Softmax Output (Divided by 10):", output_high_dim_div10)
# 打印高维输出的概率分布
print("Max Probability in High Dimension (Divided by 10):", np.max(output_high_dim_div10))
print("Min Probability in High Dimension (Divided by 10):", np.min(output_high_dim_div10))
# 绘制高维概率分布曲线
plt.figure(figsize=(10, 6))
# 绘制图形
plt.plot(output_high_dim, label='High Dim')
plt.plot(output_high_dim_div10, label='High Dim Divided by 10')
plt.legend()
plt.title('High Dimension Softmax Output Comparison')
plt.xlabel('Index')
plt.ylabel('Probability')
plt.show()
在除以 d k \sqrt{d_k} dk之前,由于内积变大,导致概率分布变得尖锐,趋近0的位置梯度基本消失,softmax 函数的损失函数的导数在输出接近 0 时接近零,在反向传播过程中,无法有效地更新权重。有兴趣的话可以试试对softmax 函数的损失函数求导。
继续上面的代码,来看下softmax的输出的损失函数求梯度:
def test_grad( dim_vertor):
import numpy as np
import torch
import torch.nn.functional as F
# 假设的输入
z = torch.tensor(dim_vertor, requires_grad=True)
print(z)
# 计算 softmax 输出
p = F.softmax(z, dim=0)
true_label = np.zeros(100)
true_label[3] = 1
# 模拟损失函数(例如交叉熵)
y = torch.tensor(true_label) # one-hot 编码的真实标签
loss = -torch.sum(y * torch.log(p))
# 反向传播并获取梯度
loss.backward()
# print(z.grad) # 输出梯度
return z.grad
grad_div10 = test_grad(input_vector_high_dim_div10)
grad = test_grad(input_vector_high_dim)
print(f"grad_div10:{grad_div10}")
print(f"grad:{grad}")
明显看出,没有除以 d k \sqrt{d_k} dk求出的梯度,基本为0;上面的代码是torch已经实现的。当然也可以根据损失函数自己求导,这里我们只为演示效果,点到即止:
5、多头注意力:
import torch
import torch.nn as nn
torch.manual_seed(123)
# 输入矩阵 Q
Q = torch.tensor([[3.0, 3.0, 0.0],
[0.5, 4.0, 0.0]])
# 维度设置
d_q, d_k, d_v = 4, 4, 5 # 每个头的 query, key, value 的维度
d_model = Q.shape[1] # 输入 token 的维度
num_heads = 2 # 头的数量
# 初始化每个头的权重矩阵
W_query = nn.ParameterList([nn.Parameter(torch.rand(d_model, d_q)) for _ in range(num_heads)])
W_key = nn.ParameterList([nn.Parameter(torch.rand(d_model, d_k)) for _ in range(num_heads)])
W_value = nn.ParameterList([nn.Parameter(torch.rand(d_model, d_v)) for _ in range(num_heads)])
# 输出权重矩阵
W_output = nn.Parameter(torch.rand(num_heads * d_v, d_model))
# 打印权重矩阵
for i in range(num_heads):
print(f"W_query_{i+1}:\n{W_query[i]}")
print(f"W_key_{i+1}:\n{W_key[i]}")
print(f"W_value_{i+1}:\n{W_value[i]}")
# 计算每个头的 Q, K, V
queries = [Q @ W_query[i] for i in range(num_heads)]
keys = [Q @ W_key[i] for i in range(num_heads)]
values = [Q @ W_value[i] for i in range(num_heads)]
# 计算每个头的注意力分数和权重
outputs = []
for i in range(num_heads):
scores = queries[i] @ keys[i].T / (d_k ** 0.5)
weights = torch.softmax(scores, dim=-1)
output = weights @ values[i]
outputs.append(output)
# 拼接所有头的输出
concat_output = torch.cat(outputs, dim=-1)
print(f"concat_output:\n{concat_output}")
# 最终线性变换
final_output = concat_output @ W_output
# 打印结果
print(f"Final Output:\n{final_output}")
6、掩码注意力:
import torch
# 原始 Q 矩阵
Q = torch.tensor([[3.0, 3.0, 0.0],
[0.5, 4.0, 0.0],
[1.0, 2.0, 0.0],
[2.0, 1.0, 0.0]])
torch.manual_seed(123)
d_q, d_k, d_v = 4, 4, 5 # query, key, value 的维度
d = Q.shape[1] # query, key, value 的行数等于输入 token 的维度
# 初始化权重矩阵
W_query = torch.nn.Parameter(torch.rand(d, d_q))
W_key = torch.nn.Parameter(torch.rand(d, d_k))
W_value = torch.nn.Parameter(torch.rand(d, d_v))
print("W_query:", W_query)
print("W_key:", W_key)
print("W_value:", W_value)
# 计算 Q, K, V
querys = Q @ W_query
keys = Q @ W_key
values = Q @ W_value
print(f"querys:\n{querys}")
print(f"keys:\n{keys}")
print(f"values:\n{values}")
# 计算注意力分数
all_scores = querys @ keys.T / (d_k ** 0.5)
print(f"all_scores:\n{all_scores}")
# 生成掩码
seq_len = Q.shape[0]
mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
masked_scores = all_scores.masked_fill(mask, float('-inf'))
print(f"Mask:\n{mask}")
print(f"Masked Scores:\n{masked_scores}")
# 计算权重
all_weights = torch.softmax(masked_scores, dim=-1)
print(f"all_weights:\n{all_weights}")
# 计算输出
output = all_weights @ values
print(f"output:\n{output}")
主要看下生成的掩码矩阵,和通过掩码矩阵处理的权重分布: