transformer学习笔记-自注意力机制（2）

经过上一篇transformer学习笔记-自注意力机制（1）原理学习，这一篇对其中的几个关键知识点代码演示：

1、整体qkv注意力计算

先来个最简单未经变换的QKV处理：

import torch  
Q = torch.tensor([[3.0, 3.0,0.0],
                  [0.5, 4.0,0.0]])
K = Q.T
V = Q

scores = Q @ K #计算内积
weights = torch.softmax(scores, dim=0)
print(f"概率分布：{weights}")
newQ = weights @ V
print(f"输出：{newQ}")

再来个输入经过Wq/Wk/Wv变换的：

import torch  
Q = torch.tensor([[3.0, 3.0,0.0],
                  [0.5, 4.0,0.0]])
torch.manual_seed(123)  
d_q, d_k, d_v = 4, 4, 5 # W_query, W_key, W_value 的维度  
d = Q.shape[1] #  W_query, W_key, W_value 的行数等于输入token的维度
# 获取W_query, W_key, W_value(随机生成)
W_query = torch.nn.Parameter(torch.rand(d, d_q))  
W_key = torch.nn.Parameter(torch.rand(d, d_k))  
W_value = torch.nn.Parameter(torch.rand(d, d_v))

print("W_query:", W_query)
print("W_key:", W_key)
print("W_value:", W_value)

#先只计算苹果对整个句子的注意力，看看效果
apple = Q[0]
query_apple = apple @ W_query  
keys = Q @ W_key  
values = Q @ W_value  
print(f"query_apple:{query_apple}")
print(f"keys:{keys}")
print(f"values:{values}")
scores = query_apple @ keys.T
print(f"scores:{scores}")
weights = torch.softmax(scores, dim=0)
print(f"weights:{weights}")
newQ = weights @ values
print(f"newQ:{newQ}")

#再看下整体的
querys = Q @ W_query
all_scores = querys @ keys.T
print(f"all_scores:{all_scores}")
all_weights = torch.softmax(all_scores, dim=-1)
print(f"all_weights:{all_weights}")
output = all_weights @ values
print(f"output:{output}")

最终生成的output的维度与W_value 的维度一致。

2、调换顺序结果不变

import torch

def simple_attention(Q):
    K = Q.T
    V = Q
    
    scores = Q @ K #计算内积
    weights = torch.softmax(scores, dim=-1)
    print(f"概率分布：{weights}")
    newQ = weights @ V
    print(f"输出：{newQ}")
    
Q = torch.tensor([[3.0, 3.0,0.0],
                  [0.5, 4.0,0.0]])
Q1 = torch.tensor([[0.5, 4.0,0.0],
                   [3.0, 3.0,0.0]])
print("模拟‘苹果梨’：")
simple_attention(Q)
print("模拟‘梨苹果’：")
simple_attention(Q1)

可以看到“苹果梨”、“梨苹果”即便换了词token的顺序，并不会影响新的梨和新的苹果的向量数值。这里我们用了softmax函数求概率分布，因此跟上一篇文章的示例数值不一样，不要在意这个细节。

3、softmax：

import numpy as np

def softmax(x):
    e_x = np.exp(x)
    return e_x / e_x.sum(axis=0)

def softmax_with_temperature(x,T):
    e_x = np.exp(x/T)
    return e_x / e_x.sum(axis=0)


# 示例使用
if __name__ == "__main__":
    input_vector = np.array([2.0, 1.0, 0.1])
    output = softmax(input_vector)
    print("Softmax Output:", output)

    print("Softmax with Temperature 0.5 Output:", softmax_with_temperature(input_vector,0.5))
    print("Softmax with Temperature 1 Output:", softmax_with_temperature(input_vector,1))
    print("Softmax with Temperature 5 Output:", softmax_with_temperature(input_vector,5))

可以看到随着T的不断加大，概率分布不断趋于均匀分布。

4、softmax除以 $\sqrt{d_k}$

还是用上面的softmax函数，演示下除以 $\sqrt{d_k}$ 的效果：

        # 高维输入向量
        input_vector_high_dim = np.random.randn(100) * 10  # 生成一个100维的高斯分布随机向量，乘以10增加内积
        output_high_dim = softmax(input_vector_high_dim)
        print("High Dimension Softmax Output:", output_high_dim)
        # 打印高维输出的概率分布
        print("Max Probability in High Dimension:", np.max(output_high_dim))
        print("Min Probability in High Dimension:", np.min(output_high_dim))

        # 高维输入向量除以10
        input_vector_high_dim_div10 = input_vector_high_dim / 10
        output_high_dim_div10 = softmax(input_vector_high_dim_div10)
        print("High Dimension Softmax Output (Divided by 10):", output_high_dim_div10)
        # 打印高维输出的概率分布
        print("Max Probability in High Dimension (Divided by 10):", np.max(output_high_dim_div10))
        print("Min Probability in High Dimension (Divided by 10):", np.min(output_high_dim_div10))

        # 绘制高维概率分布曲线
        plt.figure(figsize=(10, 6))

        # 绘制图形
        plt.plot(output_high_dim, label='High Dim')
        plt.plot(output_high_dim_div10, label='High Dim Divided by 10')
        plt.legend()
        plt.title('High Dimension Softmax Output Comparison')
        plt.xlabel('Index')
        plt.ylabel('Probability')
        plt.show()

在除以 $\sqrt{d_k}$ 之前，由于内积变大，导致概率分布变得尖锐，趋近0的位置梯度基本消失，softmax 函数的损失函数的导数在输出接近 0 时接近零，在反向传播过程中，无法有效地更新权重。有兴趣的话可以试试对softmax 函数的损失函数求导。

继续上面的代码，来看下softmax的输出的损失函数求梯度：

        def test_grad( dim_vertor):
            import numpy as np
            import torch
            import torch.nn.functional as F

            # 假设的输入
            z = torch.tensor(dim_vertor, requires_grad=True)
            print(z)
            # 计算 softmax 输出
            p = F.softmax(z, dim=0)
            true_label = np.zeros(100)
            true_label[3] = 1
            # 模拟损失函数（例如交叉熵）
            y = torch.tensor(true_label)  # one-hot 编码的真实标签
            loss = -torch.sum(y * torch.log(p))

            # 反向传播并获取梯度
            loss.backward()
            # print(z.grad)  # 输出梯度
            return z.grad
        grad_div10 = test_grad(input_vector_high_dim_div10)
        grad = test_grad(input_vector_high_dim)
        print(f"grad_div10:{grad_div10}")
        print(f"grad:{grad}")

明显看出，没有除以 $\sqrt{d_k}$ 求出的梯度，基本为0；上面的代码是torch已经实现的。当然也可以根据损失函数自己求导，这里我们只为演示效果，点到即止：

5、多头注意力：

import torch
import torch.nn as nn

torch.manual_seed(123)

# 输入矩阵 Q
Q = torch.tensor([[3.0, 3.0, 0.0],
                  [0.5, 4.0, 0.0]])

# 维度设置
d_q, d_k, d_v = 4, 4, 5  # 每个头的 query, key, value 的维度
d_model = Q.shape[1]     # 输入 token 的维度
num_heads = 2            # 头的数量

# 初始化每个头的权重矩阵
W_query = nn.ParameterList([nn.Parameter(torch.rand(d_model, d_q)) for _ in range(num_heads)])
W_key = nn.ParameterList([nn.Parameter(torch.rand(d_model, d_k)) for _ in range(num_heads)])
W_value = nn.ParameterList([nn.Parameter(torch.rand(d_model, d_v)) for _ in range(num_heads)])

# 输出权重矩阵
W_output = nn.Parameter(torch.rand(num_heads * d_v, d_model))

# 打印权重矩阵
for i in range(num_heads):
    print(f"W_query_{i+1}:\n{W_query[i]}")
    print(f"W_key_{i+1}:\n{W_key[i]}")
    print(f"W_value_{i+1}:\n{W_value[i]}")

# 计算每个头的 Q, K, V
queries = [Q @ W_query[i] for i in range(num_heads)]
keys = [Q @ W_key[i] for i in range(num_heads)]
values = [Q @ W_value[i] for i in range(num_heads)]

# 计算每个头的注意力分数和权重
outputs = []
for i in range(num_heads):
    scores = queries[i] @ keys[i].T / (d_k ** 0.5)
    weights = torch.softmax(scores, dim=-1)
    output = weights @ values[i]
    outputs.append(output)

# 拼接所有头的输出
concat_output = torch.cat(outputs, dim=-1)
print(f"concat_output:\n{concat_output}")
# 最终线性变换
final_output = concat_output @ W_output

# 打印结果
print(f"Final Output:\n{final_output}")

6、掩码注意力：

import torch

# 原始 Q 矩阵
Q = torch.tensor([[3.0, 3.0, 0.0],
                  [0.5, 4.0, 0.0],
                  [1.0, 2.0, 0.0],
                  [2.0, 1.0, 0.0]])

torch.manual_seed(123)
d_q, d_k, d_v = 4, 4, 5  # query, key, value 的维度
d = Q.shape[1]           # query, key, value 的行数等于输入 token 的维度

# 初始化权重矩阵
W_query = torch.nn.Parameter(torch.rand(d, d_q))
W_key = torch.nn.Parameter(torch.rand(d, d_k))
W_value = torch.nn.Parameter(torch.rand(d, d_v))

print("W_query:", W_query)
print("W_key:", W_key)
print("W_value:", W_value)

# 计算 Q, K, V
querys = Q @ W_query
keys = Q @ W_key
values = Q @ W_value

print(f"querys:\n{querys}")
print(f"keys:\n{keys}")
print(f"values:\n{values}")

# 计算注意力分数
all_scores = querys @ keys.T / (d_k ** 0.5)
print(f"all_scores:\n{all_scores}")

# 生成掩码
seq_len = Q.shape[0]
mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
masked_scores = all_scores.masked_fill(mask, float('-inf'))

print(f"Mask:\n{mask}")
print(f"Masked Scores:\n{masked_scores}")

# 计算权重
all_weights = torch.softmax(masked_scores, dim=-1)
print(f"all_weights:\n{all_weights}")

# 计算输出
output = all_weights @ values
print(f"output:\n{output}")