t-SNE(t-Distributed Stochastic Neighbor Embedding,t分布随机邻域嵌入)是一种用于数据降维和可视化的算法。它可以将高维数据映射到二维或三维空间,同时尽可能地保留数据点之间的局部关系。t-SNE特别适用于探索数据的内部结构和模式,常用于聚类分析和发现数据中的群集。
在机器学习和数据分析中,t-SNE图通常用来展示数据点在降维后的分布情况,帮助观察数据点之间的相似性或差异性。通过t-SNE图,可以直观地看出数据点是否形成了明显的群集或者存在分离的趋势,有助于深入理解数据的结构和特征。
下面我给出两个具体案例,用python来实现tSNE图。
目录
一、RTE数据集绘制tSNE图
1.1 加载数据
1.2 加载模型获得特征
1.3 保存模型
1.4 绘制tSNE图
二、HUW数据集绘制tSNE图
2.1 加载数据
2.2 加载模型保存文件
2.3 绘制tSNE图
一、RTE数据集绘制tSNE图
1.1 加载数据
from transformers import RobertaTokenizer, RobertaModel
import torch
import numpy as np
import pandas as pd
import random
# 读取TSV文件
df = pd.read_csv("RTE/train.tsv", sep="\t", header=None) # 替换为您的文件路径
# 随机选择500个句子
# sentences = df.sample(5000)[1] # 句子在第2列
random_samples = df.sample(500)
# 将第2列和第3列的句子拼接起来,两个句子之间用空格分隔
sentences = random_samples[1] + " " + random_samples[2]
sentences打印结果:
129 The company, whose registered auditors are Del... 791 So far Sony BMG has not released a list of how... 1684 Analysts expected the company to earn $1.42 a ... 109 The University has also apologized for the inc... 1723 The nation got its long-awaited first look at ... ... 2039 The International Olympic Committee has leapt ... 1381 Martha Stewart, 64, is back, after serving fiv... 570 President George W. Bush, who gave the keynote... 1519 A former philosophy teacher whose best-known n... 1318 Alan Mulally, Boeing's head of the unit, said ... Length: 500, dtype: object
注意,绘制tSNE图,上面的数据加载部分只运行一次,来保证随机采样的数据相同。
1.2 加载模型获得特征
# 定义model
model = RobertaModel.from_pretrained("/roberta-large") # 加载预训练模型权重
# model.load_state_dict(torch.load('CoLA68.9_best_model_Original.pt'), strict=False) # PLM+softmax微调
# model.load_state_dict(torch.load('CoLA72.9_best_model_calculate_weight_diff.pt'), strict=False) # ours impl
# model.load_state_dict(torch.load('RTE_best_model_original.pt'), strict=False) # PLM+softmax微调
model.load_state_dict(torch.load('RTE89.3_calculate_weight_diff_model.pt'), strict=False) # ours impl
tokenizer = RobertaTokenizer.from_pretrained("/roberta-large")
# 获取Input Features
input_feature_list = list()
for sentence in sentences:
inputs = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
input_feature = model(input_ids, attention_mask)[1]
input_feature_list.append(input_feature.detach().cpu().numpy())
# 堆叠Input Features并保存
orign = np.vstack(input_feature_list)
orign.shape
(500, 1024)
1.3 保存模型
# np.save('npy/rte_orign_roberta_1.npy',orign)
# np.save('npy/rte_orign_roberta_2.npy',orign)
np.save('npy/rte_roberta_AgDn_500.npy',orign)
# For plotting
import matplotlib.pyplot as plt
import seaborn as sns
#For standardising the dat
from sklearn.preprocessing import StandardScaler
#PCA
from sklearn.manifold import TSNE
# roberta_data = np.load('npy/cola_orign_roberta.npy')
roberta_data1 = np.load('npy/rte_roberta_FT_500.npy')
roberta_data2 = np.load('npy/rte_roberta_PT_500.npy')
roberta_data3 = np.load('npy/rte_roberta_AgDn_500.npy')
# 将所有数据组合在一起以便用t-SNE处理
combined_data = np.vstack([roberta_data1, roberta_data2, roberta_data3])
labels1 = [f'n{i}' for i in range(1, len(roberta_data1) + 1)]
labels2 = [f'n{i}' for i in range(1, len(roberta_data2) + 1)]
labels3 = [f'n{i}' for i in range(1, len(roberta_data3) + 1)]
combined_labels = labels1 + labels2 + labels3
注意,根据实际需要,要保存若干个npy文件,用作后面的tSNE绘图。
combined_data预览
array([[-0.47165307, 0.5584102 , -0.3025265 , ..., 0.21222334, 0.41210824, -0.48724136], [-0.433249 , 0.45918941, -0.37311822, ..., 0.17242172, 0.30651495, -0.53964025], [-0.6322939 , 0.36898744, -0.20495233, ..., -0.16366315, 0.45747212, -0.2154128 ], ..., [-0.05081458, 0.4187796 , 0.19485527, ..., -0.8376309 , 0.20942022, 0.57146496], [ 0.70825064, -0.51524955, 0.516124 , ..., -0.53912795, 0.7632115 , -0.71091646], [-0.10236336, 0.4357162 , 0.01013089, ..., -0.78527933, 0.20695496, 0.5769015 ]], dtype=float32)
len(combined_data) :1500
1.4 绘制tSNE图
# 应用t-SNE
tsne = TSNE(n_components=2,
random_state=42,
perplexity=19.5,
n_iter=550,
early_exaggeration=350)
tsne_results = tsne.fit_transform(combined_data)
# 设置绘图样式
# sns.set(style='white', context='notebook', rc={'figure.figsize': (5, 5), 'axes.edgecolor': 'gray', 'axes.linewidth': 0.5})
# 设置绘图样式,包括完全去掉边框
sns.set(style='white', context='notebook', rc={'axes.edgecolor': 'none', 'axes.linewidth': 0})
# 创建一个没有边框的画布
plt.figure(figsize=(5, 5), edgecolor='none') # edgecolor='none' 移除绘图区域的边框
# 为每组数据选择不同的颜色
colors = ['red', 'blue', 'green']
# 第一组数据 plm--红色
plt.scatter(tsne_results[:500, 0], tsne_results[:500, 1], c=colors[0], label='Pre-Training', marker='o', s=5)
# 第二组数据 plm+softmax微调-- 蓝色
plt.scatter(tsne_results[500:1000, 0], tsne_results[500:1000, 1], c=colors[1], label='Fine-Tuning', marker='^', s=5)
# 第三组数据 our impl--绿色
plt.scatter(tsne_results[1000:, 0], tsne_results[1000:, 1], c=colors[2], label='AgDn', marker='s', s=5)
# 自定义图例的边框样式
legend = plt.legend(fontsize=8, loc='upper left', bbox_to_anchor=(0.0, 1.0), frameon=True, fancybox=True, edgecolor='black')
# legend = plt.legend(fontsize=10, loc='upper left', frameon=True, fancybox=True, edgecolor='black')
# 获取图例的边框对象,并调整边框线条的宽度
frame = legend.get_frame()
frame.set_edgecolor('black') # 确保边框颜色为黑色
frame.set_linewidth(0.2) # 设置边框线条的宽度为 0.5
# 设置坐标轴比例和隐藏标签
plt.gca().set_aspect('equal', 'datalim')
plt.tick_params(axis='both', which='both', labelbottom=False, labelleft=False)
# 添加标题
# plt.title('RTE dataset', fontsize=12)
plt.suptitle('RTE dataset', fontsize=12, y=0.02)
plt.savefig('npy/tsne_RTE.png',dpi = 300)
# 显示图形
plt.show()
二、HUW数据集绘制tSNE图
2.1 加载数据
from transformers import RobertaTokenizer, RobertaModel
import torch
import numpy as np
import pandas as pd
import random
# 读取TSV文件
df = pd.read_csv("HWU/HWU_train_data.tsv", sep="\t", header=None).dropna() # 替换为您的文件路径
# 随机选择500个句子
# sentences = df.sample(500)[1] # 句子在第2列
random_samples = df.sample(500)
# 将第2列和第3列的句子拼接起来,两个句子之间用空格分隔
sentences = random_samples[0]
15261 Does Vocelli's do takeaway? 19685 play track one from my david bowie playlist 10380 yeap excellent response to command. 18334 that is all. 6228 that's all, cancel. ... 495 Please remove this alarm 4901 can you read me news on Trump? 5124 that is not so clear to me. 10647 Set my alarm for Tuesday at 6pm. 5639 I am no longer going to Wyatts birthday party Name: 0, Length: 500, dtype: object
2.2 加载模型保存文件
# 定义model
model = RobertaModel.from_pretrained("./Roberta-large/") # 加载预训练模型权重
# model.load_state_dict(torch.load('CoLA68.9_best_model_Original.pt'), strict=False) # PLM+softmax微调
# model.load_state_dict(torch.load('CoLA72.9_best_model_calculate_weight_diff.pt'), strict=False) # ours impl
# model.load_state_dict(torch.load('RTE_best_model_original.pt'), strict=False) # PLM+softmax微调
# model.load_state_dict(torch.load('RTE89.3_calculate_weight_diff_model.pt'), strict=False) # ours impl
model.load_state_dict(torch.load('HWU_Original_model.pt'), strict=False) # PLM+softmax微调
# model.load_state_dict(torch.load('HWU_AgDn_model.pt'), strict=False) # ours impl
# model.load_state_dict(torch.load('HWU_wacl_model.pt'), strict=False) # ours impl
tokenizer = RobertaTokenizer.from_pretrained("./Roberta-large/")
# 获取Input Features
input_feature_list = list()
for sentence in sentences:
inputs = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=64)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
input_feature = model(input_ids, attention_mask)[1]
input_feature_list.append(input_feature.detach().cpu().numpy())
# 堆叠Input Features并保存
orign = np.vstack(input_feature_list)
# np.save('npy/HWU_roberta_FT_500.npy',orign)
np.save('npy/HWU_roberta_PT_500.npy',orign)
# np.save('npy/HWU_roberta_wacl_500.npy',orign)
# For plotting
import matplotlib.pyplot as plt
import seaborn as sns
#For standardising the dat
from sklearn.preprocessing import StandardScaler
#PCA
from sklearn.manifold import TSNE
# roberta_data = np.load('npy/cola_orign_roberta.npy')
roberta_data1 = np.load('npy/HWU_roberta_FT_500.npy')
roberta_data2 = np.load('npy/HWU_roberta_PT_500.npy')
# roberta_data3 = np.load('npy/HWU_roberta_AgDn_500.npy')
roberta_data3 = np.load('npy/HWU_roberta_wacl_500.npy')
# 将所有数据组合在一起以便用t-SNE处理
combined_data = np.vstack([roberta_data1, roberta_data2, roberta_data3])
labels1 = [f'n{i}' for i in range(1, len(roberta_data1) + 1)]
labels2 = [f'n{i}' for i in range(1, len(roberta_data2) + 1)]
labels3 = [f'n{i}' for i in range(1, len(roberta_data3) + 1)]
combined_labels = labels1 + labels2 + labels3
2.3 绘制tSNE图
# 应用t-SNE
tsne = TSNE(n_components=2, # 将数据降至2维
random_state=42, # 设置随机种子以保证结果的可重复性
perplexity=15, # 困惑度,影响局部之间的距离,较低的值倾向于将数据点分离为更多的群集
n_iter=380, # 迭代次数,影响拟合的优化程度和时间
early_exaggeration=50) # 初始嵌入时的过度放大,增加数据点之间的距离,有助于在早期迭代中保留局部结构
tsne_results = tsne.fit_transform(combined_data)
# 设置绘图样式,包括完全去掉边框
sns.set(style='white', context='notebook', rc={'axes.edgecolor': 'none', 'axes.linewidth': 0})
# 创建一个没有边框的画布
plt.figure(figsize=(6, 6), edgecolor='none') # edgecolor='none' 移除绘图区域的边框
# 为每组数据选择不同的颜色
colors = ['red', 'blue', 'green']
# 第一组数据 plm--红色
plt.scatter(tsne_results[:500, 0], tsne_results[:500, 1], c=colors[0], label='Pre-Training', marker='o', s=5)
# 第二组数据 plm+softmax微调-- 蓝色
plt.scatter(tsne_results[500:1000, 0], tsne_results[500:1000, 1], c=colors[1], label='Fine-Tuning', marker='^', s=5)
# 第三组数据 our impl--绿色
plt.scatter(tsne_results[1000:, 0], tsne_results[1000:, 1], c=colors[2], label='AgDn', marker='s', s=5)
# 自定义图例的边框样式
# legend = plt.legend(fontsize=8, loc='upper left', bbox_to_anchor=(0.0, 1.0), frameon=True, fancybox=True, edgecolor='black')
legend = plt.legend(fontsize=8, loc='upper left', frameon=True, fancybox=True, edgecolor='black')
# 获取图例的边框对象,并调整边框线条的宽度
frame = legend.get_frame()
frame.set_edgecolor('black') # 确保边框颜色为黑色
frame.set_linewidth(0.2) # 设置边框线条的宽度为 0.5
# 设置坐标轴比例和隐藏标签
plt.gca().set_aspect('equal', 'datalim')
plt.tick_params(axis='both', which='both', labelbottom=False, labelleft=False)
# 添加标题
plt.suptitle('HWU dataset', fontsize=15, y=0.02)
# 调整布局以确保保存的图像完整显示
plt.tight_layout()
# 保存图像,使用更高的 DPI 以获得更好的质量
# plt.savefig('npy/tsne_HWU_2.png', dpi=300)
# 显示图形
plt.show()