- 使用文本向量化Embeding进行分类
- 使用文本向量进行零样本Zero Shot分类
# 导入必要的库
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.ensemble import RandomForestClassifier # 随机森林分类器
from sklearn.model_selection import train_test_split # 数据集划分函数
from sklearn.metrics import classification_report, accuracy_score # 分类报告和准确率评估
# 加载数据
datafile_path = "data/fine_food_reviews_with_embeddings_1k.csv" # 数据文件路径
df = pd.read_csv(datafile_path) # 读取csv文件
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array) # 将嵌入向量从字符串转换为数组
# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
list(df.embedding.values), df.Score, test_size=0.2, random_state=42
# 训练随机森林分类器
clf = RandomForestClassifier(n_estimators=100) # 创建随机森林分类器对象
clf.fit(X_train, y_train) # 在训练集上拟合分类器
preds = clf.predict(X_test) # 在测试集上进行预测
probas = clf.predict_proba(X_test) # 预测每个类别的概率
# 生成分类报告并输出准确率评估
report = classification_report(y_test, preds) # 生成分类报告
print(report) # 输出分类报告
precision recall f1-score support
1 0.89 0.40 0.55 20
2 1.00 0.38 0.55 8
3 1.00 0.18 0.31 11
4 1.00 0.26 0.41 27
5 0.75 1.00 0.86 134
accuracy 0.77 200
macro avg 0.93 0.44 0.53 200
weighted avg 0.82 0.77 0.72 200
# 从utils.embeddings_utils模块中导入plot_multiclass_precision_recall函数
from utils.embeddings_utils import plot_multiclass_precision_recall
# 调用plot_multiclass_precision_recall函数,传入probas、y_test、[1, 2, 3, 4, 5]和clf四个参数
# probas:预测结果的概率值
# y_test:测试集的真实标签
# [1, 2, 3, 4, 5]:标签的取值范围
# clf:分类器
plot_multiclass_precision_recall(probas, y_test, [1, 2, 3, 4, 5], clf)
RandomForestClassifier() - Average precision score over all classes: 0.87
使用文本向量进行零样本Zero Shot分类
# 导入所需的库
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.metrics import classification_report
# 设置参数,指定使用的文本嵌入模型
EMBEDDING_MODEL = "text-embedding-ada-002"
# 加载数据文件
datafile_path = "data/fine_food_reviews_with_embeddings_1k.csv"
df = pd.read_csv(datafile_path)
# 将embedding列中的字符串转换为numpy数组
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)
# 将5星评分转换为二元情感标签
df = df[df.Score != 3] # 去除评分为3的数据
df["sentiment"] = df.Score.replace({1: "negative", 2: "negative", 4: "positive", 5: "positive"}) # 将1和2评分标记为negative,将4和5评分标记为positive
# 导入所需的函数和类
from utils.embeddings_utils import cosine_similarity, get_embedding
from sklearn.metrics import PrecisionRecallDisplay
# 定义评估嵌入方法的函数
def evaluate_embeddings_approach(
labels = ['negative', 'positive'], # 定义标签列表,用于计算标签的嵌入向量
model = EMBEDDING_MODEL, # 定义嵌入模型
# 获取标签的嵌入向量
label_embeddings = [get_embedding(label, model=model) for label in labels]
# 定义计算评分的函数,用于计算嵌入向量与标签嵌入向量之间的余弦相似度
def label_score(review_embedding, label_embeddings):
return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])
# 计算每个评论的评分,并将评分转换为预测标签
probas = df["embedding"].apply(lambda x: label_score(x, label_embeddings))
preds = probas.apply(lambda x: 'positive' if x>0 else 'negative')
# 生成分类报告
report = classification_report(df.sentiment, preds)
# 生成精度-召回率曲线
display = PrecisionRecallDisplay.from_predictions(df.sentiment, probas, pos_label='positive')
_ = display.ax_.set_title("2-class Precision-Recall curve")
# 调用评估嵌入方法的函数,并传入标签和嵌入模型
evaluate_embeddings_approach(labels=['negative', 'positive'], model=EMBEDDING_MODEL)
precision recall f1-score support
negative 0.61 0.88 0.72 136
positive 0.98 0.90 0.94 789
accuracy 0.90 925
macro avg 0.79 0.89 0.83 925
weighted avg 0.92 0.90 0.91 925
evaluate_embeddings_approach(labels=['An Amazon review with a negative sentiment.', 'An Amazon review with a positive sentiment.'])
precision recall f1-score support
negative 0.98 0.73 0.84 136
positive 0.96 1.00 0.98 789
accuracy 0.96 925
macro avg 0.97 0.86 0.91 925
weighted avg 0.96 0.96 0.96 925
