文本评估指标
这里注意库包的版本,亲测这个很包敏感,所以一定要调好库版本
nltk == 3.6.3 如果这个包的版本不对会报错,很难理解的那种
rouge==1.0.1
定义函数
import jieba
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from IPython import embed
ref_tokens = list(jieba.cut("这个是什么东西,狗吗"))
cand_tokens = list(jieba.cut("这个不像是狗,更像是猫"))
def process_text(ref_tokens, cand_tokens):
return list(jieba.cut(ref_tokens)), list(jieba.cut(cand_tokens))
def calculate_bleu(ref_tokens, cand_tokens):
weights_1 = (1.0, 0, 0, 0)
weights_2 = (0.5, 0.5, 0, 0)
weights_3 = (0.33, 0.33, 0.33, 0)
bleu_1 = sentence_bleu([ref_tokens], cand_tokens, weights=weights_1, smoothing_function=SmoothingFunction().method1)
bleu_2 = sentence_bleu([ref_tokens], cand_tokens, weights=weights_2, smoothing_function=SmoothingFunction().method1)
bleu_3 = sentence_bleu([ref_tokens], cand_tokens, weights=weights_3, smoothing_function=SmoothingFunction().method1)
# print(f"bleu_1:{bleu_1} bleu_2:{bleu_2}: bleu_3:{bleu_3}")
return bleu_1, bleu_2, bleu_3
def calculate_meteor(ref_tokens, cand_tokens):
# embed()
meteor = meteor_score([" ".join(ref_tokens)], " ".join(cand_tokens))
# print(f"meteor_score:{meteor}")
return meteor
def calculate_rouge_l(ref_tokens, cand_tokens):
rouge = Rouge()
scores = rouge.get_scores(' '.join(cand_tokens), ' '.join(ref_tokens))
rouge_l = scores[0]['rouge-l']['f']
# print(f"rouge_l:{rouge_l}")
return rouge_l
使用说明
import re
import os
import cv2
import torch
import pydicom
import numpy as np
import pandas as pd
from tqdm import tqdm
import numpy.random as random
import torch.utils.data as data
from sklearn.utils import shuffle
from torchvision import transforms
class Dataset:
def __init__(self, xlsx_file):
self.df = pd.read_excel(xlsx_file)
self.examples = []
for index, row in self.df.iterrows():
report_description = row['description']
report_diagnosis = row['diagnosis']
if not isinstance(report_description, str):
report_description = ''
if not isinstance(report_diagnosis, str):
report_diagnosis = ''
report = report_description + "\n" + report_diagnosis
self.examples.append(report)
dataset_path = "path/to/file.xlsx"
dataset = Dataset(xlsx_file=dataset_path)
samples = shuffle(dataset.examples, random_state=42)[:5000]
bleu_scores_1_list = []
bleu_scores_2_list = []
bleu_scores_3_list = []
meteor_scores = []
rouge_l_scores = []
for sample in tqdm(samples,total=len(samples)):
# 随机选择一个样本进行比较
compare_sample = np.random.choice(samples)
# print(sample)
# print(compare_sample)
sample,compare_sample = process_text(sample,compare_sample)
# 计算指标
bleu_score_1, bleu_score_2, bleu_score_3 = calculate_bleu(sample, compare_sample)
embed()
meteor_score_tmp = calculate_meteor(sample, compare_sample)
rouge_l_score = calculate_rouge_l(sample, compare_sample)
bleu_scores_1_list.append(bleu_score_1)
bleu_scores_2_list.append(bleu_score_2)
bleu_scores_3_list.append(bleu_score_3)
meteor_scores.append(meteor_score_tmp)
rouge_l_scores.append(rouge_l_score)
# print(f"bleu_1: {bleu_score_1} bleu_2:{bleu_score_2} bleu_3:{bleu_score_3} meteor:{meteor_score_tmp} rouge_l:{rouge_l_score}")
bleu_scores_1_=np.mean(bleu_scores_1_list)
bleu_scores_2_=np.mean(bleu_scores_2_list)
bleu_scores_3_=np.mean(bleu_scores_3_list)
meteor_scores_=np.mean(meteor_scores)
rouge_l_scores_=np.mean(rouge_l_scores)
print(f"bleu_1: {bleu_scores_1_} bleu_2:{bleu_scores_2_} bleu_3:{bleu_scores_3_} meteor:{meteor_scores_} rouge_l:{rouge_l_scores_}")