目录
问题描述:
问题解决:
问题描述:
原始的标注的三元组格式如下:
需要转换的格式如下:
tips:有一个小的难点:
1. 针对多三元组的情况,需要额外考虑
2. 最后一个样本,也记得需要处理
问题解决:
from pdb import set_trace as stop
import os
from tqdm import trange
generated_path= "/public/home/hongy/qtxu/UniCOQE_20230812/data/tuple/ele/train_new_generated.txt"
Unicoqe_path = "/public/home/hongy/qtxu/UniCOQE_20230812/data/tuple/ele/train.txt"
raw_data = []
with open(os.path.join(generated_path), 'r') as f:
for line in f:
raw_data.append(line)
with open(Unicoqe_path, 'w') as fw:
line_id, i = 0, 0
text_line, label_line = '', ''
for line_id in trange(len(raw_data), desc= "procesing data ……"):
cur_line = raw_data[line_id]
if len(cur_line.split('\t')) != 2:
label_line += '\n' + cur_line
else:
if text_line !='':
sent, label = text_line.strip().split("\t")
fw.write(sent+"####")
label_list = label_line.strip().split('\n\n')
span_index =[]
all_span= ()
for label_i in label_list:
cur_span = label_i.strip()[1:-1].split(';')
sub, obj,asp = cur_span[0], cur_span[1], cur_span[2]
try:
sub_index = [int(index) for index, word in (pair.split('&', 1) if '&' in pair else [pair, ''] for pair in sub.strip()[1:-1].split())] # 针对14&&这样的特例,进行处理
obj_index = [int(index) for index, word in (pair.split('&', 1) if '&' in pair else [pair, ''] for pair in obj.strip()[1:-1].split())]
asp_index = [int(index) for index, word in (pair.split('&')for pair in asp.strip()[1:-1].split())]
except:
print(text_line)
stop()
span_tuple = (sub_index, obj_index, asp_index)
span_index.append(span_tuple)
fw.write(str('['+', '.join(str(span) for span in span_index)) + "]\n")
text_line = cur_line
label_line=''
fw.write(text_line.strip().split("\t")[0]+"####")
label_list = label_line.strip().split('\n\n')
span_index =[]
all_span= ()
for label_i in label_list:
cur_span = label_i.strip()[1:-1].split(';')
sub, obj,asp = cur_span[0], cur_span[1], cur_span[2]
sub_index = [int(index) for index, word in (pair.split('&', 1) if '&' in pair else [pair, ''] for pair in sub.strip()[1:-1].split())] # 针对14&&这样的特例,进行处理
obj_index = [int(index) for index, word in (pair.split('&', 1) if '&' in pair else [pair, ''] for pair in obj.strip()[1:-1].split())]
asp_index = [int(index) for index, word in (pair.split('&')for pair in asp.strip()[1:-1].split())]
span_tuple = (sub_index, obj_index, asp_index)
span_index.append(span_tuple)
fw.write(str('['+', '.join(str(span) for span in span_index)) + "]\n")