数据准备
加载数据集
from tqdm.notebook import tqdm
import os
dataset = []
with open(train_file, 'r') as file:
for line in tqdm(file.readlines()):
data = json.loads(line.strip())
dataset.append(data)
你可以按照 CLUENER 的格式准备训练数据,
例如:
{'text': '胡建新经营着位于深圳市福田区华富街道田面社区深南中路4028号田面城市大厦19B-19C的公司。',
'label': {'person': {'胡建新': [[0, 2]]},
'address': {'深圳市福田区华富街道田面社区深南中路4028号田面城市大厦19B-19C': [[8, 43]]}}}
拆分训练集验证集
import random
import numpy as np
def split_train_test_valid(dataset, train_size=0.8, test_size=0.1):
dataset = np.array(dataset)
total_size = len(dataset)
# define the ratios
train_len = int(total_size * train_size)
test_len = int(total_size * test_size)
# split the dataframe
idx = list(range(total_size))
random.shuffle(idx) # 将index列表打乱
data_train = dataset[idx[:train_len]]
data_test = dataset[idx[train_len:train_len+test_len]]
data_valid = dataset[idx[train_len+test_len:]] # 剩下的就是valid
return data_train, data_test, data_valid
data_train, data_test, data_valid = split_train_test_valid(dataset)
转化成 spacy docbin 格式
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans
def to_docbin(dataset):
data = dataset
data_spacy = []
for d in tqdm(data):
text = d['text']
tags = []
labels = d['label']
for label in labels:
entities = labels[label]
for entity in entities:
for loc in entities[entity]:
tags.append((loc[0], loc[1]+1, label))
data_spacy.append({"text":text, "entities": tags})
nlp = spacy.blank('zh') # 选择中文空白模型
doc_bin = DocBin()
for training_example in tqdm(data_spacy):
text = training_example['text']
labels = training_example['entities']
doc = nlp.make_doc(text)
ents = []
for start, end, label in labels:
span = doc.char_span(start, end, label=label, alignment_mode="contract")
if span is None:
print("Skipping entity")
else:
ents.append(span)
filtered_ents = filter_spans(ents)
doc.ents = filtered_ents
doc_bin.add(doc)
return doc_bin
doc_bin_train = to_docbin(data_train)
doc_bin_train.to_disk("train.spacy")
doc_bin_valid = to_docbin(data_valid)
doc_bin_valid.to_disk("valid.spacy")
训练集和验证集保存到了 train.spacy
和 valid.spacy
获取spacy训练配置
进入网页:https://spacy.io/usage/training#quickstart
选择Chinese/ner/GPU,自动生成配置文件 base_config.cfg
自动补全配置
python -m spacy init fill-config base_config.cfg config.cfg
训练模型
python -m spacy train config.cfg --output . --paths.train ./train.spacy --paths.dev ./valid.spacy --gpu-id 0
日志如下:
python -m spacy train config.cfg --output . --paths.train ./train.spacy --paths.dev ./dev.spacy --gpu-id 0
ℹ Saving to output directory: .
ℹ Using GPU: 0
=========================== Initializing pipeline ===========================
Some weights of the model checkpoint at ../models/bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
✔ Initialized pipeline
============================= Training pipeline =============================
ℹ Pipeline: ['transformer', 'ner']
ℹ Initial learn rate: 0.0
E # LOSS TRANS... LOSS NER ENTS_F ENTS_P ENTS_R SCORE
--- ------ ------------- -------- ------ ------ ------ ------
0 0 2414.47 804.03 0.41 0.25 1.17 0.00
0 200 553440.62 100815.50 25.73 27.65 24.06 0.26
1 400 379529.80 55305.57 36.83 43.31 32.03 0.37
2 600 164609.24 36629.69 62.07 60.54 63.67 0.62
3 800 163662.29 38876.53 32.75 42.38 26.69 0.33
4 1000 81601.30 28677.56 62.02 63.22 60.87 0.62
5 1200 75558.20 26489.57 61.61 63.17 60.12 0.62
6 1400 87824.25 25230.27 69.77 69.59 69.95 0.70
6 1600 54173.95 21436.94 70.03 69.52 70.54 0.70
7 1800 30978.67 15641.39 71.80 72.03 71.58 0.72
8 2000 27723.05 13770.74 69.07 69.53 68.62 0.69
9 2200 25622.08 12936.05 72.89 71.89 73.93 0.73
10 2400 24126.19 13338.83 71.58 71.96 71.19 0.72
11 2600 21804.75 11238.43 74.20 74.82 73.60 0.74
12 2800 20628.26 10916.07 71.44 71.39 71.48 0.71
13 3000 20134.37 11081.41 72.51 72.17 72.85 0.73
14 3200 16227.69 8933.84 74.17 73.84 74.51 0.74
14 3400 19235.74 9438.10 72.00 73.18 70.87 0.72
15 3600 29307.03 12692.90 74.84 76.13 73.60 0.75
16 3800 18102.06 8969.09 73.38 71.82 75.00 0.73
17 4000 14903.23 8416.16 73.11 71.91 74.35 0.73
18 4200 19608.45 9377.10 72.91 72.67 73.14 0.73
19 4400 17153.18 8931.95 74.35 74.20 74.51 0.74
20 4600 17934.71 9112.66 66.37 67.00 65.76 0.66
20 4800 13376.17 7252.01 74.06 74.29 73.83 0.74
21 5000 13659.26 6804.46 72.38 71.47 73.31 0.72
22 5200 18188.32 8358.28 73.57 72.22 74.97 0.74
✔ Saved pipeline to output directory
model-last
验证集 F1 score 达到了 0.75,相比比非transform的模型的 0.65 如下,结果是有明显提升的:
ℹ Saving to output directory: .
ℹ Using GPU: 0
=========================== Initializing pipeline ===========================
✔ Initialized pipeline
============================= Training pipeline =============================
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E # LOSS TOK2VEC LOSS NER ENTS_F ENTS_P ENTS_R SCORE
--- ------ ------------ -------- ------ ------ ------ ------
0 0 0.00 49.29 0.09 0.15 0.07 0.00
0 200 496.94 3348.46 5.82 4.36 8.76 0.06
0 400 1408.31 4107.52 9.38 20.41 6.09 0.09
0 600 2121.99 5357.34 17.45 23.00 14.06 0.17
0 800 1096.04 5009.92 19.90 27.89 15.46 0.20
0 1000 931.30 5447.63 27.72 33.77 23.50 0.28
0 1200 1375.05 6551.97 32.09 38.83 27.34 0.32
0 1400 1388.81 7116.59 37.61 43.81 32.94 0.38
0 1600 2521.46 9638.09 42.25 52.07 35.55 0.42
1 1800 2172.77 10659.31 40.53 48.04 35.06 0.41
1 2000 3563.99 12454.60 43.00 49.98 37.73 0.43
1 2200 4926.80 15747.33 46.38 50.38 42.97 0.46
2 2400 4712.95 18150.01 48.91 53.97 44.73 0.49
2 2600 4945.91 18023.03 50.25 53.30 47.53 0.50
3 2800 6100.79 18400.07 51.21 54.85 48.01 0.51
3 3000 5124.39 17074.50 51.38 54.62 48.50 0.51
4 3200 5595.23 17486.11 52.83 57.31 48.99 0.53
4 3400 5857.02 16183.54 52.39 55.95 49.25 0.52
5 3600 7097.00 16779.79 55.20 58.97 51.89 0.55
5 3800 7305.36 16330.97 53.70 56.30 51.33 0.54
6 4000 6912.16 15848.24 55.86 57.40 54.39 0.56
6 4200 7083.29 15591.03 54.72 57.02 52.60 0.55
7 4400 7072.32 14623.82 55.80 61.07 51.37 0.56
7 4600 9153.78 15341.62 57.24 58.95 55.63 0.57
8 4800 7584.10 14801.21 54.85 56.26 53.52 0.55
8 5000 7514.11 14013.45 58.38 61.83 55.31 0.58
9 5200 9505.86 14416.66 57.41 60.38 54.72 0.57
9 5400 8458.73 13544.08 58.90 62.29 55.86 0.59
10 5600 9179.71 12723.23 58.53 60.97 56.28 0.59
10 5800 9730.11 13078.69 58.85 62.58 55.53 0.59
11 6000 8485.15 13275.12 59.14 62.02 56.51 0.59
11 6200 10376.37 12896.16 58.77 60.26 57.36 0.59
12 6400 8562.07 12582.15 58.59 62.72 54.98 0.59
12 6600 8131.18 11650.52 59.21 62.55 56.22 0.59
13 6800 10618.73 11832.74 58.46 60.77 56.32 0.58
13 7000 10180.18 12106.64 59.16 61.23 57.23 0.59
14 7200 10455.71 11767.56 62.46 65.60 59.60 0.62
14 7400 10277.93 11417.25 61.00 61.90 60.12 0.61
15 7600 10416.83 11844.74 61.50 63.19 59.90 0.61
15 7800 9843.24 10815.69 60.73 63.61 58.11 0.61
16 8000 10849.20 11080.88 62.16 65.61 59.05 0.62
16 8200 12479.84 10464.58 60.54 63.07 58.20 0.61
16 8400 11960.47 10947.46 63.05 64.79 61.39 0.63
17 8600 12225.40 10741.32 63.00 64.06 61.98 0.63
17 8800 11885.81 10653.15 63.88 66.43 61.52 0.64
18 9000 9813.91 9519.76 62.38 65.15 59.83 0.62
18 9200 11317.17 10009.74 62.36 65.20 59.77 0.62
19 9400 11061.72 10646.52 62.66 63.56 61.78 0.63
19 9600 11708.71 9658.76 62.61 66.30 59.31 0.63
20 9800 11545.23 10812.54 64.21 65.83 62.66 0.64
20 10000 12078.46 9654.99 63.09 64.35 61.88 0.63
21 10200 11745.36 9246.17 61.87 64.31 59.60 0.62
21 10400 11913.01 9916.31 62.74 64.24 61.30 0.63
22 10600 11860.46 9340.68 64.30 66.44 62.30 0.64
22 10800 13450.33 9669.23 63.20 64.48 61.98 0.63
23 11000 13385.45 9062.81 63.31 65.10 61.62 0.63
23 11200 13600.88 9135.41 63.88 65.94 61.95 0.64
24 11400 14294.13 8782.87 63.87 65.69 62.14 0.64
24 11600 18930.36 9024.00 63.06 64.11 62.04 0.63
25 11800 14705.22 8806.56 63.40 66.38 60.68 0.63
25 12000 17361.70 8958.72 64.71 66.28 63.22 0.65
26 12200 14182.36 8224.55 64.20 66.21 62.30 0.64
26 12400 15606.35 8725.44 64.23 66.68 61.95 0.64
27 12600 11960.69 7855.59 64.27 64.61 63.93 0.64
27 12800 12869.61 8011.05 63.80 66.58 61.23 0.64
28 13000 13938.21 8064.88 64.14 65.55 62.79 0.64
28 13200 12936.39 8126.91 65.23 66.64 63.87 0.65
29 13400 11387.84 7295.93 64.38 64.87 63.90 0.64
29 13600 15525.57 8512.57 64.52 66.23 62.89 0.65
30 13800 13474.02 8028.01 65.55 67.37 63.83 0.66
30 14000 16685.29 7827.30 64.15 64.61 63.70 0.64
31 14200 15312.08 7759.34 65.53 66.29 64.78 0.66
31 14400 16065.35 7711.75 64.03 65.93 62.24 0.64
32 14600 16316.15 7407.74 65.02 66.08 64.00 0.65
32 14800 16318.76 7667.86 64.97 66.60 63.41 0.65
33 15000 14086.54 7523.11 64.96 68.17 62.04 0.65
33 15200 16476.11 7485.34 64.86 67.14 62.73 0.65
34 15400 16635.40 7954.74 64.90 66.50 63.38 0.65
✔ Saved pipeline to output directory
model-last