Hugging face 模型微调 - 命名实体识别

任务目标

1、投入目标任务的文本数据集重新训练哈工大已完成MLM任务预训练的roberta模型

2、使其能够完成下游文本的命名实体识别(专业领域的实体如:面积、金额、户型等)

载入模型

模型下载地址,只需下载模型相关文件即可,config.json、pytorch_model.bin、vocab.txt
最后一层的num_labels为实体标记的总类数,每一类实体都有两种标记,一种代表实体开始,一种代表实体结束,在本文设计中实体结束的标记为实体开始的的序列+1的值

1
2
3
4
5
6
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification

model_path = 'D:/Models/chinese-roberta-wwm-ext'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=11)

构建训练数据

本文训练数据参照数据集下载地址的结构进行构造,内部数据结构如下,具体信息可见下载链接中readme文件描述

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
现	O
在 O
住 O
欧 B-COMM.NAME
洲 I-COMM.NAME
花 I-COMM.NAME
园 I-COMM.NAME
1 B-VALUE.AREA
6 I-VALUE.AREA
0 I-VALUE.AREA
左 O
右 O
房 O
子 O
也 O
重 O
新 O
装 O
修 O
了 O

其中labeldic存放的就是实体的开始标记,例如VALUE.AREA的开始标记(B-VALUE.AREA)为1,则结束标记以及中间标记(I-VALUE.AREA)为2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import numpy as np
import re

labeldic = {
"VALUE.AREA":1,
"VALUE.PRICE":3,
"VALUE.SCALE":5,
"TYPE.HOUSE":7,
"COMM.NAME":9
}

def get_train_data(file_path, col_num):
max_lenth = 100
data = []
label = []
num = 1
with open(file_path,"r",encoding="utf-8") as f:
content = ''
mask = [0] * max_lenth
for line in f.readlines():
if num > col_num:
break
if line == '\n':
num += 1
data.append(content.replace('\n',''))
content = ''
label.append(mask)
mask = [0] * max_lenth
continue
items = line.replace('\n','').split('\t')
if len(content) < max_lenth:
content += items[0]
if items[1] != 'O':
# entity_type = re.search("\-([A-Z]*\.)",items[1]).group()[1:-1]
entity_type = items[1].split('-')[1]
if entity_type in labeldic.keys():
if items[1][0] == 'B':
mask[len(content)-1] = labeldic[entity_type]
else:
mask[len(content)-1] = labeldic[entity_type]+1
return data,label

def padding_label(label, maxlen = 100):
label.insert(0,0) # CLS对应位置补0
if len(label) > maxlen:
return label[:maxlen]
else:
label += [0] * (maxlen -len(label))
return label


data = get_train_data("./file/beizhu_ner_train.txt", 1000)

完成预处理的data变量中的训练样本数据格式如下:

1
2
([['现在住欧洲花园160左右房子也重新装修了'],[''],...], 
[[0, 0, 0, 9, 10, 10, 10, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],...])

定义优化器和学习率

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16
train_data = tokenizer(data[0], padding = "max_length", max_length = 100, truncation=True ,return_tensors = "pt")
train_label = [padding_label(i) for i in data[1]]
train = TensorDataset(train_data["input_ids"], train_data["attention_mask"], torch.tensor(train_label))
train_sampler = RandomSampler(train)
train_dataloader = DataLoader(train, sampler=train_sampler, batch_size=batch_size)

from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-4)

from transformers import get_scheduler

num_epochs = 12
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

模型训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from tqdm import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
for epoch in range(num_epochs):
total_loss = 0
model.train()
with tqdm(list(enumerate(train_dataloader)),ncols=100) as _tqdm:
for step, batch in _tqdm:
_tqdm.set_description('epoch {}/{}'.format(epoch+1, num_epochs))
if not step == 0:
cur_loss = total_loss/(step*batch_size)
avg_train_loss = total_loss / len(train_dataloader)
_tqdm.set_postfix(loss=cur_loss, avg_loss=avg_train_loss)
_tqdm.update(1)
else:
_tqdm.set_postfix(loss=0.00000)
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
model.zero_grad()
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)

loss = outputs.loss
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()

模型预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import numpy as np

def get_entity(model,sen,flag='run'):
res = {}
sen_input = '[CLS]' + sen
test = tokenizer(sen_input,return_tensors="pt",padding="max_length",max_length=100)
test.to(device)
model.eval()
with torch.no_grad():
outputs = model(test["input_ids"],
token_type_ids=None,
attention_mask=test["attention_mask"])
logits = outputs["logits"].cpu()
pred_flat = np.argmax(logits,axis=2).numpy().squeeze()
if flag == 'debug':
print(pred_flat, len(pred_flat))
try:
# pred = list(enumerate(pred_flat))
res_index = {}
for label in labeldic.keys():
hits = [index for index,value in enumerate(pred_flat[:len(sen)+1]) if value == labeldic[label]]
if hits:
res_index[label] = hits
res[label] = []
if flag == 'debug':
print(res_index)
for label,indexs in res_index.items():
for i in indexs:
j = i
while True:
if pred_flat[j+1] != labeldic[label]+1:
if i != j and sen[i-2:j-1] != '': # 排除单字
res[label].append(sen[i-2:j-1])
break
j += 1
except:
pass

import copy
labels = copy.deepcopy(res)
for item in res.items():
if item[1]:
continue
else:
labels.pop(item[0])
return labels

get_entity(model,"客户本人住在新建城有一套70平米房子", 'debug')

输出结果:

1
2
3
4
5
[0 0 0 0 0 0 0 0 9 0 0 0 0 0 1 2 2 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 0 0 0 0 0
1 1 1 0 0 0 1 1 2 2 0 0 0 0 0 0 1 0 0 0 0 1 1 2 0 0 0 0 0 0 1 1 0 0 0 1 1
1 2 2 1 2 0 0 0 0 1 1 0 0 1 1 1 1 2 2 0 0 0 0 0 1 1] 100
{'VALUE.AREA': [14], 'COMM.NAME': [8]}
{'VALUE.AREA': ['70平']}