2.2 模型搭建与训练
BERT分类模型搭建
学习目标
- 掌握基于BERT的分类模型搭建的代码实现.
- 掌握迁移学习模型的训练, 测试, 评估的代码实现.
BERT模型搭建
- 本项目中BERT模型搭建的步骤如下:
- 第一步: 编写两个模型类的代码
- 第二步: 编写训练函数,测试函数,评估函数
- 第三步: 编写运行主函数
第一步: 编写两个模型类的代码
- 迁移学习预训练模型采用BERT, 类的代码路径为/home/ec2-user/toutiao/src/models/bert.py
- 第一步: 实现Config类代码.
- 第二步: 实现Model类代码.
# coding: UTF-8
import torch
import torch.nn as nn
import os
from transformers import BertModel, BertTokenizer, BertConfig
class Config(object):
def __init__(self, dataset):
self.model_name = "bert"
self.data_path = "/home/ec2-user/toutiao/data/data/"
self.train_path = self.data_path + "train.txt" # 训练集
self.dev_path = self.data_path + "dev.txt" # 验证集
self.test_path = self.data_path + "test.txt" # 测试集
self.class_list = [
x.strip() for x in open(self.data_path + "class.txt").readlines()
] # 类别名单
self.save_path = '/home/ec2-user/toutiao/src/saved_dict'
if not os.path.exists(self.save_path):
os.mkdir(self.save_path)
self.save_path += "/" + self.model_name + ".pt" # 模型训练结果
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 设备
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.num_epochs = 3 # epoch数
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 5e-5 # 学习率
self.bert_path = "/home/ec2-user/toutiao/data/bert_pretrain"
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
self.bert_config = BertConfig.from_pretrained(self.bert_path + '/bert_config.json')
self.hidden_size = 768
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(config.bert_path, config=config.bert_config)
self.fc = nn.Linear(config.hidden_size, config.num_classes)
def forward(self, x):
# 输入的句子
context = x[0]
# 对padding部分进行mask, 和句子一个size, padding部分用0表示, 比如[1, 1, 1, 1, 0, 0]
mask = x[2]
_, pooled = self.bert(context, attention_mask=mask)
out = self.fc(pooled)
return out
第二步: 编写训练函数,测试函数,评估函数
- 这3个函数共同编写在一个代码文件中: /home/ec2-user/toutiao/src/train_eval.py
- 第1步: 导入相关工具包.
- 第2步: 编写训练函数.
- 第3步: 编写测试函数.
- 第4步: 编写验证函数.
# coding: UTF-8
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from utils import get_time_dif
from transformers.optimization import AdamW
from tqdm import tqdm
import math
import logging
def loss_fn(outputs, labels):
return nn.CrossEntropyLoss()(outputs, labels)
def train(config, model, train_iter, dev_iter):
start_time = time.time()
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
"weight_decay": 0.01
},
{
"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
"weight_decay": 0.0
}]
optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)
total_batch = 0 # 记录进行到多少batch
dev_best_loss = float("inf")
last_improve = 0 # 记录上次验证集loss下降的batch数
flag = False # 记录是否很久没有效果提升
model.train()
for epoch in range(config.num_epochs):
total_batch = 0
print("Epoch [{}/{}]".format(epoch + 1, config.num_epochs))
for i, (trains, labels) in enumerate(tqdm(train_iter)):
outputs = model(trains)
model.zero_grad()
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
if total_batch % 200 == 0 and total_batch != 0:
# 每多少轮输出在训练集和验证集上的效果
true = labels.data.cpu()
predic = torch.max(outputs.data, 1)[1].cpu()
train_acc = metrics.accuracy_score(true, predic)
dev_acc, dev_loss = evaluate(config, model, dev_iter)
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
torch.save(model.state_dict(), config.save_path)
improve = "*"
last_improve = total_batch
else:
improve = ""
time_dif = get_time_dif(start_time)
msg = "Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}"
print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
# 评估完成后将模型置于训练模式, 更新参数
model.train()
# 每个batch结束后累加计数
total_batch += 1
if total_batch - last_improve > config.require_improvement:
# 验证集loss超过1000batch没下降,结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
def test(config, model, test_iter):
# model.load_state_dict(torch.load(config.save_path))
# 采用量化模型进行推理时需要关闭
model.eval()
start_time = time.time()
test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
msg = "Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}"
print(msg.format(test_loss, test_acc))
print("Precision, Recall and F1-Score...")
print(test_report)
print("Confusion Matrix...")
print(test_confusion)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
def evaluate(config, model, data_iter, test=False):
# 采用量化模型进行推理时需要关闭
model.eval()
loss_total = 0
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
for texts, labels in data_iter:
outputs = model(texts)
loss = F.cross_entropy(outputs, labels)
loss_total += loss
labels = labels.data.cpu().numpy()
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)
if test:
report = metrics.classification_report(labels_all,predict_all,target_names=config.class_list,digits=4)
confusion = metrics.confusion_matrix(labels_all, predict_all)
return acc, loss_total / len(data_iter), report, confusion
return acc, loss_total / len(data_iter)
第三步: 编写运行主函数
- 运行主函数依次调用前面的函数, 完成模型的训练和评估.
- 代码位置: /home/ec2-user/toutiao/src/run.py
import time
import torch
import numpy as np
from train_eval import train, test
from importlib import import_module
import argparse
from utils import build_dataset, build_iterator, get_time_dif
parser = argparse.ArgumentParser(description="Chinese Text Classification")
parser.add_argument("--model", type=str, required=True, help="choose a model: bert")
args = parser.parse_args()
if __name__ == "__main__":
dataset = "toutiao" # 数据集
if args.model == "bert":
model_name = "bert"
x = import_module("models." + model_name)
config = x.Config(dataset)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True # 保证每次结果一样
print("Loading data for Bert Model...")
train_data, dev_data, test_data = build_dataset(config)
train_iter = build_iterator(train_data, config)
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
model = x.Model(config).to(config.device)
train(config, model, train_iter, dev_iter)
test(config,model, test_iter)
cd /home/ec2-user/toutiao/src/
python run.py --model bert
Loading data for Bert Model...
180000it [00:37, 4820.80it/s]
10000it [00:02, 4954.00it/s]
10000it [00:02, 4952.50it/s]
Epoch [1/3]
14%|█████████▉ | 200/1407 [02:06<13:26, 1.50it/s]Iter: 200, Train Loss: 0.3, Train Acc: 91.41%, Val Loss: 0.29, Val Acc: 90.86%, Time: 0:02:26 *
28%|███████████████████▉ | 400/1407 [04:44<11:46, 1.43it/s]Iter: 400, Train Loss: 0.34, Train Acc: 90.62%, Val Loss: 0.26, Val Acc: 92.10%, Time: 0:05:07 *
43%|█████████████████████████████▊ | 600/1407 [07:26<09:25, 1.43it/s]Iter: 600, Train Loss: 0.29, Train Acc: 91.41%, Val Loss: 0.25, Val Acc: 92.10%, Time: 0:07:49 *
57%|███████████████████████████████████████▊ | 800/1407 [10:08<07:06, 1.42it/s]Iter: 800, Train Loss: 0.15, Train Acc: 94.53%, Val Loss: 0.22, Val Acc: 92.85%, Time: 0:10:31 *
71%|█████████████████████████████████████████████████ | 1000/1407 [12:50<04:43, 1.44it/s]Iter: 1000, Train Loss: 0.17, Train Acc: 94.53%, Val Loss: 0.22, Val Acc: 93.00%, Time: 0:13:10
No optimization for a long time, auto-stopping...
Test Loss: 0.2, Test Acc: 93.64%
Precision, Recall and F1-Score...
precision recall f1-score support
finance 0.9246 0.9320 0.9283 1000
realty 0.9484 0.9370 0.9427 1000
stocks 0.8787 0.8980 0.8882 1000
education 0.9511 0.9730 0.9619 1000
science 0.9236 0.8950 0.9091 1000
society 0.9430 0.9270 0.9349 1000
politics 0.9267 0.9100 0.9183 1000
sports 0.9780 0.9780 0.9780 1000
game 0.9514 0.9600 0.9557 1000
entertainment 0.9390 0.9540 0.9464 1000
accuracy 0.9364 10000
macro avg 0.9365 0.9364 0.9364 10000
weighted avg 0.9365 0.9364 0.9364 10000
Confusion Matrix...
[[932 10 37 2 5 5 7 1 1 0]
[ 13 937 11 2 4 10 5 5 5 8]
[ 49 12 898 1 19 1 15 0 2 3]
[ 1 1 0 973 0 8 7 0 1 9]
[ 4 4 28 7 895 10 12 2 27 11]
[ 2 8 4 16 5 927 18 1 5 14]
[ 3 8 34 12 9 19 910 0 0 5]
[ 2 3 2 1 1 1 4 978 1 7]
[ 0 2 4 0 24 1 3 1 960 5]
[ 2 3 4 9 7 1 1 12 7 954]]
Time usage: 0:00:19
71%|█████████████████████████████████████████████████ | 1000/1407 [13:29<05:29, 1.24it/s]
- 结论: BERT模型在测试集上的表现是Test Acc: 93.64% , 对比于第一章的fasttext模型最好的表现91.93%, 有了1.71%的提升, 可以说是显著性提升了!
小节总结
- 本小节实现了基于BERT实现投满分分类的模型, 并完成了训练和测试评估.