2.2 模型搭建与训练

BERT分类模型搭建¶

学习目标¶

掌握基于BERT的分类模型搭建的代码实现.
掌握迁移学习模型的训练, 测试, 评估的代码实现.

BERT模型搭建¶

本项目中BERT模型搭建的步骤如下:
- 第一步: 编写两个模型类的代码
- 第二步: 编写训练函数,测试函数,评估函数
- 第三步: 编写运行主函数

第一步: 编写两个模型类的代码¶

迁移学习预训练模型采用BERT, 类的代码路径为/home/ec2-user/toutiao/src/models/bert.py
- 第一步: 实现Config类代码.
- 第二步: 实现Model类代码.

第一步: 实现Config类代码.

# coding: UTF-8
import torch
import torch.nn as nn
import os
from transformers import BertModel, BertTokenizer, BertConfig


class Config(object):
    def __init__(self, dataset):
        self.model_name = "bert"
        self.data_path = "/home/ec2-user/toutiao/data/data/"
        self.train_path = self.data_path + "train.txt"  # 训练集
        self.dev_path = self.data_path + "dev.txt"  # 验证集
        self.test_path = self.data_path + "test.txt"  # 测试集
        self.class_list = [
            x.strip() for x in open(self.data_path + "class.txt").readlines()
        ]  # 类别名单
        self.save_path = '/home/ec2-user/toutiao/src/saved_dict'
        if not os.path.exists(self.save_path):
            os.mkdir(self.save_path)
        self.save_path += "/" + self.model_name + ".pt"  # 模型训练结果
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 设备

        self.require_improvement = 1000  # 若超过1000batch效果还没提升，则提前结束训练
        self.num_classes = len(self.class_list)  # 类别数
        self.num_epochs = 3  # epoch数
        self.batch_size = 128  # mini-batch大小
        self.pad_size = 32  # 每句话处理成的长度(短填长切)
        self.learning_rate = 5e-5  # 学习率
        self.bert_path = "/home/ec2-user/toutiao/data/bert_pretrain"
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.bert_config = BertConfig.from_pretrained(self.bert_path + '/bert_config.json')
        self.hidden_size = 768

第二步: 实现Model类代码.

class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path, config=config.bert_config)

        self.fc = nn.Linear(config.hidden_size, config.num_classes)

    def forward(self, x):
        # 输入的句子
        context = x[0]
        # 对padding部分进行mask, 和句子一个size, padding部分用0表示, 比如[1, 1, 1, 1, 0, 0]
        mask = x[2]

        _, pooled = self.bert(context, attention_mask=mask)
        out = self.fc(pooled)

        return out

第二步: 编写训练函数,测试函数,评估函数¶

这3个函数共同编写在一个代码文件中: /home/ec2-user/toutiao/src/train_eval.py
- 第1步: 导入相关工具包.
- 第2步: 编写训练函数.
- 第3步: 编写测试函数.
- 第4步: 编写验证函数.

第1步: 导入相关工具包.

# coding: UTF-8
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from utils import get_time_dif
from transformers.optimization import AdamW
from tqdm import tqdm
import math
import logging

第2步: 编写训练函数.

def loss_fn(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)


def train(config, model, train_iter, dev_iter):
    start_time = time.time()
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01
        },
        {
            "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0
        }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float("inf")
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升

    model.train()
    for epoch in range(config.num_epochs):
        total_batch = 0
        print("Epoch [{}/{}]".format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(tqdm(train_iter)):
            outputs = model(trains)

            model.zero_grad()
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            if total_batch % 200 == 0 and total_batch != 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = "*"
                    last_improve = total_batch
                else:
                    improve = ""
                time_dif = get_time_dif(start_time)
                msg = "Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}"
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                # 评估完成后将模型置于训练模式, 更新参数
                model.train()

            # 每个batch结束后累加计数
            total_batch += 1

            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降，结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break

第3步: 编写测试函数.

def test(config, model, test_iter):
    # model.load_state_dict(torch.load(config.save_path))
    # 采用量化模型进行推理时需要关闭
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)

    msg = "Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}"
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

第4步: 编写验证函数.

def evaluate(config, model, data_iter, test=False):
    # 采用量化模型进行推理时需要关闭
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)

            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    if test:
        report = metrics.classification_report(labels_all,predict_all,target_names=config.class_list,digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)

第三步: 编写运行主函数¶

运行主函数依次调用前面的函数, 完成模型的训练和评估.
- 代码位置: /home/ec2-user/toutiao/src/run.py

import time
import torch
import numpy as np
from train_eval import train, test
from importlib import import_module
import argparse
from utils import build_dataset, build_iterator, get_time_dif


parser = argparse.ArgumentParser(description="Chinese Text Classification")
parser.add_argument("--model", type=str, required=True, help="choose a model: bert")
args = parser.parse_args()


if __name__ == "__main__":
    dataset = "toutiao"  # 数据集
    if args.model == "bert":
        model_name = "bert"
        x = import_module("models." + model_name)
        config = x.Config(dataset)
        np.random.seed(1)
        torch.manual_seed(1)
        torch.cuda.manual_seed_all(1)
        torch.backends.cudnn.deterministic = True  # 保证每次结果一样

        print("Loading data for Bert Model...")
        train_data, dev_data, test_data = build_dataset(config)
        train_iter = build_iterator(train_data, config)
        dev_iter = build_iterator(dev_data, config)
        test_iter = build_iterator(test_data, config)

        model = x.Model(config).to(config.device)
        train(config, model, train_iter, dev_iter)
        test(config,model, test_iter)

调用:

cd /home/ec2-user/toutiao/src/

python run.py --model bert

输出结果:

Loading data for Bert Model...
180000it [00:37, 4820.80it/s]
10000it [00:02, 4954.00it/s]
10000it [00:02, 4952.50it/s]
Epoch [1/3]
 14%|█████████▉                                                            | 200/1407 [02:06<13:26,  1.50it/s]Iter:    200,  Train Loss:   0.3,  Train Acc: 91.41%,  Val Loss:  0.29,  Val Acc: 90.86%,  Time: 0:02:26 *
 28%|███████████████████▉                                                  | 400/1407 [04:44<11:46,  1.43it/s]Iter:    400,  Train Loss:  0.34,  Train Acc: 90.62%,  Val Loss:  0.26,  Val Acc: 92.10%,  Time: 0:05:07 *
 43%|█████████████████████████████▊                                        | 600/1407 [07:26<09:25,  1.43it/s]Iter:    600,  Train Loss:  0.29,  Train Acc: 91.41%,  Val Loss:  0.25,  Val Acc: 92.10%,  Time: 0:07:49 *
 57%|███████████████████████████████████████▊                              | 800/1407 [10:08<07:06,  1.42it/s]Iter:    800,  Train Loss:  0.15,  Train Acc: 94.53%,  Val Loss:  0.22,  Val Acc: 92.85%,  Time: 0:10:31 *
 71%|█████████████████████████████████████████████████                    | 1000/1407 [12:50<04:43,  1.44it/s]Iter:   1000,  Train Loss:  0.17,  Train Acc: 94.53%,  Val Loss:  0.22,  Val Acc: 93.00%,  Time: 0:13:10 
No optimization for a long time, auto-stopping...
Test Loss:   0.2,  Test Acc: 93.64%
Precision, Recall and F1-Score...
               precision    recall  f1-score   support

      finance     0.9246    0.9320    0.9283      1000
       realty     0.9484    0.9370    0.9427      1000
       stocks     0.8787    0.8980    0.8882      1000
    education     0.9511    0.9730    0.9619      1000
      science     0.9236    0.8950    0.9091      1000
      society     0.9430    0.9270    0.9349      1000
     politics     0.9267    0.9100    0.9183      1000
       sports     0.9780    0.9780    0.9780      1000
         game     0.9514    0.9600    0.9557      1000
entertainment     0.9390    0.9540    0.9464      1000

     accuracy                         0.9364     10000
    macro avg     0.9365    0.9364    0.9364     10000
 weighted avg     0.9365    0.9364    0.9364     10000

Confusion Matrix...
[[932  10  37   2   5   5   7   1   1   0]
 [ 13 937  11   2   4  10   5   5   5   8]
 [ 49  12 898   1  19   1  15   0   2   3]
 [  1   1   0 973   0   8   7   0   1   9]
 [  4   4  28   7 895  10  12   2  27  11]
 [  2   8   4  16   5 927  18   1   5  14]
 [  3   8  34  12   9  19 910   0   0   5]
 [  2   3   2   1   1   1   4 978   1   7]
 [  0   2   4   0  24   1   3   1 960   5]
 [  2   3   4   9   7   1   1  12   7 954]]
Time usage: 0:00:19
 71%|█████████████████████████████████████████████████                    | 1000/1407 [13:29<05:29,  1.24it/s]

结论: BERT模型在测试集上的表现是Test Acc: 93.64% , 对比于第一章的fasttext模型最好的表现91.93%, 有了1.71%的提升, 可以说是显著性提升了!

小节总结¶

本小节实现了基于BERT实现投满分分类的模型, 并完成了训练和测试评估.