bert模型使用

huggingface中的模型

huggingface中的模型定义的基本结构如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def _forward_unimplemented(self, *input: Any) -> None:
raise NotImplementedError(f"Module [{type(self).__name__}] is missing the required \"forward\" function")

class Module:
def __init__(self) -> None:
self.basename = "BaseModel"
def __call__(self, *args: Any, **kwds: Any) -> Any:
print(f"now you call __call__,params={args}")
self.forward(args[0])

forward: Callable[..., Any] = _forward_unimplemented


class MyModel(Module):
def __init__(self, name) -> None:
super().__init__()
self.name = name
def forward(self,x):
print(f"now you call forward,param={x}")

model = MyModel("bert")
model([1,2,3,4])
# now you call __call__,params=([1, 2, 3, 4],)
# now you call forward,param=[1, 2, 3, 4]

加载bert模型

1
2
3
4
5
6
7
8
9
10
from typing import Any
from transformers import AutoTokenizer, AutoModel
from typing import Callable
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
print('tokenizer type=',type(tokenizer))
model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

# 加载bert模型推理
inputs = tokenizer("你是个好人", return_tensors='pt') # 返回数据类型可以为pt,也可以为tf
output = model(**inputs) # 打印tensor

bert中分词tokenize

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")  
# huggingface默认下载位置:~\.cache\huggingface\hub,可以配置环境变量export HF_HOME="目标地址"修改
print("词典大小:",tokenizer.vocab_size)
# 词典大小: 30522 对应vocab.txt行数
text = "the game has gone!unaffable I have a new GPU!"
tokens = tokenizer.tokenize(text)
print(f"英文分词, 英文:{text},分词:{tokens}")
# 英文分词, 英文:the game has gone!unaffable I have a new GPU!,分词:['the', 'game', 'has', 'gone', '!', 'una', '##ffa', '##ble', 'i', 'have', 'a', 'new', 'gp', '##u', '!']
text = "我爱北京天安门,吢吣"
tokens = tokenizer.tokenize(text)
print(f"中文分词, 中文:{text},分词:{tokens}")
# 中文分词, 英文:我爱北京天安门,吢吣,分词:['我', '[UNK]', '北', '京', '天', '安', '[UNK]', ',', '[UNK]', '[UNK]']
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("id-token转换:",input_ids)
# id-token转换: [1855, 100, 1781, 1755, 1811, 1820, 100, 1989, 100, 100]
sen_code = tokenizer.encode_plus("i like you much", "but not him")
print("多句子encode:",sen_code)
# 句子encode: {'input_ids': [101, 1045, 2066, 2017, 2172, 102, 2021, 2025, 2032, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
print("decode:",tokenizer.decode(sen_code['input_ids']))
# decode: [CLS] i like you much [SEP] but not him [SEP]
inputs = tokenizer("你好", return_tensors="pt")
print(f'tokenizer("你好")={inputs}')
# tokenizer("你好")={'input_ids': tensor([[101, 100, 100, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
inputs = tokenizer(["你好吗","。不好的。哈哈"], padding=True, truncation=True, max_length=128, return_tensors="pt")
# 多个句子,按最长的进行填充。同时设置最长不超过128
print(f'tokenizer("你好吗。不好的。哈哈")={inputs}')
# tokenizer("你好吗。不好的。哈哈")={'input_ids': tensor([[ 101, 100, 100, 100, 102, 0, 0, 0, 0],
# [ 101, 1636, 1744, 100, 1916, 1636, 100, 100, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
# [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0],
# [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Bert的tokenizer中有特殊标记(Special Tokens)。它们的含义如下:

  • [PAD]:在batch中对齐序列长度时,用 [PAD]进行填充以使所有序列长度相同。可以通过将其添加到较短的序列末尾来实现对齐。
  • [CLS]:在输入序列的开头添加 [CLS] 标记,以表示该序列的分类结果。
  • [SEP]:用于分隔两个句子,例如在文本分类问题中,将两个句子拼接成一个输入序列时,可以使用 [SEP] 来分隔这两个句子。
  • [UNK]:此标记用于表示未知或词汇外的单词。当一个模型遇到一个它以前没有见过/无法识别的词时,它会用这个标记替换它。

AutoTokenizer到bert tokenizer

对于bert模型,模型文件格式如下:

1
2
3
4
5
6
7
# .cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/
.
├── config.json -> ../../blobs/45a2321a7ecfdaaf60a6c1fd7f5463994cc8907d
├── model.safetensors -> ../../blobs/68d45e234eb4a928074dfd868cead0219ab85354cc53d20e772753c6bb9169d3
├── tokenizer_config.json -> ../../blobs/e5c73d8a50df1f56fb5b0b8002d7cf4010afdccb
├── tokenizer.json -> ../../blobs/949a6f013d67eb8a5b4b5b46026217b888021b88
└── vocab.txt -> ../../blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938

调用tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") 后,from_pretrained函数首先调用get_tokenizer_config读取tokenizer_config.json文件中的数据,

1
2
3
# tokenizer_config.json
{"do_lower_case": true, "model_max_length": 512}
# 很多自有模型都会提供如tokenizer_class="QWenTokenizer", "auto_map":{"AutoTokenizer":"tokenization_qwen.QWenTokenizer"}等指定tokenizer文件

解析tokenizer_config.jsontokenizer_class,如果不存在,则读取config.json中的数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# config.json
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.6.0.dev0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522
}

可见,这里的model_type表明了模型类型为berttokenizer_classmodel_type只要存在一个,都可以去内置的Tokenizer中匹配到对应的Tokenizer,然后使用PreTrainedTokenizerBase(PreTrainedTokenizerFast).from_pretrained去创建一个使用词表vocab.txt和分词器tokenizer.json, tokenizer_config.json作为参数构建的Tokenizer。(BertTokenizer等都是继承自PreTrainedTokenizerBase)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
TOKENIZER_MAPPING_NAMES = OrderedDict(
[
(
"albert",
(
"AlbertTokenizer" if is_sentencepiece_available() else None,
"AlbertTokenizerFast" if is_tokenizers_available() else None,
),
),
("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
("bart", ("BartTokenizer", "BartTokenizerFast")),
(
"barthez",
(
"BarthezTokenizer" if is_sentencepiece_available() else None,
"BarthezTokenizerFast" if is_tokenizers_available() else None,
),
),
("bartpho", ("BartphoTokenizer", None)),
("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
("bert-japanese", ("BertJapaneseTokenizer", None)),
("bertweet", ("BertweetTokenizer", None)),
(
"big_bird",
(
"BigBirdTokenizer" if is_sentencepiece_available() else None,
"BigBirdTokenizerFast" if is_tokenizers_available() else None,
),
),
#...
])

注意:
tokenizer.json 文件是 Hugging Face 的 tokenizers 库用来存储预训练的tokenizer的配置和词汇表的文件。这个文件包含了词汇表(vocabulary)以及tokenizer的设置(例如特殊的token,如 [CLS], [SEP], [PAD],以及词汇表的大小,是否使用lower case等等)。
这个文件通常在你使用预训练的模型(如BERT, GPT-2等)进行微调(fine-tuning)时会用到,因为你需要用到和原始预训练模型相同的tokenizer来确保输入的处理方式一致。这个文件在你调用 from_pretrained 方法加载预训练模型时会自动加载。
然而,如果你在构建自己的tokenizer时,你可能不会直接使用到这个文件。你可能会使用一些基础的tokenizer组件(如 BertTokenizer, GPT2Tokenizer 等)和你自己的词汇表来构建tokenizer。在这种情况下,你可能不会直接使用 tokenizer.json 文件,而是使用这些组件和你的词汇表来构建tokenizer。
总的来说,tokenizer.json 文件是一个用来存储预训练tokenizer配置的文件,它在加载预训练模型时会用到,但在构建自定义tokenizer时可能不会直接使用。
在Hugging Face的transformers库中,tokenizer.json文件主要是在使用fast版本的tokenizer时使用的。fast版本的tokenizer是用Rust编写的,性能更好,功能也更丰富。这些tokenizer可以通过tokenizers库单独使用,也可以通过transformers库使用。

1
2
3
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
print('tokenizer type=',type(tokenizer))
# tokenizer type= <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>

AutoModel到BertModel

那么,AutoModel是如何找到BertModel的?

  1. 在AutoModel类中,会局部变量model_mapping初始化为MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)来映射内部所有模型和配置和模型类的映射
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
CONFIG_MAPPING_NAMES = OrderedDict(
[
# Add configs here
("albert", "AlbertConfig"),
("align", "AlignConfig"),
("altclip", "AltCLIPConfig"),
("audio-spectrogram-transformer", "ASTConfig"),
("autoformer", "AutoformerConfig"),
("bark", "BarkConfig"),
("bart", "BartConfig"),
("beit", "BeitConfig"),
("bert", "BertConfig"),
#......
])
MODEL_MAPPING_NAMES = OrderedDict(
[
# Base model mapping
("albert", "AlbertModel"),
("align", "AlignModel"),
("altclip", "AltCLIPModel"),
("audio-spectrogram-transformer", "ASTModel"),
("autoformer", "AutoformerModel"),
("bark", "BarkModel"),
("bart", "BartModel"),
("beit", "BeitModel"),
("bert", "BertModel"),
#......
])

  1. 首先执行AutoModel.from_pretrained函数,先拉取google-bert/bert-base-uncased中的config.json文件,并使用AutoConfig.from_pretrained构建一个类型为=<class 'transformers.models.bert.configuration_bert.BertConfig'>的对象。原理为从config.json中的model_type为bert来从CONFIG_MAPPING_NAMES加载对应的Config类
  2. 根据构建的BertConfig结构,在model_mapping中找到映射到的具体Model类,即BertModel

pytorch导学:bert分类实战

数据集

数据来源:今日头条客户端 ;数据规模:共382688条,分布于15个分类中。每行为一条数据,以_!_分割的个字段,从前往后分别是 新闻ID,分类code(见下文),分类名称(见下文),新闻字符串(仅含标题),新闻关键词。原始数据集格式如下:

1
2
3
4
5
6
7
8
9
10
11
tail -n 10 toutiao_cat_data.txt
# 6554596645437178371_!_104_!_news_finance_!_百万亏损中悟出的交易之道_!_齐威王,田忌赛马,忌讳,同龄人,交易者,好朋友,做交易,股民,股票市场,田忌
# 6554627652047602190_!_107_!_news_car_!_这款合资车曾比朗逸还火 现在却成功“跳楼” 售价仅为8万!_!_科鲁兹,小轿车,雪佛兰,大众朗逸,SUV
# 6554357384574140676_!_107_!_news_car_!_汽车防撞梁对安全的意义大吗?_!_
# 6554661690015744516_!_107_!_news_car_!_精致实用,这辆房车专为行家准备_!_车内,C200,房车,水曲柳,依维柯,玻璃钢
# 6554468366101250311_!_109_!_news_tech_!_如果联想给华为的短码投票,中国的5G是否拥有专利权?是否还能挽回?_!_
# 6554578634403741966_!_109_!_news_tech_!_A10处理器iPhone SE二代值得期待吗?_!_
# 6554623450374209806_!_110_!_news_military_!_先进战机叛逃将带来重大损失,美军如何防止F22飞行员驾机叛逃?_!_
# 6554489948580348424_!_113_!_news_world_!_又一国领导人放话,只要普京下令,数万大军“碾压”美国白宫!_!_以色列,普京,俄罗斯,叙利亚,车臣
# 6554706019040100611_!_113_!_news_world_!_如何看待美国总统连续撕毁美国签署的国际协议的举动?_!_
# 6554360505438306824_!_115_!_news_agriculture_!_农博会上,公安100余种土特产成了抢手货……_!_特色农产品,农博会,荆州市,生物科技,公安县

分类code与名称:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
100 民生 故事 news_story
101 文化 文化 news_culture
102 娱乐 娱乐 news_entertainment
103 体育 体育 news_sports
104 财经 财经 news_finance
106 房产 房产 news_house
107 汽车 汽车 news_car
108 教育 教育 news_edu
109 科技 科技 news_tech
110 军事 军事 news_military
112 旅游 旅游 news_travel
113 国际 国际 news_world
114 证券 股票 stock
115 农业 三农 news_agriculture
116 电竞 游戏 news_game

采集时间: 2018年05月
数据集分割:以0.7 0.15 0.15做分割

数据集处理

封装dataset.py文件,包含数据集的处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import torch.utils.data as data
import os
import pandas as pd
from typing import Tuple,Any

class Dataset(data.Dataset):
classes = [
[100, '民生 故事', 'news_story'],
[101, '文化 文化', 'news_culture'],
[102, '娱乐 娱乐', 'news_entertainment'],
[103, '体育 体育', 'news_sports'],
[104, '财经 财经', 'news_finance'],
# [105, '时政 新时代', 'nineteenth'],
[106, '房产 房产', 'news_house'],
[107, '汽车 汽车', 'news_car'],
[108, '教育 教育', 'news_edu' ],
[109, '科技 科技', 'news_tech'],
[110, '军事 军事', 'news_military'],
# [111 宗教 无,凤凰佛教等来源],
[112, '旅游 旅游', 'news_travel'],
[113, '国际 国际', 'news_world'],
[114, '证券 股票', 'stock'],
[115, '农业 三农', 'news_agriculture'],
[116, '电竞 游戏', 'news_game']
]
classes_map={
"100":0,
"101":1,
"102":2,
"103":3,
"104":4,
"106":5,
"107":6,
"108":7,
"109":8,
"110":9,
"112":10,
"113":11,
"114":12,
"115":13,
"116":14,
}
def __init__(
self,
root:str,
train:bool = True
) -> None:
"""训练集采用前80%,测试集采用后10%"""
super().__init__()
self.root = root
self.train = train

self.df : pd.DataFrame = pd.DataFrame(columns=['label', 'text'])

self._load_data()
self.df = self.df.head(5000) #时间原因,我只取了1600条训练

print(f"is_train_data={self.train}, total_len={len(self.df)}")
# print(f"head data={self.df.head(10)}")

def _load_data(self):
"""从{root}/toutiao_cat_data.txt或者{root}/toutiao_cat_data.csv中加载数据到pandas DataFrame中"""
if not os.path.exists(os.path.join(self.root, "toutiao_cat_data.csv")):
"""从txt中加载数据"""
with open(os.path.join(self.root,"toutiao_cat_data.txt"),'r') as file:
count = 0
for line in file:
count = count +1
if(count % 100 == 0):
print(f'line:{count}, items={items}')
# 6552431613437805063_!_102_!_news_entertainment_!_谢娜为李浩菲澄清网络谣言,之后她的两个行为给自己加分_!_佟丽娅,网络谣言,快乐大本营,李浩菲,谢娜,观众们
items = line.split('_!_')
label, text = items[1], items[3]+","+items[4]
self.df = self.df._append({'label': label, 'text': text}, ignore_index=True)
self.df.to_csv(os.path.join(self.root,'toutiao_cat_data.csv'), index=False, header=True)
print(f"txt total count={count}")
# 计算各部分数据集的大小
total_len = len(self.df)
train_len = int(total_len * 0.8)
valid_len = int(total_len * 0.1)
# 分割数据集
df_train = self.df.iloc[:train_len]
df_valid = self.df.iloc[train_len:train_len+valid_len]
df_test = self.df.iloc[train_len+valid_len:]
df_train.to_csv(os.path.join(self.root,'toutiao_cat_data_train.csv'), index=False, header=True)
df_valid.to_csv(os.path.join(self.root,'toutiao_cat_data_valid.csv'), index=False, header=True)
df_test.to_csv(os.path.join(self.root,'toutiao_cat_data_test.csv'), index=False, header=True)

# 根据是否是训练集来加载数据到pd中
if self.train:
self.df = pd.read_csv(os.path.join(self.root,'toutiao_cat_data_train.csv'))
else:
self.df = pd.read_csv(os.path.join(self.root,'toutiao_cat_data_valid.csv'))

def __len__(self) -> int:
return len(self.df)

def __getitem__(self, index) -> Tuple[Any,Any]:
"""继承Dataset接口,根据index获取某个元素,返回数据和标签"""
label = self.df.iloc[index]["label"]
text = self.df.iloc[index]["text"]
# print(f"__getitem__, type(label)={type(label)},label={label}")
return self.classes_map[str(label)],text


if __name__ == "__main__":
data = Dataset("toutiao-text-classfication-dataset")
print(f'len={len(data)}')

自定义封装bert模型训练推理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from transformers import BertModel ,BertForSequenceClassification
from transformers import BertTokenizer
from typing import Any
import dataset

# 构建模型
class BertClassification(nn.Module):
def __init__(self,
dropout=0.5,
num_labels = 15,
) -> None:
super().__init__()
# 修改了类别数量不会影响加载预训练参数
self.bert = BertModel.from_pretrained("bert-base-chinese",num_labels=num_labels)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(768, num_labels)
self.relu = nn.ReLU()

def forward(self,input_ids, mask):
_, pooled_output = self.bert(input_ids= input_ids, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
classifier_output = self.classifier(dropout_output)
final_layer = self.relu(classifier_output)
return final_layer


def train(
dataloader:DataLoader,
model:nn.Module,
learning_rate:float=0.5,
epochs:int=16
):
size = len(dataloader.dataset)
model.train()
device = ( "cuda" if torch.cuda.is_available() else "cpu" )
if device == "cuda":
model = model.cuda()
# 定义损失函数和优化器
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 开始进入训练循环
for epoch_num in range(epochs):
print(f"Epoch {epoch_num+1}\n-------------------------------")
# 定义两个变量,用于存储训练集的准确率和损失
total_acc_train = 0
total_loss_train = 0
for batch,(labels, texts) in enumerate(dataloader):
train_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
train_labels = labels.to(device)
input_id = train_inputs['input_ids'].squeeze(1).to(device)
mask = train_inputs['attention_mask'].to(device)
# print(f"input_id:{input_id.size()}, mask:{mask.size()}")
output = model(input_id, mask)
batch_loss = loss_fn(output, train_labels.long())
# item从tensor提取单个元素
total_loss_train += batch_loss.item()
# 计算精度
acc = (output.argmax(dim=1) == train_labels).sum().item()
total_acc_train += acc
# print(f"loss={batch_loss},acc={acc}")
# 模型更新,反向传播
model.zero_grad()
batch_loss.backward()
optimizer.step()
if batch % 10 == 0:
loss, current = batch_loss.item(), (batch + 1) * len(labels)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
total_loss_train /= len(dataloader)
total_acc_train /= size
print(f"Train epoch[{epoch_num+1}]: \n Accuracy: {(100*total_acc_train):>0.1f}%, Avg loss: {total_loss_train:>8f} \n")


def test(
dataloader:DataLoader,
model:nn.Module,
):
size = len(dataloader.dataset)
batch_nums = len(dataloader)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
loss_fn = nn.CrossEntropyLoss()
device = ( "cuda" if torch.cuda.is_available() else "cpu" )
if device == "cuda":
model = model.cuda()
test_loss, test_acc = 0,0
with torch.no_grad():
for labels, texts in dataloader:
labels = labels.to(device)
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
mask = inputs['attention_mask'].to(device)
input_id = inputs['input_ids'].squeeze(1).to(device)
output = model(input_id,mask)
test_loss += loss_fn(output,labels.long()).item()
test_acc += (output.argmax(dim=1) == labels).sum().item()
test_loss /= batch_nums
test_acc /= size
print(f"Test result: \n Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")


def test_model(
text:str,
model:BertClassification
):
device = ( "cuda" if torch.cuda.is_available() else "cpu" )

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model.eval()
if device == "cuda":
model = model.cuda()
text_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
mask = text_input['attention_mask'].to(device)
input_id = text_input['input_ids'].to(device)
output = model(input_id, mask)
print(f'output={output}')
output = output.argmax(dim=1)
output = output.item()
print(f"output label id={output}")


if __name__ == "__main__":
batch_size = 32
lr = 1e-6
epoch = 10
model = BertClassification()
# 模型训练并保存模型
train_data = dataset.Dataset("toutiao-text-classfication-dataset")
train_loader = DataLoader(train_data,batch_size=batch_size)
train(train_loader, model,lr,epochs=epoch)
torch.save(model.state_dict(), "bertclassify_model.pth")
# 加载模型进行测试
model.load_state_dict(torch.load("bertclassify_model.pth"))
test_data = dataset.Dataset("toutiao-text-classfication-dataset",train=False)
test_loader = DataLoader(test_data,batch_size=batch_size)
test(test_loader,model)
# 模型推理
test_model("京城最值得你来场文化之旅的博物馆保利集团,马未都,中国科学技术馆,博物馆,新中国",model)

使用BertForSequenceClassification训练推理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from transformers import BertModel ,BertForSequenceClassification
from transformers import BertTokenizer,BertConfig
from typing import Any
import dataset
from torch.optim import AdamW

def train(
dataloader:DataLoader,
model:nn.Module,
learning_rate:float=0.5,
epochs:int=16
):
device = ( "cuda" if torch.cuda.is_available() else "cpu" )
if device == "cuda":
model = model.cuda()
size = len(dataloader.dataset)
batch_nums = len(dataloader)
# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# optimizer = torch.optim.AdamW(model.model.parameters(), lr=learning_rate)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

for epoch_num in range(epochs):
print(f"Epoch {epoch_num+1}\n-------------------------------")
total_acc_train, total_loss_train = 0,0
for batch,(labels, texts) in enumerate(dataloader):
train_inputs = tokenizer(texts, padding=True,
truncation=True,
return_tensors="pt")
train_labels = labels.to(device)
input_id = train_inputs['input_ids'].squeeze(1).to(device)
mask = train_inputs['attention_mask'].to(device)
outputs = model(input_id, attention_mask=mask, labels=train_labels)
# print(f"outputs={outputs.logits}, labels={train_labels.size()}")
# 计算损失
loss = outputs.loss
# 计算精度
acc = (outputs.logits.argmax(dim=1) == train_labels).sum().item()
total_loss_train += loss
total_acc_train += acc
# print(f"loss={loss}")
# 反向传播
loss.backward()
# 更新权重
optimizer.step()
# 清空梯度
optimizer.zero_grad()
if batch % 10 == 0:
loss, current = loss.item(), (batch + 1) * len(labels)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

total_loss_train /= batch_nums
total_acc_train /= size
print(f"Train epoch[{epoch_num+1}]: \n Accuracy: {(100*total_acc_train):>0.1f}%, Avg loss: {total_loss_train:>8f} \n")

def test(
dataloader:DataLoader,
model:nn.Module,
):
size = len(dataloader.dataset)
batch_nums = len(dataloader)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
device = ( "cuda" if torch.cuda.is_available() else "cpu" )
if device == "cuda":
model = model.cuda()
test_loss, test_acc = 0,0
with torch.no_grad():
for labels, texts in dataloader:
labels = labels.to(device)
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
mask = inputs['attention_mask'].to(device)
input_id = inputs['input_ids'].squeeze(1).to(device)
output = model(input_id,mask, labels=labels)
test_acc += (output.logits.argmax(dim=1) == labels).sum().item()
test_loss /= batch_nums
test_acc /= size
print(f"Test result: \n Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")

def test_model(
text:str,
model:nn.Module
):
device = ( "cuda" if torch.cuda.is_available() else "cpu" )

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model.eval()
if device == "cuda":
model = model.cuda()
text_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
mask = text_input['attention_mask'].to(device)
input_id = text_input['input_ids'].to(device)
output = model(input_id, mask)
print(f'output={output}')
output = output.logits.argmax(dim=1)
output = output.item()
print(f"output label id={output}")


if __name__ == "__main__":
batch_size = 32
lr = 1e-6
epochs = 10
config = BertConfig.from_pretrained("bert-base-chinese",
num_labels=15,
output_attentions=False, # 模型是否返回 attentions weights.
output_hidden_states=False, # 模型是否返回所有隐藏状态
)
model = BertForSequenceClassification.from_pretrained("bert-base-chinese",config=config)
# 模型训练并保存模型
train_data = dataset.Dataset("toutiao-text-classfication-dataset")
train_loader = DataLoader(train_data,batch_size=batch_size)
train(train_loader, model,lr,epochs=epochs)
torch.save(model.state_dict(), "bertsequenceclassify_model.pth")
# 加载模型进行测试
model.load_state_dict(torch.load("bertsequenceclassify_model.pth"))
test_data = dataset.Dataset("toutiao-text-classfication-dataset",train=False)
test_loader = DataLoader(test_data,batch_size=batch_size)
test(test_loader,model)
# 模型推理
test_model("京城最值得你来场文化之旅的博物馆保利集团,马未都,中国科学技术馆,博物馆,新中国",model)

参考

头条评论分类:https://blog.csdn.net/qq_41301570/article/details/134320018

1
git clone https://gitcode.com/skdjfla/toutiao-text-classfication-dataset.git

斯坦福情感分类:https://blog.csdn.net/a553181867/article/details/105389757