pytorch导学:bert分类

数据集

数据来源:今日头条客户端 ;数据规模:共382688条,分布于15个分类中。每行为一条数据,以_!_分割的个字段,从前往后分别是 新闻ID,分类code(见下文),分类名称(见下文),新闻字符串(仅含标题),新闻关键词。原始数据集格式如下:

1
2
3
4
5
6
7
8
9
10
11
tail -n 10 toutiao_cat_data.txt
# 6554596645437178371_!_104_!_news_finance_!_百万亏损中悟出的交易之道_!_齐威王,田忌赛马,忌讳,同龄人,交易者,好朋友,做交易,股民,股票市场,田忌
# 6554627652047602190_!_107_!_news_car_!_这款合资车曾比朗逸还火 现在却成功“跳楼” 售价仅为8万!_!_科鲁兹,小轿车,雪佛兰,大众朗逸,SUV
# 6554357384574140676_!_107_!_news_car_!_汽车防撞梁对安全的意义大吗?_!_
# 6554661690015744516_!_107_!_news_car_!_精致实用,这辆房车专为行家准备_!_车内,C200,房车,水曲柳,依维柯,玻璃钢
# 6554468366101250311_!_109_!_news_tech_!_如果联想给华为的短码投票,中国的5G是否拥有专利权?是否还能挽回?_!_
# 6554578634403741966_!_109_!_news_tech_!_A10处理器iPhone SE二代值得期待吗?_!_
# 6554623450374209806_!_110_!_news_military_!_先进战机叛逃将带来重大损失,美军如何防止F22飞行员驾机叛逃?_!_
# 6554489948580348424_!_113_!_news_world_!_又一国领导人放话,只要普京下令,数万大军“碾压”美国白宫!_!_以色列,普京,俄罗斯,叙利亚,车臣
# 6554706019040100611_!_113_!_news_world_!_如何看待美国总统连续撕毁美国签署的国际协议的举动?_!_
# 6554360505438306824_!_115_!_news_agriculture_!_农博会上,公安100余种土特产成了抢手货……_!_特色农产品,农博会,荆州市,生物科技,公安县

分类code与名称:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
100 民生 故事 news_story
101 文化 文化 news_culture
102 娱乐 娱乐 news_entertainment
103 体育 体育 news_sports
104 财经 财经 news_finance
106 房产 房产 news_house
107 汽车 汽车 news_car
108 教育 教育 news_edu
109 科技 科技 news_tech
110 军事 军事 news_military
112 旅游 旅游 news_travel
113 国际 国际 news_world
114 证券 股票 stock
115 农业 三农 news_agriculture
116 电竞 游戏 news_game

采集时间: 2018年05月
数据集分割:以0.7 0.15 0.15做分割

数据集处理

封装dataset.py文件,包含数据集的处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import torch.utils.data as data
import os
import pandas as pd
from typing import Tuple,Any

class Dataset(data.Dataset):
classes = [
[100, '民生 故事', 'news_story'],
[101, '文化 文化', 'news_culture'],
[102, '娱乐 娱乐', 'news_entertainment'],
[103, '体育 体育', 'news_sports'],
[104, '财经 财经', 'news_finance'],
# [105, '时政 新时代', 'nineteenth'],
[106, '房产 房产', 'news_house'],
[107, '汽车 汽车', 'news_car'],
[108, '教育 教育', 'news_edu' ],
[109, '科技 科技', 'news_tech'],
[110, '军事 军事', 'news_military'],
# [111 宗教 无,凤凰佛教等来源],
[112, '旅游 旅游', 'news_travel'],
[113, '国际 国际', 'news_world'],
[114, '证券 股票', 'stock'],
[115, '农业 三农', 'news_agriculture'],
[116, '电竞 游戏', 'news_game']
]
classes_map={
"100":0,
"101":1,
"102":2,
"103":3,
"104":4,
"106":5,
"107":6,
"108":7,
"109":8,
"110":9,
"112":10,
"113":11,
"114":12,
"115":13,
"116":14,
}
def __init__(
self,
root:str,
train:bool = True
) -> None:
"""训练集采用前80%,测试集采用后10%"""
super().__init__()
self.root = root
self.train = train

self.df : pd.DataFrame = pd.DataFrame(columns=['label', 'text'])

self._load_data()
self.df = self.df.head(5000) #时间原因,我只取了1600条训练

print(f"is_train_data={self.train}, total_len={len(self.df)}")
# print(f"head data={self.df.head(10)}")

def _load_data(self):
"""从{root}/toutiao_cat_data.txt或者{root}/toutiao_cat_data.csv中加载数据到pandas DataFrame中"""
if not os.path.exists(os.path.join(self.root, "toutiao_cat_data.csv")):
"""从txt中加载数据"""
with open(os.path.join(self.root,"toutiao_cat_data.txt"),'r') as file:
count = 0
for line in file:
count = count +1
if(count % 100 == 0):
print(f'line:{count}, items={items}')
# 6552431613437805063_!_102_!_news_entertainment_!_谢娜为李浩菲澄清网络谣言,之后她的两个行为给自己加分_!_佟丽娅,网络谣言,快乐大本营,李浩菲,谢娜,观众们
items = line.split('_!_')
label, text = items[1], items[3]+","+items[4]
self.df = self.df._append({'label': label, 'text': text}, ignore_index=True)
self.df.to_csv(os.path.join(self.root,'toutiao_cat_data.csv'), index=False, header=True)
print(f"txt total count={count}")
# 计算各部分数据集的大小
total_len = len(self.df)
train_len = int(total_len * 0.8)
valid_len = int(total_len * 0.1)
# 分割数据集
df_train = self.df.iloc[:train_len]
df_valid = self.df.iloc[train_len:train_len+valid_len]
df_test = self.df.iloc[train_len+valid_len:]
df_train.to_csv(os.path.join(self.root,'toutiao_cat_data_train.csv'), index=False, header=True)
df_valid.to_csv(os.path.join(self.root,'toutiao_cat_data_valid.csv'), index=False, header=True)
df_test.to_csv(os.path.join(self.root,'toutiao_cat_data_test.csv'), index=False, header=True)

# 根据是否是训练集来加载数据到pd中
if self.train:
self.df = pd.read_csv(os.path.join(self.root,'toutiao_cat_data_train.csv'))
else:
self.df = pd.read_csv(os.path.join(self.root,'toutiao_cat_data_valid.csv'))

def __len__(self) -> int:
return len(self.df)

def __getitem__(self, index) -> Tuple[Any,Any]:
"""继承Dataset接口,根据index获取某个元素,返回数据和标签"""
label = self.df.iloc[index]["label"]
text = self.df.iloc[index]["text"]
# print(f"__getitem__, type(label)={type(label)},label={label}")
return self.classes_map[str(label)],text


if __name__ == "__main__":
data = Dataset("toutiao-text-classfication-dataset")
print(f'len={len(data)}')

自定义封装bert模型训练推理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from transformers import BertModel ,BertForSequenceClassification
from transformers import BertTokenizer
from typing import Any
import dataset

# 构建模型
class BertClassification(nn.Module):
def __init__(self,
dropout=0.5,
num_labels = 15,
) -> None:
super().__init__()
# 修改了类别数量不会影响加载预训练参数
self.bert = BertModel.from_pretrained("bert-base-chinese",num_labels=num_labels)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(768, num_labels)
self.relu = nn.ReLU()

def forward(self,input_ids, mask):
_, pooled_output = self.bert(input_ids= input_ids, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
classifier_output = self.classifier(dropout_output)
final_layer = self.relu(classifier_output)
return final_layer


def train(
dataloader:DataLoader,
model:nn.Module,
learning_rate:float=0.5,
epochs:int=16
):
size = len(dataloader.dataset)
model.train()
device = ( "cuda" if torch.cuda.is_available() else "cpu" )
if device == "cuda":
model = model.cuda()
# 定义损失函数和优化器
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 开始进入训练循环
for epoch_num in range(epochs):
print(f"Epoch {epoch_num+1}\n-------------------------------")
# 定义两个变量,用于存储训练集的准确率和损失
total_acc_train = 0
total_loss_train = 0
for batch,(labels, texts) in enumerate(dataloader):
train_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
train_labels = labels.to(device)
input_id = train_inputs['input_ids'].squeeze(1).to(device)
mask = train_inputs['attention_mask'].to(device)
# print(f"input_id:{input_id.size()}, mask:{mask.size()}")
output = model(input_id, mask)
batch_loss = loss_fn(output, train_labels.long())
# item从tensor提取单个元素
total_loss_train += batch_loss.item()
# 计算精度
acc = (output.argmax(dim=1) == train_labels).sum().item()
total_acc_train += acc
# print(f"loss={batch_loss},acc={acc}")
# 模型更新,反向传播
model.zero_grad()
batch_loss.backward()
optimizer.step()
if batch % 10 == 0:
loss, current = batch_loss.item(), (batch + 1) * len(labels)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
total_loss_train /= len(dataloader)
total_acc_train /= size
print(f"Train epoch[{epoch_num+1}]: \n Accuracy: {(100*total_acc_train):>0.1f}%, Avg loss: {total_loss_train:>8f} \n")


def test(
dataloader:DataLoader,
model:nn.Module,
):
size = len(dataloader.dataset)
batch_nums = len(dataloader)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
loss_fn = nn.CrossEntropyLoss()
device = ( "cuda" if torch.cuda.is_available() else "cpu" )
if device == "cuda":
model = model.cuda()
test_loss, test_acc = 0,0
with torch.no_grad():
for labels, texts in dataloader:
labels = labels.to(device)
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
mask = inputs['attention_mask'].to(device)
input_id = inputs['input_ids'].squeeze(1).to(device)
output = model(input_id,mask)
test_loss += loss_fn(output,labels.long()).item()
test_acc += (output.argmax(dim=1) == labels).sum().item()
test_loss /= batch_nums
test_acc /= size
print(f"Test result: \n Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")


def test_model(
text:str,
model:BertClassification
):
device = ( "cuda" if torch.cuda.is_available() else "cpu" )

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model.eval()
if device == "cuda":
model = model.cuda()
text_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
mask = text_input['attention_mask'].to(device)
input_id = text_input['input_ids'].to(device)
output = model(input_id, mask)
print(f'output={output}')
output = output.argmax(dim=1)
output = output.item()
print(f"output label id={output}")


if __name__ == "__main__":
batch_size = 32
lr = 1e-6
epoch = 10
model = BertClassification()
# 模型训练并保存模型
train_data = dataset.Dataset("toutiao-text-classfication-dataset")
train_loader = DataLoader(train_data,batch_size=batch_size)
train(train_loader, model,lr,epochs=epoch)
torch.save(model.state_dict(), "bertclassify_model.pth")
# 加载模型进行测试
model.load_state_dict(torch.load("bertclassify_model.pth"))
test_data = dataset.Dataset("toutiao-text-classfication-dataset",train=False)
test_loader = DataLoader(test_data,batch_size=batch_size)
test(test_loader,model)
# 模型推理
test_model("京城最值得你来场文化之旅的博物馆保利集团,马未都,中国科学技术馆,博物馆,新中国",model)

使用BertForSequenceClassification训练推理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from transformers import BertModel ,BertForSequenceClassification
from transformers import BertTokenizer,BertConfig
from typing import Any
import dataset
from torch.optim import AdamW

def train(
dataloader:DataLoader,
model:nn.Module,
learning_rate:float=0.5,
epochs:int=16
):
device = ( "cuda" if torch.cuda.is_available() else "cpu" )
if device == "cuda":
model = model.cuda()
size = len(dataloader.dataset)
batch_nums = len(dataloader)
# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# optimizer = torch.optim.AdamW(model.model.parameters(), lr=learning_rate)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

for epoch_num in range(epochs):
print(f"Epoch {epoch_num+1}\n-------------------------------")
total_acc_train, total_loss_train = 0,0
for batch,(labels, texts) in enumerate(dataloader):
train_inputs = tokenizer(texts, padding=True,
truncation=True,
return_tensors="pt")
train_labels = labels.to(device)
input_id = train_inputs['input_ids'].squeeze(1).to(device)
mask = train_inputs['attention_mask'].to(device)
outputs = model(input_id, attention_mask=mask, labels=train_labels)
# print(f"outputs={outputs.logits}, labels={train_labels.size()}")
# 计算损失
loss = outputs.loss
# 计算精度
acc = (outputs.logits.argmax(dim=1) == train_labels).sum().item()
total_loss_train += loss
total_acc_train += acc
# print(f"loss={loss}")
# 反向传播
loss.backward()
# 更新权重
optimizer.step()
# 清空梯度
optimizer.zero_grad()
if batch % 10 == 0:
loss, current = loss.item(), (batch + 1) * len(labels)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

total_loss_train /= batch_nums
total_acc_train /= size
print(f"Train epoch[{epoch_num+1}]: \n Accuracy: {(100*total_acc_train):>0.1f}%, Avg loss: {total_loss_train:>8f} \n")

def test(
dataloader:DataLoader,
model:nn.Module,
):
size = len(dataloader.dataset)
batch_nums = len(dataloader)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
device = ( "cuda" if torch.cuda.is_available() else "cpu" )
if device == "cuda":
model = model.cuda()
test_loss, test_acc = 0,0
with torch.no_grad():
for labels, texts in dataloader:
labels = labels.to(device)
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
mask = inputs['attention_mask'].to(device)
input_id = inputs['input_ids'].squeeze(1).to(device)
output = model(input_id,mask, labels=labels)
test_acc += (output.logits.argmax(dim=1) == labels).sum().item()
test_loss /= batch_nums
test_acc /= size
print(f"Test result: \n Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")

def test_model(
text:str,
model:nn.Module
):
device = ( "cuda" if torch.cuda.is_available() else "cpu" )

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model.eval()
if device == "cuda":
model = model.cuda()
text_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
mask = text_input['attention_mask'].to(device)
input_id = text_input['input_ids'].to(device)
output = model(input_id, mask)
print(f'output={output}')
output = output.logits.argmax(dim=1)
output = output.item()
print(f"output label id={output}")


if __name__ == "__main__":
batch_size = 32
lr = 1e-6
epochs = 10
config = BertConfig.from_pretrained("bert-base-chinese",
num_labels=15,
output_attentions=False, # 模型是否返回 attentions weights.
output_hidden_states=False, # 模型是否返回所有隐藏状态
)
model = BertForSequenceClassification.from_pretrained("bert-base-chinese",config=config)
# 模型训练并保存模型
train_data = dataset.Dataset("toutiao-text-classfication-dataset")
train_loader = DataLoader(train_data,batch_size=batch_size)
train(train_loader, model,lr,epochs=epochs)
torch.save(model.state_dict(), "bertsequenceclassify_model.pth")
# 加载模型进行测试
model.load_state_dict(torch.load("bertsequenceclassify_model.pth"))
test_data = dataset.Dataset("toutiao-text-classfication-dataset",train=False)
test_loader = DataLoader(test_data,batch_size=batch_size)
test(test_loader,model)
# 模型推理
test_model("京城最值得你来场文化之旅的博物馆保利集团,马未都,中国科学技术馆,博物馆,新中国",model)

参考

头条评论分类:https://blog.csdn.net/qq_41301570/article/details/134320018

1
git clone https://gitcode.com/skdjfla/toutiao-text-classfication-dataset.git

斯坦福情感分类:https://blog.csdn.net/a553181867/article/details/105389757