1.确定设备

  • 使用**torch.device()**方法指定设备

  • 对于Tensor或者model,可以使用to(device)的方式转移到指定设备上运算

1
2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("当前运行设备为: {} ".format(device))

2.准备数据

  • 需要用到torch.utils.data里面的两个类,DatasetDataLoader

  • 新建自己的数据类并继承Dataset类

  • 在构造函数里读取文件,并准备好datalabel的数据列表

  • getitem方法指定按索引读取数据时返回哪些data

  • len方法指定使用len()方法获取数据长度时返回的值

  • 之后使用Dataloader读取用自建数据类实例化的数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class MyData(Dataset):
# 初始化时指定存放路径
def __init__(self, fp):
xy = pd.read_csv(fp)
self.len = len(xy)
# 句子
self.x_data = tokenizer(xy.text.values.tolist(), padding='max_length', max_length=102, truncation=True)
# input_ids
self.x_input_ids = torch.Tensor(self.x_data['input_ids']) # [[x, x, x], [x, x, x], ...]
# token_type_ids
self.x_token_type_ids = torch.Tensor(self.x_data['token_type_ids'])
# attention_mask
self.x_attention_mask = torch.Tensor(self.x_data['attention_mask'])
# 标签Tensor
self.y_data = torch.Tensor(xy.label.values.tolist()) # [1, 1, 0, 0, ...]

# 获取数据索引
def __getitem__(self, index):
# [x, x, x]
return self.x_input_ids[index], self.x_token_type_ids[index], self.x_attention_mask[index], self.y_data[index]

# 获取数据总量
def __len__(self):
return self.len

# 加载数据
train_data = MyData('imdbsTrain.csv')
test_data = MyData('imdbsTest.csv')
train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=16)

3.设计模型

  • 需要用到torch.nn.Module

  • 在构造函数构建网络

  • 在前馈方法确定正向传播的方式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
class MyModel(torch.nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.bert = BertModel.from_pretrained('../bert-base-uncased')
self.linear1 = torch.nn.Linear(768, 192)
self.linear2 = torch.nn.Linear(192, 96)
self.linear3 = torch.nn.Linear(96, 1)
self.dropout = torch.nn.Dropout(0.2)

def forward(self, input_ids, token_type_ids, attention_mask):
input_ids = input_ids.long()
token_type_ids = token_type_ids.long()
attention_mask = attention_mask.long()
output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
output = output['pooler_output'] # cls那个嵌入表示 batchSize * 768
output = F.relu(self.linear1(output))
output = F.relu(self.linear2(output))
output = self.dropout(output)
output = F.sigmoid(self.linear3(output))
return output

4.定义优化器和损失函数

  • 需要用到**torch.nn**和torch.optim
1
2
3
criterion = torch.nn.BCELoss(reduction='mean')  # 二元交叉熵损失函数,二分类用
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

5.训练模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
for i in range(epoch):
model.train()
losses = []
accuracy = []
start_time = time.time()

for batch, data in enumerate(train_loader, 0):
# for input_ids, token_type_ids, attention_mask, labels in tqdm(train_loader, total=len(train_loader)):
input_ids, token_type_ids, attention_mask, labels = data # [[x, x, x], ... [x, x, x]] 这里有16个
# 将输入张量转入gpu
input_ids, token_type_ids, attention_mask, labels = input_ids.to(device), token_type_ids.to(
device), attention_mask.to(device), labels.to(device)

# 输出yhat
y_pred = model(input_ids, token_type_ids, attention_mask) # 维度要和标签的维度对应
labels = labels.unsqueeze(1) # 标签也要转成矩阵的形式

# 计算损失
loss = criterion(y_pred, labels)
# print('当前批次的损失为:',loss.item())
losses.append(loss.item())

# 计算准确率
pred_labels = []
for p in y_pred:
if p.item() > 0.5: # 设定阈值为0.5
pred_labels.append(1)
else:
pred_labels.append(0)
pred_labels = torch.Tensor(pred_labels).unsqueeze(1)
pred_labels = pred_labels.to(device)
acc = torch.sum(pred_labels == labels).item() / len(pred_labels) # 计算准确率
accuracy.append(acc)

# 训练完一批后的固定步骤
optimizer.zero_grad() # 梯度归0
loss.backward() # 反向传播
optimizer.step() # 更新参数

# 可视化指标
elapsed_time = time.time() - start_time
print("\nEpoch: {}/{}: ".format(i + 1, epoch), # 第几轮训练
"Loss: {:.6f}; ".format(np.mean(losses)), # 该轮训练的平均损失
"Accuracy: {:.6f}; ".format(np.mean(accuracy)), # 该论的准确率
'Time: {:.2f}s'.format(elapsed_time)) # 该论训练时间

6.测试模型

  • 相当于训练跑一个epoch,但是不需要计算梯度
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
with torch.no_grad():  # 不计算梯度,因为不需要反向传播
model.eval()
eval_losses = []
eval_acc = []
eval_start_time = time.time()
for batch, data in enumerate(test_loader, 0):
input_ids, token_type_ids, attention_mask, labels = data
# 将输入张量转入gpu
input_ids, token_type_ids, attention_mask, labels = input_ids.to(device), token_type_ids.to(
device), attention_mask.to(device), labels.to(device)

# 输出yhat
y_pred = model(input_ids, token_type_ids, attention_mask) # 维度要和标签的维度对应
labels = labels.unsqueeze(1) # 标签也要转成矩阵的形式

# 计算损失
loss = criterion(y_pred, labels)
eval_losses.append(loss.item())

# 计算准确率
pred_labels = []
for p in y_pred:
if p.item() > 0.5: # 设定阈值为0.5
pred_labels.append(1)
else:
pred_labels.append(0)
pred_labels = torch.Tensor(pred_labels).unsqueeze(1)
pred_labels = pred_labels.to(device)
acc = torch.sum(pred_labels == labels).item() / len(pred_labels) # 计算准确率
eval_acc.append(acc)

# 可视化指标
elapsed_time = time.time() - eval_start_time
print("\nEval_loss: {:.6f}; ".format(np.mean(eval_losses)), # 该轮训练的平均损失
"Accuracy: {:.6f}; ".format(np.mean(eval_acc)), # 该论的准确率
'Time: {:.2f}s'.format(elapsed_time)) # 该论训练时间

其他

  • bert模型的输入必须是tensor([ [ ] ])的形式,如tensor[ [101, 222, 102], [201, 333, 202] ],并且元素必须是long或者Int型

  • tokenizer()方法返回一个对象,里面包含input_ids、attention_mask、token_type_ids三个属性

  • bert的输出有两个,分别为句子所有单词的嵌入表示和CLS标记的嵌入表示(包含整个句子的聚合表示,可以将其作为整个句子的嵌入表示)

  • 确定种子的值可以保证模型训练的结果一致

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
import pandas as pd
import time
import numpy as np

# 确定设备
# torch.device("str")方法指定设备
# 对于tensor,可以用.to(device)将其置于指定设备上运算
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("当前运行设备为: {} ".format(device))

# 引入分词器,BertTokenizer换成AutoTokenizer的话会根据载入的模型自动适配分词器的种类
tokenizer = BertTokenizer.from_pretrained('../bert-base-uncased')


# 1.准备数据
class MyData(Dataset):
# 初始化时指定存放路径
def __init__(self, fp):
xy = pd.read_csv(fp)
self.len = len(xy)
# 句子
self.x_data = tokenizer(xy.text.values.tolist(), padding='max_length', max_length=102, truncation=True)
# input_ids
self.x_input_ids = torch.Tensor(self.x_data['input_ids']) # [[x, x, x], [x, x, x], ...]
# token_type_ids
self.x_token_type_ids = torch.Tensor(self.x_data['token_type_ids'])
# attention_mask
self.x_attention_mask = torch.Tensor(self.x_data['attention_mask'])
# 标签Tensor
self.y_data = torch.Tensor(xy.label.values.tolist()) # [1, 1, 0, 0, ...]

# 获取数据索引
def __getitem__(self, index):
# [x, x, x]
return self.x_input_ids[index], self.x_token_type_ids[index], self.x_attention_mask[index], self.y_data[index]

# 获取数据总量
def __len__(self):
return self.len


# 加载数据
train_data = MyData('imdbsTrain.csv')
test_data = MyData('imdbsTest.csv')
# input_ids, token_type_ids, attention_mask, labels = train_data[0]
# print(input_ids)
# print(token_type_ids)
# print(attention_mask)
# print(labels)
train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=16)


# 2.设计模型
class MyModel(torch.nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.bert = BertModel.from_pretrained('../bert-base-uncased')
self.linear1 = torch.nn.Linear(768, 192)
self.linear2 = torch.nn.Linear(192, 96)
self.linear3 = torch.nn.Linear(96, 1)
self.dropout = torch.nn.Dropout(0.2)

def forward(self, input_ids, token_type_ids, attention_mask):
input_ids = input_ids.long() # [[1, 1, 1], [16, 16, 16]]
token_type_ids = token_type_ids.long()
attention_mask = attention_mask.long()
# 也就是说bert的输入必须是[[]]的形式,即使是一个句子也一样,并且里面的元素得是long或者int
output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
output = output['pooler_output'] # cls那个嵌入表示 batchSize * 768
output = F.relu(self.linear1(output))
output = F.relu(self.linear2(output))
output = self.dropout(output)
output = F.sigmoid(self.linear3(output))
return output


# 参数准备
# 设置随机数种子,保证结果可复现
seed = 100
if device == 'cuda':
torch.cuda.manual_seed(seed)
else:
torch.manual_seed(seed)

# 实例化模型
model = MyModel()
model.to(device)

# 设置参数
lr = 0.01
epoch = 60

# 3.定义优化器和损失函数
criterion = torch.nn.BCELoss(reduction='mean') # 二元交叉熵损失函数,二分类用
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

# 4.训练
if __name__ == '__main__':
for i in range(epoch):
model.train()
losses = []
accuracy = []
start_time = time.time()

for batch, data in enumerate(train_loader, 0):
# for input_ids, token_type_ids, attention_mask, labels in tqdm(train_loader, total=len(train_loader)):
input_ids, token_type_ids, attention_mask, labels = data # [[x, x, x], ... [x, x, x]] 这里有16个
# 将输入张量转入gpu
input_ids, token_type_ids, attention_mask, labels = input_ids.to(device), token_type_ids.to(
device), attention_mask.to(device), labels.to(device)

# 输出yhat
y_pred = model(input_ids, token_type_ids, attention_mask) # 维度要和标签的维度对应
labels = labels.unsqueeze(1) # 标签也要转成矩阵的形式

# 计算损失
loss = criterion(y_pred, labels)
# print('当前批次的损失为:',loss.item())
losses.append(loss.item())

# 计算准确率
pred_labels = []
for p in y_pred:
if p.item() > 0.5: # 设定阈值为0.5
pred_labels.append(1)
else:
pred_labels.append(0)
pred_labels = torch.Tensor(pred_labels).unsqueeze(1)
pred_labels = pred_labels.to(device)
acc = torch.sum(pred_labels == labels).item() / len(pred_labels) # 计算准确率
accuracy.append(acc)

# 训练完一批后的固定步骤
optimizer.zero_grad() # 梯度归0
loss.backward() # 反向传播
optimizer.step() # 更新参数

# 可视化指标
elapsed_time = time.time() - start_time
print("\nEpoch: {}/{}: ".format(i + 1, epoch), # 第几轮训练
"Loss: {:.6f}; ".format(np.mean(losses)), # 该轮训练的平均损失
"Accuracy: {:.6f}; ".format(np.mean(accuracy)), # 该论的准确率
'Time: {:.2f}s'.format(elapsed_time)) # 该论训练时间

# 5. 测试
with torch.no_grad(): # 不计算梯度,因为不需要反向传播
model.eval()
eval_losses = []
eval_acc = []
eval_start_time = time.time()
for batch, data in enumerate(test_loader, 0):
input_ids, token_type_ids, attention_mask, labels = data
# 将输入张量转入gpu
input_ids, token_type_ids, attention_mask, labels = input_ids.to(device), token_type_ids.to(
device), attention_mask.to(device), labels.to(device)

# 输出yhat
y_pred = model(input_ids, token_type_ids, attention_mask) # 维度要和标签的维度对应
labels = labels.unsqueeze(1) # 标签也要转成矩阵的形式

# 计算损失
loss = criterion(y_pred, labels)
eval_losses.append(loss.item())

# 计算准确率
pred_labels = []
for p in y_pred:
if p.item() > 0.5: # 设定阈值为0.5
pred_labels.append(1)
else:
pred_labels.append(0)
pred_labels = torch.Tensor(pred_labels).unsqueeze(1)
pred_labels = pred_labels.to(device)
acc = torch.sum(pred_labels == labels).item() / len(pred_labels) # 计算准确率
eval_acc.append(acc)

# 可视化指标
elapsed_time = time.time() - eval_start_time
print("\nEval_loss: {:.6f}; ".format(np.mean(eval_losses)), # 该轮训练的平均损失
"Accuracy: {:.6f}; ".format(np.mean(eval_acc)), # 该论的准确率
'Time: {:.2f}s'.format(elapsed_time)) # 该论训练时间