-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
211 lines (180 loc) · 8.31 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
Embedding向量模型约等于BERT。
从Transformer模型的角度来看,BERT是一个Encoder-only的Transformer架构。
而我们所需要的Embedding向量就是Bert架构的最后一层Transformer Block的输出。
并且,它是一个Bi-directional Encoder的架构(即没有attention mask,但是有特殊标记mask)。
关于特殊标记:
主打 [CLS]、[MUSK] 和 [SEP] 来在切分的句子中添加特殊的标记,以使模型能够学习到词与词的关系,以及句子的边界。
"""
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
class EmbeddingLayer(nn.Module):
"""
BERT Embedding 层有三种属性:
1. Token Embedding : 类似GPT中的输入样本文字部分
2. Positional Embedding : 通过正余弦添加位置信息
3. Segment Embedding : 样本对顺序的标注,用于区分两个句子。如:“0”代表第一个句子,“1”代表“0”接下来的下一个句子。
将所有三个embedding做加法,得到最终给transformer encoder层的输入。
"""
def __init__(self, d_model, device, vocab_size, context_length):
super().__init__()
self.d_model = d_model
self.vocab_size = vocab_size
self.context_length = context_length
self.device = device
self.token_embedding = nn.Embedding(self.vocab_size, self.d_model).to(self.device)
self.segment_embedding = nn.Embedding(2, self.d_model).to(self.device)
self.position_embedding = nn.Parameter(self.create_position_encoding().to(self.device), requires_grad=False)
self.layer_norm = nn.LayerNorm(self.d_model)
def create_position_encoding(self):
position_encoding = torch.zeros(self.context_length, self.d_model)
position = torch.arange(0, self.context_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * (-math.log(10000.0) / self.d_model))
position_encoding[:, 0::2] = torch.sin(position * div_term)
position_encoding[:, 1::2] = torch.cos(position * div_term)
return position_encoding
def forward(self, idx):
idx = idx.to(self.device)
sentence_size = idx.size(-1)
segment_tensor = torch.zeros_like(idx).to(self.device)
segment_tensor[:, sentence_size // 2 + 1:] = 1
position_embedding = self.position_embedding[:idx.size(1), :]
x = self.token_embedding(idx) + self.segment_embedding(segment_tensor) + position_embedding
return self.layer_norm(x)
# Define feed forward network | 定义前馈网络
class FeedForwardNetwork(nn.Module):
def __init__(self, d_model, dropout):
super().__init__()
self.d_model = d_model
self.dropout = dropout
self.ffn = nn.Sequential(
nn.Linear(self.d_model, self.d_model * 4),
nn.ReLU(),
nn.Linear(self.d_model * 4, self.d_model),
nn.Dropout(self.dropout)
)
def forward(self, x):
return self.ffn(x)
# Define Scaled Dot Product Attention | 定义单头注意力
class Attention(nn.Module):
def __init__(self, d_model, head_size, context_length, dropout):
super().__init__()
self.d_model = d_model
self.head_size = head_size
self.context_length = context_length
self.dropout = dropout
self.Wq = nn.Linear(self.d_model, self.head_size, bias=False)
self.Wk = nn.Linear(self.d_model, self.head_size, bias=False)
self.Wv = nn.Linear(self.d_model, self.head_size, bias=False)
self.dropout = nn.Dropout(self.dropout)
def forward(self, x):
q = self.Wq(x)
k = self.Wk(x)
v = self.Wv(x)
weights = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_size)
weights = F.softmax(weights, dim=-1)
weights = self.dropout(weights)
output = weights @ v
return output
# Define Multi-head Attention | 定义多头注意力
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads, head_size, context_length, dropout):
super().__init__()
self.d_model = d_model
self.num_heads = num_heads
self.head_size = head_size
self.context_length = context_length
self.dropout = dropout
self.heads = nn.ModuleList(
[Attention(self.d_model, self.head_size, self.context_length, self.dropout) for _ in range(self.num_heads)])
self.projection_layer = nn.Linear(self.d_model, self.d_model)
self.dropout = nn.Dropout(self.dropout)
def forward(self, x):
head_outputs = [head(x) for head in self.heads]
head_outputs = torch.cat(head_outputs, dim=-1)
out = self.dropout(self.projection_layer(head_outputs))
return out
# Define Transformer Block | 定义Transformer块
class TransformerBlock(nn.Module):
def __init__(self, d_model, num_heads, head_size, context_length, dropout):
super().__init__()
self.ln1 = nn.LayerNorm(d_model)
self.ln2 = nn.LayerNorm(d_model)
self.mha = MultiHeadAttention(d_model, num_heads, head_size, context_length, dropout)
self.ffn = FeedForwardNetwork(d_model, dropout)
def forward(self, x):
x = x + self.mha(self.ln1(x))
x = x + self.ffn(self.ln2(x))
return x
class BERTModel(nn.Module):
def __init__(self, vocab_size, d_model, num_heads, head_size, context_length, num_blocks, dropout, device):
super().__init__()
self.d_model = d_model
self.num_heads = num_heads
self.head_size = head_size
self.context_length = context_length
self.num_blocks = num_blocks
self.device = device
self.embedding = EmbeddingLayer(self.d_model, self.device, vocab_size, self.context_length)
self.transformer_blocks = nn.Sequential(*(
[TransformerBlock(self.d_model, self.num_heads, self.head_size, self.context_length, dropout) for _ in
range(self.num_blocks)] +
[nn.LayerNorm(self.d_model)]
))
def forward(self, idx):
x = self.embedding(idx)
for block in self.transformer_blocks:
x = block(x)
return x
class NextSentencePrediction(torch.nn.Module):
"""
BERT架构实现目的之一:NSP任务
即判断两个句子是否是连续的。
主要用于句子级别的分类任务。
"""
def __init__(self, hidden):
super().__init__()
self.linear = torch.nn.Linear(hidden, 2)
"""
如果损失函数使用的 nn.NLLLoss 的化,这里需要 LogSoftmax
如果损失函数使用的 nn.CrossEntropyLoss 的化,这里不需要 LogSoftmax,因为 nn.CrossEntropyLoss 函数内部做了
"""
# self.softmax = torch.nn.LogSoftmax(dim=-1)
def forward(self, x):
x = self.linear(x[:, 0]) # NSP任务只返回第一个token的值,即 [CLS] token
# x = self.softmax(x)
return x
class MaskedLanguageModel(torch.nn.Module):
"""
BERT架构实现目的之二:MLM任务
即从被mask的输入序列中预测原始token。
主要用于单词级别的分类任务。
"""
def __init__(self, hidden, vocab_size, *args, **kwargs):
super().__init__(*args, **kwargs)
self.linear = torch.nn.Linear(hidden, vocab_size)
"""
如果损失函数使用的 nn.NLLLoss 的化,这里需要 LogSoftmax
如果损失函数使用的 nn.CrossEntropyLoss 的化,这里不需要 LogSoftmax,因为 nn.CrossEntropyLoss 函数内部做了
"""
# self.softmax = torch.nn.LogSoftmax(dim=-1)
def forward(self, x):
x = self.linear(x) # MLM任务返回所有token的值
# x = self.softmax(x)
return x
class NovelModel(torch.nn.Module):
"""
NovelModel 是我们自定义的最终模型
[ BERT Base + Next Sentence Prediction + Masked Language Modeling ]
"""
def __init__(self, bert: BERTModel, vocab_size):
super().__init__()
self.bert = bert
self.next_sentence_p = NextSentencePrediction(self.bert.d_model)
self.mask_lm = MaskedLanguageModel(self.bert.d_model, vocab_size)
def forward(self, x):
x = self.bert(x)
"""模型输出的是两个预测值:一个是被mask掩码掉的token的预测,一个是下一句预测"""
return self.mask_lm(x), self.next_sentence_p(x)