kd13 commited on
Commit
68b16ff
·
verified ·
1 Parent(s): 93b37a1

Upload 6 files

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MyBertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_mybert.MyBertConfig",
8
+ "AutoModelForMaskedLM": "modeling_mybert.MyBertForMaskedLM"
9
+ },
10
+ "dtype": "float32",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 384,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 1536,
15
+ "layer_norm_eps": 1e-12,
16
+ "max_position_embeddings": 128,
17
+ "model_type": "mybert",
18
+ "num_attention_heads": 8,
19
+ "num_hidden_layers": 8,
20
+ "pad_token_id": 0,
21
+ "rope_theta": 10000.0,
22
+ "tie_word_embeddings": true,
23
+ "transformers_version": "5.0.0",
24
+ "vocab_size": 16839
25
+ }
configuration_mybert.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class MyBertConfig(PretrainedConfig):
5
+ model_type = "mybert"
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_size=16839,
10
+ hidden_size=384,
11
+ num_hidden_layers=8,
12
+ num_attention_heads=8,
13
+ intermediate_size=1536,
14
+ max_position_embeddings=128,
15
+ hidden_dropout_prob=0.1,
16
+ attention_probs_dropout_prob=0.1,
17
+ layer_norm_eps=1e-12,
18
+ initializer_range=0.02,
19
+ rope_theta=10000.0,
20
+ pad_token_id=0,
21
+ tie_word_embeddings=True,
22
+ **kwargs,
23
+ ):
24
+ super().__init__(
25
+ pad_token_id=pad_token_id,
26
+ tie_word_embeddings=tie_word_embeddings,
27
+ **kwargs,
28
+ )
29
+ assert hidden_size % num_attention_heads == 0
30
+ self.vocab_size = vocab_size
31
+ self.hidden_size = hidden_size
32
+ self.num_hidden_layers = num_hidden_layers
33
+ self.num_attention_heads = num_attention_heads
34
+ self.intermediate_size = intermediate_size
35
+ self.max_position_embeddings = max_position_embeddings
36
+ self.hidden_dropout_prob = hidden_dropout_prob
37
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
38
+ self.layer_norm_eps = layer_norm_eps
39
+ self.initializer_range = initializer_range
40
+ self.rope_theta = rope_theta
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19c03d833407c6c1355deb49e17e506656d5d6005aa7e173753760fb000f991f
3
+ size 83331444
modeling_mybert.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import PreTrainedModel
5
+ from transformers.modeling_outputs import MaskedLMOutput, BaseModelOutput
6
+
7
+ from .configuration_mybert import MyBertConfig
8
+
9
+
10
+ def _build_rope_cache(head_dim, max_seq_len, base=10000.0):
11
+ inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim))
12
+ t = torch.arange(max_seq_len, dtype=torch.float32)
13
+ freqs = torch.outer(t, inv_freq)
14
+ emb = torch.cat((freqs, freqs), dim=-1)
15
+ return emb.cos(), emb.sin()
16
+
17
+
18
+ def _rotate_half(x):
19
+ x1, x2 = x.chunk(2, dim=-1)
20
+ return torch.cat((-x2, x1), dim=-1)
21
+
22
+
23
+ def _apply_rope(q, k, cos, sin):
24
+ cos = cos.to(q.dtype).unsqueeze(0).unsqueeze(0)
25
+ sin = sin.to(q.dtype).unsqueeze(0).unsqueeze(0)
26
+ q_rot = (q * cos) + (_rotate_half(q) * sin)
27
+ k_rot = (k * cos) + (_rotate_half(k) * sin)
28
+ return q_rot, k_rot
29
+
30
+
31
+ class MyBertEmbeddings(nn.Module):
32
+ def __init__(self, config):
33
+ super().__init__()
34
+ self.word_embeddings = nn.Embedding(
35
+ config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
36
+ )
37
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
38
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
39
+
40
+ def forward(self, input_ids):
41
+ x = self.word_embeddings(input_ids)
42
+ x = self.LayerNorm(x)
43
+ x = self.dropout(x)
44
+ return x
45
+
46
+
47
+ class MyBertSelfAttention(nn.Module):
48
+ def __init__(self, config):
49
+ super().__init__()
50
+ self.num_attention_heads = config.num_attention_heads
51
+ self.attention_head_size = config.hidden_size // config.num_attention_heads
52
+ self.all_head_size = config.hidden_size
53
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
54
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
55
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
56
+ self.dropout_prob = config.attention_probs_dropout_prob
57
+
58
+ def forward(self, hidden_states, attention_mask=None, cos=None, sin=None):
59
+ q = self.query(hidden_states)
60
+ k = self.key(hidden_states)
61
+ v = self.value(hidden_states)
62
+ new_shape = q.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
63
+ q = q.view(*new_shape).transpose(1, 2)
64
+ k = k.view(*new_shape).transpose(1, 2)
65
+ v = v.view(*new_shape).transpose(1, 2)
66
+ if cos is not None and sin is not None:
67
+ q, k = _apply_rope(q, k, cos, sin)
68
+ context = F.scaled_dot_product_attention(
69
+ q, k, v,
70
+ attn_mask=attention_mask,
71
+ dropout_p=self.dropout_prob if self.training else 0.0,
72
+ is_causal=False,
73
+ )
74
+ context = context.transpose(1, 2).contiguous()
75
+ new_context_shape = context.size()[:-2] + (self.all_head_size,)
76
+ return context.view(*new_context_shape)
77
+
78
+
79
+ class MyBertSelfOutput(nn.Module):
80
+ def __init__(self, config):
81
+ super().__init__()
82
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
83
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
84
+
85
+ def forward(self, hidden_states):
86
+ return self.dropout(self.dense(hidden_states))
87
+
88
+
89
+ class MyBertAttention(nn.Module):
90
+ def __init__(self, config):
91
+ super().__init__()
92
+ self.self = MyBertSelfAttention(config)
93
+ self.output = MyBertSelfOutput(config)
94
+
95
+ def forward(self, hidden_states, attention_mask=None, cos=None, sin=None):
96
+ self_outputs = self.self(hidden_states, attention_mask, cos, sin)
97
+ return self.output(self_outputs)
98
+
99
+
100
+ class MyBertIntermediate(nn.Module):
101
+ def __init__(self, config):
102
+ super().__init__()
103
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
104
+ self.intermediate_act_fn = nn.GELU()
105
+
106
+ def forward(self, hidden_states):
107
+ return self.intermediate_act_fn(self.dense(hidden_states))
108
+
109
+
110
+ class MyBertOutput(nn.Module):
111
+ def __init__(self, config):
112
+ super().__init__()
113
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
114
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
115
+
116
+ def forward(self, hidden_states):
117
+ return self.dropout(self.dense(hidden_states))
118
+
119
+
120
+ class MyBertLayer(nn.Module):
121
+ def __init__(self, config):
122
+ super().__init__()
123
+ self.attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
124
+ self.attention = MyBertAttention(config)
125
+ self.ffn_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
126
+ self.intermediate = MyBertIntermediate(config)
127
+ self.output = MyBertOutput(config)
128
+
129
+ def forward(self, hidden_states, attention_mask=None, cos=None, sin=None):
130
+ normed = self.attention_layernorm(hidden_states)
131
+ attention_output = self.attention(normed, attention_mask, cos, sin)
132
+ hidden_states = hidden_states + attention_output
133
+ normed = self.ffn_layernorm(hidden_states)
134
+ intermediate_out = self.intermediate(normed)
135
+ layer_output = self.output(intermediate_out)
136
+ hidden_states = hidden_states + layer_output
137
+ return hidden_states
138
+
139
+
140
+ class MyBertEncoder(nn.Module):
141
+ def __init__(self, config):
142
+ super().__init__()
143
+ self.layer = nn.ModuleList([MyBertLayer(config) for _ in range(config.num_hidden_layers)])
144
+ self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
145
+
146
+ def forward(self, hidden_states, attention_mask=None, cos=None, sin=None):
147
+ for layer_module in self.layer:
148
+ hidden_states = layer_module(hidden_states, attention_mask, cos, sin)
149
+ return self.final_layernorm(hidden_states)
150
+
151
+
152
+ class MyBertPredictionHeadTransform(nn.Module):
153
+ def __init__(self, config):
154
+ super().__init__()
155
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
156
+ self.transform_act_fn = nn.GELU()
157
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
158
+
159
+ def forward(self, hidden_states):
160
+ hidden_states = self.dense(hidden_states)
161
+ hidden_states = self.transform_act_fn(hidden_states)
162
+ hidden_states = self.LayerNorm(hidden_states)
163
+ return hidden_states
164
+
165
+
166
+ class MyBertLMPredictionHead(nn.Module):
167
+ def __init__(self, config):
168
+ super().__init__()
169
+ self.transform = MyBertPredictionHeadTransform(config)
170
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
171
+
172
+ def forward(self, hidden_states):
173
+ hidden_states = self.transform(hidden_states)
174
+ hidden_states = self.decoder(hidden_states)
175
+ return hidden_states
176
+
177
+
178
+ class MyBertOnlyMLMHead(nn.Module):
179
+ def __init__(self, config):
180
+ super().__init__()
181
+ self.predictions = MyBertLMPredictionHead(config)
182
+
183
+ def forward(self, sequence_output):
184
+ return self.predictions(sequence_output)
185
+
186
+
187
+ class MyBertPreTrainedModel(PreTrainedModel):
188
+ config_class = MyBertConfig
189
+ base_model_prefix = "mybert"
190
+ supports_gradient_checkpointing = False
191
+ _no_split_modules = ["MyBertLayer"]
192
+
193
+ def _init_weights(self, module):
194
+ std = self.config.initializer_range
195
+ if isinstance(module, nn.Linear):
196
+ module.weight.data.normal_(mean=0.0, std=std)
197
+ if module.bias is not None:
198
+ module.bias.data.zero_()
199
+ elif isinstance(module, nn.Embedding):
200
+ module.weight.data.normal_(mean=0.0, std=std)
201
+ if module.padding_idx is not None:
202
+ module.weight.data[module.padding_idx].zero_()
203
+ elif isinstance(module, nn.LayerNorm):
204
+ module.bias.data.zero_()
205
+ module.weight.data.fill_(1.0)
206
+
207
+
208
+ class MyBertModel(MyBertPreTrainedModel):
209
+ def __init__(self, config):
210
+ super().__init__(config)
211
+ self.embeddings = MyBertEmbeddings(config)
212
+ self.encoder = MyBertEncoder(config)
213
+
214
+ head_dim = config.hidden_size // config.num_attention_heads
215
+ cos, sin = _build_rope_cache(head_dim, config.max_position_embeddings, config.rope_theta)
216
+ self.register_buffer("rope_cos", cos, persistent=True)
217
+ self.register_buffer("rope_sin", sin, persistent=True)
218
+
219
+ self.post_init()
220
+
221
+ def get_input_embeddings(self):
222
+ return self.embeddings.word_embeddings
223
+
224
+ def set_input_embeddings(self, value):
225
+ self.embeddings.word_embeddings = value
226
+
227
+ def forward(self, input_ids=None, attention_mask=None, return_dict=True, **kwargs):
228
+ _, T = input_ids.shape
229
+ head_dim = self.config.hidden_size // self.config.num_attention_heads
230
+ cos, sin = _build_rope_cache(head_dim, T, self.config.rope_theta)
231
+ cos = cos.to(device=input_ids.device, dtype=self.embeddings.word_embeddings.weight.dtype)
232
+ sin = sin.to(device=input_ids.device, dtype=self.embeddings.word_embeddings.weight.dtype)
233
+
234
+ attn_mask = None
235
+ if attention_mask is not None:
236
+ attn_mask = attention_mask.bool()[:, None, None, :]
237
+
238
+ hidden = self.embeddings(input_ids)
239
+ sequence_output = self.encoder(hidden, attn_mask, cos, sin)
240
+ if not return_dict:
241
+ return (sequence_output,)
242
+ return BaseModelOutput(last_hidden_state=sequence_output)
243
+
244
+
245
+ class MyBertForMaskedLM(MyBertPreTrainedModel):
246
+ _tied_weights_keys = {
247
+ "cls.predictions.decoder.weight": "mybert.embeddings.word_embeddings.weight",
248
+ }
249
+
250
+ def __init__(self, config):
251
+ super().__init__(config)
252
+ self.mybert = MyBertModel(config)
253
+ self.cls = MyBertOnlyMLMHead(config)
254
+ self.post_init()
255
+
256
+ def get_output_embeddings(self):
257
+ return self.cls.predictions.decoder
258
+
259
+ def set_output_embeddings(self, new_embeddings):
260
+ self.cls.predictions.decoder = new_embeddings
261
+
262
+ def forward(self, input_ids=None, attention_mask=None, labels=None, return_dict=True, **kwargs):
263
+ outputs = self.mybert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
264
+ sequence_output = outputs.last_hidden_state
265
+ prediction_scores = self.cls(sequence_output)
266
+
267
+ loss = None
268
+ if labels is not None:
269
+ loss = F.cross_entropy(
270
+ prediction_scores.view(-1, self.config.vocab_size),
271
+ labels.view(-1),
272
+ ignore_index=-100,
273
+ )
274
+
275
+ if not return_dict:
276
+ output = (prediction_scores,)
277
+ return ((loss,) + output) if loss is not None else output
278
+
279
+ return MaskedLMOutput(loss=loss, logits=prediction_scores)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 128,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": true,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }