kd13 commited on
Commit
7ad7edf
·
verified ·
1 Parent(s): 5d1b3ad

Upload 6 files

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MyBertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_mybert.MyBertConfig",
8
+ "AutoModelForMaskedLM": "modeling_mybert.MyBertForMaskedLM"
9
+ },
10
+ "dtype": "float32",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "layer_norm_eps": 1e-12,
16
+ "max_position_embeddings": 128,
17
+ "model_type": "mybert",
18
+ "num_attention_heads": 8,
19
+ "num_hidden_layers": 8,
20
+ "pad_token_id": 0,
21
+ "rope_theta": 10000.0,
22
+ "tie_word_embeddings": true,
23
+ "transformers_version": "5.0.0",
24
+ "vocab_size": 16839
25
+ }
configuration_mybert.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class MyBertConfig(PretrainedConfig):
5
+ model_type = "mybert"
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_size=16839,
10
+ hidden_size=512,
11
+ num_hidden_layers=8,
12
+ num_attention_heads=8,
13
+ intermediate_size=2048,
14
+ max_position_embeddings=128,
15
+ hidden_dropout_prob=0.1,
16
+ attention_probs_dropout_prob=0.1,
17
+ layer_norm_eps=1e-12,
18
+ initializer_range=0.02,
19
+ rope_theta=10000.0,
20
+ pad_token_id=0,
21
+ tie_word_embeddings=True,
22
+ **kwargs,
23
+ ):
24
+ super().__init__(
25
+ pad_token_id=pad_token_id,
26
+ tie_word_embeddings=tie_word_embeddings,
27
+ **kwargs,
28
+ )
29
+ assert hidden_size % num_attention_heads == 0
30
+ self.vocab_size = vocab_size
31
+ self.hidden_size = hidden_size
32
+ self.num_hidden_layers = num_hidden_layers
33
+ self.num_attention_heads = num_attention_heads
34
+ self.intermediate_size = intermediate_size
35
+ self.max_position_embeddings = max_position_embeddings
36
+ self.hidden_dropout_prob = hidden_dropout_prob
37
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
38
+ self.layer_norm_eps = layer_norm_eps
39
+ self.initializer_range = initializer_range
40
+ self.rope_theta = rope_theta
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6d32123898520b79f57ee7d1e84ba117683332683530fe619c80ee30660d61c
3
+ size 170063628
modeling_mybert.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import PreTrainedModel
5
+ from transformers.modeling_outputs import MaskedLMOutput, BaseModelOutput
6
+
7
+ from .configuration_mybert import MyBertConfig
8
+
9
+
10
+ def _build_rope_cache(head_dim, max_seq_len, base=10000.0):
11
+ inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim))
12
+ t = torch.arange(max_seq_len, dtype=torch.float32)
13
+ freqs = torch.outer(t, inv_freq)
14
+ emb = torch.cat((freqs, freqs), dim=-1)
15
+ return emb.cos(), emb.sin()
16
+
17
+
18
+ def _rotate_half(x):
19
+ x1, x2 = x.chunk(2, dim=-1)
20
+ return torch.cat((-x2, x1), dim=-1)
21
+
22
+
23
+ def _apply_rope(q, k, cos, sin):
24
+ cos = cos.to(q.dtype).unsqueeze(0).unsqueeze(0)
25
+ sin = sin.to(q.dtype).unsqueeze(0).unsqueeze(0)
26
+ q_rot = (q * cos) + (_rotate_half(q) * sin)
27
+ k_rot = (k * cos) + (_rotate_half(k) * sin)
28
+ return q_rot, k_rot
29
+
30
+
31
+ class MyBertEmbeddings(nn.Module):
32
+ def __init__(self, config):
33
+ super().__init__()
34
+ self.word_embeddings = nn.Embedding(
35
+ config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
36
+ )
37
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
38
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
39
+
40
+ def forward(self, input_ids):
41
+ x = self.word_embeddings(input_ids)
42
+ x = self.LayerNorm(x)
43
+ x = self.dropout(x)
44
+ return x
45
+
46
+
47
+ class MyBertSelfAttention(nn.Module):
48
+ def __init__(self, config):
49
+ super().__init__()
50
+ self.num_attention_heads = config.num_attention_heads
51
+ self.attention_head_size = config.hidden_size // config.num_attention_heads
52
+ self.all_head_size = config.hidden_size
53
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
54
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
55
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
56
+ self.dropout_prob = config.attention_probs_dropout_prob
57
+
58
+ def forward(self, hidden_states, attention_mask=None, cos=None, sin=None):
59
+ q = self.query(hidden_states)
60
+ k = self.key(hidden_states)
61
+ v = self.value(hidden_states)
62
+ new_shape = q.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
63
+ q = q.view(*new_shape).transpose(1, 2)
64
+ k = k.view(*new_shape).transpose(1, 2)
65
+ v = v.view(*new_shape).transpose(1, 2)
66
+ if cos is not None and sin is not None:
67
+ q, k = _apply_rope(q, k, cos, sin)
68
+ context = F.scaled_dot_product_attention(
69
+ q, k, v,
70
+ attn_mask=attention_mask,
71
+ dropout_p=self.dropout_prob if self.training else 0.0,
72
+ is_causal=False,
73
+ )
74
+ context = context.transpose(1, 2).contiguous()
75
+ new_context_shape = context.size()[:-2] + (self.all_head_size,)
76
+ return context.view(*new_context_shape)
77
+
78
+
79
+ class MyBertSelfOutput(nn.Module):
80
+ def __init__(self, config):
81
+ super().__init__()
82
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
83
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
84
+
85
+ def forward(self, hidden_states):
86
+ return self.dropout(self.dense(hidden_states))
87
+
88
+
89
+ class MyBertAttention(nn.Module):
90
+ def __init__(self, config):
91
+ super().__init__()
92
+ self.self = MyBertSelfAttention(config)
93
+ self.output = MyBertSelfOutput(config)
94
+
95
+ def forward(self, hidden_states, attention_mask=None, cos=None, sin=None):
96
+ self_outputs = self.self(hidden_states, attention_mask, cos, sin)
97
+ return self.output(self_outputs)
98
+
99
+
100
+ class MyBertIntermediate(nn.Module):
101
+ def __init__(self, config: MyBertConfig):
102
+ super().__init__()
103
+ self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
104
+ self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
105
+
106
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
107
+ gate = F.silu(self.gate_proj(hidden_states))
108
+ up = self.up_proj(hidden_states)
109
+ return gate * up
110
+
111
+ class MyBertOutput(nn.Module):
112
+ def __init__(self, config):
113
+ super().__init__()
114
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
115
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
116
+
117
+ def forward(self, hidden_states):
118
+ return self.dropout(self.dense(hidden_states))
119
+
120
+
121
+ class MyBertLayer(nn.Module):
122
+ def __init__(self, config):
123
+ super().__init__()
124
+ self.attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
125
+ self.attention = MyBertAttention(config)
126
+ self.ffn_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
127
+ self.intermediate = MyBertIntermediate(config)
128
+ self.output = MyBertOutput(config)
129
+
130
+ def forward(self, hidden_states, attention_mask=None, cos=None, sin=None):
131
+ normed = self.attention_layernorm(hidden_states)
132
+ attention_output = self.attention(normed, attention_mask, cos, sin)
133
+ hidden_states = hidden_states + attention_output
134
+ normed = self.ffn_layernorm(hidden_states)
135
+ intermediate_out = self.intermediate(normed)
136
+ layer_output = self.output(intermediate_out)
137
+ hidden_states = hidden_states + layer_output
138
+ return hidden_states
139
+
140
+
141
+ class MyBertEncoder(nn.Module):
142
+ def __init__(self, config):
143
+ super().__init__()
144
+ self.layer = nn.ModuleList([MyBertLayer(config) for _ in range(config.num_hidden_layers)])
145
+ self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
146
+
147
+ def forward(self, hidden_states, attention_mask=None, cos=None, sin=None):
148
+ for layer_module in self.layer:
149
+ hidden_states = layer_module(hidden_states, attention_mask, cos, sin)
150
+ return self.final_layernorm(hidden_states)
151
+
152
+
153
+ class MyBertPredictionHeadTransform(nn.Module):
154
+ def __init__(self, config):
155
+ super().__init__()
156
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
157
+ self.transform_act_fn = nn.GELU()
158
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
159
+
160
+ def forward(self, hidden_states):
161
+ hidden_states = self.dense(hidden_states)
162
+ hidden_states = self.transform_act_fn(hidden_states)
163
+ hidden_states = self.LayerNorm(hidden_states)
164
+ return hidden_states
165
+
166
+
167
+ class MyBertLMPredictionHead(nn.Module):
168
+ def __init__(self, config):
169
+ super().__init__()
170
+ self.transform = MyBertPredictionHeadTransform(config)
171
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
172
+
173
+ def forward(self, hidden_states):
174
+ hidden_states = self.transform(hidden_states)
175
+ hidden_states = self.decoder(hidden_states)
176
+ return hidden_states
177
+
178
+
179
+ class MyBertOnlyMLMHead(nn.Module):
180
+ def __init__(self, config):
181
+ super().__init__()
182
+ self.predictions = MyBertLMPredictionHead(config)
183
+
184
+ def forward(self, sequence_output):
185
+ return self.predictions(sequence_output)
186
+
187
+
188
+ class MyBertPreTrainedModel(PreTrainedModel):
189
+ config_class = MyBertConfig
190
+ base_model_prefix = "mybert"
191
+ supports_gradient_checkpointing = False
192
+ _no_split_modules = ["MyBertLayer"]
193
+
194
+ def _init_weights(self, module):
195
+ std = self.config.initializer_range
196
+ if isinstance(module, nn.Linear):
197
+ module.weight.data.normal_(mean=0.0, std=std)
198
+ if module.bias is not None:
199
+ module.bias.data.zero_()
200
+ elif isinstance(module, nn.Embedding):
201
+ module.weight.data.normal_(mean=0.0, std=std)
202
+ if module.padding_idx is not None:
203
+ module.weight.data[module.padding_idx].zero_()
204
+ elif isinstance(module, nn.LayerNorm):
205
+ module.bias.data.zero_()
206
+ module.weight.data.fill_(1.0)
207
+
208
+
209
+ class MyBertModel(MyBertPreTrainedModel):
210
+ def __init__(self, config):
211
+ super().__init__(config)
212
+ self.embeddings = MyBertEmbeddings(config)
213
+ self.encoder = MyBertEncoder(config)
214
+
215
+ head_dim = config.hidden_size // config.num_attention_heads
216
+ cos, sin = _build_rope_cache(head_dim, config.max_position_embeddings, config.rope_theta)
217
+ self.register_buffer("rope_cos", cos, persistent=True)
218
+ self.register_buffer("rope_sin", sin, persistent=True)
219
+
220
+ self.post_init()
221
+
222
+ def get_input_embeddings(self):
223
+ return self.embeddings.word_embeddings
224
+
225
+ def set_input_embeddings(self, value):
226
+ self.embeddings.word_embeddings = value
227
+
228
+ def forward(self, input_ids=None, attention_mask=None, return_dict=True, **kwargs):
229
+ _, T = input_ids.shape
230
+ head_dim = self.config.hidden_size // self.config.num_attention_heads
231
+ cos, sin = _build_rope_cache(head_dim, T, self.config.rope_theta)
232
+ cos = cos.to(device=input_ids.device, dtype=self.embeddings.word_embeddings.weight.dtype)
233
+ sin = sin.to(device=input_ids.device, dtype=self.embeddings.word_embeddings.weight.dtype)
234
+
235
+ attn_mask = None
236
+ if attention_mask is not None:
237
+ attn_mask = attention_mask.bool()[:, None, None, :]
238
+
239
+ hidden = self.embeddings(input_ids)
240
+ sequence_output = self.encoder(hidden, attn_mask, cos, sin)
241
+ if not return_dict:
242
+ return (sequence_output,)
243
+ return BaseModelOutput(last_hidden_state=sequence_output)
244
+
245
+
246
+ class MyBertForMaskedLM(MyBertPreTrainedModel):
247
+ _tied_weights_keys = {
248
+ "cls.predictions.decoder.weight": "mybert.embeddings.word_embeddings.weight",
249
+ }
250
+
251
+ def __init__(self, config):
252
+ super().__init__(config)
253
+ self.mybert = MyBertModel(config)
254
+ self.cls = MyBertOnlyMLMHead(config)
255
+ self.post_init()
256
+
257
+ def get_output_embeddings(self):
258
+ return self.cls.predictions.decoder
259
+
260
+ def set_output_embeddings(self, new_embeddings):
261
+ self.cls.predictions.decoder = new_embeddings
262
+
263
+ def forward(self, input_ids=None, attention_mask=None, labels=None, return_dict=True, **kwargs):
264
+ outputs = self.mybert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
265
+ sequence_output = outputs.last_hidden_state
266
+ prediction_scores = self.cls(sequence_output)
267
+
268
+ loss = None
269
+ if labels is not None:
270
+ loss = F.cross_entropy(
271
+ prediction_scores.view(-1, self.config.vocab_size),
272
+ labels.view(-1),
273
+ ignore_index=-100,
274
+ )
275
+
276
+ if not return_dict:
277
+ output = (prediction_scores,)
278
+ return ((loss,) + output) if loss is not None else output
279
+
280
+ return MaskedLMOutput(loss=loss, logits=prediction_scores)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 128,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": true,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "TokenizersBackend",
13
+ "unk_token": "[UNK]"
14
+ }