jacksuuuu commited on
Commit
08dbe3c
·
verified ·
1 Parent(s): 6ddf075

Update to checkpoint 35000 with fixed Pre-LN architecture (iter 35k, loss 3.46)

Browse files
config.json CHANGED
@@ -1,30 +1,22 @@
1
  {
 
2
  "architectures": [
3
- "GPT2LMHeadModel"
4
  ],
5
- "model_type": "gpt2",
6
- "vocab_size": 50257,
7
- "n_positions": 512,
 
 
 
 
8
  "n_embd": 384,
9
- "n_layer": 8,
10
  "n_head": 8,
11
  "n_inner": 1536,
12
- "activation_function": "gelu",
 
13
  "resid_pdrop": 0.1,
14
- "embd_pdrop": 0.1,
15
- "attn_pdrop": 0.1,
16
- "layer_norm_epsilon": 1e-05,
17
- "initializer_range": 0.02,
18
- "bos_token_id": 50256,
19
- "eos_token_id": 50256,
20
- "tie_word_embeddings": true,
21
  "torch_dtype": "float32",
22
- "transformers_version": "4.35.0",
23
- "mlx_training": {
24
- "framework": "MLX",
25
- "iterations": 20000,
26
- "final_loss": 0.7582720518112183,
27
- "dataset": "tinystories",
28
- "max_tokens": 2000000
29
- }
30
- }
 
1
  {
2
+ "activation_function": "gelu",
3
  "architectures": [
4
+ "NanoGPTLMHeadModel"
5
  ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "nanogpt",
13
  "n_embd": 384,
 
14
  "n_head": 8,
15
  "n_inner": 1536,
16
+ "n_layer": 8,
17
+ "n_positions": 512,
18
  "resid_pdrop": 0.1,
 
 
 
 
 
 
 
19
  "torch_dtype": "float32",
20
+ "transformers_version": "4.51.3",
21
+ "vocab_size": 50257
22
+ }
 
 
 
 
 
 
generation_config.json CHANGED
@@ -1,9 +1,6 @@
1
  {
 
2
  "bos_token_id": 50256,
3
  "eos_token_id": 50256,
4
- "max_length": 512,
5
- "temperature": 1.0,
6
- "top_k": 50,
7
- "top_p": 0.95,
8
- "do_sample": true
9
- }
 
1
  {
2
+ "_from_model_config": true,
3
  "bos_token_id": 50256,
4
  "eos_token_id": 50256,
5
+ "transformers_version": "4.51.3"
6
+ }
 
 
 
 
modeling_nanogpt.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom HuggingFace-compatible GPT model with Pre-LN architecture
3
+ Matches the MLX nanoGPT implementation exactly
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+ from transformers import PreTrainedModel, PretrainedConfig
8
+ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
9
+ import math
10
+
11
+
12
+ class NanoGPTConfig(PretrainedConfig):
13
+ """Configuration for NanoGPT model"""
14
+ model_type = "nanogpt"
15
+
16
+ def __init__(
17
+ self,
18
+ vocab_size=50257,
19
+ n_positions=512,
20
+ n_embd=384,
21
+ n_layer=8,
22
+ n_head=8,
23
+ n_inner=1536,
24
+ activation_function="gelu",
25
+ resid_pdrop=0.1,
26
+ embd_pdrop=0.1,
27
+ attn_pdrop=0.1,
28
+ layer_norm_epsilon=1e-5,
29
+ initializer_range=0.02,
30
+ bos_token_id=50256,
31
+ eos_token_id=50256,
32
+ **kwargs
33
+ ):
34
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
35
+ self.vocab_size = vocab_size
36
+ self.n_positions = n_positions
37
+ self.n_embd = n_embd
38
+ self.n_layer = n_layer
39
+ self.n_head = n_head
40
+ self.n_inner = n_inner
41
+ self.activation_function = activation_function
42
+ self.resid_pdrop = resid_pdrop
43
+ self.embd_pdrop = embd_pdrop
44
+ self.attn_pdrop = attn_pdrop
45
+ self.layer_norm_epsilon = layer_norm_epsilon
46
+ self.initializer_range = initializer_range
47
+
48
+
49
+ class NanoGPTAttention(nn.Module):
50
+ """Multi-head self-attention with Pre-LN"""
51
+
52
+ def __init__(self, config):
53
+ super().__init__()
54
+ self.n_head = config.n_head
55
+ self.n_embd = config.n_embd
56
+ self.head_dim = self.n_embd // self.n_head
57
+ self.scale = math.sqrt(self.head_dim)
58
+
59
+ # Combined QKV projection (standard Linear, not Conv1D)
60
+ self.qkv_proj = nn.Linear(config.n_embd, 3 * config.n_embd)
61
+ self.out_proj = nn.Linear(config.n_embd, config.n_embd)
62
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
63
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
64
+
65
+ # Causal mask
66
+ self.register_buffer(
67
+ "bias",
68
+ torch.tril(torch.ones(config.n_positions, config.n_positions)).view(
69
+ 1, 1, config.n_positions, config.n_positions
70
+ ),
71
+ )
72
+
73
+ def forward(self, x):
74
+ B, T, C = x.size() # batch, sequence length, embedding dim
75
+
76
+ # Project and split into Q, K, V
77
+ qkv = self.qkv_proj(x) # (B, T, 3*C)
78
+ qkv = qkv.view(B, T, 3, self.n_head, self.head_dim) # (B, T, 3, n_head, head_dim)
79
+ qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, n_head, T, head_dim)
80
+ q, k, v = qkv[0], qkv[1], qkv[2]
81
+
82
+ # Scaled dot-product attention
83
+ scores = (q @ k.transpose(-2, -1)) / self.scale # (B, n_head, T, T)
84
+
85
+ # Apply causal mask
86
+ scores = scores.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
87
+
88
+ attn_weights = torch.softmax(scores, dim=-1)
89
+ attn_weights = self.attn_dropout(attn_weights)
90
+
91
+ # Combine heads
92
+ out = attn_weights @ v # (B, n_head, T, head_dim)
93
+ out = out.transpose(1, 2).contiguous().view(B, T, C) # (B, T, C)
94
+
95
+ return self.resid_dropout(self.out_proj(out))
96
+
97
+
98
+ class NanoGPTMLP(nn.Module):
99
+ """Feed-forward network"""
100
+
101
+ def __init__(self, config):
102
+ super().__init__()
103
+ self.fc1 = nn.Linear(config.n_embd, config.n_inner)
104
+ self.fc2 = nn.Linear(config.n_inner, config.n_embd)
105
+ self.act = nn.GELU()
106
+ self.dropout = nn.Dropout(config.resid_pdrop)
107
+
108
+ def forward(self, x):
109
+ x = self.fc1(x)
110
+ x = self.act(x)
111
+ x = self.dropout(x)
112
+ x = self.fc2(x)
113
+ return self.dropout(x)
114
+
115
+
116
+ class NanoGPTBlock(nn.Module):
117
+ """Transformer block with Pre-LN architecture"""
118
+
119
+ def __init__(self, config):
120
+ super().__init__()
121
+ self.ln1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
122
+ self.attn = NanoGPTAttention(config)
123
+ self.ln2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
124
+ self.mlp = NanoGPTMLP(config)
125
+
126
+ def forward(self, x):
127
+ # Pre-norm architecture (LayerNorm before attention/MLP)
128
+ x = x + self.attn(self.ln1(x))
129
+ x = x + self.mlp(self.ln2(x))
130
+ return x
131
+
132
+
133
+ class NanoGPTModel(PreTrainedModel):
134
+ """NanoGPT model with Pre-LN architecture"""
135
+ config_class = NanoGPTConfig
136
+
137
+ def __init__(self, config):
138
+ super().__init__(config)
139
+ self.config = config
140
+
141
+ # Embeddings
142
+ self.token_embedding = nn.Embedding(config.vocab_size, config.n_embd)
143
+ self.position_embedding = nn.Embedding(config.n_positions, config.n_embd)
144
+ self.drop = nn.Dropout(config.embd_pdrop)
145
+
146
+ # Transformer blocks
147
+ self.blocks = nn.ModuleList([NanoGPTBlock(config) for _ in range(config.n_layer)])
148
+
149
+ # Final layer norm
150
+ self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
151
+
152
+ # LM head (tied with token_embedding)
153
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
154
+
155
+ # Tie weights
156
+ self.lm_head.weight = self.token_embedding.weight
157
+
158
+ # Initialize weights
159
+ self.apply(self._init_weights)
160
+
161
+ def _init_weights(self, module):
162
+ if isinstance(module, (nn.Linear, nn.Embedding)):
163
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
164
+ if isinstance(module, nn.Linear) and module.bias is not None:
165
+ module.bias.data.zero_()
166
+
167
+ def forward(
168
+ self,
169
+ input_ids=None,
170
+ past_key_values=None,
171
+ attention_mask=None,
172
+ token_type_ids=None,
173
+ position_ids=None,
174
+ head_mask=None,
175
+ inputs_embeds=None,
176
+ use_cache=None,
177
+ output_attentions=None,
178
+ output_hidden_states=None,
179
+ return_dict=None,
180
+ labels=None,
181
+ ):
182
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
183
+
184
+ if input_ids is not None:
185
+ batch_size, seq_length = input_ids.size()
186
+ else:
187
+ batch_size, seq_length = inputs_embeds.size()[:-1]
188
+
189
+ if position_ids is None:
190
+ if input_ids is not None:
191
+ device = input_ids.device
192
+ elif inputs_embeds is not None:
193
+ device = inputs_embeds.device
194
+ else:
195
+ device = next(self.parameters()).device
196
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
197
+ position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
198
+
199
+ # Embeddings
200
+ if inputs_embeds is None:
201
+ inputs_embeds = self.token_embedding(input_ids)
202
+
203
+ position_embeds = self.position_embedding(position_ids)
204
+ hidden_states = self.drop(inputs_embeds + position_embeds)
205
+
206
+ # Transformer blocks
207
+ for block in self.blocks:
208
+ hidden_states = block(hidden_states)
209
+
210
+ # Final layer norm
211
+ hidden_states = self.ln_f(hidden_states)
212
+
213
+ # LM head
214
+ lm_logits = self.lm_head(hidden_states)
215
+
216
+ loss = None
217
+ if labels is not None:
218
+ # Shift so that tokens < n predict n
219
+ shift_logits = lm_logits[..., :-1, :].contiguous()
220
+ shift_labels = labels[..., 1:].contiguous()
221
+ loss_fct = nn.CrossEntropyLoss()
222
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
223
+
224
+ if not return_dict:
225
+ output = (lm_logits,)
226
+ return ((loss,) + output) if loss is not None else output
227
+
228
+ return CausalLMOutputWithCrossAttentions(
229
+ loss=loss,
230
+ logits=lm_logits,
231
+ past_key_values=None,
232
+ hidden_states=None,
233
+ attentions=None,
234
+ )
235
+
236
+
237
+ class NanoGPTLMHeadModel(PreTrainedModel):
238
+ """Causal language model wrapper"""
239
+ config_class = NanoGPTConfig
240
+
241
+ def __init__(self, config):
242
+ super().__init__(config)
243
+ self.transformer = NanoGPTModel(config)
244
+
245
+ def forward(self, *args, **kwargs):
246
+ return self.transformer(*args, **kwargs)
247
+
248
+ def generate(self, *args, **kwargs):
249
+ """Use HF's generate method"""
250
+ # Remove unused kwargs that transformers might pass
251
+ kwargs.pop("attention_mask", None)
252
+ kwargs.pop("token_type_ids", None)
253
+ return super().generate(*args, **kwargs)
254
+
255
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
256
+ # Remove unused kwargs
257
+ kwargs.pop("attention_mask", None)
258
+ kwargs.pop("token_type_ids", None)
259
+
260
+ # Our model doesn't support KV caching, so we need to pass the full sequence
261
+ return {
262
+ "input_ids": input_ids,
263
+ }
264
+
265
+ def can_generate(self):
266
+ """Indicate this model can generate"""
267
+ return True
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d21ad8e8646491e7510cb58cc5d542ca21db6a8e174f5399fba9546662cf317
3
+ size 143190611
special_tokens_map.json CHANGED
@@ -1,5 +1,23 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
  }
tokenizer_config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "50256": {
@@ -13,8 +14,10 @@
13
  "bos_token": "<|endoftext|>",
14
  "clean_up_tokenization_spaces": false,
15
  "eos_token": "<|endoftext|>",
 
16
  "extra_special_tokens": {},
17
  "model_max_length": 1024,
 
18
  "tokenizer_class": "GPT2Tokenizer",
19
  "unk_token": "<|endoftext|>"
20
  }
 
1
  {
2
+ "add_bos_token": false,
3
  "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
  "50256": {
 
14
  "bos_token": "<|endoftext|>",
15
  "clean_up_tokenization_spaces": false,
16
  "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
  "extra_special_tokens": {},
19
  "model_max_length": 1024,
20
+ "pad_token": null,
21
  "tokenizer_class": "GPT2Tokenizer",
22
  "unk_token": "<|endoftext|>"
23
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff