Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- README.md +17 -8
- __pycache__/attention.cpython-310.pyc +0 -0
- __pycache__/bigram_model.cpython-310.pyc +0 -0
- __pycache__/conf.cpython-310.pyc +0 -0
- __pycache__/data_utils.cpython-310.pyc +0 -0
- __pycache__/tokenizer_utils.cpython-310.pyc +0 -0
- app.py +41 -0
- attention.py +68 -0
- bigram_model.py +93 -0
- conf.py +4 -0
- data_utils.py +61 -0
- flagged/log.csv +2 -0
- input.txt +0 -0
- nano_gpt_ckpts/ckpt_5k_iters.pt +3 -0
- tokenizer_utils.py +21 -0
- train_shakespeare.ipynb +301 -0
README.md
CHANGED
@@ -1,12 +1,21 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji: 🏃
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: gray
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.1.0
|
8 |
app_file: app.py
|
9 |
-
|
|
|
10 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: raghunc0nano-gpt-shakespeare-demo
|
|
|
|
|
|
|
|
|
|
|
3 |
app_file: app.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 3.39.0
|
6 |
---
|
7 |
+
This is an example of a nano-gpt trained on mini-shakespeare text of size 1MB. The model follows the video exactly and was trained for 5000 iters. The training and the text generation code are in the [train_shakespeare.ipynb](./train_shakespeare.ipynb) file. To generate the text from model during inference time, run the following lines:
|
8 |
+
|
9 |
+
```
|
10 |
+
context = torch.zeros((1, 1), dtype=torch.long)
|
11 |
+
print(char_tokenizer.decode(m3.generate(context, max_new_tokens=500)[0].tolist()))
|
12 |
+
```
|
13 |
+
|
14 |
+
Here we start with a new "context" vector of zero tensor (standing in for "START" token) and "max_new_tokens" is the max number of tokens (or letters here, in this demo) that will be generated. I have limited it to 500 to be able to inference on CPU in a reasonable time (around 10s) -- which is suitable for Huggingface gradio demo without payment. Inference on GPU can support max_new_tokens to any value; tested upto a few thousand.
|
15 |
+
|
16 |
+
The model checkpoint is the 'nano_gpt_ckpts' dir. The hyper params used are the exact same shown in the video:
|
17 |
+
```
|
18 |
+
vocab_size=65, n_layer=6, n_head=6, n_embed=384, block_size=256,
|
19 |
+
bias=False, dropout=0.2
|
20 |
+
```
|
21 |
|
|
__pycache__/attention.cpython-310.pyc
ADDED
Binary file (3.37 kB). View file
|
|
__pycache__/bigram_model.cpython-310.pyc
ADDED
Binary file (3.53 kB). View file
|
|
__pycache__/conf.cpython-310.pyc
ADDED
Binary file (220 Bytes). View file
|
|
__pycache__/data_utils.cpython-310.pyc
ADDED
Binary file (2.3 kB). View file
|
|
__pycache__/tokenizer_utils.cpython-310.pyc
ADDED
Binary file (1.67 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from bigram_model import BigramLanguageModel
|
4 |
+
import os
|
5 |
+
from data_utils import *
|
6 |
+
import torch
|
7 |
+
|
8 |
+
def generate_nanogpt_text():
|
9 |
+
model = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer)
|
10 |
+
ckpt = torch.load(os.path.join("./nano_gpt_ckpts", "ckpt_5k_iters.pt"))
|
11 |
+
model.load_state_dict(ckpt['model'])
|
12 |
+
|
13 |
+
char_tokenizer = load_int_char_tokenizer(load_text())
|
14 |
+
|
15 |
+
context = torch.zeros((1, 1), dtype=torch.long)
|
16 |
+
generated_text = char_tokenizer.decode(model.generate(context, max_new_tokens=400)[0].tolist())
|
17 |
+
|
18 |
+
return generated_text
|
19 |
+
|
20 |
+
|
21 |
+
#gr.Interface(fn=generate_nanogpt_text, inputs=gr.Button(value="Generate text!"), outputs='text').launch(share=True)
|
22 |
+
|
23 |
+
|
24 |
+
with gr.Blocks() as demo:
|
25 |
+
gr.Markdown(
|
26 |
+
"""
|
27 |
+
# Example of text generation with nano-gpt:
|
28 |
+
|
29 |
+
|
30 |
+
The model checkpoint is the 'nano_gpt_ckpts' dir. The hyper params used are the exact same shown in the nano-gpt video by Karpathy, and the dataset size is just 1MB, so the text generated could be gibberish.
|
31 |
+
|
32 |
+
Keep in mind the output is limited to 400 tokens so the inference runs within reasonable time (10s) on CPU. (Huggingface free tier)
|
33 |
+
|
34 |
+
GPU inference can output much much longer sequences.
|
35 |
+
Click on the "Generate text" button to see the generated text.
|
36 |
+
""")
|
37 |
+
generate_button = gr.Button("Generate text!")
|
38 |
+
output = gr.Textbox(label="Generated text from nano-gpt")
|
39 |
+
generate_button.click(fn=generate_nanogpt_text, inputs=None, outputs=output, api_name='nano-gpt text generation sample')
|
40 |
+
|
41 |
+
demo.launch(share=True)
|
attention.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
class SelfAttentionHead(nn.Module):
|
6 |
+
def __init__(self, head_size, n_embed, block_size, dropout=0.2) -> None:
|
7 |
+
super().__init__()
|
8 |
+
self.head_size = head_size
|
9 |
+
self.key = nn.Linear(n_embed, head_size, bias=False)
|
10 |
+
self.query = nn.Linear(n_embed, head_size, bias=False)
|
11 |
+
self.value = nn.Linear(n_embed, head_size, bias=False)
|
12 |
+
self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
|
13 |
+
|
14 |
+
self.dropout = nn.Dropout(dropout)
|
15 |
+
|
16 |
+
def forward(self, x):
|
17 |
+
B, T, C = x.shape
|
18 |
+
k = self.key(x) # (B, T, C)
|
19 |
+
q = self.query(x) # (B, T, C)
|
20 |
+
wei = q @ k.transpose(-2, -1) * (C ** -0.5) # (B, T, C) @ (B, C, T) -> (B, T, T)
|
21 |
+
wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
|
22 |
+
wei = F.softmax(wei, dim=-1) # (B, T, T)
|
23 |
+
wei = self.dropout(wei)
|
24 |
+
v = self.value(x) # (B, T, C)
|
25 |
+
out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
|
26 |
+
return out
|
27 |
+
|
28 |
+
|
29 |
+
class MultiHeadAttention(nn.Module):
|
30 |
+
def __init__(self, num_heads, head_size, n_embed, block_size, dropout=0.2) -> None:
|
31 |
+
super().__init__()
|
32 |
+
self.heads = nn.ModuleList([SelfAttentionHead(head_size, n_embed, block_size) for _ in range(num_heads)])
|
33 |
+
# self.projection = nn.Linear(num_heads * head_size, n_embed)
|
34 |
+
self.projection = nn.Linear(n_embed, n_embed)
|
35 |
+
self.dropout = nn.Dropout(dropout)
|
36 |
+
|
37 |
+
def forward(self, x):
|
38 |
+
out = torch.cat([h(x) for h in self.heads], dim=-1)
|
39 |
+
out = self.dropout(self.projection(out))
|
40 |
+
return out
|
41 |
+
|
42 |
+
|
43 |
+
class FeedForwardNet(nn.Module):
|
44 |
+
def __init__(self, n_embed, dropout=0.2) -> None:
|
45 |
+
super().__init__()
|
46 |
+
self.net = nn.Sequential(
|
47 |
+
nn.Linear(n_embed, 4 * n_embed),
|
48 |
+
nn.ReLU(),
|
49 |
+
nn.Linear(4 * n_embed, n_embed),
|
50 |
+
nn.Dropout(dropout)
|
51 |
+
)
|
52 |
+
|
53 |
+
def forward(self, x):
|
54 |
+
return self.net(x)
|
55 |
+
|
56 |
+
class DecoderBlock(nn.Module):
|
57 |
+
def __init__(self, n_embed, num_heads, block_size) -> None:
|
58 |
+
super().__init__()
|
59 |
+
head_size = n_embed // num_heads
|
60 |
+
self.sa_head = MultiHeadAttention(num_heads, head_size, n_embed, block_size)
|
61 |
+
self.ffn = FeedForwardNet(n_embed)
|
62 |
+
self.ln1 = nn.LayerNorm(n_embed)
|
63 |
+
self.ln2 = nn.LayerNorm(n_embed)
|
64 |
+
|
65 |
+
def forward(self, x):
|
66 |
+
x = x + self.sa_head(self.ln1(x))
|
67 |
+
x = x + self.ffn(self.ln2(x))
|
68 |
+
return x
|
bigram_model.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from data_utils import *
|
5 |
+
|
6 |
+
from attention import SelfAttentionHead, MultiHeadAttention, FeedForwardNet, DecoderBlock
|
7 |
+
|
8 |
+
|
9 |
+
class BigramLanguageModel(nn.Module):
|
10 |
+
def __init__(self, vocab_size, n_embed, block_size, num_heads, n_layers) -> None:
|
11 |
+
super().__init__()
|
12 |
+
self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
|
13 |
+
self.position_embedding_table = nn.Embedding(block_size, n_embed)
|
14 |
+
self.decoder_blocks = nn.Sequential(*[DecoderBlock(n_embed, num_heads, block_size=block_size) for _ in range(n_layers)] )
|
15 |
+
self.ln_final = nn.LayerNorm(n_embed)
|
16 |
+
|
17 |
+
## self.sa_head = SelfAttentionHead(vocab_size, n_embed, block_size)
|
18 |
+
# self.sa_heads = MultiHeadAttention(num_heads=4, head_size=n_embed//4, n_embed=n_embed, block_size=block_size)
|
19 |
+
# self.ffn = FeedForwardNet(n_embed, dropout=0.2)
|
20 |
+
|
21 |
+
self.lm_head = nn.Linear(n_embed, vocab_size)
|
22 |
+
|
23 |
+
|
24 |
+
def forward(self, idx, targets=None):
|
25 |
+
|
26 |
+
# idx and targets both are tensors of shape (B, T) -> B = batch_sz, T = seq_len ("time steps", here 8)
|
27 |
+
B, T = idx.shape
|
28 |
+
tok_embed = self.token_embedding_table(idx) # (B, T, C) C = "channels", here vocab_size or embedding dim for each token
|
29 |
+
pos_embed = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T, C) C = "channels", here vocab_size or embedding dim for each token
|
30 |
+
x_in = tok_embed + pos_embed
|
31 |
+
# x_in = self.sa_heads(x_in)
|
32 |
+
# x_in = self.ffn(x_in)
|
33 |
+
x_in = self.ln_final(self.decoder_blocks(x_in))
|
34 |
+
logits = self.lm_head(x_in) # (B, T, C) C = "channels", here vocab_size or embedding dim for each token
|
35 |
+
|
36 |
+
if targets is None:
|
37 |
+
loss = None
|
38 |
+
else:
|
39 |
+
B, T, C = logits.shape
|
40 |
+
# Cross entropy requires the 2nd param to be C "channels"
|
41 |
+
loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T), ignore_index=0)
|
42 |
+
|
43 |
+
return logits, loss
|
44 |
+
|
45 |
+
def generate(self, idx, max_new_tokens):
|
46 |
+
# idx is (B, T) shaped array of indices in current context
|
47 |
+
for _ in range(max_new_tokens):
|
48 |
+
#limit input idx to last "block size" tokens
|
49 |
+
idx_cond = idx[:, -BLOCK_SIZE:]
|
50 |
+
logits, loss = self(idx_cond)
|
51 |
+
#focus only on the last time step
|
52 |
+
logits = logits[:, -1, :] # becomes (B, C)
|
53 |
+
# apply softmax for probs
|
54 |
+
probs = F.softmax(logits, dim=-1) # (B, C)
|
55 |
+
#sample from distribudion
|
56 |
+
idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
|
57 |
+
#append sampled index to running sequence idx
|
58 |
+
idx = torch.cat([idx, idx_next], dim=1) # (B, T+1)
|
59 |
+
|
60 |
+
return idx
|
61 |
+
|
62 |
+
def get_num_params(self, non_embedding=True):
|
63 |
+
"""
|
64 |
+
Return the number of parameters in the model.
|
65 |
+
For non-embedding count (default), the position embeddings get subtracted.
|
66 |
+
The token embeddings would too, except due to the parameter sharing these
|
67 |
+
params are actually used as weights in the final layer, so we include them.
|
68 |
+
"""
|
69 |
+
n_params = sum(p.numel() for p in self.parameters())
|
70 |
+
if non_embedding:
|
71 |
+
n_params -= self.transformer.wpe.weight.numel()
|
72 |
+
return n_params
|
73 |
+
|
74 |
+
def _init_weights(self, module):
|
75 |
+
if isinstance(module, nn.Linear):
|
76 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
77 |
+
if module.bias is not None:
|
78 |
+
torch.nn.init.zeros_(module.bias)
|
79 |
+
elif isinstance(module, nn.Embedding):
|
80 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
from data_utils import *
|
86 |
+
xb, yb = get_random_batch('train')
|
87 |
+
xb = xb.to(device)
|
88 |
+
yb = yb.to(device)
|
89 |
+
|
90 |
+
m = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)
|
91 |
+
logits, loss = m(xb, yb)
|
92 |
+
print(logits.shape)
|
93 |
+
print(loss)
|
conf.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nanogpt_conf = {
|
2 |
+
"model_name": "nanogpt",
|
3 |
+
"text_file": "input.txt"
|
4 |
+
}
|
data_utils.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
from sklearn.base import validate_parameter_constraints
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from tokenizer_utils import IntCharTokenizer
|
6 |
+
from conf import nanogpt_conf
|
7 |
+
|
8 |
+
BLOCK_SIZE = 256 #context length
|
9 |
+
BATCH_SIZE = 128
|
10 |
+
max_iters = 5000
|
11 |
+
eval_interval = 500
|
12 |
+
learning_rate = 3e-4
|
13 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
+
eval_iters = 100
|
15 |
+
n_embed = 384
|
16 |
+
n_head = 6
|
17 |
+
n_layer = 6
|
18 |
+
dropout = 0.2
|
19 |
+
|
20 |
+
def load_text() -> str:
|
21 |
+
with open(nanogpt_conf["text_file"], "r") as f:
|
22 |
+
text = f.read()
|
23 |
+
return text
|
24 |
+
|
25 |
+
def load_int_char_tokenizer(text: str) -> IntCharTokenizer:
|
26 |
+
return IntCharTokenizer(text)
|
27 |
+
|
28 |
+
def tokenize_char_to_int(text: str) -> List[int]:
|
29 |
+
tokenizer = load_int_char_tokenizer(text)
|
30 |
+
return tokenizer.encode(text)
|
31 |
+
|
32 |
+
# def decode_int_to_char(tokens: List[int]) -> str:
|
33 |
+
# tokenizer = load_int_char_tokenizer(text)
|
34 |
+
# return tokenizer.decode(tokens)
|
35 |
+
|
36 |
+
def load_text_as_tensor(text: str) -> torch.Tensor:
|
37 |
+
data = torch.tensor(tokenize_char_to_int(text), dtype=torch.long)
|
38 |
+
return data
|
39 |
+
|
40 |
+
def split_train_val(text):
|
41 |
+
n = int(0.9 * len(text))
|
42 |
+
train_data = text[:n]
|
43 |
+
val_data = text[n:]
|
44 |
+
|
45 |
+
return train_data, val_data
|
46 |
+
|
47 |
+
|
48 |
+
def get_random_batch(split):
|
49 |
+
train_data, val_data = split_train_val(load_text_as_tensor(load_text()))
|
50 |
+
data = train_data if split == 'train' else val_data
|
51 |
+
ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE, ))
|
52 |
+
x = torch.stack([data[i: i + BLOCK_SIZE] for i in ix])
|
53 |
+
y = torch.stack([data[i + 1: i + BLOCK_SIZE + 1] for i in ix])
|
54 |
+
if device == 'cuda':
|
55 |
+
# pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
|
56 |
+
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
|
57 |
+
else:
|
58 |
+
x, y = x.to(device), y.to(device)
|
59 |
+
return x, y
|
60 |
+
|
61 |
+
|
flagged/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
component 0,output,flag,username,timestamp
|
2 |
+
,,,,2023-11-04 01:54:58.521219
|
input.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nano_gpt_ckpts/ckpt_5k_iters.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e76615f4ce234b78d6e6fcfcc2a7033239ed806084809195cd39689da29c85a4
|
3 |
+
size 139140734
|
tokenizer_utils.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
class IntCharTokenizer:
|
5 |
+
def __init__(self, text):
|
6 |
+
self.chars, self.vocab_size = self._get_uniq_chars(text)
|
7 |
+
self.int_to_char = {i: c for i, c in enumerate(self.chars)}
|
8 |
+
self.char_to_int = {c: i for i, c in enumerate(self.chars)}
|
9 |
+
|
10 |
+
def _get_uniq_chars(self, text):
|
11 |
+
chars = sorted(list(set(text)))
|
12 |
+
return chars, len(chars)
|
13 |
+
|
14 |
+
def encode(self, text):
|
15 |
+
#enc = lambda s: [self.char_to_int[c] for c in s]
|
16 |
+
return [self.char_to_int[c] for c in text]
|
17 |
+
|
18 |
+
def decode(self, tokens):
|
19 |
+
#dec = lambda s: ''.join(self.int_to_char[i] for i in s)
|
20 |
+
return ''.join(self.int_to_char[i] for i in tokens)
|
21 |
+
|
train_shakespeare.ipynb
ADDED
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import torch\n",
|
10 |
+
"from contextlib import nullcontext\n",
|
11 |
+
"from bigram_model import BigramLanguageModel\n",
|
12 |
+
"from tokenizer_utils import IntCharTokenizer"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"cell_type": "code",
|
17 |
+
"execution_count": 2,
|
18 |
+
"metadata": {},
|
19 |
+
"outputs": [],
|
20 |
+
"source": [
|
21 |
+
"import os"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": 3,
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": 4,
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [],
|
38 |
+
"source": [
|
39 |
+
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
40 |
+
"dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler\n",
|
41 |
+
"ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]\n",
|
42 |
+
"ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=ptdtype)\n",
|
43 |
+
"scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))"
|
44 |
+
]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"cell_type": "code",
|
48 |
+
"execution_count": 5,
|
49 |
+
"metadata": {},
|
50 |
+
"outputs": [],
|
51 |
+
"source": [
|
52 |
+
"from data_utils import *\n",
|
53 |
+
"model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embed, block_size=BLOCK_SIZE,\n",
|
54 |
+
" bias=False, vocab_size=None, dropout=dropout)"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": 6,
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [
|
62 |
+
{
|
63 |
+
"name": "stdout",
|
64 |
+
"output_type": "stream",
|
65 |
+
"text": [
|
66 |
+
"torch.Size([128, 256, 65])\n",
|
67 |
+
"tensor(4.3690, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
68 |
+
]
|
69 |
+
}
|
70 |
+
],
|
71 |
+
"source": [
|
72 |
+
"from data_utils import *\n",
|
73 |
+
"xb, yb = get_random_batch('train')\n",
|
74 |
+
"xb = xb.to(device)\n",
|
75 |
+
"yb = yb.to(device)\n",
|
76 |
+
"\n",
|
77 |
+
"m = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)\n",
|
78 |
+
"logits, loss = m(xb, yb)\n",
|
79 |
+
"print(logits.shape)\n",
|
80 |
+
"print(loss)"
|
81 |
+
]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"cell_type": "code",
|
85 |
+
"execution_count": 7,
|
86 |
+
"metadata": {},
|
87 |
+
"outputs": [],
|
88 |
+
"source": [
|
89 |
+
"\n",
|
90 |
+
"def estimate_loss(model):\n",
|
91 |
+
" out = {}\n",
|
92 |
+
" model.eval()\n",
|
93 |
+
" for split in ['train', 'val']:\n",
|
94 |
+
" losses = torch.zeros(eval_iters)\n",
|
95 |
+
" for k in range(eval_iters):\n",
|
96 |
+
" X, Y = get_random_batch(split)\n",
|
97 |
+
" with ctx:\n",
|
98 |
+
" logits, loss = model(X, Y)\n",
|
99 |
+
" losses[k] = loss.item()\n",
|
100 |
+
" out[split] = losses.mean()\n",
|
101 |
+
" model.train()\n",
|
102 |
+
" return out"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": 8,
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [],
|
110 |
+
"source": [
|
111 |
+
"char_tokenizer = load_int_char_tokenizer(load_text())"
|
112 |
+
]
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"cell_type": "code",
|
116 |
+
"execution_count": 9,
|
117 |
+
"metadata": {},
|
118 |
+
"outputs": [
|
119 |
+
{
|
120 |
+
"name": "stdout",
|
121 |
+
"output_type": "stream",
|
122 |
+
"text": [
|
123 |
+
"10.788929 M parameters\n"
|
124 |
+
]
|
125 |
+
},
|
126 |
+
{
|
127 |
+
"name": "stdout",
|
128 |
+
"output_type": "stream",
|
129 |
+
"text": [
|
130 |
+
"step 0: train loss 4.3685, val loss 4.3640\n",
|
131 |
+
"step 500: train loss 1.9681, val loss 2.0837\n",
|
132 |
+
"step 1000: train loss 1.5377, val loss 1.7404\n",
|
133 |
+
"step 1500: train loss 1.3802, val loss 1.6101\n",
|
134 |
+
"step 2000: train loss 1.2855, val loss 1.5551\n",
|
135 |
+
"step 2500: train loss 1.2162, val loss 1.5157\n",
|
136 |
+
"step 3000: train loss 1.1617, val loss 1.5088\n",
|
137 |
+
"step 3500: train loss 1.1061, val loss 1.5088\n",
|
138 |
+
"step 4000: train loss 1.0555, val loss 1.5150\n",
|
139 |
+
"step 4500: train loss 1.0086, val loss 1.5385\n",
|
140 |
+
"step 4999: train loss 0.9583, val loss 1.5524\n"
|
141 |
+
]
|
142 |
+
}
|
143 |
+
],
|
144 |
+
"source": [
|
145 |
+
"print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')\n",
|
146 |
+
"\n",
|
147 |
+
"# create a PyTorch optimizer\n",
|
148 |
+
"optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)\n",
|
149 |
+
"\n",
|
150 |
+
"for iter in range(max_iters):\n",
|
151 |
+
"\n",
|
152 |
+
" # every once in a while evaluate the loss on train and val sets\n",
|
153 |
+
" if iter % eval_interval == 0 or iter == max_iters - 1:\n",
|
154 |
+
" losses = estimate_loss(m)\n",
|
155 |
+
" print(f\"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
|
156 |
+
"\n",
|
157 |
+
" # sample a batch of data\n",
|
158 |
+
" xb, yb = get_random_batch('train')\n",
|
159 |
+
"\n",
|
160 |
+
" # evaluate the loss\n",
|
161 |
+
" logits, loss = m(xb, yb)\n",
|
162 |
+
" optimizer.zero_grad(set_to_none=True)\n",
|
163 |
+
" loss.backward()\n",
|
164 |
+
" optimizer.step()\n"
|
165 |
+
]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": 10,
|
170 |
+
"metadata": {},
|
171 |
+
"outputs": [
|
172 |
+
{
|
173 |
+
"name": "stdout",
|
174 |
+
"output_type": "stream",
|
175 |
+
"text": [
|
176 |
+
"saving checkpoint to ./nano_gpt_ckpts\n"
|
177 |
+
]
|
178 |
+
}
|
179 |
+
],
|
180 |
+
"source": [
|
181 |
+
"checkpoint = {\n",
|
182 |
+
" 'model': m.state_dict(),\n",
|
183 |
+
" 'optimizer': optimizer.state_dict(),\n",
|
184 |
+
" 'model_args': model_args,\n",
|
185 |
+
" 'iter_num': max_iters,\n",
|
186 |
+
" 'best_val_loss': losses['val'],\n",
|
187 |
+
"\n",
|
188 |
+
"}\n",
|
189 |
+
"out_dir = \"./nano_gpt_ckpts\"\n",
|
190 |
+
"print(f\"saving checkpoint to {out_dir}\")\n",
|
191 |
+
"torch.save(checkpoint, os.path.join(out_dir, 'ckpt_5k_iters.pt'))"
|
192 |
+
]
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"cell_type": "code",
|
196 |
+
"execution_count": 11,
|
197 |
+
"metadata": {},
|
198 |
+
"outputs": [],
|
199 |
+
"source": [
|
200 |
+
"#m2 = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)"
|
201 |
+
]
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"cell_type": "code",
|
205 |
+
"execution_count": 12,
|
206 |
+
"metadata": {},
|
207 |
+
"outputs": [
|
208 |
+
{
|
209 |
+
"name": "stdout",
|
210 |
+
"output_type": "stream",
|
211 |
+
"text": [
|
212 |
+
"\n",
|
213 |
+
"GLOUCESTER: learn, like a nap. Prisoner will to my intents! with my brother! and this bloody makes off flows,--and haste tear'd your roe!--I should not be the other's.---I'ld do hear that be pupy with thear; sweet Montague,--thou as done not--So that they have nage must know,--never speak so many tears,--traightful ner-light,--with'd yet a ping tymp,--which time to stir; now still hurr'd,---water'd honour,--Pray's Coitlinius: the mountake's nobled daughter.' Sir, it is some thee on Rome is sin:--'proud him 'there;' none honest seen; forsweet must be pointed, hurls thee in men; a proud confines, foot, die, gin night, old Ratchard!--Go, good lord!--will'd you not piece, I dare not.' an't; swear by the dog, belike! mother!--How sir!-Spite! Jupiteous put o's!--God leave your lawful coward!'--for I'll dry down, you in death;'--near'---for very 'ven a day.---fa, by; 'twas his mother's disposed;--'I shall make no son,--hard him hear me,--do. Madam, or smother'd wife: and that you may part this denies.'--'--thrieks for Richmond dancerts, in free people's anointed,--O, hold: Curs, on a fiathful doom: every nurse, is I long now, never large.' quoth let return him; for an't plead the fie, his maids; he will not quarrel; 'twas this, but take within, as he learn, as and heat, it see; a gized evassages of season, imagish: yet, a very no other consulance, good den.--To fair cousin, stay! come, sir; and hath been, let it breather ring.' God; I am, trusper, I say: provided, pardone! a never lady; come in God. I'll fight with Montagues come. Why, 'twas bring you to be, if the pass off, and here, it dare, man cryield. Frow, your head A called with Gaunt; the cause. O, prettiest his pale thing, rust, and good. Thou adventure be more, Juliet, perishease: I'll take the queen, and his love.--give me note to de,--dyes help, Edward, and after Romeo!--Whence labour cann'd Warwick! was? whither? why hours! fairs! after was? stay come! your run? a happy kind!--O day, go be--hours, wrong!--ta w\n"
|
214 |
+
]
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"source": [
|
218 |
+
"# generate from the model\n",
|
219 |
+
"context = torch.zeros((1, 1), dtype=torch.long, device=device)\n",
|
220 |
+
"#print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))\n",
|
221 |
+
"print(char_tokenizer.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))"
|
222 |
+
]
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"cell_type": "code",
|
226 |
+
"execution_count": null,
|
227 |
+
"metadata": {},
|
228 |
+
"outputs": [],
|
229 |
+
"source": []
|
230 |
+
},
|
231 |
+
{
|
232 |
+
"cell_type": "code",
|
233 |
+
"execution_count": 20,
|
234 |
+
"metadata": {},
|
235 |
+
"outputs": [
|
236 |
+
{
|
237 |
+
"data": {
|
238 |
+
"text/plain": [
|
239 |
+
"<All keys matched successfully>"
|
240 |
+
]
|
241 |
+
},
|
242 |
+
"execution_count": 20,
|
243 |
+
"metadata": {},
|
244 |
+
"output_type": "execute_result"
|
245 |
+
}
|
246 |
+
],
|
247 |
+
"source": [
|
248 |
+
"m3 = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer)\n",
|
249 |
+
"ckpt = torch.load(os.path.join(\"./nano_gpt_ckpts\", \"ckpt_5k_iters.pt\"))\n",
|
250 |
+
"m3.load_state_dict(ckpt['model'])"
|
251 |
+
]
|
252 |
+
},
|
253 |
+
{
|
254 |
+
"cell_type": "code",
|
255 |
+
"execution_count": 21,
|
256 |
+
"metadata": {},
|
257 |
+
"outputs": [
|
258 |
+
{
|
259 |
+
"name": "stdout",
|
260 |
+
"output_type": "stream",
|
261 |
+
"text": [
|
262 |
+
"\n",
|
263 |
+
"But Dohor, aged by! At Antigonus. You see his court! For death; a talm every hand, here shall!--So,--O, I, title now point!--Who, this I sem blind--that tark;--come boy?---O pray, peace! May, two here, do not---that I troth:----to villain leave, where was the Gallent--if I look the house,--bold Jour---whether may I go,--Mine son,---as I amiled me pized,--or so fled; 'tis a famouse,--there littenants,--If an either lawful hant ther is gone.' Sicilence, if it wer done! I have twize its sourness. P\n"
|
264 |
+
]
|
265 |
+
}
|
266 |
+
],
|
267 |
+
"source": [
|
268 |
+
"context = torch.zeros((1, 1), dtype=torch.long)\n",
|
269 |
+
"print(char_tokenizer.decode(m3.generate(context, max_new_tokens=500)[0].tolist()))\n"
|
270 |
+
]
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"cell_type": "code",
|
274 |
+
"execution_count": null,
|
275 |
+
"metadata": {},
|
276 |
+
"outputs": [],
|
277 |
+
"source": []
|
278 |
+
}
|
279 |
+
],
|
280 |
+
"metadata": {
|
281 |
+
"kernelspec": {
|
282 |
+
"display_name": "Python 3",
|
283 |
+
"language": "python",
|
284 |
+
"name": "python3"
|
285 |
+
},
|
286 |
+
"language_info": {
|
287 |
+
"codemirror_mode": {
|
288 |
+
"name": "ipython",
|
289 |
+
"version": 3
|
290 |
+
},
|
291 |
+
"file_extension": ".py",
|
292 |
+
"mimetype": "text/x-python",
|
293 |
+
"name": "python",
|
294 |
+
"nbconvert_exporter": "python",
|
295 |
+
"pygments_lexer": "ipython3",
|
296 |
+
"version": "3.10.12"
|
297 |
+
}
|
298 |
+
},
|
299 |
+
"nbformat": 4,
|
300 |
+
"nbformat_minor": 2
|
301 |
+
}
|