raghunc0 commited on
Commit
a2dad08
1 Parent(s): 7c09b8b

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,12 +1,21 @@
1
  ---
2
- title: Raghunc0nano Gpt Shakespeare Demo
3
- emoji: 🏃
4
- colorFrom: purple
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.1.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: raghunc0nano-gpt-shakespeare-demo
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 3.39.0
6
  ---
7
+ This is an example of a nano-gpt trained on mini-shakespeare text of size 1MB. The model follows the video exactly and was trained for 5000 iters. The training and the text generation code are in the [train_shakespeare.ipynb](./train_shakespeare.ipynb) file. To generate the text from model during inference time, run the following lines:
8
+
9
+ ```
10
+ context = torch.zeros((1, 1), dtype=torch.long)
11
+ print(char_tokenizer.decode(m3.generate(context, max_new_tokens=500)[0].tolist()))
12
+ ```
13
+
14
+ Here we start with a new "context" vector of zero tensor (standing in for "START" token) and "max_new_tokens" is the max number of tokens (or letters here, in this demo) that will be generated. I have limited it to 500 to be able to inference on CPU in a reasonable time (around 10s) -- which is suitable for Huggingface gradio demo without payment. Inference on GPU can support max_new_tokens to any value; tested upto a few thousand.
15
+
16
+ The model checkpoint is the 'nano_gpt_ckpts' dir. The hyper params used are the exact same shown in the video:
17
+ ```
18
+ vocab_size=65, n_layer=6, n_head=6, n_embed=384, block_size=256,
19
+ bias=False, dropout=0.2
20
+ ```
21
 
 
__pycache__/attention.cpython-310.pyc ADDED
Binary file (3.37 kB). View file
 
__pycache__/bigram_model.cpython-310.pyc ADDED
Binary file (3.53 kB). View file
 
__pycache__/conf.cpython-310.pyc ADDED
Binary file (220 Bytes). View file
 
__pycache__/data_utils.cpython-310.pyc ADDED
Binary file (2.3 kB). View file
 
__pycache__/tokenizer_utils.cpython-310.pyc ADDED
Binary file (1.67 kB). View file
 
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from bigram_model import BigramLanguageModel
4
+ import os
5
+ from data_utils import *
6
+ import torch
7
+
8
+ def generate_nanogpt_text():
9
+ model = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer)
10
+ ckpt = torch.load(os.path.join("./nano_gpt_ckpts", "ckpt_5k_iters.pt"))
11
+ model.load_state_dict(ckpt['model'])
12
+
13
+ char_tokenizer = load_int_char_tokenizer(load_text())
14
+
15
+ context = torch.zeros((1, 1), dtype=torch.long)
16
+ generated_text = char_tokenizer.decode(model.generate(context, max_new_tokens=400)[0].tolist())
17
+
18
+ return generated_text
19
+
20
+
21
+ #gr.Interface(fn=generate_nanogpt_text, inputs=gr.Button(value="Generate text!"), outputs='text').launch(share=True)
22
+
23
+
24
+ with gr.Blocks() as demo:
25
+ gr.Markdown(
26
+ """
27
+ # Example of text generation with nano-gpt:
28
+
29
+
30
+ The model checkpoint is the 'nano_gpt_ckpts' dir. The hyper params used are the exact same shown in the nano-gpt video by Karpathy, and the dataset size is just 1MB, so the text generated could be gibberish.
31
+
32
+ Keep in mind the output is limited to 400 tokens so the inference runs within reasonable time (10s) on CPU. (Huggingface free tier)
33
+
34
+ GPU inference can output much much longer sequences.
35
+ Click on the "Generate text" button to see the generated text.
36
+ """)
37
+ generate_button = gr.Button("Generate text!")
38
+ output = gr.Textbox(label="Generated text from nano-gpt")
39
+ generate_button.click(fn=generate_nanogpt_text, inputs=None, outputs=output, api_name='nano-gpt text generation sample')
40
+
41
+ demo.launch(share=True)
attention.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class SelfAttentionHead(nn.Module):
6
+ def __init__(self, head_size, n_embed, block_size, dropout=0.2) -> None:
7
+ super().__init__()
8
+ self.head_size = head_size
9
+ self.key = nn.Linear(n_embed, head_size, bias=False)
10
+ self.query = nn.Linear(n_embed, head_size, bias=False)
11
+ self.value = nn.Linear(n_embed, head_size, bias=False)
12
+ self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
13
+
14
+ self.dropout = nn.Dropout(dropout)
15
+
16
+ def forward(self, x):
17
+ B, T, C = x.shape
18
+ k = self.key(x) # (B, T, C)
19
+ q = self.query(x) # (B, T, C)
20
+ wei = q @ k.transpose(-2, -1) * (C ** -0.5) # (B, T, C) @ (B, C, T) -> (B, T, T)
21
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
22
+ wei = F.softmax(wei, dim=-1) # (B, T, T)
23
+ wei = self.dropout(wei)
24
+ v = self.value(x) # (B, T, C)
25
+ out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
26
+ return out
27
+
28
+
29
+ class MultiHeadAttention(nn.Module):
30
+ def __init__(self, num_heads, head_size, n_embed, block_size, dropout=0.2) -> None:
31
+ super().__init__()
32
+ self.heads = nn.ModuleList([SelfAttentionHead(head_size, n_embed, block_size) for _ in range(num_heads)])
33
+ # self.projection = nn.Linear(num_heads * head_size, n_embed)
34
+ self.projection = nn.Linear(n_embed, n_embed)
35
+ self.dropout = nn.Dropout(dropout)
36
+
37
+ def forward(self, x):
38
+ out = torch.cat([h(x) for h in self.heads], dim=-1)
39
+ out = self.dropout(self.projection(out))
40
+ return out
41
+
42
+
43
+ class FeedForwardNet(nn.Module):
44
+ def __init__(self, n_embed, dropout=0.2) -> None:
45
+ super().__init__()
46
+ self.net = nn.Sequential(
47
+ nn.Linear(n_embed, 4 * n_embed),
48
+ nn.ReLU(),
49
+ nn.Linear(4 * n_embed, n_embed),
50
+ nn.Dropout(dropout)
51
+ )
52
+
53
+ def forward(self, x):
54
+ return self.net(x)
55
+
56
+ class DecoderBlock(nn.Module):
57
+ def __init__(self, n_embed, num_heads, block_size) -> None:
58
+ super().__init__()
59
+ head_size = n_embed // num_heads
60
+ self.sa_head = MultiHeadAttention(num_heads, head_size, n_embed, block_size)
61
+ self.ffn = FeedForwardNet(n_embed)
62
+ self.ln1 = nn.LayerNorm(n_embed)
63
+ self.ln2 = nn.LayerNorm(n_embed)
64
+
65
+ def forward(self, x):
66
+ x = x + self.sa_head(self.ln1(x))
67
+ x = x + self.ffn(self.ln2(x))
68
+ return x
bigram_model.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from data_utils import *
5
+
6
+ from attention import SelfAttentionHead, MultiHeadAttention, FeedForwardNet, DecoderBlock
7
+
8
+
9
+ class BigramLanguageModel(nn.Module):
10
+ def __init__(self, vocab_size, n_embed, block_size, num_heads, n_layers) -> None:
11
+ super().__init__()
12
+ self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
13
+ self.position_embedding_table = nn.Embedding(block_size, n_embed)
14
+ self.decoder_blocks = nn.Sequential(*[DecoderBlock(n_embed, num_heads, block_size=block_size) for _ in range(n_layers)] )
15
+ self.ln_final = nn.LayerNorm(n_embed)
16
+
17
+ ## self.sa_head = SelfAttentionHead(vocab_size, n_embed, block_size)
18
+ # self.sa_heads = MultiHeadAttention(num_heads=4, head_size=n_embed//4, n_embed=n_embed, block_size=block_size)
19
+ # self.ffn = FeedForwardNet(n_embed, dropout=0.2)
20
+
21
+ self.lm_head = nn.Linear(n_embed, vocab_size)
22
+
23
+
24
+ def forward(self, idx, targets=None):
25
+
26
+ # idx and targets both are tensors of shape (B, T) -> B = batch_sz, T = seq_len ("time steps", here 8)
27
+ B, T = idx.shape
28
+ tok_embed = self.token_embedding_table(idx) # (B, T, C) C = "channels", here vocab_size or embedding dim for each token
29
+ pos_embed = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T, C) C = "channels", here vocab_size or embedding dim for each token
30
+ x_in = tok_embed + pos_embed
31
+ # x_in = self.sa_heads(x_in)
32
+ # x_in = self.ffn(x_in)
33
+ x_in = self.ln_final(self.decoder_blocks(x_in))
34
+ logits = self.lm_head(x_in) # (B, T, C) C = "channels", here vocab_size or embedding dim for each token
35
+
36
+ if targets is None:
37
+ loss = None
38
+ else:
39
+ B, T, C = logits.shape
40
+ # Cross entropy requires the 2nd param to be C "channels"
41
+ loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T), ignore_index=0)
42
+
43
+ return logits, loss
44
+
45
+ def generate(self, idx, max_new_tokens):
46
+ # idx is (B, T) shaped array of indices in current context
47
+ for _ in range(max_new_tokens):
48
+ #limit input idx to last "block size" tokens
49
+ idx_cond = idx[:, -BLOCK_SIZE:]
50
+ logits, loss = self(idx_cond)
51
+ #focus only on the last time step
52
+ logits = logits[:, -1, :] # becomes (B, C)
53
+ # apply softmax for probs
54
+ probs = F.softmax(logits, dim=-1) # (B, C)
55
+ #sample from distribudion
56
+ idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
57
+ #append sampled index to running sequence idx
58
+ idx = torch.cat([idx, idx_next], dim=1) # (B, T+1)
59
+
60
+ return idx
61
+
62
+ def get_num_params(self, non_embedding=True):
63
+ """
64
+ Return the number of parameters in the model.
65
+ For non-embedding count (default), the position embeddings get subtracted.
66
+ The token embeddings would too, except due to the parameter sharing these
67
+ params are actually used as weights in the final layer, so we include them.
68
+ """
69
+ n_params = sum(p.numel() for p in self.parameters())
70
+ if non_embedding:
71
+ n_params -= self.transformer.wpe.weight.numel()
72
+ return n_params
73
+
74
+ def _init_weights(self, module):
75
+ if isinstance(module, nn.Linear):
76
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
77
+ if module.bias is not None:
78
+ torch.nn.init.zeros_(module.bias)
79
+ elif isinstance(module, nn.Embedding):
80
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
81
+
82
+
83
+
84
+ if __name__ == "__main__":
85
+ from data_utils import *
86
+ xb, yb = get_random_batch('train')
87
+ xb = xb.to(device)
88
+ yb = yb.to(device)
89
+
90
+ m = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)
91
+ logits, loss = m(xb, yb)
92
+ print(logits.shape)
93
+ print(loss)
conf.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ nanogpt_conf = {
2
+ "model_name": "nanogpt",
3
+ "text_file": "input.txt"
4
+ }
data_utils.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from sklearn.base import validate_parameter_constraints
3
+ import torch
4
+
5
+ from tokenizer_utils import IntCharTokenizer
6
+ from conf import nanogpt_conf
7
+
8
+ BLOCK_SIZE = 256 #context length
9
+ BATCH_SIZE = 128
10
+ max_iters = 5000
11
+ eval_interval = 500
12
+ learning_rate = 3e-4
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ eval_iters = 100
15
+ n_embed = 384
16
+ n_head = 6
17
+ n_layer = 6
18
+ dropout = 0.2
19
+
20
+ def load_text() -> str:
21
+ with open(nanogpt_conf["text_file"], "r") as f:
22
+ text = f.read()
23
+ return text
24
+
25
+ def load_int_char_tokenizer(text: str) -> IntCharTokenizer:
26
+ return IntCharTokenizer(text)
27
+
28
+ def tokenize_char_to_int(text: str) -> List[int]:
29
+ tokenizer = load_int_char_tokenizer(text)
30
+ return tokenizer.encode(text)
31
+
32
+ # def decode_int_to_char(tokens: List[int]) -> str:
33
+ # tokenizer = load_int_char_tokenizer(text)
34
+ # return tokenizer.decode(tokens)
35
+
36
+ def load_text_as_tensor(text: str) -> torch.Tensor:
37
+ data = torch.tensor(tokenize_char_to_int(text), dtype=torch.long)
38
+ return data
39
+
40
+ def split_train_val(text):
41
+ n = int(0.9 * len(text))
42
+ train_data = text[:n]
43
+ val_data = text[n:]
44
+
45
+ return train_data, val_data
46
+
47
+
48
+ def get_random_batch(split):
49
+ train_data, val_data = split_train_val(load_text_as_tensor(load_text()))
50
+ data = train_data if split == 'train' else val_data
51
+ ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE, ))
52
+ x = torch.stack([data[i: i + BLOCK_SIZE] for i in ix])
53
+ y = torch.stack([data[i + 1: i + BLOCK_SIZE + 1] for i in ix])
54
+ if device == 'cuda':
55
+ # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
56
+ x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
57
+ else:
58
+ x, y = x.to(device), y.to(device)
59
+ return x, y
60
+
61
+
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ component 0,output,flag,username,timestamp
2
+ ,,,,2023-11-04 01:54:58.521219
input.txt ADDED
The diff for this file is too large to render. See raw diff
 
nano_gpt_ckpts/ckpt_5k_iters.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e76615f4ce234b78d6e6fcfcc2a7033239ed806084809195cd39689da29c85a4
3
+ size 139140734
tokenizer_utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class IntCharTokenizer:
5
+ def __init__(self, text):
6
+ self.chars, self.vocab_size = self._get_uniq_chars(text)
7
+ self.int_to_char = {i: c for i, c in enumerate(self.chars)}
8
+ self.char_to_int = {c: i for i, c in enumerate(self.chars)}
9
+
10
+ def _get_uniq_chars(self, text):
11
+ chars = sorted(list(set(text)))
12
+ return chars, len(chars)
13
+
14
+ def encode(self, text):
15
+ #enc = lambda s: [self.char_to_int[c] for c in s]
16
+ return [self.char_to_int[c] for c in text]
17
+
18
+ def decode(self, tokens):
19
+ #dec = lambda s: ''.join(self.int_to_char[i] for i in s)
20
+ return ''.join(self.int_to_char[i] for i in tokens)
21
+
train_shakespeare.ipynb ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import torch\n",
10
+ "from contextlib import nullcontext\n",
11
+ "from bigram_model import BigramLanguageModel\n",
12
+ "from tokenizer_utils import IntCharTokenizer"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 2,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import os"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 3,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 4,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
40
+ "dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler\n",
41
+ "ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]\n",
42
+ "ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=ptdtype)\n",
43
+ "scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 5,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "from data_utils import *\n",
53
+ "model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embed, block_size=BLOCK_SIZE,\n",
54
+ " bias=False, vocab_size=None, dropout=dropout)"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 6,
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "name": "stdout",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "torch.Size([128, 256, 65])\n",
67
+ "tensor(4.3690, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
68
+ ]
69
+ }
70
+ ],
71
+ "source": [
72
+ "from data_utils import *\n",
73
+ "xb, yb = get_random_batch('train')\n",
74
+ "xb = xb.to(device)\n",
75
+ "yb = yb.to(device)\n",
76
+ "\n",
77
+ "m = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)\n",
78
+ "logits, loss = m(xb, yb)\n",
79
+ "print(logits.shape)\n",
80
+ "print(loss)"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 7,
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "\n",
90
+ "def estimate_loss(model):\n",
91
+ " out = {}\n",
92
+ " model.eval()\n",
93
+ " for split in ['train', 'val']:\n",
94
+ " losses = torch.zeros(eval_iters)\n",
95
+ " for k in range(eval_iters):\n",
96
+ " X, Y = get_random_batch(split)\n",
97
+ " with ctx:\n",
98
+ " logits, loss = model(X, Y)\n",
99
+ " losses[k] = loss.item()\n",
100
+ " out[split] = losses.mean()\n",
101
+ " model.train()\n",
102
+ " return out"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 8,
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "char_tokenizer = load_int_char_tokenizer(load_text())"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 9,
117
+ "metadata": {},
118
+ "outputs": [
119
+ {
120
+ "name": "stdout",
121
+ "output_type": "stream",
122
+ "text": [
123
+ "10.788929 M parameters\n"
124
+ ]
125
+ },
126
+ {
127
+ "name": "stdout",
128
+ "output_type": "stream",
129
+ "text": [
130
+ "step 0: train loss 4.3685, val loss 4.3640\n",
131
+ "step 500: train loss 1.9681, val loss 2.0837\n",
132
+ "step 1000: train loss 1.5377, val loss 1.7404\n",
133
+ "step 1500: train loss 1.3802, val loss 1.6101\n",
134
+ "step 2000: train loss 1.2855, val loss 1.5551\n",
135
+ "step 2500: train loss 1.2162, val loss 1.5157\n",
136
+ "step 3000: train loss 1.1617, val loss 1.5088\n",
137
+ "step 3500: train loss 1.1061, val loss 1.5088\n",
138
+ "step 4000: train loss 1.0555, val loss 1.5150\n",
139
+ "step 4500: train loss 1.0086, val loss 1.5385\n",
140
+ "step 4999: train loss 0.9583, val loss 1.5524\n"
141
+ ]
142
+ }
143
+ ],
144
+ "source": [
145
+ "print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')\n",
146
+ "\n",
147
+ "# create a PyTorch optimizer\n",
148
+ "optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)\n",
149
+ "\n",
150
+ "for iter in range(max_iters):\n",
151
+ "\n",
152
+ " # every once in a while evaluate the loss on train and val sets\n",
153
+ " if iter % eval_interval == 0 or iter == max_iters - 1:\n",
154
+ " losses = estimate_loss(m)\n",
155
+ " print(f\"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
156
+ "\n",
157
+ " # sample a batch of data\n",
158
+ " xb, yb = get_random_batch('train')\n",
159
+ "\n",
160
+ " # evaluate the loss\n",
161
+ " logits, loss = m(xb, yb)\n",
162
+ " optimizer.zero_grad(set_to_none=True)\n",
163
+ " loss.backward()\n",
164
+ " optimizer.step()\n"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 10,
170
+ "metadata": {},
171
+ "outputs": [
172
+ {
173
+ "name": "stdout",
174
+ "output_type": "stream",
175
+ "text": [
176
+ "saving checkpoint to ./nano_gpt_ckpts\n"
177
+ ]
178
+ }
179
+ ],
180
+ "source": [
181
+ "checkpoint = {\n",
182
+ " 'model': m.state_dict(),\n",
183
+ " 'optimizer': optimizer.state_dict(),\n",
184
+ " 'model_args': model_args,\n",
185
+ " 'iter_num': max_iters,\n",
186
+ " 'best_val_loss': losses['val'],\n",
187
+ "\n",
188
+ "}\n",
189
+ "out_dir = \"./nano_gpt_ckpts\"\n",
190
+ "print(f\"saving checkpoint to {out_dir}\")\n",
191
+ "torch.save(checkpoint, os.path.join(out_dir, 'ckpt_5k_iters.pt'))"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": 11,
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "#m2 = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 12,
206
+ "metadata": {},
207
+ "outputs": [
208
+ {
209
+ "name": "stdout",
210
+ "output_type": "stream",
211
+ "text": [
212
+ "\n",
213
+ "GLOUCESTER: learn, like a nap. Prisoner will to my intents! with my brother! and this bloody makes off flows,--and haste tear'd your roe!--I should not be the other's.---I'ld do hear that be pupy with thear; sweet Montague,--thou as done not--So that they have nage must know,--never speak so many tears,--traightful ner-light,--with'd yet a ping tymp,--which time to stir; now still hurr'd,---water'd honour,--Pray's Coitlinius: the mountake's nobled daughter.' Sir, it is some thee on Rome is sin:--'proud him 'there;' none honest seen; forsweet must be pointed, hurls thee in men; a proud confines, foot, die, gin night, old Ratchard!--Go, good lord!--will'd you not piece, I dare not.' an't; swear by the dog, belike! mother!--How sir!-Spite! Jupiteous put o's!--God leave your lawful coward!'--for I'll dry down, you in death;'--near'---for very 'ven a day.---fa, by; 'twas his mother's disposed;--'I shall make no son,--hard him hear me,--do. Madam, or smother'd wife: and that you may part this denies.'--'--thrieks for Richmond dancerts, in free people's anointed,--O, hold: Curs, on a fiathful doom: every nurse, is I long now, never large.' quoth let return him; for an't plead the fie, his maids; he will not quarrel; 'twas this, but take within, as he learn, as and heat, it see; a gized evassages of season, imagish: yet, a very no other consulance, good den.--To fair cousin, stay! come, sir; and hath been, let it breather ring.' God; I am, trusper, I say: provided, pardone! a never lady; come in God. I'll fight with Montagues come. Why, 'twas bring you to be, if the pass off, and here, it dare, man cryield. Frow, your head A called with Gaunt; the cause. O, prettiest his pale thing, rust, and good. Thou adventure be more, Juliet, perishease: I'll take the queen, and his love.--give me note to de,--dyes help, Edward, and after Romeo!--Whence labour cann'd Warwick! was? whither? why hours! fairs! after was? stay come! your run? a happy kind!--O day, go be--hours, wrong!--ta w\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "# generate from the model\n",
219
+ "context = torch.zeros((1, 1), dtype=torch.long, device=device)\n",
220
+ "#print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))\n",
221
+ "print(char_tokenizer.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "code",
226
+ "execution_count": null,
227
+ "metadata": {},
228
+ "outputs": [],
229
+ "source": []
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": 20,
234
+ "metadata": {},
235
+ "outputs": [
236
+ {
237
+ "data": {
238
+ "text/plain": [
239
+ "<All keys matched successfully>"
240
+ ]
241
+ },
242
+ "execution_count": 20,
243
+ "metadata": {},
244
+ "output_type": "execute_result"
245
+ }
246
+ ],
247
+ "source": [
248
+ "m3 = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer)\n",
249
+ "ckpt = torch.load(os.path.join(\"./nano_gpt_ckpts\", \"ckpt_5k_iters.pt\"))\n",
250
+ "m3.load_state_dict(ckpt['model'])"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": 21,
256
+ "metadata": {},
257
+ "outputs": [
258
+ {
259
+ "name": "stdout",
260
+ "output_type": "stream",
261
+ "text": [
262
+ "\n",
263
+ "But Dohor, aged by! At Antigonus. You see his court! For death; a talm every hand, here shall!--So,--O, I, title now point!--Who, this I sem blind--that tark;--come boy?---O pray, peace! May, two here, do not---that I troth:----to villain leave, where was the Gallent--if I look the house,--bold Jour---whether may I go,--Mine son,---as I amiled me pized,--or so fled; 'tis a famouse,--there littenants,--If an either lawful hant ther is gone.' Sicilence, if it wer done! I have twize its sourness. P\n"
264
+ ]
265
+ }
266
+ ],
267
+ "source": [
268
+ "context = torch.zeros((1, 1), dtype=torch.long)\n",
269
+ "print(char_tokenizer.decode(m3.generate(context, max_new_tokens=500)[0].tolist()))\n"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": null,
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": []
278
+ }
279
+ ],
280
+ "metadata": {
281
+ "kernelspec": {
282
+ "display_name": "Python 3",
283
+ "language": "python",
284
+ "name": "python3"
285
+ },
286
+ "language_info": {
287
+ "codemirror_mode": {
288
+ "name": "ipython",
289
+ "version": 3
290
+ },
291
+ "file_extension": ".py",
292
+ "mimetype": "text/x-python",
293
+ "name": "python",
294
+ "nbconvert_exporter": "python",
295
+ "pygments_lexer": "ipython3",
296
+ "version": "3.10.12"
297
+ }
298
+ },
299
+ "nbformat": 4,
300
+ "nbformat_minor": 2
301
+ }