Spaces:

raghunc0
/

raghunc0nano-gpt-shakespeare-demo

Runtime error

App Files Files Community

raghunc0 commited on Nov 3, 2023

Commit

a2dad08

•

1 Parent(s): 7c09b8b

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

README.md +17 -8
__pycache__/attention.cpython-310.pyc +0 -0
__pycache__/bigram_model.cpython-310.pyc +0 -0
__pycache__/conf.cpython-310.pyc +0 -0
__pycache__/data_utils.cpython-310.pyc +0 -0
__pycache__/tokenizer_utils.cpython-310.pyc +0 -0
app.py +41 -0
attention.py +68 -0
bigram_model.py +93 -0
conf.py +4 -0
data_utils.py +61 -0
flagged/log.csv +2 -0
input.txt +0 -0
nano_gpt_ckpts/ckpt_5k_iters.pt +3 -0
tokenizer_utils.py +21 -0
train_shakespeare.ipynb +301 -0

README.md CHANGED Viewed

@@ -1,12 +1,21 @@
 ---
-title: Raghunc0nano Gpt Shakespeare Demo
-emoji: 🏃
-colorFrom: purple
-colorTo: gray
-sdk: gradio
-sdk_version: 4.1.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: raghunc0nano-gpt-shakespeare-demo
 app_file: app.py
+sdk: gradio
+sdk_version: 3.39.0
 ---
+This is an example of a nano-gpt trained on mini-shakespeare text of size 1MB. The model follows the video exactly and was trained for 5000 iters. The training and the text generation code are in the [train_shakespeare.ipynb](./train_shakespeare.ipynb) file. To generate the text from model during inference time, run the following lines:
+```
+context = torch.zeros((1, 1), dtype=torch.long)
+print(char_tokenizer.decode(m3.generate(context, max_new_tokens=500)[0].tolist()))
+```
+Here we start with a new "context" vector of zero tensor (standing in for "START" token) and  "max_new_tokens" is the max number of tokens (or letters here, in this demo) that will be generated. I have limited it to 500 to be able to inference on CPU in a reasonable time (around 10s) -- which is suitable for Huggingface gradio demo without payment. Inference on GPU can support max_new_tokens to any value; tested upto a few thousand.
+The model checkpoint is the 'nano_gpt_ckpts' dir. The hyper params used are the exact same shown in the video:
+```
+vocab_size=65, n_layer=6, n_head=6, n_embed=384, block_size=256,
+                  bias=False, dropout=0.2
+```

__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (3.37 kB). View file

__pycache__/bigram_model.cpython-310.pyc ADDED Viewed

Binary file (3.53 kB). View file

__pycache__/conf.cpython-310.pyc ADDED Viewed

Binary file (220 Bytes). View file

__pycache__/data_utils.cpython-310.pyc ADDED Viewed

Binary file (2.3 kB). View file

__pycache__/tokenizer_utils.cpython-310.pyc ADDED Viewed

Binary file (1.67 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import gradio as gr
+from bigram_model import BigramLanguageModel
+import os
+from data_utils import *
+import torch
+def generate_nanogpt_text():
+    model = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer)
+    ckpt = torch.load(os.path.join("./nano_gpt_ckpts", "ckpt_5k_iters.pt"))
+    model.load_state_dict(ckpt['model'])
+    char_tokenizer = load_int_char_tokenizer(load_text())
+    context = torch.zeros((1, 1), dtype=torch.long)
+    generated_text = char_tokenizer.decode(model.generate(context, max_new_tokens=400)[0].tolist())
+    return generated_text
+#gr.Interface(fn=generate_nanogpt_text, inputs=gr.Button(value="Generate text!"), outputs='text').launch(share=True)
+with  gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    # Example of text generation with nano-gpt:
+   The model checkpoint is the 'nano_gpt_ckpts' dir. The hyper params used are the exact same shown in the nano-gpt video by Karpathy, and the dataset size is just 1MB, so the text generated could be gibberish.
+   Keep in mind the output is limited to 400 tokens so the inference runs within reasonable time (10s) on CPU. (Huggingface free tier)
+   GPU inference can output much much longer sequences.
+   Click on the "Generate text" button to see the generated text.
+    """)
+    generate_button = gr.Button("Generate text!")
+    output = gr.Textbox(label="Generated text from nano-gpt")
+    generate_button.click(fn=generate_nanogpt_text, inputs=None, outputs=output, api_name='nano-gpt text generation sample')
+demo.launch(share=True)

attention.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SelfAttentionHead(nn.Module):
+    def __init__(self, head_size, n_embed, block_size, dropout=0.2) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.key = nn.Linear(n_embed, head_size, bias=False)
+        self.query = nn.Linear(n_embed, head_size, bias=False)
+        self.value = nn.Linear(n_embed, head_size, bias=False)
+        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        B, T, C = x.shape
+        k = self.key(x) # (B, T, C)
+        q = self.query(x) # (B, T, C)
+        wei = q @ k.transpose(-2, -1) * (C ** -0.5) # (B, T, C) @ (B, C, T) -> (B, T, T)
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
+        wei = F.softmax(wei, dim=-1) # (B, T, T)
+        wei = self.dropout(wei)
+        v = self.value(x) # (B, T, C)
+        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
+        return out
+class MultiHeadAttention(nn.Module):
+    def __init__(self, num_heads, head_size, n_embed, block_size, dropout=0.2) -> None:
+        super().__init__()
+        self.heads = nn.ModuleList([SelfAttentionHead(head_size, n_embed, block_size) for _ in range(num_heads)])
+        # self.projection = nn.Linear(num_heads * head_size, n_embed)
+        self.projection = nn.Linear(n_embed, n_embed)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        out = torch.cat([h(x) for h in self.heads], dim=-1)
+        out = self.dropout(self.projection(out))
+        return out
+class FeedForwardNet(nn.Module):
+    def __init__(self, n_embed, dropout=0.2) -> None:
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(n_embed, 4 * n_embed),
+            nn.ReLU(),
+            nn.Linear(4 * n_embed, n_embed),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+class DecoderBlock(nn.Module):
+    def __init__(self, n_embed, num_heads, block_size) -> None:
+        super().__init__()
+        head_size = n_embed // num_heads
+        self.sa_head = MultiHeadAttention(num_heads, head_size, n_embed, block_size)
+        self.ffn = FeedForwardNet(n_embed)
+        self.ln1 = nn.LayerNorm(n_embed)
+        self.ln2 = nn.LayerNorm(n_embed)
+    def forward(self, x):
+        x = x + self.sa_head(self.ln1(x))
+        x = x + self.ffn(self.ln2(x))
+        return x

bigram_model.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from data_utils import *
+from attention import SelfAttentionHead, MultiHeadAttention, FeedForwardNet, DecoderBlock
+class BigramLanguageModel(nn.Module):
+    def __init__(self, vocab_size, n_embed, block_size, num_heads, n_layers) -> None:
+        super().__init__()
+        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
+        self.position_embedding_table = nn.Embedding(block_size, n_embed)
+        self.decoder_blocks = nn.Sequential(*[DecoderBlock(n_embed, num_heads, block_size=block_size) for _ in range(n_layers)] )
+        self.ln_final = nn.LayerNorm(n_embed)
+        ## self.sa_head = SelfAttentionHead(vocab_size, n_embed, block_size)
+        # self.sa_heads = MultiHeadAttention(num_heads=4, head_size=n_embed//4, n_embed=n_embed, block_size=block_size)
+        # self.ffn = FeedForwardNet(n_embed, dropout=0.2)
+        self.lm_head = nn.Linear(n_embed, vocab_size)
+    def forward(self, idx, targets=None):
+        # idx and targets both are tensors of shape (B, T) -> B = batch_sz, T = seq_len ("time steps", here 8)
+        B, T = idx.shape
+        tok_embed = self.token_embedding_table(idx) # (B, T, C)  C = "channels", here vocab_size or embedding dim for each token
+        pos_embed = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T, C)  C = "channels", here vocab_size or embedding dim for each token
+        x_in = tok_embed + pos_embed
+        # x_in = self.sa_heads(x_in)
+        # x_in = self.ffn(x_in)
+        x_in = self.ln_final(self.decoder_blocks(x_in))
+        logits = self.lm_head(x_in) # (B, T, C)  C = "channels", here vocab_size or embedding dim for each token
+        if targets is None:
+            loss = None
+        else:
+            B, T, C = logits.shape
+            # Cross entropy requires the 2nd param to be C "channels"
+            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T), ignore_index=0)
+        return logits, loss
+    def generate(self, idx, max_new_tokens):
+        # idx is (B, T) shaped array of indices in current context
+        for _ in range(max_new_tokens):
+            #limit input idx to last "block size" tokens
+            idx_cond = idx[:, -BLOCK_SIZE:]
+            logits, loss = self(idx_cond)
+            #focus only on the last time step
+            logits = logits[:, -1, :] # becomes (B, C)
+            # apply softmax for probs
+            probs = F.softmax(logits, dim=-1) # (B, C)
+            #sample from distribudion
+            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
+            #append sampled index to running sequence idx
+            idx = torch.cat([idx, idx_next], dim=1) # (B, T+1)
+        return idx
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+if __name__ == "__main__":
+    from data_utils import *
+    xb, yb = get_random_batch('train')
+    xb = xb.to(device)
+    yb = yb.to(device)
+    m = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)
+    logits, loss = m(xb, yb)
+    print(logits.shape)
+    print(loss)

conf.py ADDED Viewed

	@@ -0,0 +1,4 @@

+nanogpt_conf = {
+    "model_name": "nanogpt",
+    "text_file": "input.txt"
+}

data_utils.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from typing import List
+from sklearn.base import validate_parameter_constraints
+import torch
+from tokenizer_utils import IntCharTokenizer
+from conf import nanogpt_conf
+BLOCK_SIZE = 256 #context length
+BATCH_SIZE = 128
+max_iters = 5000
+eval_interval = 500
+learning_rate = 3e-4
+device = "cuda" if torch.cuda.is_available() else "cpu"
+eval_iters = 100
+n_embed = 384
+n_head = 6
+n_layer = 6
+dropout = 0.2
+def load_text() -> str:
+    with open(nanogpt_conf["text_file"], "r") as f:
+        text = f.read()
+    return text
+def load_int_char_tokenizer(text: str) -> IntCharTokenizer:
+    return IntCharTokenizer(text)
+def tokenize_char_to_int(text: str) -> List[int]:
+    tokenizer = load_int_char_tokenizer(text)
+    return tokenizer.encode(text)
+# def decode_int_to_char(tokens: List[int]) -> str:
+#     tokenizer = load_int_char_tokenizer(text)
+#     return tokenizer.decode(tokens)
+def load_text_as_tensor(text: str) -> torch.Tensor:
+    data = torch.tensor(tokenize_char_to_int(text), dtype=torch.long)
+    return data
+def split_train_val(text):
+    n = int(0.9 * len(text))
+    train_data = text[:n]
+    val_data = text[n:]
+    return train_data, val_data
+def get_random_batch(split):
+    train_data, val_data = split_train_val(load_text_as_tensor(load_text()))
+    data = train_data if split == 'train' else val_data
+    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE, ))
+    x = torch.stack([data[i: i + BLOCK_SIZE] for i in ix])
+    y = torch.stack([data[i + 1: i + BLOCK_SIZE + 1] for i in ix])
+    if device == 'cuda':
+        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
+        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
+    else:
+        x, y = x.to(device), y.to(device)
+    return x, y

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ component 0,output,flag,username,timestamp
2	+ ,,,,2023-11-04 01:54:58.521219

input.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

nano_gpt_ckpts/ckpt_5k_iters.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e76615f4ce234b78d6e6fcfcc2a7033239ed806084809195cd39689da29c85a4
+size 139140734

tokenizer_utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+class IntCharTokenizer:
+    def __init__(self, text):
+        self.chars, self.vocab_size = self._get_uniq_chars(text)
+        self.int_to_char = {i: c for i, c in enumerate(self.chars)}
+        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
+    def _get_uniq_chars(self, text):
+        chars = sorted(list(set(text)))
+        return chars, len(chars)
+    def encode(self, text):
+        #enc = lambda s: [self.char_to_int[c] for c in s]
+        return [self.char_to_int[c] for c in text]
+    def decode(self, tokens):
+        #dec = lambda s: ''.join(self.int_to_char[i] for i in s)
+        return ''.join(self.int_to_char[i] for i in tokens)

train_shakespeare.ipynb ADDED Viewed

	@@ -0,0 +1,301 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from contextlib import nullcontext\n",
+    "from bigram_model import BigramLanguageModel\n",
+    "from tokenizer_utils import IntCharTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler\n",
+    "ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]\n",
+    "ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=ptdtype)\n",
+    "scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from data_utils import *\n",
+    "model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embed, block_size=BLOCK_SIZE,\n",
+    "                  bias=False, vocab_size=None, dropout=dropout)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([128, 256, 65])\n",
+      "tensor(4.3690, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from data_utils import *\n",
+    "xb, yb = get_random_batch('train')\n",
+    "xb = xb.to(device)\n",
+    "yb = yb.to(device)\n",
+    "\n",
+    "m = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)\n",
+    "logits, loss = m(xb, yb)\n",
+    "print(logits.shape)\n",
+    "print(loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def estimate_loss(model):\n",
+    "    out = {}\n",
+    "    model.eval()\n",
+    "    for split in ['train', 'val']:\n",
+    "        losses = torch.zeros(eval_iters)\n",
+    "        for k in range(eval_iters):\n",
+    "            X, Y = get_random_batch(split)\n",
+    "            with ctx:\n",
+    "                logits, loss = model(X, Y)\n",
+    "            losses[k] = loss.item()\n",
+    "        out[split] = losses.mean()\n",
+    "    model.train()\n",
+    "    return out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "char_tokenizer = load_int_char_tokenizer(load_text())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10.788929 M parameters\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step 0: train loss 4.3685, val loss 4.3640\n",
+      "step 500: train loss 1.9681, val loss 2.0837\n",
+      "step 1000: train loss 1.5377, val loss 1.7404\n",
+      "step 1500: train loss 1.3802, val loss 1.6101\n",
+      "step 2000: train loss 1.2855, val loss 1.5551\n",
+      "step 2500: train loss 1.2162, val loss 1.5157\n",
+      "step 3000: train loss 1.1617, val loss 1.5088\n",
+      "step 3500: train loss 1.1061, val loss 1.5088\n",
+      "step 4000: train loss 1.0555, val loss 1.5150\n",
+      "step 4500: train loss 1.0086, val loss 1.5385\n",
+      "step 4999: train loss 0.9583, val loss 1.5524\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')\n",
+    "\n",
+    "# create a PyTorch optimizer\n",
+    "optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)\n",
+    "\n",
+    "for iter in range(max_iters):\n",
+    "\n",
+    "    # every once in a while evaluate the loss on train and val sets\n",
+    "    if iter % eval_interval == 0 or iter == max_iters - 1:\n",
+    "        losses = estimate_loss(m)\n",
+    "        print(f\"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
+    "\n",
+    "    # sample a batch of data\n",
+    "    xb, yb = get_random_batch('train')\n",
+    "\n",
+    "    # evaluate the loss\n",
+    "    logits, loss = m(xb, yb)\n",
+    "    optimizer.zero_grad(set_to_none=True)\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "saving checkpoint to ./nano_gpt_ckpts\n"
+     ]
+    }
+   ],
+   "source": [
+    "checkpoint = {\n",
+    "    'model': m.state_dict(),\n",
+    "    'optimizer': optimizer.state_dict(),\n",
+    "    'model_args': model_args,\n",
+    "    'iter_num': max_iters,\n",
+    "    'best_val_loss': losses['val'],\n",
+    "\n",
+    "}\n",
+    "out_dir = \"./nano_gpt_ckpts\"\n",
+    "print(f\"saving checkpoint to {out_dir}\")\n",
+    "torch.save(checkpoint, os.path.join(out_dir, 'ckpt_5k_iters.pt'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#m2 = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "GLOUCESTER: learn, like a nap. Prisoner will to my intents! with my brother! and this bloody makes off flows,--and haste tear'd your roe!--I should not be the other's.---I'ld do hear that be pupy with thear; sweet Montague,--thou as done not--So that they have nage must know,--never speak so many tears,--traightful ner-light,--with'd yet a ping tymp,--which time to stir; now still hurr'd,---water'd honour,--Pray's Coitlinius: the mountake's nobled daughter.' Sir, it is some thee on Rome is sin:--'proud him 'there;' none honest seen; forsweet must be pointed, hurls thee in men; a proud confines, foot, die, gin night, old Ratchard!--Go, good lord!--will'd you not piece, I dare not.' an't; swear by the dog, belike! mother!--How sir!-Spite! Jupiteous put o's!--God leave your lawful coward!'--for I'll dry down, you in death;'--near'---for very 'ven a day.---fa, by; 'twas his mother's disposed;--'I shall make no son,--hard him hear me,--do. Madam, or smother'd wife: and that you may part this denies.'--'--thrieks for Richmond dancerts, in free people's anointed,--O, hold: Curs, on a fiathful doom: every nurse, is I long now, never large.' quoth let return him; for an't plead the fie, his maids; he will not quarrel; 'twas this, but take within, as he learn, as and heat, it see; a gized evassages of season, imagish: yet, a very no other consulance, good den.--To fair cousin, stay! come, sir; and hath been, let it breather ring.' God; I am, trusper, I say: provided, pardone! a never lady; come in God. I'll fight with Montagues come. Why, 'twas bring you to be, if the pass off, and here, it dare, man cryield. Frow, your head A called with Gaunt; the cause. O, prettiest his pale thing, rust, and good. Thou adventure be more, Juliet, perishease: I'll take the queen, and his love.--give me note to de,--dyes help, Edward, and after Romeo!--Whence labour cann'd Warwick! was? whither? why hours! fairs! after was? stay come! your run? a happy kind!--O day, go be--hours, wrong!--ta w\n"
+     ]
+    }
+   ],
+   "source": [
+    "# generate from the model\n",
+    "context = torch.zeros((1, 1), dtype=torch.long, device=device)\n",
+    "#print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))\n",
+    "print(char_tokenizer.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "m3 = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer)\n",
+    "ckpt = torch.load(os.path.join(\"./nano_gpt_ckpts\", \"ckpt_5k_iters.pt\"))\n",
+    "m3.load_state_dict(ckpt['model'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "But Dohor, aged by! At Antigonus. You see his court! For death; a talm every hand, here shall!--So,--O, I, title now point!--Who, this I sem blind--that tark;--come boy?---O pray, peace! May, two here, do not---that I troth:----to villain leave, where was the Gallent--if I look the house,--bold Jour---whether may I go,--Mine son,---as I amiled me pized,--or so fled; 'tis a famouse,--there littenants,--If an either lawful hant ther is gone.' Sicilence, if it wer done! I have twize its sourness. P\n"
+     ]
+    }
+   ],
+   "source": [
+    "context = torch.zeros((1, 1), dtype=torch.long)\n",
+    "print(char_tokenizer.decode(m3.generate(context, max_new_tokens=500)[0].tolist()))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}