Spaces:

Ashish-R
/

LLMFromScratch

Sleeping

App Files Files Community

Ashish Reddy commited on Jun 10, 2025

Commit

d00fb47

1 Parent(s): 2ea3f3e

Add application file

Browse files

Files changed (6) hide show

.DS_Store +0 -0
deploy.py +33 -0
model.py +64 -0
nanogpt_model.pth +3 -0
requirements.txt +2 -0
train.py +152 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

deploy.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import gradio as gr
+from model import Model
+from train import encoder, decoder
+# Device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load model
+model = Model().to(device)
+model.load_state_dict(torch.load("nanogpt_model.pth", map_location=device))
+model.eval()
+# Generation function
+def generate_text(prompt, max_tokens):
+    idx = torch.tensor(encoder(prompt), dtype=torch.long, device=device).unsqueeze(0)
+    generated = model.generate(idx, max_new_tokens=max_tokens)[0].tolist()
+    return decoder(generated)
+# Gradio interface
+iface = gr.Interface(
+    fn=generate_text,
+    inputs=[
+        gr.Textbox(lines=2, placeholder="Enter a prompt...", label="Prompt"),
+        gr.Slider(10, 500, value=200, step=10, label="Max Tokens")
+    ],
+    outputs=gr.Textbox(label="Generated Output"),
+    title="🧠 NanoGPT from Scratch",
+    description="A tiny GPT model trained on Shakespeare. Try your luck by giving it a prompt!"
+)
+iface.launch(share=True)

model.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch, torch.nn as nn, torch.nn.functional as F
+batch_size = 64
+max_len = 256
+d_model = 384
+n_layer = 6 # 6 blocks in the decoder
+n_head = 6
+d_q = int(d_model / n_head)
+dropout = 0.2
+vocab_size = 65
+from block import Block
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.token_embedding_table = nn.Embedding(vocab_size, d_model) # Embedding matrix size: (65, 384)
+        self.positional_embedding_table = nn.Embedding(max_len, d_model) # Position matrix size: (256, 384)
+        self.blocks = nn.Sequential(*[Block(d_model, n_head) for _ in range(n_layer)])
+        self.ln = nn.LayerNorm(d_model)
+        self.unembedding_matrix_calc = nn.Linear(d_model, vocab_size)
+    def forward(self, idx, targets=None):
+        B, S = idx.shape
+        tok_emb = self.token_embedding_table(idx) # Size of embedding: (B, S, 384)
+        pos_emb = self.positional_embedding_table(torch.arange(S, device=idx.device)) # Shape: (S, 384)
+        x = tok_emb + pos_emb
+        x = self.blocks(x) # Pass through all 6 blocks each of all 6 heads
+        x = self.ln(x)
+        logits = self.unembedding_matrix_calc(x) # --> (B, S, 384) * (384, 65) --> (B, S, 65)
+        if targets is None:
+            loss = None
+        else:
+            B, S, V = logits.shape
+            logits = logits.view(-1, V) # (B, S, V) --> (B*S, V)
+            targets = targets.view(-1) # --> (B, S) --> (B*S)
+            loss = F.cross_entropy(logits, targets) # Handles softmax interally as well (better because it does log addition which reduces errors instead of log multi)
+        return logits, loss
+    def generate(self, idx, max_new_tokens):
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -max_len:]
+            logits, loss = self(idx_cond)
+            logits = logits[:, -1, :]
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim = 1)
+        return idx
+if __name__ == "__main__":
+    model = Model()
+    idx = torch.zeros((batch_size, max_len), dtype=torch.long)
+    logits, loss = model(idx, idx)
+    print("Input shape:", idx.shape)
+    print("Output logits shape:", logits.shape)
+    print("Calculated loss:", loss.item())

nanogpt_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04b000c9b4136c6badf5fd7c6bab668f7fec6b7ffc1838c6d85b9d4ef6a15fce
+size 52673259

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch
2	+ gradio

train.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import torch, torch.nn as nn, torch.optim as optim, torch.nn.functional as F, wandb, time
+batch_size = 64
+max_len = 256
+d_model = 384
+n_layer = 6
+n_head = 6
+d_q = int(d_model / n_head)
+dropout = 0.2
+vocab_size = 65
+max_iters = 5000
+eval_interval = 500
+learning_rate = 3e-4
+eval_iters = 200
+"""
+---- Device ----
+"""
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+    print("Using CUDA (GPU)")
+elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+    device = torch.device('mps')
+    print("Using MPS (Apple Silicon GPU)")
+else:
+    device = torch.device('cpu')
+    print("Using device's CPU")
+"""
+--- WandB Integration ---
+"""
+wandb.init(
+    project="nano-model-shakesphere-training",
+    config={
+        "learning_rate": learning_rate,
+        "architecture": "decoder-only-model",
+        "dataset": "tinyshakesphere",
+        "d_model": d_model,
+        "n_layer": n_layer,
+        "n_head": n_head,
+        "max_iters": max_iters,
+        "dropout": dropout
+    }
+)
+with open('input.txt', 'r', encoding='utf-8') as f:
+    text = f.read()
+chars = sorted(list(set(text))) # --> All unique characters within the text
+vocab_size = len(chars) # 65 different characters in text
+stoi = {}
+itos = {}
+for i in range(len(chars)):
+    stoi[chars[i]] = i  # Convert strings to ints
+    itos[i] = chars[i]  # Convert ints to strings
+# Take a string, and output its characters indices in a list
+def encoder(s):
+    res = []
+    for char in s:
+        res.append(stoi[char])
+    return res
+# Take a list of indices and output a string
+def decoder(l):
+    res = ""
+    for i in l:
+        res += itos[i]
+    return res
+data = torch.tensor(encoder(text), dtype=torch.long) # --> Same shape as length, i.e., number of characters
+n = int(0.9 * len(data))
+train_data = data[:n] # 90% of text
+val_data = data[n:]  # 10% of text
+def get_batch(split):
+    if split.lower() == 'train':
+        data = train_data
+    else:
+        data = val_data
+    ix = torch.randint(len(data)-max_len, (batch_size,)) # Generate batch_size=64 random numbers from 0 to len(data)-max_len
+    x = torch.stack([data[i:i+max_len] for i in ix])        # Generates 250 ids from that random number and stacks batch_size by rows, so shape[64, 256]
+    y = torch.stack([data[i+1:i+max_len+1] for i in ix])    # This is done in order to test teh real y with the later predicted y by the model using cross entropy and update weights
+    return x.to(device), y.to(device)
+"""
+--- Model Training ---
+"""
+if __name__ == "__main__":
+    from model import Model
+    model = Model()
+    m = model.to(device)
+    optimizer = optim.AdamW(
+        model.parameters(),
+        lr=learning_rate
+    )
+    @torch.no_grad
+    def estimate_loss():
+        out = {}
+        model.eval()
+        for split in ['train', 'val']:
+            losses = torch.zeros(eval_iters)
+            for k in range(eval_iters):
+                X, Y = get_batch(split)
+                logits, loss = model(X, Y)
+                losses[k] = loss.item()
+            out[split] = losses.mean()
+        model.train()
+        return out
+    for iter in range(max_iters):
+        if iter % eval_interval == 0 or iter == max_iters - 1:
+            losses = estimate_loss()
+            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+            wandb.log({
+                "iter": iter,
+                "train/loss": losses['train'],
+                "val/loss": losses['val'],
+                "lr": learning_rate
+            })
+        iter_start = time.time()
+        xb, yb = get_batch("train")
+        logits, loss = model(xb, yb)
+        optimizer.zero_grad(set_to_none=True) # Required for new resetting as after iter, new set of batches will come
+        loss.backward()                       # Required for back passing, it gives you the amount of steepness and gradient
+        optimizer.step()                      # Required for actually nudging in that given direction (Taking a plausible value of lr right now but it influences a lot)
+        iter_time = time.time() - iter_start
+        print(f"Iteration {iter} completed in {iter_time:.2f} seconds")
+        wandb.log({"iter_time": iter_time})
+    wandb.finish()
+    print("Training finished. Saving model state...")
+    torch.save(model.state_dict(), 'nanogpt_model.pth')
+    print("Model saved to nanogpt_model.pth")