Spaces:

abersbail
/

small-gpt-python

Sleeping

App Files Files Community

abersbail commited on 24 days ago

Commit

79078fe

verified ·

1 Parent(s): 3b6bcbe

Add small GPT Python Space

Browse files

Files changed (10) hide show

README.md +21 -6
app.py +88 -0
requirements.txt +2 -0
small_gpt/__init__.py +4 -0
small_gpt/config.py +24 -0
small_gpt/data.py +87 -0
small_gpt/model.py +111 -0
small_gpt/service.py +120 -0
small_gpt/tokenizer.py +69 -0
small_gpt/trainer.py +52 -0

README.md CHANGED Viewed

@@ -1,12 +1,27 @@
 ---
-title: Small Gpt Python
-emoji: 👀
-colorFrom: green
-colorTo: purple
 sdk: gradio
-sdk_version: 6.10.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Small GPT Python
+colorFrom: indigo
+colorTo: blue
 sdk: gradio
 app_file: app.py
 pinned: false
+license: mit
 ---
+# Small GPT Python
+This is a tiny GPT-style language model project written in Python from scratch.
+## What it includes
+- Word-level tokenizer
+- Causal transformer decoder with self-attention
+- Local CPU training loop
+- Checkpoint save and load
+- Gradio user interface
+## Important
+- No external pretrained LLM is used
+- This is a small educational GPT-like model
+- The first generate or train call will initialize and train the model locally

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import gradio as gr
+from small_gpt.config import SmallGPTConfig
+from small_gpt.service import SmallGPTService
+config = SmallGPTConfig()
+service = SmallGPTService(config=config)
+def generate_text(prompt, max_new_tokens, temperature, top_k):
+    return service.generate(
+        prompt=prompt,
+        max_new_tokens=int(max_new_tokens),
+        temperature=float(temperature),
+        top_k=int(top_k),
+    )
+def train_model(extra_text, steps):
+    return service.train(extra_text=extra_text, steps=int(steps))
+def reset_model():
+    return service.reset()
+with gr.Blocks(
+    title="Small GPT Python",
+    theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"),
+) as demo:
+    gr.Markdown(
+        """
+        # Small GPT Python
+        A tiny GPT-style language model written in Python from scratch.
+        - Causal transformer decoder
+        - Word-level tokenizer
+        - No external pretrained LLM
+        - Local CPU training and generation
+        """
+    )
+    with gr.Tab("Generate"):
+        prompt_input = gr.Textbox(
+            label="Prompt",
+            value="User: hello\nAssistant:",
+            lines=6,
+        )
+        with gr.Row():
+            max_tokens_input = gr.Slider(10, 180, value=72, step=2, label="Max New Tokens")
+            temperature_input = gr.Slider(0.2, 1.3, value=0.75, step=0.05, label="Temperature")
+            top_k_input = gr.Slider(1, 20, value=8, step=1, label="Top-K")
+        generate_button = gr.Button("Generate", variant="primary")
+        output_text = gr.Textbox(label="Output", lines=10)
+        output_status = gr.Textbox(label="Status", lines=4)
+    with gr.Tab("Train"):
+        extra_text_input = gr.Textbox(
+            label="Extra Training Text",
+            placeholder="Add more local text to continue training the small GPT model.",
+            lines=10,
+        )
+        steps_input = gr.Slider(10, 400, value=120, step=10, label="Training Steps")
+        train_button = gr.Button("Train / Continue Training", variant="primary")
+        reset_button = gr.Button("Reset Model")
+        train_status = gr.Textbox(label="Training Status", lines=6)
+    generate_button.click(
+        fn=generate_text,
+        inputs=[prompt_input, max_tokens_input, temperature_input, top_k_input],
+        outputs=[output_text, output_status],
+    )
+    train_button.click(
+        fn=train_model,
+        inputs=[extra_text_input, steps_input],
+        outputs=[train_status],
+    )
+    reset_button.click(
+        fn=reset_model,
+        outputs=[train_status],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio>=5.23.0
2	+ torch>=2.3.0

small_gpt/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .config import SmallGPTConfig
+from .service import SmallGPTService
+__all__ = ["SmallGPTConfig", "SmallGPTService"]

small_gpt/config.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class SmallGPTConfig:
+    block_size: int = 48
+    batch_size: int = 20
+    d_model: int = 96
+    n_heads: int = 4
+    n_layers: int = 3
+    dropout: float = 0.1
+    learning_rate: float = 2.5e-3
+    bootstrap_steps: int = 80
+    cpu_threads: int = 4
+    seed: int = 42
+    @property
+    def root_dir(self) -> Path:
+        return Path(__file__).resolve().parents[1]
+    @property
+    def checkpoint_path(self) -> Path:
+        return self.root_dir / "artifacts" / "small_gpt_checkpoint.pt"

small_gpt/data.py ADDED Viewed

	@@ -0,0 +1,87 @@

+BASE_CORPUS = """
+System: You are a small local GPT style assistant.
+User: hello
+Assistant: Hello. I am a tiny GPT style language model built from scratch in Python.
+User: what are you
+Assistant: I am a local small GPT style model trained on a compact text corpus.
+User: explain machine learning
+Assistant: Machine learning builds patterns from data so models can make predictions.
+User: explain deep learning
+Assistant: Deep learning uses neural networks with multiple layers to learn useful representations.
+User: explain natural language processing
+Assistant: Natural language processing helps computers understand and generate human language.
+User: what is regression
+Assistant: Regression predicts a continuous value such as price, score, demand, or temperature.
+User: what is classification
+Assistant: Classification predicts a category or label for an example.
+User: what is clustering
+Assistant: Clustering groups similar data points without labeled targets.
+User: what is preprocessing
+Assistant: Preprocessing cleans and transforms data before modeling.
+User: what is model evaluation
+Assistant: Model evaluation measures how well a model performs on unseen data.
+User: what is Python
+Assistant: Python is a programming language known for readable code and strong libraries.
+User: what is attention
+Assistant: Attention lets a model focus on useful earlier tokens when predicting the next token.
+User: what is a transformer
+Assistant: A transformer is a neural architecture that uses attention and layered processing.
+User: what is a tokenizer
+Assistant: A tokenizer converts text into smaller units that a model can process.
+User: what is overfitting
+Assistant: Overfitting happens when a model memorizes training data and generalizes poorly.
+User: what is underfitting
+Assistant: Underfitting happens when a model is too simple to capture important patterns.
+User: write a short greeting
+Assistant: Hello and welcome. I am ready to help with a short answer.
+User: write a short poem
+Assistant: Quiet rain on window light, patient thoughts arrive at night.
+User: give a coding tip
+Assistant: Build small pieces, test often, and keep your functions clear.
+User: how should I debug code
+Assistant: Reproduce the problem, isolate the failing step, inspect values, and test one change at a time.
+User: summarize good study habits
+Assistant: Good study habits use planning, active recall, spaced review, and steady practice.
+User: what is data science
+Assistant: Data science combines programming, statistics, and domain knowledge to learn from data.
+User: what is a dataset
+Assistant: A dataset is a collection of examples or records used for analysis or training.
+User: what is a feature
+Assistant: A feature is an input variable used by a model.
+User: what is a target
+Assistant: A target is the value or label a model tries to predict.
+User: what is local ai
+Assistant: Local AI runs on your own machine so you control the code, files, and execution.
+""".strip()
+def build_training_text(extra_text: str = "") -> str:
+    extra = " ".join((extra_text or "").split())
+    if not extra:
+        return BASE_CORPUS
+    return BASE_CORPUS + "\n\n" + extra

small_gpt/model.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import math
+import torch
+from torch import nn
+class CausalSelfAttention(nn.Module):
+    def __init__(self, d_model, n_heads, block_size, dropout):
+        super().__init__()
+        self.n_heads = n_heads
+        self.head_dim = d_model // n_heads
+        self.qkv = nn.Linear(d_model, 3 * d_model)
+        self.out_proj = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+        mask = torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size)
+        self.register_buffer("mask", mask)
+    def forward(self, x):
+        batch, seq_len, channels = x.shape
+        qkv = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        q = q.view(batch, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :seq_len, :seq_len] == 0, float("-inf"))
+        att = torch.softmax(att, dim=-1)
+        att = self.dropout(att)
+        out = att @ v
+        out = out.transpose(1, 2).contiguous().view(batch, seq_len, channels)
+        return self.out_proj(out)
+class FeedForward(nn.Module):
+    def __init__(self, d_model, dropout):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(d_model, 4 * d_model),
+            nn.GELU(),
+            nn.Linear(4 * d_model, d_model),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class GPTBlock(nn.Module):
+    def __init__(self, d_model, n_heads, block_size, dropout):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(d_model)
+        self.attn = CausalSelfAttention(d_model, n_heads, block_size, dropout)
+        self.ln2 = nn.LayerNorm(d_model)
+        self.ff = FeedForward(d_model, dropout)
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.ff(self.ln2(x))
+        return x
+class SmallGPTModel(nn.Module):
+    def __init__(self, vocab_size, block_size, d_model, n_heads, n_layers, dropout):
+        super().__init__()
+        self.block_size = block_size
+        self.token_emb = nn.Embedding(vocab_size, d_model)
+        self.pos_emb = nn.Embedding(block_size, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.Sequential(
+            *[GPTBlock(d_model, n_heads, block_size, dropout) for _ in range(n_layers)]
+        )
+        self.ln_f = nn.LayerNorm(d_model)
+        self.head = nn.Linear(d_model, vocab_size, bias=False)
+        self.head.weight = self.token_emb.weight
+    def forward(self, idx, targets=None):
+        batch, seq_len = idx.shape
+        positions = torch.arange(seq_len, device=idx.device)
+        x = self.token_emb(idx) + self.pos_emb(positions)[None, :, :]
+        x = self.dropout(x)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.head(x)
+        loss = None
+        if targets is not None:
+            loss = nn.functional.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                targets.reshape(-1),
+            )
+        return logits, loss
+    def generate(self, idx, max_new_tokens, eos_id, temperature=1.0, top_k=8):
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -self.block_size :]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / max(temperature, 1e-4)
+            if top_k is not None and top_k > 0:
+                values, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < values[:, [-1]]] = float("-inf")
+            probs = torch.softmax(logits, dim=-1)
+            next_id = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat([idx, next_id], dim=1)
+            if int(next_id.item()) == eos_id:
+                break
+        return idx

small_gpt/service.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import shutil
+import torch
+from .config import SmallGPTConfig
+from .model import SmallGPTModel
+from .tokenizer import WordTokenizer
+from .trainer import create_model_and_tokenizer, set_seed, train_model
+class SmallGPTService:
+    def __init__(self, config: SmallGPTConfig):
+        self.config = config
+        torch.set_num_threads(max(1, self.config.cpu_threads))
+        self.model = None
+        self.tokenizer = None
+    def generate(self, prompt: str, max_new_tokens: int, temperature: float, top_k: int):
+        clean_prompt = prompt or "User: hello\nAssistant:"
+        self._ensure_ready()
+        encoded = self.tokenizer.encode(clean_prompt, add_bos=True)
+        idx = torch.tensor(encoded, dtype=torch.long).unsqueeze(0)
+        self.model.eval()
+        with torch.inference_mode():
+            output = self.model.generate(
+                idx=idx,
+                max_new_tokens=max_new_tokens,
+                eos_id=self.tokenizer.eos_id,
+                temperature=temperature,
+                top_k=top_k,
+            )
+        text = self.tokenizer.decode(output[0].tolist())
+        status = (
+            f"Generated with small GPT Python. "
+            f"Architecture=causal transformer, Vocab={self.tokenizer.vocab_size}, Layers={self.config.n_layers}."
+        )
+        return text, status
+    def train(self, extra_text: str, steps: int):
+        steps = max(1, steps)
+        checkpoint_exists = self.config.checkpoint_path.exists()
+        training_text = extra_text or ""
+        if checkpoint_exists:
+            self._load_or_initialize(extra_text="")
+        model, tokenizer, encoded = create_model_and_tokenizer(self.config, training_text)
+        if checkpoint_exists and self.model is not None and self.tokenizer is not None:
+            if tokenizer.stoi == self.tokenizer.stoi:
+                model.load_state_dict(self.model.state_dict())
+        losses = train_model(model, encoded, self.config, steps)
+        self.model = model
+        self.tokenizer = tokenizer
+        self._save_checkpoint(extra_text=training_text)
+        return (
+            f"small GPT training finished.\n"
+            f"Steps: {steps}\n"
+            f"Start Loss: {losses[0]:.4f}\n"
+            f"End Loss: {losses[-1]:.4f}\n"
+            f"Checkpoint: {self.config.checkpoint_path}"
+        )
+    def reset(self):
+        checkpoint_dir = self.config.checkpoint_path.parent
+        if checkpoint_dir.exists():
+            shutil.rmtree(checkpoint_dir)
+        self.model = None
+        self.tokenizer = None
+        return "small GPT reset complete. Next train or generate call will rebuild from scratch."
+    def _ensure_ready(self):
+        if self.model is not None and self.tokenizer is not None:
+            return
+        self._load_or_initialize(extra_text="")
+    def _load_or_initialize(self, extra_text: str):
+        checkpoint = self.config.checkpoint_path
+        if checkpoint.exists():
+            state = torch.load(checkpoint, map_location="cpu")
+            self.tokenizer = WordTokenizer.from_state_dict(state["tokenizer"])
+            self.model = SmallGPTModel(
+                vocab_size=state["config"]["vocab_size"],
+                block_size=state["config"]["block_size"],
+                d_model=state["config"]["d_model"],
+                n_heads=state["config"]["n_heads"],
+                n_layers=state["config"]["n_layers"],
+                dropout=state["config"]["dropout"],
+            )
+            self.model.load_state_dict(state["model"])
+            self.model.eval()
+            return
+        set_seed(self.config.seed)
+        self.model, self.tokenizer, encoded = create_model_and_tokenizer(self.config, extra_text)
+        train_model(self.model, encoded, self.config, self.config.bootstrap_steps)
+        self._save_checkpoint(extra_text=extra_text)
+    def _save_checkpoint(self, extra_text: str):
+        checkpoint = self.config.checkpoint_path
+        checkpoint.parent.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            {
+                "model": self.model.state_dict(),
+                "tokenizer": self.tokenizer.state_dict(),
+                "config": {
+                    "vocab_size": self.tokenizer.vocab_size,
+                    "block_size": self.config.block_size,
+                    "d_model": self.config.d_model,
+                    "n_heads": self.config.n_heads,
+                    "n_layers": self.config.n_layers,
+                    "dropout": self.config.dropout,
+                    "extra_text": extra_text,
+                },
+            },
+            checkpoint,
+        )

small_gpt/tokenizer.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import re
+TOKEN_PATTERN = re.compile(r"\n|[A-Za-z0-9_']+|[^\w\s]")
+class WordTokenizer:
+    def __init__(self):
+        self.special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]
+        self.stoi = {}
+        self.itos = {}
+    @property
+    def vocab_size(self):
+        return len(self.stoi)
+    @property
+    def bos_id(self):
+        return self.stoi["<bos>"]
+    @property
+    def eos_id(self):
+        return self.stoi["<eos>"]
+    def tokenize(self, text: str):
+        return TOKEN_PATTERN.findall(text)
+    def fit(self, text: str):
+        vocab = self.special_tokens + sorted(set(self.tokenize(text)))
+        self.stoi = {token: idx for idx, token in enumerate(vocab)}
+        self.itos = {idx: token for token, idx in self.stoi.items()}
+        return self
+    def encode(self, text: str, add_bos: bool = False, add_eos: bool = False):
+        tokens = self.tokenize(text)
+        ids = [self.stoi.get(token, self.stoi["<unk>"]) for token in tokens]
+        if add_bos:
+            ids = [self.bos_id] + ids
+        if add_eos:
+            ids = ids + [self.eos_id]
+        return ids
+    def decode(self, ids):
+        tokens = []
+        for idx in ids:
+            token = self.itos.get(int(idx), "<unk>")
+            if token in self.special_tokens:
+                continue
+            tokens.append(token)
+        text = ""
+        for token in tokens:
+            if token == "\n":
+                text = text.rstrip() + "\n"
+            elif token in {".", ",", "!", "?", ":", ";"}:
+                text = text.rstrip() + token + " "
+            else:
+                text += token + " "
+        return text.strip()
+    def state_dict(self):
+        return {"stoi": self.stoi}
+    @classmethod
+    def from_state_dict(cls, state):
+        tok = cls()
+        tok.stoi = dict(state["stoi"])
+        tok.itos = {idx: token for token, idx in tok.stoi.items()}
+        return tok

small_gpt/trainer.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import random
+import torch
+from .data import build_training_text
+from .model import SmallGPTModel
+from .tokenizer import WordTokenizer
+def set_seed(seed: int):
+    random.seed(seed)
+    torch.manual_seed(seed)
+def create_model_and_tokenizer(config, extra_text=""):
+    text = build_training_text(extra_text)
+    tokenizer = WordTokenizer().fit(text)
+    encoded = tokenizer.encode(text, add_bos=True, add_eos=True)
+    encoded = torch.tensor(encoded, dtype=torch.long)
+    model = SmallGPTModel(
+        vocab_size=tokenizer.vocab_size,
+        block_size=config.block_size,
+        d_model=config.d_model,
+        n_heads=config.n_heads,
+        n_layers=config.n_layers,
+        dropout=config.dropout,
+    )
+    return model, tokenizer, encoded
+def build_batch(encoded, block_size, batch_size):
+    max_start = max(1, len(encoded) - block_size - 1)
+    starts = torch.randint(0, max_start, (batch_size,))
+    x = torch.stack([encoded[start : start + block_size] for start in starts])
+    y = torch.stack([encoded[start + 1 : start + block_size + 1] for start in starts])
+    return x, y
+def train_model(model, encoded, config, steps):
+    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
+    model.train()
+    losses = []
+    for _ in range(steps):
+        xb, yb = build_batch(encoded, config.block_size, config.batch_size)
+        _, loss = model(xb, targets=yb)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        losses.append(float(loss.item()))
+    return losses