Add files using upload-large-folder tool

Browse files

Files changed (8) hide show

README.md +120 -0
config.json +12 -0
config.py +50 -0
meta.txt +6 -0
model.py +180 -0
model.safetensors +3 -0
sample.py +55 -0
tokenizer.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+---
+license: apache-2.0
+language:
+  - en
+library_name: pytorch
+tags:
+  - causal-lm
+  - pretrained-from-scratch
+  - small-lm
+  - gpt
+datasets:
+  - roneneldan/TinyStories
+  - roneneldan/TinyStoriesInstruct
+  - wikimedia/wikipedia
+  - nampdn-ai/tiny-textbooks
+pipeline_tag: text-generation
+---
+# tiny-38m
+A 37.8M-parameter decoder-only transformer pretrained from zero on a mix of small, simple-vocabulary corpora. Pure PyTorch, single GPU, no HF Trainer, no PEFT, no distillation.
+Educational artifact. Demonstrates that the modern transformer recipe (RMSNorm + RoPE + SwiGLU + SDPA) reaches coherent output at small scale on a single GPU.
+## Quick start
+```python
+import json, sys, torch
+from pathlib import Path
+from huggingface_hub import snapshot_download
+from tokenizers import Tokenizer
+from safetensors.torch import load_file
+local = snapshot_download("darthcrawl/tiny-38m")
+sys.path.insert(0, local)
+from config import ModelConfig
+from model import GPT
+cfg_dict = json.loads((Path(local) / "config.json").read_text())
+valid = {f for f in ModelConfig.__dataclass_fields__}
+cfg = ModelConfig(**{k: v for k, v in cfg_dict.items() if k in valid})
+model = GPT(cfg).eval()
+model.load_state_dict(load_file(f"{local}/model.safetensors"), strict=False)
+tok = Tokenizer.from_file(f"{local}/tokenizer.json")
+eot = tok.token_to_id("<|endoftext|>")
+ids = torch.tensor([tok.encode("Once upon a time, there was a small dragon").ids], dtype=torch.long)
+out = model.generate(ids, max_new_tokens=200, temperature=0.8, top_k=200, eos_id=eot)
+print(tok.decode(out[0].tolist()))
+```
+`strict=False` is required because tied embeddings (`lm_head.weight = tok_emb.weight`) get stored once.
+## Architecture
+| | |
+|---|---|
+| Type | Decoder-only transformer |
+| Parameters | 37.8M |
+| Layers | 8 |
+| Hidden dim | 512 |
+| Attention heads | 8 |
+| Context length | 1024 |
+| Vocab size | 8192 |
+| Position encoding | RoPE |
+| Norm | RMSNorm (pre-norm) |
+| MLP | SwiGLU |
+| Attention | PyTorch SDPA, causal |
+| Embedding tying | Yes |
+## Training
+| | |
+|---|---|
+| Source mix | `tinystories:60,tinystories_instruct:15,simple_wiki:15,tiny_textbooks:10` |
+| Total train tokens | 477521740 |
+| Best ckpt step | 19500 |
+| Best val loss | 1.8847 |
+| Optimizer | AdamW (β=(0.9, 0.95), wd=0.1) |
+| Peak LR | 0.0006 |
+| LR schedule | Cosine, 200-step warmup |
+| Batch size | 32 × grad_accum 4 |
+| Precision | bfloat16 (AMP) |
+| Hardware | Single GPU |
+Mix format is `name:weight,...`. `meta.txt` in this repo is the canonical record.
+## Tokenizer
+Byte-level BPE trained on the same source mix. Single `tokenizer.json` (HuggingFace `tokenizers` format), 8192 merges. Special tokens: `<|endoftext|>` (eot/eos), `<|pad|>`.
+## What it can do
+- Continue toddler-level English narratives in TinyStories register.
+- Produce short factual-sounding text in the simple-Wikipedia register.
+- Follow basic prompt → story patterns from TinyStoriesInstruct.
+## What it can't do
+- General-knowledge QA, code, math, multi-turn chat, reasoning, instructions beyond what was in the training mix.
+- Out-of-distribution vocabulary. Vocab is small and the corpus is intentionally narrow.
+- Reliable factuality. Even on simple-wiki-style prompts it will confabulate.
+## Intended use
+Education, replication, ablations, baseline for from-scratch pretraining experiments. Not for downstream production.
+## Limitations and bias
+Inherits whatever biases live in the synthetic TinyStories corpora and Simple English Wikipedia. Outputs are not safe for any user-facing application. No safety alignment, no instruction tuning, no RLHF.
+## Reproducibility
+Inference code (`model.py`, `config.py`, `sample.py`) ships in this repo. Full training pipeline (tokenizer, data prep, training loop, source mixing) is in the upstream project.
+## License
+Apache 2.0 for code and weights. Training data licenses follow their respective sources (see Datasets in metadata).

config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "vocab_size": 8192,
+  "n_layer": 8,
+  "n_head": 8,
+  "n_embd": 512,
+  "block_size": 1024,
+  "rope_base": 10000.0,
+  "mlp_mult": 4,
+  "dropout": 0.0,
+  "tie_embeddings": true,
+  "arch": "from_scratch_gpt"
+}

config.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from dataclasses import dataclass, field, asdict
+@dataclass
+class ModelConfig:
+    vocab_size: int = 8192
+    n_layer: int = 8
+    n_head: int = 8
+    n_embd: int = 512
+    block_size: int = 1024
+    rope_base: float = 10000.0
+    mlp_mult: int = 4
+    dropout: float = 0.0
+    tie_embeddings: bool = True
+    @property
+    def head_dim(self) -> int:
+        assert self.n_embd % self.n_head == 0
+        return self.n_embd // self.n_head
+@dataclass
+class TrainConfig:
+    out_dir: str = "checkpoints"
+    data_dir: str = "data"
+    tokenizer_path: str = "data/tokenizer.json"
+    batch_size: int = 32
+    grad_accum: int = 4
+    max_steps: int = 20000
+    eval_interval: int = 500
+    eval_iters: int = 100
+    log_interval: int = 20
+    save_interval: int = 2000
+    lr: float = 6e-4
+    min_lr: float = 6e-5
+    warmup_steps: int = 200
+    weight_decay: float = 0.1
+    beta1: float = 0.9
+    beta2: float = 0.95
+    grad_clip: float = 1.0
+    dtype: str = "bfloat16"
+    compile: bool = True
+    seed: int = 1337
+    device: str = "cuda"
+    def to_dict(self):
+        return asdict(self)

meta.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+dtype=uint16
+vocab=8192
+eot=0
+train_tokens=477521740
+val_tokens=9456433
+mix=tinystories:60,tinystories_instruct:15,simple_wiki:15,tiny_textbooks:10

model.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""Decoder-only transformer with RMSNorm, RoPE, SwiGLU. Educational, modern, single-GPU."""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from config import ModelConfig
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        return self.weight * norm.to(x.dtype)
+def build_rope_cache(seq_len: int, head_dim: int, base: float, device, dtype):
+    inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+    t = torch.arange(seq_len, device=device).float()
+    freqs = torch.outer(t, inv_freq)
+    cos = freqs.cos().to(dtype)
+    sin = freqs.sin().to(dtype)
+    return cos, sin
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    # x: (B, H, T, D). Pair adjacent dims and rotate.
+    x1, x2 = x[..., 0::2], x[..., 1::2]
+    cos = cos[None, None, :x.size(-2), :]
+    sin = sin[None, None, :x.size(-2), :]
+    rot1 = x1 * cos - x2 * sin
+    rot2 = x1 * sin + x2 * cos
+    out = torch.stack((rot1, rot2), dim=-1).flatten(-2)
+    return out
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        self.n_head = cfg.n_head
+        self.head_dim = cfg.head_dim
+        self.qkv = nn.Linear(cfg.n_embd, 3 * cfg.n_embd, bias=False)
+        self.proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.dropout = cfg.dropout
+    def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        q = apply_rope(q, cos, sin)
+        k = apply_rope(k, cos, sin)
+        y = F.scaled_dot_product_attention(
+            q, k, v,
+            is_causal=True,
+            dropout_p=self.dropout if self.training else 0.0,
+        )
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.proj(y)
+class SwiGLU(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        hidden = cfg.mlp_mult * cfg.n_embd
+        # Round to multiple of 64 for efficiency.
+        hidden = ((hidden + 63) // 64) * 64
+        self.w1 = nn.Linear(cfg.n_embd, hidden, bias=False)
+        self.w3 = nn.Linear(cfg.n_embd, hidden, bias=False)
+        self.w2 = nn.Linear(hidden, cfg.n_embd, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class Block(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        self.norm1 = RMSNorm(cfg.n_embd)
+        self.attn = CausalSelfAttention(cfg)
+        self.norm2 = RMSNorm(cfg.n_embd)
+        self.mlp = SwiGLU(cfg)
+    def forward(self, x, cos, sin):
+        x = x + self.attn(self.norm1(x), cos, sin)
+        x = x + self.mlp(self.norm2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.n_embd)
+        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layer)])
+        self.norm = RMSNorm(cfg.n_embd)
+        self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
+        if cfg.tie_embeddings:
+            self.lm_head.weight = self.tok_emb.weight
+        self.apply(self._init_weights)
+        # Scale residual projections per GPT-2 init.
+        for name, p in self.named_parameters():
+            if name.endswith("proj.weight") or name.endswith("w2.weight"):
+                nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * cfg.n_layer))
+        self._rope_cache = None
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, mean=0.0, std=0.02)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, mean=0.0, std=0.02)
+    def num_params(self, non_embedding: bool = True) -> int:
+        n = sum(p.numel() for p in self.parameters())
+        if non_embedding and self.cfg.tie_embeddings:
+            n -= self.tok_emb.weight.numel()
+        return n
+    def _rope(self, T: int, device, dtype):
+        if (self._rope_cache is None
+            or self._rope_cache[0].size(0) < T
+            or self._rope_cache[0].device != device
+            or self._rope_cache[0].dtype != dtype):
+            self._rope_cache = build_rope_cache(
+                self.cfg.block_size, self.cfg.head_dim, self.cfg.rope_base, device, dtype,
+            )
+        cos, sin = self._rope_cache
+        return cos[:T], sin[:T]
+    def forward(self, idx: torch.Tensor, targets: torch.Tensor | None = None):
+        B, T = idx.shape
+        assert T <= self.cfg.block_size, f"sequence length {T} > block_size {self.cfg.block_size}"
+        x = self.tok_emb(idx)
+        cos, sin = self._rope(T, x.device, x.dtype)
+        for block in self.blocks:
+            x = block(x, cos, sin)
+        x = self.norm(x)
+        if targets is None:
+            logits = self.lm_head(x[:, [-1], :])
+            return logits, None
+        logits = self.lm_head(x)
+        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx: torch.Tensor, max_new_tokens: int,
+                 temperature: float = 1.0, top_k: int | None = None,
+                 eos_id: int | None = None):
+        for _ in range(max_new_tokens):
+            idx_cond = idx if idx.size(1) <= self.cfg.block_size else idx[:, -self.cfg.block_size:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / max(temperature, 1e-5)
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("inf")
+            probs = F.softmax(logits, dim=-1)
+            next_id = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, next_id), dim=1)
+            if eos_id is not None and (next_id == eos_id).all():
+                break
+        return idx

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c6b1b608732dfd322ac3b51cfadee1382a575f23a5b1dad2064baf75447f69
+size 151035216

sample.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Generate from a trained checkpoint."""
+import argparse
+from pathlib import Path
+import torch
+from tokenizers import Tokenizer
+from config import ModelConfig
+from model import GPT
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--ckpt", type=str, default="checkpoints/best.pt")
+    p.add_argument("--tokenizer", type=str, default="data/tokenizer.json")
+    p.add_argument("--prompt", type=str, default="Once upon a time")
+    p.add_argument("--max-new-tokens", type=int, default=256)
+    p.add_argument("--temperature", type=float, default=0.8)
+    p.add_argument("--top-k", type=int, default=200)
+    p.add_argument("--num-samples", type=int, default=1)
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--device", type=str, default=None)
+    args = p.parse_args()
+    device = args.device or ("cuda" if torch.cuda.is_available() else "cpu")
+    torch.manual_seed(args.seed)
+    ckpt = torch.load(args.ckpt, map_location=device, weights_only=False)
+    cfg_dict = ckpt["model_cfg"]
+    valid = {f for f in ModelConfig.__dataclass_fields__}
+    cfg = ModelConfig(**{k: v for k, v in cfg_dict.items() if k in valid})
+    model = GPT(cfg).to(device).eval()
+    model.load_state_dict(ckpt["model"])
+    tok = Tokenizer.from_file(args.tokenizer)
+    eot = tok.token_to_id("<|endoftext|>")
+    ids = tok.encode(args.prompt).ids
+    if not ids:
+        ids = [eot]
+    x = torch.tensor([ids], dtype=torch.long, device=device)
+    for s in range(args.num_samples):
+        out = model.generate(
+            x, max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature, top_k=args.top_k, eos_id=eot,
+        )[0].tolist()
+        text = tok.decode(out)
+        print(f"\n--- sample {s + 1} ---")
+        print(text)
+if __name__ == "__main__":
+    main()

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff