Spaces:

ProCreations
/

Intellite

Running

App Files Files Community

ProCreations commited on 24 days ago

Commit

a2ce935

1 Parent(s): 04582f0

Initial RLHF chat UI for intellite 100M

Browse files

Files changed (6) hide show

.gitignore +4 -0
README.md +57 -7
app.py +338 -0
config.py +60 -0
model.py +162 -0
requirements.txt +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+*.pyc
+data.json
+data.json.tmp

README.md CHANGED Viewed

@@ -1,13 +1,63 @@
 ---
-title: Intellite
-emoji: 💻
-colorFrom: indigo
-colorTo: red
 sdk: gradio
-sdk_version: 6.12.0
 app_file: app.py
 pinned: false
-short_description: 'Intellite '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: intellite-100m
+emoji: 💬
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 5.0.0
 app_file: app.py
 pinned: false
 ---
+# intellite-100M — RLHF data collector
+Serves the SFT-tuned intellite 100M model in a chat UI. Every assistant reply
+gets 👍 / 👎 buttons; each rating appends one record to `data.json` with the
+prompt, the response, and the binary reward — ready for RLHF / DPO training
+on your Mac.
+## Setup
+1. Copy your SFT checkpoint to the Space root as **`best.pt`**
+   (or set `INTELLITE_CKPT=/path/to/file.pt` in the Space's settings → Variables).
+   Use `git lfs track "best.pt"` before committing the weights file.
+2. Push the Space. `app.py` loads the checkpoint once at startup.
+## Data format
+`data.json` is a list of records, one per rating:
+```json
+{
+  "ts": "2026-04-20T15:23:45",
+  "system": "You are a helpful, honest, and concise assistant.",
+  "prompt_messages": [
+    { "role": "user", "content": "..." },
+    { "role": "assistant", "content": "..." },
+    { "role": "user", "content": "..." }
+  ],
+  "response": "...",
+  "liked": true
+}
+```
+Each record is exactly `(prompt, response, reward∈{0,1})` — the shape any
+preference/RL trainer expects. For DPO, group records by identical `prompt_messages`
+and pair a `liked=true` response (chosen) with a `liked=false` one (rejected).
+For REINFORCE/PPO, feed `liked` as a {−1, +1} or {0, 1} reward.
+## Downloading the data
+The right-hand panel has an **⬇ Download data.json** button — one click on your
+Mac and you've got every rating so far.
+## Clearing the data
+The **Clear data.json** button empties the file on the Space. Do this after
+pulling the file locally so you don't double-count records on the next export.
+## Notes on the free CPU tier
+Generation on CPU is slow (~5–10 tok/s for 100M in fp32). If you move to the
+paid GPU tier, the app auto-detects `cuda` and uses bf16 autocast — roughly
+10× faster.

app.py ADDED Viewed

	@@ -0,0 +1,338 @@

+"""intellite 100M — RLHF data collector served as a Gradio HuggingFace Space.
+Every assistant reply gets 👍 / 👎 buttons. When the user rates a reply,
+the (system, prior messages, response, liked) tuple is appended to
+data.json in the Space's working directory. A Download button exposes
+that file so you can grab it on your Mac and use it for RL / DPO.
+The SFT checkpoint is loaded from:
+    $INTELLITE_CKPT  (if set), else ./best.pt at the Space root.
+"""
+import json
+import os
+import sys
+import threading
+import time
+import traceback
+from pathlib import Path
+import gradio as gr
+import tiktoken
+import torch
+SPACE_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(SPACE_DIR))
+from config import ModelConfig
+from model import IntelliteGPT
+# ------------------------------------------------------------------------
+# Paths & constants
+CKPT_PATH = Path(os.environ.get("INTELLITE_CKPT", SPACE_DIR / "best.pt"))
+DATA_PATH = SPACE_DIR / "data.json"
+DEFAULT_SYSTEM = "You are a helpful, honest, and concise assistant."
+SYSTEM_TAG = "<|system|>\n"
+USER_TAG = "<|user|>\n"
+ASST_TAG = "<|assistant|>\n"
+STOP_MARKERS = ("<|user|>", "<|system|>")
+# ------------------------------------------------------------------------
+# Model load (once, at startup)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"[sys] device={DEVICE}  ckpt={CKPT_PATH}")
+if not CKPT_PATH.exists():
+    raise FileNotFoundError(
+        f"No checkpoint at {CKPT_PATH}. Upload your SFT best.pt to the Space "
+        f"root, or set the INTELLITE_CKPT environment variable to its path."
+    )
+sd = torch.load(str(CKPT_PATH), map_location=DEVICE)
+_fields = ModelConfig.__dataclass_fields__.keys()
+MCFG = ModelConfig(**{k: v for k, v in sd["model_cfg"].items() if k in _fields})
+MODEL = IntelliteGPT(MCFG).to(DEVICE)
+MODEL.load_state_dict(sd["model"])
+MODEL.eval()
+TOKENS_SEEN = int(sd.get("tokens_seen", 0))
+BEST_VAL = float(sd.get("best_val", float("nan")))
+ENC = tiktoken.get_encoding("gpt2")
+EOT = ENC.eot_token
+N_PARAMS = MODEL.num_params()
+print(f"[model] {N_PARAMS/1e6:.1f}M params  tokens_seen={TOKENS_SEEN:,}  best_val={BEST_VAL:.4f}")
+# ------------------------------------------------------------------------
+# Prompt templating + generation (mirrors chat.py)
+def render_prompt_ids(system: str, prior_messages: list[dict], user_msg: str) -> list[int]:
+    """Encode the SFT chat template exactly as sft_prepare.py did."""
+    ids: list[int] = []
+    if system:
+        ids.extend(ENC.encode_ordinary(SYSTEM_TAG + system.strip() + "\n"))
+    # Pair prior messages into (user, assistant) turns.
+    pending_user = None
+    for m in prior_messages:
+        role = m.get("role")
+        content = (m.get("content") or "").strip()
+        if role == "user":
+            pending_user = content
+        elif role == "assistant" and pending_user is not None:
+            ids.extend(ENC.encode_ordinary(USER_TAG + pending_user + "\n"))
+            ids.extend(ENC.encode_ordinary(ASST_TAG))
+            ids.extend(ENC.encode_ordinary(content))
+            ids.append(EOT)
+            pending_user = None
+    # Current user turn + assistant opener.
+    ids.extend(ENC.encode_ordinary(USER_TAG + user_msg.strip() + "\n"))
+    ids.extend(ENC.encode_ordinary(ASST_TAG))
+    return ids
+@torch.no_grad()
+def stream_reply(prompt_ids, max_new, temperature, top_k, top_p, rep_penalty):
+    """Yield the partial assistant reply after each new token."""
+    x = torch.tensor([prompt_ids], dtype=torch.long, device=DEVICE)
+    ctx = MCFG.seq_len
+    start = len(prompt_ids)
+    reply = ""
+    for _ in range(max_new):
+        xc = x[:, -ctx:]
+        if DEVICE == "cuda":
+            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+                logits, _ = MODEL(xc)
+        else:
+            logits, _ = MODEL(xc)
+        logits = logits[0, -1, :].float()
+        if rep_penalty and rep_penalty != 1.0:
+            seen = torch.unique(x[0])
+            prev = logits[seen]
+            logits[seen] = torch.where(prev > 0, prev / rep_penalty, prev * rep_penalty)
+        logits = logits / max(temperature, 1e-5)
+        if top_k and top_k > 0:
+            k = min(int(top_k), logits.numel())
+            v, _ = torch.topk(logits, k)
+            logits[logits < v[-1]] = -float("inf")
+        if top_p and 0.0 < top_p < 1.0:
+            sorted_logits, sorted_idx = torch.sort(logits, descending=True)
+            cum = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+            mask = cum > top_p
+            mask[1:] = mask[:-1].clone()
+            mask[0] = False
+            logits[sorted_idx[mask]] = -float("inf")
+        probs = torch.softmax(logits, dim=-1)
+        next_tok = torch.multinomial(probs, num_samples=1)
+        tok_id = int(next_tok.item())
+        x = torch.cat([x, next_tok.unsqueeze(0)], dim=1)
+        if tok_id == EOT:
+            break
+        reply = ENC.decode(x[0, start:].tolist())
+        # Strip trailing replacement char (partial UTF-8) for nicer streaming.
+        while reply.endswith("\ufffd"):
+            reply = reply[:-1]
+        hit_stop = False
+        for marker in STOP_MARKERS:
+            idx = reply.find(marker)
+            if idx != -1:
+                reply = reply[:idx]
+                hit_stop = True
+                break
+        if hit_stop:
+            break
+        yield reply.strip()
+    yield reply.strip()
+# ------------------------------------------------------------------------
+# Feedback store (data.json)
+_feedback_lock = threading.Lock()
+def _read_data() -> list:
+    if not DATA_PATH.exists():
+        return []
+    try:
+        with open(DATA_PATH) as f:
+            return json.load(f)
+    except Exception:
+        return []
+def _write_data(items: list) -> None:
+    tmp = DATA_PATH.with_suffix(".json.tmp")
+    with open(tmp, "w") as f:
+        json.dump(items, f, indent=2, ensure_ascii=False)
+    tmp.replace(DATA_PATH)
+if not DATA_PATH.exists():
+    _write_data([])
+def _stats_str() -> str:
+    with _feedback_lock:
+        items = _read_data()
+    total = len(items)
+    liked = sum(1 for i in items if i.get("liked"))
+    return f"**{total}** records · 👍 {liked} · 👎 {total - liked}"
+def save_feedback(evt: gr.LikeData, history: list, system: str) -> str:
+    """Handle a thumbs-up / thumbs-down click on a chat message."""
+    if evt.liked is None:
+        return "rating cleared (nothing saved)"
+    # evt.index is an int in messages mode; be defensive either way.
+    idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
+    if not isinstance(idx, int) or idx < 0 or idx >= len(history):
+        return f"bad index {evt.index!r}"
+    msg = history[idx]
+    if msg.get("role") != "assistant":
+        return "skipped non-assistant message"
+    record = {
+        "ts": time.strftime("%Y-%m-%dT%H:%M:%S"),
+        "system": (system or DEFAULT_SYSTEM).strip(),
+        "prompt_messages": history[:idx],
+        "response": msg.get("content", ""),
+        "liked": bool(evt.liked),
+    }
+    with _feedback_lock:
+        items = _read_data()
+        items.append(record)
+        _write_data(items)
+    verdict = "👍 good" if evt.liked else "👎 bad"
+    return f"saved {verdict} · {len(items)} records in data.json"
+def clear_data() -> str:
+    with _feedback_lock:
+        _write_data([])
+    return "data.json cleared"
+# ------------------------------------------------------------------------
+# Chat callback
+def chat(user_msg, history, system, max_new, temperature, top_k, top_p, rep_penalty):
+    """Stream a reply; yield updated chatbot history after each token."""
+    user_msg = (user_msg or "").strip()
+    if not user_msg:
+        yield history, ""
+        return
+    history = list(history) + [
+        {"role": "user", "content": user_msg},
+        {"role": "assistant", "content": ""},
+    ]
+    prior = history[:-2]
+    ids = render_prompt_ids(system or DEFAULT_SYSTEM, prior, user_msg)
+    room = MCFG.seq_len - int(max_new)
+    if len(ids) > room > 0:
+        ids = ids[-room:]
+    try:
+        for partial in stream_reply(ids, int(max_new), float(temperature),
+                                    int(top_k), float(top_p), float(rep_penalty)):
+            history[-1]["content"] = partial
+            yield history, ""
+    except Exception:
+        history[-1]["content"] = f"[error] {traceback.format_exc()}"
+        yield history, ""
+# ------------------------------------------------------------------------
+# UI
+with gr.Blocks(title="intellite 100M — RLHF collector") as demo:
+    gr.Markdown(
+        f"# intellite 100M — RLHF data collector\n"
+        f"{MCFG.d_model}d × {MCFG.n_layers}L × {MCFG.n_heads}h "
+        f"({N_PARAMS/1e6:.1f}M params) · {TOKENS_SEEN/1e6:.0f}M SFT tokens · "
+        f"val_loss {BEST_VAL:.3f} · device `{DEVICE}`  \n"
+        f"**Please rate every response with 👍 or 👎.** Every rating appends a record "
+        f"to `data.json`; grab it from the sidebar for RLHF on your Mac."
+    )
+    with gr.Row():
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(
+                type="messages",
+                height=520,
+                show_copy_button=True,
+                avatar_images=(None, None),
+            )
+            msg = gr.Textbox(
+                placeholder="Your message — Enter to send",
+                lines=2,
+                show_label=False,
+                autofocus=True,
+            )
+            with gr.Row():
+                send_btn = gr.Button("Send", variant="primary")
+                clear_btn = gr.Button("Clear chat")
+            feedback_status = gr.Markdown("_rate replies with 👍 / 👎_")
+        with gr.Column(scale=1):
+            system = gr.Textbox(value=DEFAULT_SYSTEM, label="System prompt", lines=3)
+            max_new = gr.Slider(16, 800, value=400, step=16, label="max new tokens")
+            temp = gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature")
+            top_k = gr.Slider(0, 200, value=50, step=1, label="top-k (0 = off)")
+            top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top-p")
+            rep = gr.Slider(1.0, 1.5, value=1.1, step=0.01, label="repetition penalty")
+            gr.Markdown("### RLHF data")
+            stats_md = gr.Markdown(_stats_str())
+            download = gr.DownloadButton(
+                label="⬇ Download data.json", value=str(DATA_PATH)
+            )
+            clear_data_btn = gr.Button("Clear data.json", variant="stop")
+    # Wire the chat events.
+    send_btn.click(
+        chat,
+        inputs=[msg, chatbot, system, max_new, temp, top_k, top_p, rep],
+        outputs=[chatbot, msg],
+    )
+    msg.submit(
+        chat,
+        inputs=[msg, chatbot, system, max_new, temp, top_k, top_p, rep],
+        outputs=[chatbot, msg],
+    )
+    clear_btn.click(lambda: [], None, chatbot, queue=False)
+    # Thumbs-up / thumbs-down → append to data.json, refresh counters.
+    chatbot.like(
+        save_feedback,
+        inputs=[chatbot, system],
+        outputs=[feedback_status],
+    ).then(lambda: _stats_str(), None, stats_md, queue=False)
+    clear_data_btn.click(clear_data, None, feedback_status, queue=False).then(
+        lambda: _stats_str(), None, stats_md, queue=False
+    )
+if __name__ == "__main__":
+    demo.queue().launch()

config.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from dataclasses import dataclass
+@dataclass
+class ModelConfig:
+    vocab_size: int = 50304  # rounded-up GPT-2 vocab for better matmul shapes
+    d_model: int = 768
+    n_layers: int = 10
+    n_heads: int = 12        # head_dim = 64
+    d_ff: int = 2048         # canonical SwiGLU 8/3 * d_model
+    seq_len: int = 2048
+    dropout: float = 0.0
+    rope_theta: float = 10000.0
+    tie_embeddings: bool = True
+    norm_eps: float = 1e-5
+@dataclass
+class TrainConfig:
+    # Paths
+    data_dir: str = "data"
+    out_dir: str = "checkpoints"
+    # Model (mirrors ModelConfig so a single dataclass configures runs)
+    vocab_size: int = 50304
+    d_model: int = 768
+    n_layers: int = 10
+    n_heads: int = 12
+    d_ff: int = 2048
+    seq_len: int = 2048
+    dropout: float = 0.0
+    # Training budget
+    target_tokens: int = 1_000_000_000
+    # Memory at seq=2048 for ~100M params: keep microbatches small and use
+    # grad accumulation to keep effective batch = 32 × 2048 = 65_536 tok/step.
+    batch_size: int = 4
+    grad_accum_steps: int = 8
+    # Optimizer / schedule
+    learning_rate: float = 6e-4
+    min_lr_ratio: float = 0.1
+    warmup_tokens: int = 3_000_000
+    weight_decay: float = 0.1
+    beta1: float = 0.9
+    beta2: float = 0.95
+    grad_clip: float = 1.0
+    # Checkpoint / eval cadence (in tokens)
+    ckpt_every_tokens: int = 100_000_000
+    eval_every_tokens: int = 6_000_000
+    eval_batches: int = 50
+    # Logging
+    log_every_steps: int = 10
+    # System
+    device: str = "mps"
+    dtype: str = "bfloat16"
+    seed: int = 1337

model.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""Small but modern decoder-only transformer (~50M params).
+Uses RoPE, RMSNorm, SwiGLU FFN, tied embeddings, and PyTorch SDPA
+for causal attention (which lights up MPS fast-paths where available).
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from config import ModelConfig
+def precompute_rope(head_dim: int, seq_len: int, theta: float = 10000.0, device=None):
+    inv_freq = 1.0 / (theta ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+    t = torch.arange(seq_len, device=device).float()
+    freqs = torch.outer(t, inv_freq)  # (T, head_dim/2)
+    return freqs.cos(), freqs.sin()
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    # x: (B, H, T, D); cos/sin: (T, D/2)
+    x1, x2 = x.chunk(2, dim=-1)
+    cos = cos[None, None, :, :]
+    sin = sin[None, None, :, :]
+    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
+class RMSNorm(nn.Module):
+    def __init__(self, d: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(d))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Always compute the norm in fp32 for stability, then cast back.
+        dtype = x.dtype
+        x32 = x.float()
+        norm = torch.rsqrt(x32.pow(2).mean(-1, keepdim=True) + self.eps)
+        return (x32 * norm).to(dtype) * self.weight
+class Attention(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        assert cfg.d_model % cfg.n_heads == 0
+        self.n_heads = cfg.n_heads
+        self.head_dim = cfg.d_model // cfg.n_heads
+        self.qkv = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
+        self.o = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout = cfg.dropout
+    def forward(self, x, cos, sin):
+        B, T, C = x.shape
+        q, k, v = self.qkv(x).chunk(3, dim=-1)
+        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        q = apply_rope(q, cos[:T], sin[:T])
+        k = apply_rope(k, cos[:T], sin[:T])
+        y = F.scaled_dot_product_attention(
+            q, k, v,
+            is_causal=True,
+            dropout_p=self.dropout if self.training else 0.0,
+        )
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.o(y)
+class SwiGLU(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        self.w1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)  # gate
+        self.w2 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)  # down
+        self.w3 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)  # up
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class Block(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        self.attn_norm = RMSNorm(cfg.d_model, cfg.norm_eps)
+        self.attn = Attention(cfg)
+        self.ffn_norm = RMSNorm(cfg.d_model, cfg.norm_eps)
+        self.ffn = SwiGLU(cfg)
+    def forward(self, x, cos, sin):
+        x = x + self.attn(self.attn_norm(x), cos, sin)
+        x = x + self.ffn(self.ffn_norm(x))
+        return x
+class IntelliteGPT(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
+        self.norm = RMSNorm(cfg.d_model, cfg.norm_eps)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        if cfg.tie_embeddings:
+            self.lm_head.weight = self.tok_emb.weight
+        cos, sin = precompute_rope(cfg.d_model // cfg.n_heads, cfg.seq_len, cfg.rope_theta)
+        self.register_buffer("cos", cos, persistent=False)
+        self.register_buffer("sin", sin, persistent=False)
+        self.apply(self._init_weights)
+        # GPT-2 style: scale residual projections by 1/sqrt(2*n_layers)
+        scale = 0.02 / math.sqrt(2 * cfg.n_layers)
+        for n, p in self.named_parameters():
+            if n.endswith("attn.o.weight") or n.endswith("ffn.w2.weight"):
+                nn.init.normal_(p, mean=0.0, std=scale)
+    @staticmethod
+    def _init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, mean=0.0, std=0.02)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, mean=0.0, std=0.02)
+    def num_params(self, exclude_embedding: bool = False) -> int:
+        n = sum(p.numel() for p in self.parameters())
+        if exclude_embedding:
+            n -= self.tok_emb.weight.numel()
+        return n
+    def forward(self, idx: torch.Tensor, targets: torch.Tensor | None = None):
+        B, T = idx.shape
+        x = self.tok_emb(idx)
+        cos, sin = self.cos, self.sin
+        for block in self.blocks:
+            x = block(x, cos, sin)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)).float(),
+                targets.view(-1),
+                ignore_index=-1,
+            )
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -self.cfg.seq_len:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / max(temperature, 1e-5)
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("inf")
+            probs = F.softmax(logits, dim=-1)
+            next_tok = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat([idx, next_tok], dim=1)
+        return idx

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=5.0.0
+torch>=2.1.0
+tiktoken
+numpy