Spaces:

spectralman
/

router-api

Sleeping

App Files Files Community

spectralman commited on 7 days ago

Commit

6f0ff99

verified ·

1 Parent(s): e9564be

Initial deploy: classifier + FastAPI router

Browse files

Files changed (37) hide show

.gitignore +14 -0
Dockerfile +28 -0
README.md +29 -5
app.py +164 -0
greenrouting/__init__.py +1 -0
greenrouting/classifier/__init__.py +0 -0
greenrouting/classifier/calibration.py +41 -0
greenrouting/classifier/infer.py +205 -0
greenrouting/classifier/model.py +94 -0
greenrouting/classifier/ood.py +90 -0
greenrouting/classifier/train.py +269 -0
greenrouting/classifier/trained_predictor.py +129 -0
greenrouting/data/__init__.py +0 -0
greenrouting/data/builder.py +260 -0
greenrouting/data/capability_labeler.py +237 -0
greenrouting/data/cascade.py +292 -0
greenrouting/data/graders.py +158 -0
greenrouting/data/schema.py +80 -0
greenrouting/data/seed_dataset.py +545 -0
greenrouting/data/sources.py +343 -0
greenrouting/demo/__init__.py +0 -0
greenrouting/demo/app.py +215 -0
greenrouting/energy/__init__.py +0 -0
greenrouting/energy/estimator.py +19 -0
greenrouting/routing/__init__.py +0 -0
greenrouting/routing/decision.py +191 -0
greenrouting/routing/registry.py +440 -0
greenrouting/routing/scorer.py +93 -0
mapper.py +175 -0
models/classifier_v1/calibration.json +3 -0
models/classifier_v1/encoder_name.txt +1 -0
models/classifier_v1/head.pt +3 -0
models/classifier_v1/metadata.json +21 -0
models/classifier_v1/ood_stats.npz +3 -0
models/classifier_v1/training_history.json +182 -0
partner_registry.py +115 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+__pycache__/
+*.py[codz]
+*.egg-info/
+.venv/
+venv/
+.env
+# Partner config: never commit
+data/partner_registry.json
+data/*.json
+# IDE
+.idea/
+.vscode/

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HOME=/tmp/hf_cache \
+    TRANSFORMERS_CACHE=/tmp/hf_cache/transformers \
+    SENTENCE_TRANSFORMERS_HOME=/tmp/hf_cache/sentence-transformers
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+ && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --upgrade pip && \
+    pip install torch --index-url https://download.pytorch.org/whl/cpu && \
+    pip install -r requirements.txt
+COPY greenrouting /app/greenrouting
+COPY models /app/models
+COPY partner_registry.py mapper.py app.py /app/
+RUN mkdir -p /tmp/hf_cache && chmod -R 777 /tmp/hf_cache
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,34 @@
 ---
-title: Router Api
-emoji: 📈
-colorFrom: gray
-colorTo: pink
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Router Classify API
+emoji: 🛰️
+colorFrom: green
+colorTo: indigo
 sdk: docker
 pinned: false
+license: other
+license_name: polyform-noncommercial-1.0.0
+license_link: https://polyformproject.org/licenses/noncommercial/1.0.0
 ---
+# Router Classify API
+REST endpoint that runs the GreenRouting classifier and returns a routing decision against a configurable downstream model registry.
+## Endpoints
+- `POST /classify` — Run the classifier and pick a model.
+  - Request: `{ "message": "...", "recentMessages": [{"role": "...", "content": "..."}] }`
+  - Response: `{ "category", "complexity", "model_id", "capability_weights", "difficulty", "energy_savings_pct", "method", "reason" }`
+- `GET /health` — Liveness probe.
+## Configuration
+The registry of candidate models is supplied at runtime via a Space secret. Set one of:
+- `PARTNER_REGISTRY_JSON` — the registry as raw JSON (preferred)
+- `PARTNER_REGISTRY_PATH` — a file path inside the container
+Other env vars:
+- `CLASSIFIER_ARTIFACT_DIR` — defaults to `models/classifier_v1`
+- `INCLUDE_REASON` — `1` (default) to include the `reason` string in responses, `0` to omit

app.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""FastAPI service that wraps the GreenRouting classifier behind the partner-
+specific response schema.
+Endpoints:
+  POST /classify  - classify a query and pick a model from the partner registry
+  GET  /health    - liveness probe used by the partner edge function
+Auth: none. Stateless. CORS open. Single-process. Designed for a HF Spaces
+Docker deployment with periodic /health pings keeping the container warm.
+"""
+from __future__ import annotations
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from greenrouting.classifier.trained_predictor import TrainedPredictor
+from mapper import (
+    build_reason,
+    fold_recent_context,
+    energy_savings_pct,
+    pick_category,
+    pick_complexity,
+    pick_difficulty_int,
+    rebucket_capabilities,
+    select_model,
+)
+from partner_registry import load_registry
+logger = logging.getLogger("router-api")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
+ARTIFACT_DIR = os.environ.get("CLASSIFIER_ARTIFACT_DIR", "models/classifier_v1")
+INCLUDE_REASON = os.environ.get("INCLUDE_REASON", "1") not in ("0", "false", "False")
+app = FastAPI(title="GreenRouting Partner Router", version="0.1.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=False,
+    allow_methods=["*"],
+    allow_headers=["*"],
+    expose_headers=["*"],
+    max_age=3600,
+)
+_predictor: Optional[TrainedPredictor] = None
+_registry = None
+class RecentMessage(BaseModel):
+    role: str
+    content: str
+class ClassifyRequest(BaseModel):
+    message: str = Field(min_length=1, max_length=8000)
+    recentMessages: Optional[list[RecentMessage]] = None
+class ClassifyResponse(BaseModel):
+    category: str
+    complexity: str
+    model_id: str
+    capability_weights: dict[str, float]
+    difficulty: int
+    energy_savings_pct: Optional[float] = None
+    method: str
+    reason: Optional[str] = None
+def _ensure_loaded() -> None:
+    global _predictor, _registry
+    if _predictor is None:
+        artifact_path = Path(ARTIFACT_DIR)
+        if not (artifact_path / "head.pt").exists():
+            raise RuntimeError(f"trained classifier not found at {artifact_path}")
+        _predictor = TrainedPredictor(artifact_path)
+        _predictor.predict("warm up")
+        logger.info("classifier loaded and warmed")
+    if _registry is None:
+        _registry = load_registry()
+        logger.info("partner registry loaded with %d models", len(_registry))
+@app.on_event("startup")
+def _startup() -> None:
+    try:
+        _ensure_loaded()
+    except Exception as exc:
+        logger.warning("startup warm load failed: %s (will retry on first request)", exc)
+@app.get("/health")
+def health() -> dict:
+    try:
+        _ensure_loaded()
+        return {"status": "ok"}
+    except Exception as exc:
+        logger.exception("health check failed")
+        raise HTTPException(status_code=503, detail=f"unhealthy: {exc}")
+@app.post("/classify", response_model=ClassifyResponse)
+def classify(req: ClassifyRequest) -> ClassifyResponse:
+    _ensure_loaded()
+    started = time.time()
+    folded = fold_recent_context(
+        req.message,
+        [m.dict() for m in req.recentMessages] if req.recentMessages else None,
+    )
+    profile = _predictor.predict(folded)
+    weights = rebucket_capabilities(profile)
+    category = pick_category(weights)
+    complexity = pick_complexity(profile)
+    difficulty = pick_difficulty_int(profile)
+    chosen, escalated = select_model(_registry, weights, difficulty, is_ood=profile.is_ood)
+    savings: Optional[float]
+    if profile.is_ood or escalated:
+        savings = None
+    else:
+        savings = round(energy_savings_pct(chosen), 1)
+    reason = (
+        build_reason(weights, complexity, chosen, escalated, is_ood=profile.is_ood)
+        if INCLUDE_REASON
+        else None
+    )
+    elapsed_ms = (time.time() - started) * 1000.0
+    logger.info(
+        "classify model=%s tier=%s difficulty=%d category=%s ood=%s escalated=%s elapsed_ms=%.1f",
+        chosen.id, chosen.tier, difficulty, category, profile.is_ood, escalated, elapsed_ms,
+    )
+    return ClassifyResponse(
+        category=category,
+        complexity=complexity,
+        model_id=chosen.id,
+        capability_weights=weights,
+        difficulty=difficulty,
+        energy_savings_pct=savings,
+        method="greenrouting",
+        reason=reason,
+    )
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run("app:app", host="0.0.0.0", port=port, log_level="info")

greenrouting/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.1.0"

greenrouting/classifier/__init__.py ADDED Viewed

File without changes

greenrouting/classifier/calibration.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Temperature scaling for the multi-label capability head.
+Fits a single positive scalar T such that BCE(logits / T, targets) is minimized
+on a held-out set. T < 1 sharpens, T > 1 softens.
+"""
+from __future__ import annotations
+import math
+def fit_temperature(val_logits, val_targets, max_iter: int = 200) -> float:
+    import numpy as np
+    import torch
+    import torch.nn as nn
+    if val_logits.size == 0:
+        return 1.0
+    logits = torch.tensor(np.asarray(val_logits), dtype=torch.float32)
+    targets = torch.tensor(np.asarray(val_targets), dtype=torch.float32)
+    log_t = torch.zeros((), dtype=torch.float32, requires_grad=True)
+    optimizer = torch.optim.LBFGS([log_t], lr=0.1, max_iter=max_iter)
+    bce = nn.BCEWithLogitsLoss()
+    def closure():
+        optimizer.zero_grad()
+        loss = bce(logits / log_t.exp(), targets)
+        loss.backward()
+        return loss
+    optimizer.step(closure)
+    return float(math.exp(log_t.item()))
+def apply_temperature(logits, temperature: float):
+    import numpy as np
+    arr = np.asarray(logits)
+    if temperature <= 0:
+        return arr
+    return arr / temperature

greenrouting/classifier/infer.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""Inference-side types and predictors. The trained predictor lives in a sibling
+module; this file defines the contract the router consumes."""
+from __future__ import annotations
+import math
+import re
+from dataclasses import dataclass, asdict, field
+from typing import Protocol
+from greenrouting.routing.registry import CAPABILITY_KEYS
+LENGTH_BUCKETS: tuple[str, str, str] = ("short", "medium", "long")
+LENGTH_TOKEN_TARGETS: dict[str, int] = {"short": 60, "medium": 220, "long": 700}
+LENGTH_P90_MULTIPLIER: float = 1.6
+@dataclass
+class CapabilityProfile:
+    code: float = 0.0
+    math: float = 0.0
+    reasoning: float = 0.0
+    knowledge: float = 0.0
+    instruction: float = 0.0
+    creative: float = 0.0
+    multilingual: float = 0.0
+    simple_chat: float = 0.0
+    def as_dict(self) -> dict[str, float]:
+        return asdict(self)
+    def top(self, k: int = 3) -> list[tuple[str, float]]:
+        items = sorted(self.as_dict().items(), key=lambda kv: kv[1], reverse=True)
+        return [(k_, v) for k_, v in items[:k] if v > 0.05]
+@dataclass
+class QueryProfile:
+    capabilities: CapabilityProfile
+    difficulty_log_params: float
+    length_dist: dict[str, float]
+    expected_input_tokens: int
+    expected_output_tokens_p50: int
+    expected_output_tokens_p90: int
+    confidence: float
+    is_ood: bool = False
+    raw_query: str = ""
+    debug: dict = field(default_factory=dict)
+    @property
+    def difficulty_params_b(self) -> float:
+        return math.exp(self.difficulty_log_params) / 1e9
+class Predictor(Protocol):
+    def predict(self, query: str) -> QueryProfile: ...
+def _tokens_from_text(text: str) -> int:
+    words = max(1, len(text.split()))
+    return int(words * 1.3) + 4
+def _length_dist_for_target(bucket: str) -> dict[str, float]:
+    if bucket == "short":
+        return {"short": 0.75, "medium": 0.20, "long": 0.05}
+    if bucket == "medium":
+        return {"short": 0.15, "medium": 0.65, "long": 0.20}
+    return {"short": 0.05, "medium": 0.25, "long": 0.70}
+def _expected_output_tokens(length_dist: dict[str, float]) -> tuple[int, int]:
+    p50 = sum(length_dist[b] * LENGTH_TOKEN_TARGETS[b] for b in LENGTH_BUCKETS)
+    long_weight = length_dist.get("long", 0.0)
+    p90 = p50 * LENGTH_P90_MULTIPLIER + long_weight * LENGTH_TOKEN_TARGETS["long"] * 0.3
+    return int(round(p50)), int(round(p90))
+_KEYWORD_RULES: dict[str, tuple[float, list[str]]] = {
+    "code": (0.85, [
+        r"\b(code|function|class|def |algorithm|implement|debug|compile|stack trace|api|sdk)\b",
+        r"\b(python|javascript|typescript|rust|go|c\+\+|java|sql|html|css|react|kotlin|swift)\b",
+        r"\b(refactor|unit test|regex|linter)\b",
+    ]),
+    "math": (0.80, [
+        r"\b(calculate|compute|solve|equation|integral|derivative|matrix|vector|probability|theorem)\b",
+        r"\b(sum|product|mean|median|variance|standard deviation|percentage)\b",
+        r"\d+\s*[+\-*/×÷=]\s*\d+",
+    ]),
+    "reasoning": (0.70, [
+        r"\b(why|how does|explain|reason|because|therefore|thus|argue|justify|implication)\b",
+        r"\b(compare|contrast|analyze|evaluate|trade-?off|implication)\b",
+    ]),
+    "knowledge": (0.65, [
+        r"\b(who|what is|when did|where is|history|definition|capital|population|founded)\b",
+    ]),
+    "instruction": (0.60, [
+        r"\b(write|draft|create|generate|produce|format|list|outline|step.?by.?step)\b",
+    ]),
+    "creative": (0.75, [
+        r"\b(story|poem|novel|character|plot|scene|metaphor|fictional)\b",
+        r"\b(write a (?:short )?(?:story|poem|haiku|song|essay))\b",
+    ]),
+    "multilingual": (0.85, [
+        r"\b(translate|translation|en español|en français|auf deutsch|на русском|中文|日本語|한국어)\b",
+        r"[Ѐ-ӿ一-鿿぀-ゟ゠-ヿ]",
+    ]),
+    "simple_chat": (0.70, [
+        r"^\s*(hi|hello|hey|thanks|thank you|good morning|good evening|sup|yo)\b",
+        r"^\s*\S{1,40}\?\s*$",
+    ]),
+}
+class MockPredictor:
+    """Heuristic predictor used to drive the demo before a trained checkpoint exists.
+    The interface and output shape match the trained predictor that replaces it later.
+    """
+    def __init__(self, default_difficulty_log_params: float = math.log(8e9)) -> None:
+        self.default_difficulty = default_difficulty_log_params
+    def predict(self, query: str) -> QueryProfile:
+        q = query.strip()
+        scores = {k: 0.0 for k in CAPABILITY_KEYS}
+        for cap, (weight, patterns) in _KEYWORD_RULES.items():
+            for pat in patterns:
+                if re.search(pat, q, flags=re.IGNORECASE | re.MULTILINE):
+                    scores[cap] = max(scores[cap], weight)
+        if not any(v > 0 for v in scores.values()):
+            scores["simple_chat"] = 0.55
+            scores["instruction"] = 0.30
+        length_bucket = self._length_bucket(q, scores)
+        length_dist = _length_dist_for_target(length_bucket)
+        difficulty = self._difficulty(q, scores)
+        confidence = max(scores.values())
+        in_tokens = _tokens_from_text(q)
+        out_p50, out_p90 = _expected_output_tokens(length_dist)
+        is_ood = self._ood(q)
+        return QueryProfile(
+            capabilities=CapabilityProfile(**scores),
+            difficulty_log_params=difficulty,
+            length_dist=length_dist,
+            expected_input_tokens=in_tokens,
+            expected_output_tokens_p50=out_p50,
+            expected_output_tokens_p90=out_p90,
+            confidence=confidence,
+            is_ood=is_ood,
+            raw_query=q,
+            debug={"source": "mock", "length_bucket": length_bucket},
+        )
+    @staticmethod
+    def _length_bucket(query: str, scores: dict[str, float]) -> str:
+        if scores.get("simple_chat", 0) > 0.5:
+            return "short"
+        if scores.get("creative", 0) > 0.5 or scores.get("code", 0) > 0.5:
+            return "long"
+        if len(query) < 80:
+            return "short"
+        if len(query) < 240:
+            return "medium"
+        return "long"
+    @staticmethod
+    def _difficulty(query: str, scores: dict[str, float]) -> float:
+        base = math.log(7e9)
+        bumps = 0.0
+        if scores.get("math", 0) > 0.5 and re.search(r"\b(prove|theorem|integral|differential)\b", query, re.IGNORECASE):
+            bumps += math.log(10)
+        if scores.get("reasoning", 0) > 0.5 and len(query) > 200:
+            bumps += math.log(5)
+        if scores.get("code", 0) > 0.5 and re.search(r"\b(distributed|concurrency|kernel|cuda|optimize)\b", query, re.IGNORECASE):
+            bumps += math.log(8)
+        if scores.get("simple_chat", 0) > 0.5:
+            bumps -= math.log(3)
+        return base + bumps
+    @staticmethod
+    def _ood(query: str) -> bool:
+        q = query.strip()
+        if len(q) < 2:
+            return True
+        alnum = sum(c.isalnum() for c in q)
+        if alnum and (sum(c.isalpha() for c in q) / max(alnum, 1)) < 0.3:
+            return True
+        if re.fullmatch(r"[\W\d_]+", q):
+            return True
+        words = re.findall(r"[A-Za-z]{4,}", q)
+        if words:
+            gibberish = 0
+            for w in words:
+                longest_run = max(
+                    (len(m.group()) for m in re.finditer(r"[^aeiouyAEIOUY]+", w)),
+                    default=0,
+                )
+                if longest_run >= 5:
+                    gibberish += 1
+            if gibberish / len(words) >= 0.5:
+                return True
+        return False

greenrouting/classifier/model.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Frozen sentence encoder + three task heads (capability, difficulty, length)."""
+from __future__ import annotations
+from dataclasses import dataclass
+DEFAULT_ENCODER = "BAAI/bge-small-en-v1.5"
+@dataclass
+class ModelSpec:
+    encoder_name: str = DEFAULT_ENCODER
+    embedding_dim: int = 384
+    hidden_dim: int = 256
+    n_capabilities: int = 8
+    n_length_buckets: int = 3
+    dropout: float = 0.1
+    max_seq_len: int = 256
+def build_head(spec: ModelSpec):
+    import torch.nn as nn
+    class HeadStack(nn.Module):
+        def __init__(self, s: ModelSpec):
+            super().__init__()
+            self.shared = nn.Sequential(
+                nn.Linear(s.embedding_dim, s.hidden_dim),
+                nn.GELU(),
+                nn.Dropout(s.dropout),
+                nn.Linear(s.hidden_dim, s.hidden_dim),
+                nn.GELU(),
+                nn.Dropout(s.dropout),
+            )
+            self.cap_head = nn.Linear(s.hidden_dim, s.n_capabilities)
+            self.diff_head = nn.Linear(s.hidden_dim, 1)
+            self.len_head = nn.Linear(s.hidden_dim, s.n_length_buckets)
+        def forward(self, embeddings):
+            h = self.shared(embeddings)
+            return {
+                "cap_logits": self.cap_head(h),
+                "diff": self.diff_head(h).squeeze(-1),
+                "len_logits": self.len_head(h),
+            }
+    return HeadStack(spec)
+class Encoder:
+    """Lazy wrapper around a HuggingFace sentence encoder, mean-pooled and L2-normalized."""
+    def __init__(self, encoder_name: str = DEFAULT_ENCODER, max_seq_len: int = 256):
+        self.encoder_name = encoder_name
+        self.max_seq_len = max_seq_len
+        self._tokenizer = None
+        self._model = None
+        self._device = None
+    def _ensure_loaded(self):
+        if self._model is not None:
+            return
+        import torch
+        from transformers import AutoModel, AutoTokenizer
+        self._tokenizer = AutoTokenizer.from_pretrained(self.encoder_name)
+        self._model = AutoModel.from_pretrained(self.encoder_name)
+        self._device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._model.to(self._device).eval()
+        for p in self._model.parameters():
+            p.requires_grad = False
+    @property
+    def device(self) -> str:
+        self._ensure_loaded()
+        return self._device
+    def embed(self, texts: list[str]):
+        import torch
+        import torch.nn.functional as F
+        self._ensure_loaded()
+        enc = self._tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            max_length=self.max_seq_len,
+            return_tensors="pt",
+        ).to(self._device)
+        with torch.no_grad():
+            out = self._model(**enc)
+        mask = enc["attention_mask"].unsqueeze(-1).float()
+        pooled = (out.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
+        return F.normalize(pooled, dim=-1)

greenrouting/classifier/ood.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""OOD detection on L2-normalized encoder embeddings.
+Uses centroid cosine distance + k-nearest-neighbor distance. Both are robust
+when the number of training examples is smaller than the embedding dimension,
+which is typical for our seed-scale datasets.
+"""
+from __future__ import annotations
+def fit_ood_stats(train_embeddings, k: int = 5):
+    """Returns a dict with: centroid (unit-norm), full reference embeddings,
+    and per-source distances used for threshold calibration."""
+    import numpy as np
+    arr = np.asarray(train_embeddings, dtype=np.float32)
+    if arr.size == 0:
+        return {"centroid": np.zeros((arr.shape[1] if arr.ndim == 2 else 384,), dtype=np.float32),
+                "reference": arr,
+                "k": k}
+    norms = np.linalg.norm(arr, axis=1, keepdims=True)
+    norms[norms == 0] = 1.0
+    normalized = arr / norms
+    centroid = normalized.mean(axis=0)
+    centroid_norm = float(np.linalg.norm(centroid))
+    if centroid_norm == 0:
+        centroid_norm = 1.0
+    centroid = centroid / centroid_norm
+    return {"centroid": centroid.astype(np.float32),
+            "reference": normalized.astype(np.float32),
+            "k": k}
+def _cosine_distance(a, b):
+    import numpy as np
+    a = np.asarray(a, dtype=np.float32)
+    b = np.asarray(b, dtype=np.float32)
+    na = np.linalg.norm(a)
+    nb = np.linalg.norm(b)
+    if na == 0 or nb == 0:
+        return 1.0
+    return 1.0 - float(np.dot(a, b) / (na * nb))
+def centroid_distance(embedding, stats) -> float:
+    return _cosine_distance(embedding, stats["centroid"])
+def knn_distance(embedding, stats, k: int = None) -> float:
+    import numpy as np
+    ref = stats["reference"]
+    if ref.size == 0:
+        return 1.0
+    k = k or stats.get("k", 5)
+    emb = np.asarray(embedding, dtype=np.float32)
+    n = np.linalg.norm(emb)
+    if n == 0:
+        return 1.0
+    emb = emb / n
+    sims = ref @ emb
+    distances = 1.0 - sims
+    distances.sort()
+    return float(distances[: max(1, min(k, len(distances)))].mean())
+def calibrate_thresholds(
+    train_embeddings,
+    stats,
+    percentile: float = 99.9,
+    safety_multiplier: float = 1.25,
+) -> dict:
+    """Threshold = percentile × safety_multiplier. The multiplier gives headroom
+    for natural rephrasings that aren't truly OOD."""
+    import numpy as np
+    centroid_dists = [centroid_distance(e, stats) for e in train_embeddings]
+    knn_dists = [knn_distance(e, stats) for e in train_embeddings]
+    centroid_t = float(np.percentile(centroid_dists, percentile)) if centroid_dists else 1.0
+    knn_t = float(np.percentile(knn_dists, percentile)) if knn_dists else 1.0
+    return {
+        "centroid_threshold": centroid_t * safety_multiplier,
+        "knn_threshold": knn_t * safety_multiplier,
+    }
+def is_ood(embedding, stats, thresholds) -> bool:
+    """Either signal is sufficient to flag OOD. AND semantics let too many
+    obvious cases slip when one signal happens to look in-distribution."""
+    cd = centroid_distance(embedding, stats)
+    kd = knn_distance(embedding, stats)
+    return cd > thresholds["centroid_threshold"] or kd > thresholds["knn_threshold"]

greenrouting/classifier/train.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""Training loop. Frozen encoder, head-only optimization, multi-task loss."""
+from __future__ import annotations
+import json
+import math
+from dataclasses import dataclass, field
+from pathlib import Path
+from greenrouting.classifier.model import DEFAULT_ENCODER, Encoder, ModelSpec, build_head
+from greenrouting.data.schema import LENGTH_BUCKETS
+from greenrouting.routing.registry import CAPABILITY_KEYS
+LENGTH_TO_INDEX: dict[str, int] = {b: i for i, b in enumerate(LENGTH_BUCKETS)}
+@dataclass
+class TrainConfig:
+    encoder_name: str = DEFAULT_ENCODER
+    hidden_dim: int = 256
+    dropout: float = 0.1
+    max_seq_len: int = 256
+    epochs: int = 8
+    batch_size: int = 32
+    learning_rate: float = 1e-3
+    weight_decay: float = 1e-4
+    cap_weight: float = 1.0
+    diff_weight: float = 0.5
+    len_weight: float = 0.3
+    val_split: float = 0.15
+    seed: int = 42
+    huber_delta: float = 1.0
+    cap_pos_weight: float = 2.0
+    diff_target_center: float = field(default_factory=lambda: math.log(8e9))
+def _load_split(parquet_path: str | Path):
+    import pandas as pd
+    return pd.read_parquet(parquet_path)
+def _build_targets(df, cfg: TrainConfig):
+    import numpy as np
+    cap_cols = [f"cap_{k}" for k in CAPABILITY_KEYS]
+    caps = df[cap_cols].fillna(0.0).to_numpy(dtype=np.float32)
+    diff = (df["difficulty_log_params"].fillna(cfg.diff_target_center).to_numpy(dtype=np.float32))
+    diff_centered = diff - cfg.diff_target_center
+    lens = df["length_bucket"].fillna("medium").map(LENGTH_TO_INDEX).fillna(1).to_numpy(dtype=np.int64)
+    texts = df["text"].astype(str).tolist()
+    return texts, caps, diff_centered, lens
+def _split_train_val(texts, caps, diff, lens, val_split: float, seed: int):
+    import numpy as np
+    rng = np.random.default_rng(seed)
+    n = len(texts)
+    indices = np.arange(n)
+    rng.shuffle(indices)
+    n_val = max(1, int(n * val_split))
+    val_idx = indices[:n_val]
+    train_idx = indices[n_val:]
+    return (
+        ([texts[i] for i in train_idx], caps[train_idx], diff[train_idx], lens[train_idx]),
+        ([texts[i] for i in val_idx], caps[val_idx], diff[val_idx], lens[val_idx]),
+    )
+def _iterate_batches(texts, caps, diff, lens, batch_size: int, encoder: Encoder, shuffle: bool, seed: int):
+    import numpy as np
+    import torch
+    n = len(texts)
+    indices = np.arange(n)
+    if shuffle:
+        np.random.default_rng(seed).shuffle(indices)
+    for start in range(0, n, batch_size):
+        idx = indices[start:start + batch_size]
+        batch_texts = [texts[i] for i in idx]
+        emb = encoder.embed(batch_texts)
+        cap_t = torch.tensor(caps[idx], dtype=torch.float32, device=emb.device)
+        diff_t = torch.tensor(diff[idx], dtype=torch.float32, device=emb.device)
+        len_t = torch.tensor(lens[idx], dtype=torch.long, device=emb.device)
+        yield emb, cap_t, diff_t, len_t
+def train(
+    train_parquet: str | Path,
+    output_dir: str | Path,
+    cfg: TrainConfig | None = None,
+) -> dict:
+    import torch
+    import torch.nn as nn
+    from torch.optim import AdamW
+    cfg = cfg or TrainConfig()
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    df = _load_split(train_parquet)
+    texts, caps, diff, lens = _build_targets(df, cfg)
+    train_set, val_set = _split_train_val(texts, caps, diff, lens, cfg.val_split, cfg.seed)
+    encoder = Encoder(cfg.encoder_name, cfg.max_seq_len)
+    embed_dim = encoder.embed(["probe"]).shape[-1]
+    spec = ModelSpec(
+        encoder_name=cfg.encoder_name,
+        embedding_dim=embed_dim,
+        hidden_dim=cfg.hidden_dim,
+        dropout=cfg.dropout,
+        max_seq_len=cfg.max_seq_len,
+    )
+    head = build_head(spec).to(encoder.device)
+    pos_weight = torch.full((spec.n_capabilities,), cfg.cap_pos_weight, device=encoder.device)
+    cap_loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+    diff_loss_fn = nn.HuberLoss(delta=cfg.huber_delta)
+    len_loss_fn = nn.CrossEntropyLoss()
+    optimizer = AdamW(head.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
+    history = []
+    for epoch in range(cfg.epochs):
+        head.train()
+        train_loss_sum = 0.0
+        n_train = 0
+        for emb, cap_t, diff_t, len_t in _iterate_batches(
+            *train_set, batch_size=cfg.batch_size, encoder=encoder,
+            shuffle=True, seed=cfg.seed + epoch,
+        ):
+            out = head(emb)
+            loss = (
+                cfg.cap_weight * cap_loss_fn(out["cap_logits"], cap_t)
+                + cfg.diff_weight * diff_loss_fn(out["diff"], diff_t)
+                + cfg.len_weight * len_loss_fn(out["len_logits"], len_t)
+            )
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            train_loss_sum += loss.item() * emb.shape[0]
+            n_train += emb.shape[0]
+        val_metrics = _evaluate(head, encoder, val_set, cfg)
+        history.append({
+            "epoch": epoch,
+            "train_loss": train_loss_sum / max(n_train, 1),
+            **val_metrics,
+        })
+        print(
+            f"epoch {epoch+1}/{cfg.epochs}  "
+            f"train_loss={train_loss_sum/max(n_train,1):.4f}  "
+            f"val_cap_f1={val_metrics['cap_f1']:.3f}  "
+            f"val_diff_mae={val_metrics['diff_mae']:.3f}  "
+            f"val_len_acc={val_metrics['len_acc']:.3f}"
+        )
+    head.eval()
+    torch.save(head.state_dict(), out_dir / "head.pt")
+    (out_dir / "encoder_name.txt").write_text(cfg.encoder_name)
+    (out_dir / "metadata.json").write_text(json.dumps({
+        "capability_keys": list(CAPABILITY_KEYS),
+        "length_buckets": list(LENGTH_BUCKETS),
+        "embedding_dim": int(spec.embedding_dim),
+        "hidden_dim": int(spec.hidden_dim),
+        "max_seq_len": int(spec.max_seq_len),
+        "diff_target_center": float(cfg.diff_target_center),
+    }, indent=2))
+    (out_dir / "training_history.json").write_text(json.dumps(history, indent=2))
+    train_embeddings = _collect_embeddings(encoder, train_set[0], batch_size=cfg.batch_size)
+    val_cap_logits = _collect_logits(head, encoder, val_set, cfg.batch_size)
+    from greenrouting.classifier.calibration import fit_temperature
+    temperature = fit_temperature(val_cap_logits, val_set[1])
+    (out_dir / "calibration.json").write_text(json.dumps({"temperature": float(temperature)}, indent=2))
+    from greenrouting.classifier.ood import calibrate_thresholds, fit_ood_stats
+    ood_stats = fit_ood_stats(train_embeddings, k=5)
+    thresholds = calibrate_thresholds(train_embeddings, ood_stats, percentile=99.0)
+    import numpy as np
+    np.savez(
+        out_dir / "ood_stats.npz",
+        centroid=ood_stats["centroid"],
+        reference=ood_stats["reference"],
+        k=ood_stats.get("k", 5),
+        centroid_threshold=thresholds["centroid_threshold"],
+        knn_threshold=thresholds["knn_threshold"],
+    )
+    return {
+        "history": history,
+        "temperature": float(temperature),
+        "n_train": len(train_set[0]),
+        "n_val": len(val_set[0]),
+    }
+def _evaluate(head, encoder: Encoder, val_set, cfg: TrainConfig) -> dict:
+    import torch
+    head.eval()
+    all_cap_pred, all_cap_true = [], []
+    all_diff_pred, all_diff_true = [], []
+    all_len_pred, all_len_true = [], []
+    with torch.no_grad():
+        for emb, cap_t, diff_t, len_t in _iterate_batches(
+            *val_set, batch_size=cfg.batch_size, encoder=encoder, shuffle=False, seed=cfg.seed,
+        ):
+            out = head(emb)
+            all_cap_pred.append(torch.sigmoid(out["cap_logits"]).cpu().numpy())
+            all_cap_true.append(cap_t.cpu().numpy())
+            all_diff_pred.append(out["diff"].cpu().numpy())
+            all_diff_true.append(diff_t.cpu().numpy())
+            all_len_pred.append(out["len_logits"].argmax(dim=-1).cpu().numpy())
+            all_len_true.append(len_t.cpu().numpy())
+    head.train()
+    import numpy as np
+    cap_pred = np.concatenate(all_cap_pred)
+    cap_true = np.concatenate(all_cap_true)
+    diff_pred = np.concatenate(all_diff_pred)
+    diff_true = np.concatenate(all_diff_true)
+    len_pred = np.concatenate(all_len_pred)
+    len_true = np.concatenate(all_len_true)
+    cap_pred_bin = (cap_pred >= 0.5).astype(np.float32)
+    cap_true_bin = (cap_true >= 0.5).astype(np.float32)
+    tp = ((cap_pred_bin == 1) & (cap_true_bin == 1)).sum()
+    fp = ((cap_pred_bin == 1) & (cap_true_bin == 0)).sum()
+    fn = ((cap_pred_bin == 0) & (cap_true_bin == 1)).sum()
+    precision = tp / max(tp + fp, 1)
+    recall = tp / max(tp + fn, 1)
+    f1 = 2 * precision * recall / max(precision + recall, 1e-9)
+    diff_mae = float(np.abs(diff_pred - diff_true).mean())
+    len_acc = float((len_pred == len_true).mean())
+    return {
+        "cap_precision": float(precision),
+        "cap_recall": float(recall),
+        "cap_f1": float(f1),
+        "diff_mae": diff_mae,
+        "len_acc": len_acc,
+    }
+def _collect_embeddings(encoder: Encoder, texts: list[str], batch_size: int):
+    import numpy as np
+    chunks = []
+    for start in range(0, len(texts), batch_size):
+        chunk = texts[start:start + batch_size]
+        emb = encoder.embed(chunk).cpu().numpy()
+        chunks.append(emb)
+    return np.concatenate(chunks, axis=0) if chunks else np.zeros((0, 384), dtype=np.float32)
+def _collect_logits(head, encoder: Encoder, val_set, batch_size: int):
+    import numpy as np
+    import torch
+    head.eval()
+    out_logits = []
+    with torch.no_grad():
+        for emb, _cap, _diff, _len in _iterate_batches(
+            *val_set, batch_size=batch_size, encoder=encoder, shuffle=False, seed=0,
+        ):
+            out = head(emb)
+            out_logits.append(out["cap_logits"].cpu().numpy())
+    head.train()
+    return np.concatenate(out_logits, axis=0) if out_logits else np.zeros((0, 8), dtype=np.float32)

greenrouting/classifier/trained_predictor.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Inference-time predictor that loads the trained artifact and conforms to
+the `Predictor` protocol used by the router."""
+from __future__ import annotations
+import json
+import math
+from pathlib import Path
+from typing import Optional
+from greenrouting.classifier.infer import (
+    CapabilityProfile,
+    LENGTH_BUCKETS,
+    LENGTH_TOKEN_TARGETS,
+    LENGTH_P90_MULTIPLIER,
+    QueryProfile,
+)
+from greenrouting.classifier.model import Encoder, ModelSpec, build_head
+from greenrouting.classifier.ood import is_ood
+from greenrouting.routing.registry import CAPABILITY_KEYS
+class TrainedPredictor:
+    def __init__(self, artifact_dir: str | Path):
+        self.artifact_dir = Path(artifact_dir)
+        self._loaded = False
+        self._encoder: Optional[Encoder] = None
+        self._head = None
+        self._spec: Optional[ModelSpec] = None
+        self._temperature: float = 1.0
+        self._ood_stats = None
+        self._ood_thresholds = None
+        self._ood_min_confidence: float = 0.40
+    def _ensure_loaded(self) -> None:
+        if self._loaded:
+            return
+        import numpy as np
+        import torch
+        meta_path = self.artifact_dir / "metadata.json"
+        meta = json.loads(meta_path.read_text())
+        encoder_name = (self.artifact_dir / "encoder_name.txt").read_text().strip()
+        self._spec = ModelSpec(
+            encoder_name=encoder_name,
+            embedding_dim=int(meta["embedding_dim"]),
+            hidden_dim=int(meta["hidden_dim"]),
+            n_capabilities=len(meta["capability_keys"]),
+            n_length_buckets=len(meta["length_buckets"]),
+            max_seq_len=int(meta.get("max_seq_len", 256)),
+        )
+        self._diff_center = float(meta.get("diff_target_center", math.log(8e9)))
+        self._encoder = Encoder(encoder_name, max_seq_len=self._spec.max_seq_len)
+        head = build_head(self._spec)
+        head.load_state_dict(torch.load(self.artifact_dir / "head.pt", map_location="cpu"))
+        head.to(self._encoder.device).eval()
+        self._head = head
+        cal_path = self.artifact_dir / "calibration.json"
+        if cal_path.exists():
+            self._temperature = float(json.loads(cal_path.read_text()).get("temperature", 1.0))
+        ood_path = self.artifact_dir / "ood_stats.npz"
+        if ood_path.exists():
+            data = np.load(ood_path)
+            if "centroid" in data.files and "reference" in data.files:
+                self._ood_stats = {
+                    "centroid": data["centroid"],
+                    "reference": data["reference"],
+                    "k": int(data["k"]) if "k" in data.files else 5,
+                }
+                self._ood_thresholds = {
+                    "centroid_threshold": float(data["centroid_threshold"]),
+                    "knn_threshold": float(data["knn_threshold"]),
+                }
+        self._loaded = True
+    def predict(self, query: str) -> QueryProfile:
+        import torch
+        import torch.nn.functional as F
+        self._ensure_loaded()
+        text = (query or "").strip()
+        emb = self._encoder.embed([text])
+        with torch.no_grad():
+            out = self._head(emb)
+        cap_logits = (out["cap_logits"] / max(self._temperature, 1e-3))
+        cap_probs = torch.sigmoid(cap_logits)[0].cpu().numpy().tolist()
+        cap_dict = {k: float(v) for k, v in zip(CAPABILITY_KEYS, cap_probs)}
+        diff_centered = float(out["diff"][0].item())
+        diff_log_params = diff_centered + self._diff_center
+        len_probs = F.softmax(out["len_logits"][0], dim=-1).cpu().numpy().tolist()
+        length_dist = {b: float(p) for b, p in zip(LENGTH_BUCKETS, len_probs)}
+        confidence = max(cap_dict.values()) if cap_dict else 0.0
+        confidence_ood = confidence < self._ood_min_confidence
+        geometric_ood = False
+        if self._ood_stats is not None and self._ood_thresholds is not None:
+            emb_np = emb[0].cpu().numpy()
+            geometric_ood = is_ood(emb_np, self._ood_stats, self._ood_thresholds)
+        ood_flag = confidence_ood or geometric_ood
+        in_tokens = max(1, int(len(text.split()) * 1.3) + 4)
+        out_p50 = int(round(sum(length_dist[b] * LENGTH_TOKEN_TARGETS[b] for b in LENGTH_BUCKETS)))
+        long_w = length_dist.get("long", 0.0)
+        out_p90 = int(round(out_p50 * LENGTH_P90_MULTIPLIER + long_w * LENGTH_TOKEN_TARGETS["long"] * 0.3))
+        return QueryProfile(
+            capabilities=CapabilityProfile(**cap_dict),
+            difficulty_log_params=diff_log_params,
+            length_dist=length_dist,
+            expected_input_tokens=in_tokens,
+            expected_output_tokens_p50=out_p50,
+            expected_output_tokens_p90=out_p90,
+            confidence=confidence,
+            is_ood=ood_flag,
+            raw_query=text,
+            debug={
+                "source": "trained",
+                "temperature": self._temperature,
+                "confidence_ood": bool(confidence_ood),
+                "geometric_ood": bool(geometric_ood),
+            },
+        )

greenrouting/data/__init__.py ADDED Viewed

File without changes

greenrouting/data/builder.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""Dataset orchestrator: source sampling -> capability labeling -> cascade plan -> parquet."""
+from __future__ import annotations
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+from greenrouting.data.capability_labeler import LabelerConfig, label_queries
+from greenrouting.data.schema import CapabilityLabel, LabeledQuery, RawQuery
+from greenrouting.data.sources import SOURCE_REGISTRY, sample_mix
+from greenrouting.routing.registry import CAPABILITY_KEYS
+@dataclass
+class CascadeRungConfig:
+    id: str
+    hf_model: str
+    params_b: float
+    decode_tokens_per_second_estimate: float
+    runs_locally: bool = True
+@dataclass
+class CascadeConfig:
+    rungs: list[CascadeRungConfig]
+    k_samples: int = 1
+    max_new_tokens: int = 200
+    temperature_first: float = 0.0
+    temperature_resample: float = 0.7
+    def projected_seconds(self, n_queries: int) -> float:
+        total = 0.0
+        for r in self.rungs:
+            inferences = n_queries * self.k_samples
+            total += inferences * self.max_new_tokens / max(r.decode_tokens_per_second_estimate, 1.0)
+            total += inferences * 0.4
+        return total
+@dataclass
+class BuildConfig:
+    profile_name: str
+    target_total_queries: int
+    test_split: float
+    seed: int
+    sources: dict[str, float]
+    cascade: CascadeConfig
+    labeler: LabelerConfig
+    budget_minutes: float = 60.0
+    output_dir: str = "data"
+    capability_labels_cache: Optional[str] = None
+    @classmethod
+    def from_yaml(cls, path: str | Path) -> "BuildConfig":
+        import yaml
+        with open(path, "r", encoding="utf-8") as f:
+            raw = yaml.safe_load(f)
+        rungs = [CascadeRungConfig(**r) for r in raw["cascade"]["rungs"]]
+        cascade = CascadeConfig(
+            rungs=rungs,
+            k_samples=raw["cascade"].get("k_samples", 1),
+            max_new_tokens=raw["cascade"].get("max_new_tokens", 200),
+            temperature_first=raw["cascade"].get("temperature_first", 0.0),
+            temperature_resample=raw["cascade"].get("temperature_resample", 0.7),
+        )
+        labeler_raw = raw.get("labeler", {})
+        labeler = LabelerConfig(
+            use_heuristic=labeler_raw.get("use_heuristic", True),
+            use_gpt=labeler_raw.get("use_gpt", False),
+            use_claude=labeler_raw.get("use_claude", False),
+            use_gemini=labeler_raw.get("use_gemini", False),
+            source_prior_weight=labeler_raw.get("source_prior_weight", 0.5),
+            sleep_between_calls_s=labeler_raw.get("sleep_between_calls_s", 0.0),
+        )
+        return cls(
+            profile_name=raw["profile_name"],
+            target_total_queries=raw["target_total_queries"],
+            test_split=raw["test_split"],
+            seed=raw["seed"],
+            sources=raw["sources"],
+            cascade=cascade,
+            labeler=labeler,
+            budget_minutes=raw.get("budget_minutes", 60.0),
+            output_dir=raw.get("output_dir", "data"),
+            capability_labels_cache=raw.get("capability_labels_cache"),
+        )
+@dataclass
+class BuildPlan:
+    config: BuildConfig
+    n_queries: int
+    cascade_seconds: float
+    cascade_minutes: float
+    over_budget: bool
+    notes: list[str] = field(default_factory=list)
+    def report(self) -> str:
+        lines = [
+            f"Profile: {self.config.profile_name}",
+            f"Target queries: {self.config.target_total_queries}",
+            f"Test split: {int(self.config.test_split * 100)}%",
+            f"Sources: {', '.join(f'{k}={v}' for k, v in self.config.sources.items())}",
+            f"Cascade rungs: {', '.join(r.id for r in self.config.cascade.rungs)}",
+            f"k_samples per rung: {self.config.cascade.k_samples}",
+            f"Max new tokens: {self.config.cascade.max_new_tokens}",
+            f"Estimated cascade wall time: {self.cascade_minutes:.1f} min",
+            f"Configured budget: {self.config.budget_minutes:.1f} min",
+            f"Over budget: {self.over_budget}",
+        ]
+        if self.notes:
+            lines.append("Notes:")
+            for note in self.notes:
+                lines.append(f"  - {note}")
+        return "\n".join(lines)
+def plan(config: BuildConfig) -> BuildPlan:
+    notes: list[str] = []
+    cascade_s = config.cascade.projected_seconds(config.target_total_queries)
+    cascade_m = cascade_s / 60.0
+    over_budget = cascade_m > config.budget_minutes
+    if over_budget:
+        notes.append(
+            f"cascade projected {cascade_m:.1f} min exceeds budget {config.budget_minutes:.1f} min"
+        )
+    if config.labeler.use_gpt and not os.environ.get("OPENAI_API_KEY"):
+        notes.append("OPENAI_API_KEY missing; gpt vote will be skipped")
+    if config.labeler.use_claude and not os.environ.get("ANTHROPIC_API_KEY"):
+        notes.append("ANTHROPIC_API_KEY missing; claude vote will be skipped")
+    if config.labeler.use_gemini and not os.environ.get("GOOGLE_API_KEY"):
+        notes.append("GOOGLE_API_KEY missing; gemini vote will be skipped")
+    for src in config.sources:
+        if src not in SOURCE_REGISTRY:
+            notes.append(f"unknown source '{src}' in mix")
+    return BuildPlan(
+        config=config,
+        n_queries=config.target_total_queries,
+        cascade_seconds=cascade_s,
+        cascade_minutes=cascade_m,
+        over_budget=over_budget,
+        notes=notes,
+    )
+def write_capability_labels(path: str | Path, labels: list[CapabilityLabel]) -> None:
+    import pandas as pd
+    df = pd.DataFrame([lbl.to_record() for lbl in labels])
+    Path(path).parent.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(path, index=False)
+def read_capability_labels(path: str | Path) -> dict[str, dict[str, float]]:
+    import pandas as pd
+    df = pd.read_parquet(path)
+    out: dict[str, dict[str, float]] = {}
+    cap_cols = [c for c in df.columns if c.startswith("cap_")]
+    for _, row in df.iterrows():
+        out[row["query_id"]] = {c[4:]: float(row[c]) for c in cap_cols}
+    return out
+def write_raw_manifest(path: str | Path, queries: list[RawQuery]) -> None:
+    Path(path).parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        for q in queries:
+            f.write(json.dumps(q.to_dict()) + "\n")
+def write_labeled_dataset(
+    train_path: str | Path,
+    test_path: str | Path,
+    rows: list[LabeledQuery],
+    test_split: float,
+    seed: int,
+) -> None:
+    import pandas as pd
+    import random as _random
+    rng = _random.Random(seed)
+    indices = list(range(len(rows)))
+    rng.shuffle(indices)
+    n_test = max(1, int(len(rows) * test_split))
+    test_idx = set(indices[:n_test])
+    train_records = [rows[i].to_record() for i in range(len(rows)) if i not in test_idx]
+    test_records = [rows[i].to_record() for i in test_idx]
+    Path(train_path).parent.mkdir(parents=True, exist_ok=True)
+    pd.DataFrame(train_records).to_parquet(train_path, index=False)
+    pd.DataFrame(test_records).to_parquet(test_path, index=False)
+def build_seed_dataset(
+    output_dir: str | Path,
+    test_split: float = 0.15,
+    seed: int = 42,
+    suffix: str = "seed",
+) -> tuple[Path, Path]:
+    """Materialize the curated seed entries into train/test parquet files.
+    Skips the cascade and the labeler: the seed entries already carry gold
+    capability multi-labels, difficulty (in log_params), and length buckets.
+    """
+    from greenrouting.data.seed_dataset import (
+        SEED_QUERIES,
+        difficulty_log_params_from_b,
+        seed_capability_dict,
+    )
+    rows: list[LabeledQuery] = []
+    for i, entry in enumerate(SEED_QUERIES):
+        raw = RawQuery(
+            id=f"seed-{i:04d}",
+            text=entry.text,
+            source="seed",
+            source_category=entry.primary_category,
+            has_grader=False,
+            grader_metadata={},
+        )
+        rows.append(LabeledQuery(
+            raw=raw,
+            capabilities=seed_capability_dict(entry, CAPABILITY_KEYS),
+            difficulty_log_params=difficulty_log_params_from_b(entry.difficulty_b),
+            length_bucket=entry.length,
+            cascade_results={"source": "seed_curated"},
+        ))
+    out = Path(output_dir)
+    train_path = out / f"train_{suffix}.parquet"
+    test_path = out / f"test_{suffix}.parquet"
+    write_labeled_dataset(train_path, test_path, rows, test_split=test_split, seed=seed)
+    return train_path, test_path
+def sample_and_label(config: BuildConfig) -> tuple[list[RawQuery], list[CapabilityLabel]]:
+    queries = sample_mix(config.sources, config.target_total_queries, config.seed)
+    cached = {}
+    if config.capability_labels_cache and Path(config.capability_labels_cache).exists():
+        cached = read_capability_labels(config.capability_labels_cache)
+    new_queries = [q for q in queries if q.id not in cached]
+    new_labels = label_queries(new_queries, config.labeler) if new_queries else []
+    cached_labels: list[CapabilityLabel] = []
+    for q in queries:
+        if q.id in cached:
+            from greenrouting.data.schema import CapabilityVotes
+            cached_labels.append(CapabilityLabel(
+                query_id=q.id,
+                capabilities=cached[q.id],
+                votes=CapabilityVotes(),
+                aggregation_method="cached",
+            ))
+    all_labels = new_labels + cached_labels
+    by_id = {l.query_id: l for l in all_labels}
+    aligned = [by_id[q.id] for q in queries if q.id in by_id]
+    return queries, aligned

greenrouting/data/capability_labeler.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""Capability labeling: turns RawQuery records into multi-label CapabilityLabel records.
+Aggregates up to four independent voters:
+  - source_prior (always available; derived from source category)
+  - heuristic (always available; deterministic keyword/regex rules)
+  - gpt-4o (optional, requires OPENAI_API_KEY)
+  - claude-sonnet (optional, requires ANTHROPIC_API_KEY)
+  - gemini-pro (optional, requires GOOGLE_API_KEY)
+Designed to run once during dataset prep. The output is committed as a parquet so
+downstream training does not depend on API access.
+"""
+from __future__ import annotations
+import json
+import os
+import re
+import time
+from dataclasses import dataclass
+from typing import Iterable, Optional
+from greenrouting.data.schema import CapabilityLabel, CapabilityVotes, RawQuery
+from greenrouting.routing.registry import CAPABILITY_KEYS
+CATEGORY_TO_LABELS: dict[str, list[str]] = {
+    "code": ["code"],
+    "math": ["math"],
+    "reasoning": ["reasoning"],
+    "knowledge": ["knowledge"],
+    "instruction": ["instruction"],
+    "creative": ["creative"],
+    "multilingual": ["multilingual"],
+    "simple_chat": ["simple_chat"],
+}
+def source_prior_vote(category: str) -> dict[str, float]:
+    labels = CATEGORY_TO_LABELS.get(category, [])
+    return {k: (1.0 if k in labels else 0.0) for k in CAPABILITY_KEYS}
+_HEURISTIC_PATTERNS: dict[str, list[str]] = {
+    "code": [
+        r"\b(code|function|class|def |algorithm|implement|debug|stack trace|api|sdk)\b",
+        r"\b(python|javascript|typescript|rust|go|c\+\+|java|sql|html|css)\b",
+        r"\b(refactor|unit test|regex|linter|compile|recursion)\b",
+        r"```",
+    ],
+    "math": [
+        r"\b(calculate|compute|solve|equation|integral|derivative|matrix|vector|theorem|prove)\b",
+        r"\b(probability|sum|product|mean|median|variance|standard deviation|percentage)\b",
+        r"\d+\s*[+\-*/×÷=]\s*\d+",
+        r"\b(arithmetic|fraction|geometry|algebra|trig)\b",
+    ],
+    "reasoning": [
+        r"\b(why|how does|explain|reason|because|therefore|argue|justify|implication)\b",
+        r"\b(compare|contrast|analyze|evaluate|trade.?off|infer|deduce)\b",
+    ],
+    "knowledge": [
+        r"\b(who|what is|when did|where is|history|definition|capital|founded|named)\b",
+        r"\b(country|continent|invented|discovered|president|prime minister)\b",
+    ],
+    "instruction": [
+        r"\b(write|draft|create|generate|produce|format|list|outline|step.?by.?step|summarize)\b",
+        r"\b(rewrite|translate from|convert to|extract)\b",
+    ],
+    "creative": [
+        r"\b(story|poem|novel|character|plot|scene|metaphor|fictional|haiku|song lyric)\b",
+        r"\b(write a (?:short )?(?:story|poem|haiku|song))\b",
+    ],
+    "multilingual": [
+        r"\b(translate|translation|en español|en français|auf deutsch|на русском|中文|日本語|한국어)\b",
+        r"[Ѐ-ӿ一-鿿぀-ゟ゠-ヿ가-힣]",
+    ],
+    "simple_chat": [
+        r"^\s*(hi|hello|hey|thanks|thank you|good morning|good evening|sup|yo)\b",
+    ],
+}
+def heuristic_vote(text: str) -> dict[str, float]:
+    out = {k: 0.0 for k in CAPABILITY_KEYS}
+    for cap, patterns in _HEURISTIC_PATTERNS.items():
+        for pat in patterns:
+            if re.search(pat, text, flags=re.IGNORECASE | re.MULTILINE):
+                out[cap] = 1.0
+                break
+    if all(v == 0.0 for v in out.values()):
+        if len(text.strip()) < 80:
+            out["simple_chat"] = 1.0
+        else:
+            out["instruction"] = 1.0
+    return out
+_LABELER_SYSTEM_PROMPT = (
+    "You are labeling AI queries by which capabilities they require. "
+    "Capabilities: code, math, reasoning, knowledge, instruction, creative, multilingual, "
+    "simple_chat. A query can require multiple capabilities. "
+    "Reply with strict JSON only, in the form: "
+    '{"code": 0|1, "math": 0|1, "reasoning": 0|1, "knowledge": 0|1, '
+    '"instruction": 0|1, "creative": 0|1, "multilingual": 0|1, "simple_chat": 0|1}.'
+)
+def _user_prompt(query: str) -> str:
+    return f"Query:\n{query}\n\nRespond with JSON only."
+def _parse_vote(raw: str) -> dict[str, float]:
+    try:
+        data = json.loads(_extract_json(raw))
+    except Exception:
+        return {k: 0.0 for k in CAPABILITY_KEYS}
+    return {k: float(1 if data.get(k) else 0) for k in CAPABILITY_KEYS}
+def _extract_json(text: str) -> str:
+    match = re.search(r"\{.*\}", text, flags=re.DOTALL)
+    return match.group(0) if match else text
+def _gpt_vote(text: str) -> Optional[dict[str, float]]:
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        return None
+    try:
+        from openai import OpenAI
+    except ImportError:
+        return None
+    client = OpenAI(api_key=api_key)
+    resp = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": _LABELER_SYSTEM_PROMPT},
+            {"role": "user", "content": _user_prompt(text)},
+        ],
+        temperature=0,
+        response_format={"type": "json_object"},
+    )
+    return _parse_vote(resp.choices[0].message.content or "{}")
+def _claude_vote(text: str) -> Optional[dict[str, float]]:
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        return None
+    try:
+        import anthropic
+    except ImportError:
+        return None
+    client = anthropic.Anthropic(api_key=api_key)
+    resp = client.messages.create(
+        model="claude-haiku-4-5",
+        max_tokens=200,
+        system=_LABELER_SYSTEM_PROMPT,
+        messages=[{"role": "user", "content": _user_prompt(text)}],
+    )
+    body = "".join(b.text for b in resp.content if getattr(b, "type", "") == "text")
+    return _parse_vote(body)
+def _gemini_vote(text: str) -> Optional[dict[str, float]]:
+    api_key = os.environ.get("GOOGLE_API_KEY")
+    if not api_key:
+        return None
+    try:
+        import google.generativeai as genai
+    except ImportError:
+        return None
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel("gemini-1.5-flash", system_instruction=_LABELER_SYSTEM_PROMPT)
+    resp = model.generate_content(_user_prompt(text))
+    return _parse_vote(resp.text or "{}")
+@dataclass
+class LabelerConfig:
+    use_heuristic: bool = True
+    use_gpt: bool = False
+    use_claude: bool = False
+    use_gemini: bool = False
+    source_prior_weight: float = 0.5
+    sleep_between_calls_s: float = 0.0
+def aggregate_votes(votes: CapabilityVotes, source_prior_weight: float = 0.5) -> dict[str, float]:
+    voters = [v for v in (votes.heuristic, votes.gpt, votes.claude, votes.gemini) if v is not None]
+    if not voters:
+        return dict(votes.source_prior) if votes.source_prior else {k: 0.0 for k in CAPABILITY_KEYS}
+    result: dict[str, float] = {}
+    total_weight = source_prior_weight + len(voters)
+    for cap in CAPABILITY_KEYS:
+        prior_term = source_prior_weight * float(votes.source_prior.get(cap, 0.0))
+        vendor_sum = sum(float(v.get(cap, 0.0)) for v in voters)
+        result[cap] = (prior_term + vendor_sum) / total_weight
+    return result
+def label_query(query: RawQuery, config: LabelerConfig) -> CapabilityLabel:
+    votes = CapabilityVotes(source_prior=source_prior_vote(query.source_category))
+    if config.use_heuristic:
+        votes.heuristic = heuristic_vote(query.text)
+    if config.use_gpt:
+        votes.gpt = _gpt_vote(query.text)
+        if config.sleep_between_calls_s:
+            time.sleep(config.sleep_between_calls_s)
+    if config.use_claude:
+        votes.claude = _claude_vote(query.text)
+        if config.sleep_between_calls_s:
+            time.sleep(config.sleep_between_calls_s)
+    if config.use_gemini:
+        votes.gemini = _gemini_vote(query.text)
+        if config.sleep_between_calls_s:
+            time.sleep(config.sleep_between_calls_s)
+    aggregated = aggregate_votes(votes, source_prior_weight=config.source_prior_weight)
+    method = "+".join(
+        m for m, present in [
+            ("heuristic", votes.heuristic is not None),
+            ("gpt", votes.gpt is not None),
+            ("claude", votes.claude is not None),
+            ("gemini", votes.gemini is not None),
+        ] if present
+    ) or "source_prior_only"
+    return CapabilityLabel(
+        query_id=query.id,
+        capabilities=aggregated,
+        votes=votes,
+        aggregation_method=method,
+    )
+def label_queries(queries: Iterable[RawQuery], config: LabelerConfig) -> list[CapabilityLabel]:
+    return [label_query(q, config) for q in queries]

greenrouting/data/cascade.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""Difficulty cascade: runs each query against an ascending ladder of models,
+grades the response, and derives a continuous `min_capable_log_params` label.
+Memory strategy: load one rung at a time, run all queries, dump checkpoint, free
+the weights, then advance to the next rung. Resumes from per-rung JSONL files.
+"""
+from __future__ import annotations
+import json
+import math
+import time
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Iterable, Optional
+from greenrouting.data.graders import grade
+from greenrouting.data.schema import LabeledQuery, RawQuery
+@dataclass
+class RungResult:
+    rung_id: str
+    params_b: float
+    query_id: str
+    sample_index: int
+    response: str
+    score: float
+    response_tokens: int
+def _read_raw_manifest(path: str | Path) -> list[RawQuery]:
+    queries: list[RawQuery] = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            data = json.loads(line)
+            queries.append(RawQuery(**data))
+    return queries
+def _read_rung_checkpoint(path: Path) -> dict[str, list[RungResult]]:
+    if not path.exists():
+        return {}
+    out: dict[str, list[RungResult]] = {}
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            row = json.loads(line)
+            r = RungResult(**row)
+            out.setdefault(r.query_id, []).append(r)
+    return out
+def _append_rung_checkpoint(path: Path, result: RungResult) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "a", encoding="utf-8") as f:
+        f.write(json.dumps(asdict(result)) + "\n")
+def _load_model_and_tokenizer(hf_model: str):
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    tok = AutoTokenizer.from_pretrained(hf_model)
+    if tok.pad_token_id is None:
+        tok.pad_token_id = tok.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(
+        hf_model, torch_dtype=dtype, device_map="auto" if torch.cuda.is_available() else None
+    )
+    model.eval()
+    return tok, model
+def _free_model(model) -> None:
+    import gc
+    del model
+    gc.collect()
+    try:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
+def _format_prompt(tok, query: str) -> str:
+    if hasattr(tok, "apply_chat_template") and tok.chat_template:
+        messages = [{"role": "user", "content": query}]
+        return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    return f"### Instruction:\n{query}\n\n### Response:\n"
+def _generate(tok, model, prompt: str, max_new_tokens: int, temperature: float) -> tuple[str, int]:
+    import torch
+    inputs = tok(prompt, return_tensors="pt").to(model.device)
+    do_sample = temperature > 0
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+            temperature=temperature if do_sample else 1.0,
+            pad_token_id=tok.pad_token_id,
+        )
+    new_tokens = out[0][inputs["input_ids"].shape[1]:]
+    response = tok.decode(new_tokens, skip_special_tokens=True)
+    return response.strip(), int(new_tokens.shape[0])
+def run_rung(
+    rung,
+    queries: list[RawQuery],
+    k_samples: int,
+    max_new_tokens: int,
+    temperature_first: float,
+    temperature_resample: float,
+    checkpoint_path: Path,
+    progress: bool = True,
+) -> list[RungResult]:
+    existing = _read_rung_checkpoint(checkpoint_path)
+    pending = [q for q in queries if len(existing.get(q.id, [])) < k_samples]
+    results: list[RungResult] = [r for rs in existing.values() for r in rs]
+    if not pending:
+        return results
+    tok, model = _load_model_and_tokenizer(rung.hf_model)
+    try:
+        for i, q in enumerate(pending):
+            done = len(existing.get(q.id, []))
+            for s in range(done, k_samples):
+                temp = temperature_first if s == 0 else temperature_resample
+                prompt = _format_prompt(tok, q.text)
+                start = time.time()
+                response, n_tokens = _generate(tok, model, prompt, max_new_tokens, temp)
+                score = grade(response, q.grader_metadata, max_new_tokens=max_new_tokens)
+                rr = RungResult(
+                    rung_id=rung.id,
+                    params_b=rung.params_b,
+                    query_id=q.id,
+                    sample_index=s,
+                    response=response,
+                    score=score,
+                    response_tokens=n_tokens,
+                )
+                _append_rung_checkpoint(checkpoint_path, rr)
+                results.append(rr)
+                if progress:
+                    print(
+                        f"  [{rung.id}] {i+1}/{len(pending)} sample={s} "
+                        f"score={score:.2f} tok={n_tokens} t={time.time()-start:.1f}s"
+                    )
+    finally:
+        _free_model(model)
+    return results
+def derive_difficulty(
+    per_rung: dict[str, list[float]],
+    rung_params_b: dict[str, float],
+    pass_threshold: float,
+) -> float:
+    """Continuous min_capable_log_params from per-rung mean scores.
+    Logic:
+      - sort rungs by parameter count
+      - for each rung, mean score across samples is the "rung pass rate"
+      - the smallest rung whose pass rate >= threshold defines the floor
+      - linear interpolation in log(params) space between the failing and passing rung
+      - if no rung passes, return log(largest_rung_params * 2) as out-of-pool
+      - if smallest rung already passes, return log(smallest_rung_params)
+    """
+    sorted_rungs = sorted(rung_params_b.items(), key=lambda kv: kv[1])
+    if not sorted_rungs:
+        return math.log(8e9)
+    means: list[tuple[str, float, float]] = []
+    for rung_id, params_b in sorted_rungs:
+        scores = per_rung.get(rung_id, [])
+        if not scores:
+            continue
+        means.append((rung_id, params_b, sum(scores) / len(scores)))
+    if not means:
+        return math.log(sorted_rungs[-1][1] * 1e9 * 2)
+    if means[0][2] >= pass_threshold:
+        return math.log(means[0][1] * 1e9)
+    for i in range(1, len(means)):
+        prev_id, prev_params, prev_score = means[i - 1]
+        cur_id, cur_params, cur_score = means[i]
+        if cur_score >= pass_threshold:
+            denom = max(cur_score - prev_score, 1e-6)
+            t = max(0.0, min(1.0, (pass_threshold - prev_score) / denom))
+            log_lo = math.log(prev_params * 1e9)
+            log_hi = math.log(cur_params * 1e9)
+            return log_lo + t * (log_hi - log_lo)
+    return math.log(means[-1][1] * 1e9 * 2)
+def derive_length_bucket(response_token_counts: list[int]) -> str:
+    if not response_token_counts:
+        return "medium"
+    avg = sum(response_token_counts) / len(response_token_counts)
+    if avg < 100:
+        return "short"
+    if avg < 400:
+        return "medium"
+    return "long"
+def run_cascade(
+    config,
+    raw_manifest_path: str | Path,
+    capability_labels_path: str | Path,
+    train_path: str | Path,
+    test_path: str | Path,
+    pass_threshold: float = 0.7,
+) -> None:
+    from greenrouting.data.builder import (
+        read_capability_labels,
+        write_labeled_dataset,
+    )
+    queries = _read_raw_manifest(raw_manifest_path)
+    cap_labels = read_capability_labels(capability_labels_path)
+    per_rung_results: dict[str, dict[str, list[RungResult]]] = {}
+    out_dir = Path(config.output_dir)
+    for rung in config.cascade.rungs:
+        if not rung.runs_locally:
+            print(f"[skip] {rung.id} marked as not runs_locally; configure remote backend.")
+            continue
+        ckpt = out_dir / f"cascade_{config.profile_name}_{rung.id}.jsonl"
+        results = run_rung(
+            rung,
+            queries,
+            k_samples=config.cascade.k_samples,
+            max_new_tokens=config.cascade.max_new_tokens,
+            temperature_first=config.cascade.temperature_first,
+            temperature_resample=config.cascade.temperature_resample,
+            checkpoint_path=ckpt,
+        )
+        by_query: dict[str, list[RungResult]] = {}
+        for r in results:
+            by_query.setdefault(r.query_id, []).append(r)
+        per_rung_results[rung.id] = by_query
+        print(f"[done] rung {rung.id}: {sum(len(v) for v in by_query.values())} samples")
+    rung_params: dict[str, float] = {r.id: r.params_b for r in config.cascade.rungs}
+    labeled: list[LabeledQuery] = []
+    for q in queries:
+        per_rung_scores: dict[str, list[float]] = {}
+        token_counts: list[int] = []
+        for rung_id, by_query in per_rung_results.items():
+            for rr in by_query.get(q.id, []):
+                per_rung_scores.setdefault(rung_id, []).append(rr.score)
+                token_counts.append(rr.response_tokens)
+        if not per_rung_scores:
+            continue
+        difficulty = derive_difficulty(per_rung_scores, rung_params, pass_threshold)
+        length_bucket = derive_length_bucket(token_counts)
+        caps = cap_labels.get(q.id, {})
+        labeled.append(LabeledQuery(
+            raw=q,
+            capabilities=caps,
+            difficulty_log_params=difficulty,
+            length_bucket=length_bucket,
+            cascade_results={
+                "per_rung_mean_scores": {
+                    k: sum(v) / len(v) for k, v in per_rung_scores.items()
+                },
+            },
+        ))
+    write_labeled_dataset(
+        train_path=train_path,
+        test_path=test_path,
+        rows=labeled,
+        test_split=config.test_split,
+        seed=config.seed,
+    )
+    print(f"[done] wrote {len(labeled)} labeled rows -> {train_path}, {test_path}")

greenrouting/data/graders.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""Graders that score whether a model's response solved a query.
+Outputs a bounded float in [0.0, 1.0]. Where a deterministic grader exists
+(numeric extraction, multi-choice match), the score is binary. For free-form
+responses we use a deterministic proxy (parseability, length) clamped to a
+sensible range.
+"""
+from __future__ import annotations
+import ast
+import re
+from typing import Optional
+NUMBER_PATTERN = re.compile(r"-?\d+(?:[.,]\d+)?")
+def _extract_last_number(text: str) -> Optional[float]:
+    matches = NUMBER_PATTERN.findall(text or "")
+    if not matches:
+        return None
+    last = matches[-1].replace(",", "")
+    try:
+        return float(last)
+    except ValueError:
+        return None
+def _extract_first_letter(text: str, valid_letters: str = "ABCDEFGH") -> Optional[str]:
+    if not text:
+        return None
+    cleaned = text.strip()
+    m = re.search(
+        r"(?:answer|the answer is|final answer)\s*[:\-]?\s*\(?([" + valid_letters + r"])\)?",
+        cleaned,
+        re.IGNORECASE,
+    )
+    if m:
+        return m.group(1).upper()
+    m = re.match(r"\s*\(([" + valid_letters + r"])\)", cleaned)
+    if m:
+        return m.group(1)
+    m = re.match(r"\s*([" + valid_letters + r"])[\s\.\):,]+", cleaned)
+    if m:
+        return m.group(1)
+    m = re.match(r"\s*([" + valid_letters + r"])\s*$", cleaned)
+    if m:
+        return m.group(1)
+    return None
+def grade_numeric(response: str, gold: str) -> float:
+    g = _extract_last_number(gold)
+    r = _extract_last_number(response)
+    if g is None or r is None:
+        return 0.0
+    return 1.0 if abs(g - r) < 1e-6 else 0.0
+def grade_multichoice(response: str, gold_letter: str) -> float:
+    if not gold_letter:
+        return 0.0
+    pred = _extract_first_letter(response or "")
+    if pred is None:
+        return 0.0
+    return 1.0 if pred.upper() == gold_letter.strip().upper() else 0.0
+def grade_string_match(response: str, gold: str) -> float:
+    if not gold:
+        return 0.0
+    norm_resp = re.sub(r"\s+", " ", (response or "")).strip().lower()
+    norm_gold = re.sub(r"\s+", " ", gold).strip().lower()
+    if not norm_gold:
+        return 0.0
+    return 1.0 if norm_gold in norm_resp else 0.0
+def grade_code_proxy(response: str, entry_point: Optional[str] = None) -> float:
+    """Cheap proxy for code correctness: code parses + (optionally) defines the
+    expected entry-point function. No execution; safe to run on untrusted output."""
+    if not response:
+        return 0.0
+    code = _extract_code_block(response)
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return 0.0
+    if entry_point:
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef) and node.name == entry_point:
+                return 1.0
+        return 0.4
+    return 0.7
+def _extract_code_block(text: str) -> str:
+    fence = re.search(r"```(?:python|py)?\s*\n(.*?)```", text, re.DOTALL | re.IGNORECASE)
+    if fence:
+        return fence.group(1)
+    if "def " in text or "class " in text or "import " in text:
+        return text
+    return text
+def grade_response_quality(response: str, max_new_tokens: int) -> float:
+    """For open-ended queries with no gold answer. Length-and-diversity heuristic.
+    Returns 0.0 for empty/garbage; saturates toward 1.0 as length grows up to
+    `max_new_tokens` and vocabulary diversity is healthy.
+    """
+    if not response:
+        return 0.0
+    tokens = response.split()
+    if len(tokens) < 3:
+        return 0.05
+    unique_ratio = len(set(t.lower() for t in tokens)) / len(tokens)
+    length_score = min(1.0, len(tokens) / max(max_new_tokens * 0.5, 1))
+    return max(0.0, min(1.0, 0.6 * length_score + 0.4 * unique_ratio))
+def grade_ifeval_proxy(response: str, instruction_id_list: list[str]) -> float:
+    """Lightweight stand-in for IFEval's strict constraint grader. Counts how many
+    structural cues from instruction IDs are present in the response."""
+    if not response:
+        return 0.0
+    if not instruction_id_list:
+        return 0.5
+    hits = 0
+    for iid in instruction_id_list:
+        if "list" in iid and re.search(r"^\s*[-*•]|^\s*\d+\.", response, re.MULTILINE):
+            hits += 1
+        elif "json" in iid and (response.strip().startswith("{") or response.strip().startswith("[")):
+            hits += 1
+        elif "letter" in iid:
+            hits += 1
+        elif "word_count" in iid:
+            hits += 1
+        elif "uppercase" in iid and any(c.isupper() for c in response):
+            hits += 1
+        else:
+            hits += 0.5
+    return min(1.0, hits / max(len(instruction_id_list), 1))
+def grade(response: str, grader_metadata: dict, max_new_tokens: int = 256) -> float:
+    g = grader_metadata.get("grader")
+    if g == "exact_numeric":
+        return grade_numeric(response, grader_metadata.get("gold_final", ""))
+    if g == "multichoice":
+        return grade_multichoice(response, grader_metadata.get("gold_letter", ""))
+    if g == "string_match":
+        return grade_string_match(response, grader_metadata.get("gold", ""))
+    if g == "code_exec":
+        return grade_code_proxy(response, grader_metadata.get("entry_point"))
+    if g == "ifeval_constraints":
+        return grade_ifeval_proxy(response, grader_metadata.get("instruction_id_list", []))
+    return grade_response_quality(response, max_new_tokens)

greenrouting/data/schema.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Schema for queries and labels flowing through the data pipeline."""
+from __future__ import annotations
+from dataclasses import dataclass, field, asdict
+from typing import Optional
+from greenrouting.routing.registry import CAPABILITY_KEYS
+LENGTH_BUCKETS: tuple[str, str, str] = ("short", "medium", "long")
+@dataclass
+class RawQuery:
+    id: str
+    text: str
+    source: str
+    source_category: str
+    has_grader: bool = False
+    grader_metadata: dict = field(default_factory=dict)
+    def to_dict(self) -> dict:
+        return asdict(self)
+@dataclass
+class CapabilityVotes:
+    source_prior: dict[str, float] = field(default_factory=dict)
+    heuristic: Optional[dict[str, float]] = None
+    gpt: Optional[dict[str, float]] = None
+    claude: Optional[dict[str, float]] = None
+    gemini: Optional[dict[str, float]] = None
+    def vote_count(self) -> int:
+        return sum(
+            1 for v in (self.heuristic, self.gpt, self.claude, self.gemini) if v is not None
+        )
+@dataclass
+class CapabilityLabel:
+    query_id: str
+    capabilities: dict[str, float]
+    votes: CapabilityVotes
+    aggregation_method: str
+    def to_record(self) -> dict:
+        rec = {"query_id": self.query_id, "aggregation_method": self.aggregation_method}
+        for k in CAPABILITY_KEYS:
+            rec[f"cap_{k}"] = float(self.capabilities.get(k, 0.0))
+        for vendor in ("source_prior", "heuristic", "gpt", "claude", "gemini"):
+            v = getattr(self.votes, vendor)
+            if v is None:
+                continue
+            for k in CAPABILITY_KEYS:
+                rec[f"vote_{vendor}_{k}"] = float(v.get(k, 0.0))
+        return rec
+@dataclass
+class LabeledQuery:
+    raw: RawQuery
+    capabilities: dict[str, float]
+    difficulty_log_params: Optional[float]
+    length_bucket: Optional[str]
+    cascade_results: dict = field(default_factory=dict)
+    def to_record(self) -> dict:
+        rec = {
+            "id": self.raw.id,
+            "text": self.raw.text,
+            "source": self.raw.source,
+            "source_category": self.raw.source_category,
+            "has_grader": self.raw.has_grader,
+            "difficulty_log_params": self.difficulty_log_params,
+            "length_bucket": self.length_bucket,
+        }
+        for k in CAPABILITY_KEYS:
+            rec[f"cap_{k}"] = float(self.capabilities.get(k, 0.0))
+        return rec

greenrouting/data/seed_dataset.py ADDED Viewed

	@@ -0,0 +1,545 @@

+"""Curated training set with multi-label capability assignments and difficulty.
+Each entry is hand-authored. Phrasings deliberately vary across registers
+(commands, questions, conversational asks, fragments) so the classifier learns
+semantic patterns rather than surface keyword rules.
+Schema for each entry: (text, primary_category, capabilities, difficulty_b, length).
+  - text: the raw query string
+  - primary_category: dominant capability bucket (used as `source_category`)
+  - capabilities: list of buckets that apply (multi-label)
+  - difficulty_b: parameter count (in billions) of the smallest model that would
+    plausibly handle this well; used to derive `difficulty_log_params`
+  - length: expected answer length bucket (short/medium/long)
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class SeedEntry:
+    text: str
+    primary_category: str
+    capabilities: tuple[str, ...]
+    difficulty_b: float
+    length: str
+def _e(text, primary, caps, diff_b, length) -> SeedEntry:
+    return SeedEntry(text=text, primary_category=primary, capabilities=tuple(caps), difficulty_b=diff_b, length=length)
+_SIMPLE_CHAT: list[SeedEntry] = [
+    _e("hi", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("hello", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("hey there", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("good morning", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("how's it going", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("thanks!", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("thank you so much", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("appreciate it", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("ok cool", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("got it", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("sounds good", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("nice", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("makes sense", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("yep", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("yes please", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("nope", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("not really", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("can you help me?", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("are you there?", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("what can you do?", "simple_chat", ["simple_chat", "instruction"], 1.0, "short"),
+    _e("how does this work", "simple_chat", ["simple_chat", "instruction"], 1.0, "short"),
+    _e("who are you?", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("are you human?", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("good night", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("see you later", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("bye!", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("lol", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("haha that's funny", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("oh interesting", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("hmm okay", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("can you elaborate", "simple_chat", ["simple_chat", "instruction"], 1.0, "short"),
+    _e("tell me more", "simple_chat", ["simple_chat", "instruction"], 1.0, "short"),
+    _e("what do you think?", "simple_chat", ["simple_chat", "reasoning"], 1.0, "short"),
+    _e("any thoughts?", "simple_chat", ["simple_chat", "reasoning"], 1.0, "short"),
+    _e("got a sec?", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("quick question", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("just checking in", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("how was your day", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("what's up", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("you good?", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("yeah that works", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("alright then", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("hold on a sec", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("never mind", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("scratch that", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("oops", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("my bad", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("you're awesome", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("this is great", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("perfect, thanks", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("can we try again", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("one more time", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("again please", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("yo", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("sup", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("howdy", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("greetings", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("test", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("just testing", "simple_chat", ["simple_chat"], 0.5, "short"),
+    _e("can you hear me", "simple_chat", ["simple_chat"], 0.5, "short"),
+]
+_INSTRUCTION: list[SeedEntry] = [
+    _e("write a 3-bullet summary of the agenda below: 1) opening remarks 2) Q3 numbers 3) staffing", "instruction", ["instruction"], 3.0, "short"),
+    _e("turn this into a numbered list", "instruction", ["instruction"], 1.0, "short"),
+    _e("format the following as a markdown table", "instruction", ["instruction"], 2.0, "medium"),
+    _e("rewrite this in a more formal tone", "instruction", ["instruction"], 3.0, "medium"),
+    _e("make this paragraph shorter without losing any meaning", "instruction", ["instruction"], 3.0, "medium"),
+    _e("expand this outline into a full paragraph", "instruction", ["instruction"], 3.0, "medium"),
+    _e("draft a polite decline to a meeting invite", "instruction", ["instruction", "creative"], 3.0, "short"),
+    _e("write a short subject line for this email", "instruction", ["instruction"], 1.0, "short"),
+    _e("give me three name ideas for a coffee shop near a park", "instruction", ["instruction", "creative"], 3.0, "short"),
+    _e("create a checklist for moving apartments", "instruction", ["instruction"], 3.0, "medium"),
+    _e("outline a 5-day study plan for the GRE quantitative section", "instruction", ["instruction"], 7.0, "medium"),
+    _e("summarize this in two sentences please", "instruction", ["instruction"], 3.0, "short"),
+    _e("convert these notes into a clean meeting recap", "instruction", ["instruction"], 7.0, "medium"),
+    _e("turn the following log lines into a single sentence describing what happened", "instruction", ["instruction", "reasoning"], 7.0, "short"),
+    _e("group these errors by likely root cause", "instruction", ["instruction", "reasoning"], 8.0, "medium"),
+    _e("clean up the grammar in the paragraph below", "instruction", ["instruction"], 1.0, "medium"),
+    _e("rephrase this so a 12-year-old could follow", "instruction", ["instruction"], 3.0, "medium"),
+    _e("make a one-paragraph executive summary", "instruction", ["instruction"], 7.0, "short"),
+    _e("draft an out-of-office reply for next Friday", "instruction", ["instruction", "creative"], 1.0, "short"),
+    _e("create a 7-day workout split focused on legs and shoulders", "instruction", ["instruction"], 7.0, "medium"),
+    _e("write a packing list for a 4-day winter trip to Reykjavik", "instruction", ["instruction"], 3.0, "medium"),
+    _e("turn the bullet points below into a smooth paragraph", "instruction", ["instruction"], 3.0, "medium"),
+    _e("transform this dry product description into a punchy tagline", "instruction", ["instruction", "creative"], 7.0, "short"),
+    _e("draft a thank-you note to my mentor", "instruction", ["instruction", "creative"], 3.0, "short"),
+    _e("write meeting minutes from this transcript snippet", "instruction", ["instruction"], 7.0, "medium"),
+    _e("split this monolithic to-do into morning vs afternoon", "instruction", ["instruction"], 1.0, "medium"),
+    _e("rewrite this slack message so it doesn't sound passive aggressive", "instruction", ["instruction"], 3.0, "short"),
+    _e("draft a customer apology email after a 3-hour outage", "instruction", ["instruction", "creative"], 7.0, "medium"),
+    _e("turn this transcript into 5 talking points", "instruction", ["instruction"], 3.0, "medium"),
+    _e("convert this case study into a single tweet", "instruction", ["instruction", "creative"], 3.0, "short"),
+    _e("write release notes for version 2.3 covering the changelog below", "instruction", ["instruction"], 7.0, "medium"),
+    _e("make a side-by-side comparison table of the two job offers", "instruction", ["instruction", "reasoning"], 7.0, "medium"),
+    _e("compose a brief biography paragraph for a conference badge", "instruction", ["instruction", "creative"], 3.0, "short"),
+    _e("write a 4-line elevator pitch for an indie video game", "instruction", ["instruction", "creative"], 3.0, "short"),
+    _e("expand the following acronyms inline: API, SLA, P95, RAG", "instruction", ["instruction", "knowledge"], 3.0, "short"),
+    _e("create a polite reminder to a colleague who hasn't reviewed my PR", "instruction", ["instruction"], 1.0, "short"),
+    _e("draft an FAQ entry explaining how refunds work for SaaS subscriptions", "instruction", ["instruction"], 7.0, "medium"),
+    _e("turn this jira ticket description into a one-line standup update", "instruction", ["instruction"], 1.0, "short"),
+    _e("write a job description for a junior data analyst", "instruction", ["instruction"], 7.0, "medium"),
+    _e("convert the recipe below from imperial to metric", "instruction", ["instruction"], 1.0, "short"),
+    _e("organize the chaos in this email thread into a clean timeline", "instruction", ["instruction", "reasoning"], 7.0, "medium"),
+    _e("extract every dollar amount from the contract clause and list them", "instruction", ["instruction"], 3.0, "short"),
+    _e("write three follow-up email subject lines, increasing in urgency", "instruction", ["instruction", "creative"], 3.0, "short"),
+    _e("rewrite this resume bullet to emphasize impact", "instruction", ["instruction"], 3.0, "short"),
+    _e("draft a 1-paragraph linkedin post announcing a job change", "instruction", ["instruction", "creative"], 7.0, "short"),
+    _e("turn the bullet recap below into a polished retro doc section", "instruction", ["instruction"], 7.0, "medium"),
+    _e("compose a clear bug report from this user complaint", "instruction", ["instruction", "reasoning"], 7.0, "medium"),
+    _e("create a 5-question pre-interview survey for prospective tenants", "instruction", ["instruction"], 3.0, "medium"),
+    _e("rewrite the warning copy below to be friendlier without losing meaning", "instruction", ["instruction"], 3.0, "short"),
+    _e("write a one-line apology, a longer apology, and a formal apology", "instruction", ["instruction", "creative"], 3.0, "medium"),
+]
+_KNOWLEDGE: list[SeedEntry] = [
+    _e("what's the capital of Mongolia", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("when was the printing press invented", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what's the population of Lagos roughly", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("who painted Guernica", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("how long was the Hundred Years War actually", "knowledge", ["knowledge"], 3.0, "short"),
+    _e("what protein does insulin regulate", "knowledge", ["knowledge"], 7.0, "short"),
+    _e("what's the difference between a virus and a bacterium in plain terms", "knowledge", ["knowledge", "instruction"], 7.0, "medium"),
+    _e("name the three branches of the US government", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what year did World War I end", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("who wrote One Hundred Years of Solitude", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what's photosynthesis in one sentence", "knowledge", ["knowledge", "instruction"], 1.0, "short"),
+    _e("how many bones are in the human body", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what does GDP stand for and what does it measure", "knowledge", ["knowledge"], 3.0, "short"),
+    _e("which planet has the most moons", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what's the chemical symbol for gold", "knowledge", ["knowledge"], 0.5, "short"),
+    _e("define entropy in thermodynamics", "knowledge", ["knowledge"], 7.0, "medium"),
+    _e("explain the citric acid cycle briefly", "knowledge", ["knowledge", "instruction"], 8.0, "medium"),
+    _e("what is OAuth 2.0 and what problem does it solve", "knowledge", ["knowledge", "instruction"], 7.0, "medium"),
+    _e("describe the Marshall Plan in two sentences", "knowledge", ["knowledge", "instruction"], 7.0, "short"),
+    _e("what's the difference between TCP and UDP", "knowledge", ["knowledge", "reasoning"], 7.0, "medium"),
+    _e("who founded the Stoic philosophical school", "knowledge", ["knowledge"], 3.0, "short"),
+    _e("what did Rosalind Franklin contribute to DNA research", "knowledge", ["knowledge"], 7.0, "medium"),
+    _e("explain RAID 5 vs RAID 10 storage", "knowledge", ["knowledge", "reasoning"], 8.0, "medium"),
+    _e("what is monetary policy in plain language", "knowledge", ["knowledge", "instruction"], 7.0, "medium"),
+    _e("how does a vaccine actually trigger immunity", "knowledge", ["knowledge", "reasoning"], 8.0, "medium"),
+    _e("what was the Bretton Woods system", "knowledge", ["knowledge"], 7.0, "medium"),
+    _e("who was the first woman to win a Nobel Prize", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what's the longest river in South America", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("describe the role of the prefrontal cortex", "knowledge", ["knowledge", "instruction"], 8.0, "medium"),
+    _e("define inflation vs deflation simply", "knowledge", ["knowledge"], 3.0, "short"),
+    _e("what is BGP in networking", "knowledge", ["knowledge"], 7.0, "short"),
+    _e("explain what a CDN does", "knowledge", ["knowledge", "instruction"], 3.0, "short"),
+    _e("what's a Galois field", "knowledge", ["knowledge", "math"], 30.0, "medium"),
+    _e("who invented the World Wide Web", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what does the term 'eventual consistency' mean in databases", "knowledge", ["knowledge"], 8.0, "medium"),
+    _e("brief overview of the Treaty of Westphalia", "knowledge", ["knowledge", "instruction"], 8.0, "medium"),
+    _e("what is the boiling point of water in Kelvin", "knowledge", ["knowledge"], 0.5, "short"),
+    _e("what's the Coriolis effect", "knowledge", ["knowledge"], 3.0, "short"),
+    _e("explain Bayes' theorem in everyday language", "knowledge", ["knowledge", "math", "instruction"], 8.0, "medium"),
+    _e("what is the speed of light in vacuum", "knowledge", ["knowledge"], 0.5, "short"),
+    _e("when did the Berlin Wall fall", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what does HTTP/3 use under the hood", "knowledge", ["knowledge"], 8.0, "short"),
+    _e("describe the Heisenberg uncertainty principle", "knowledge", ["knowledge", "instruction"], 8.0, "medium"),
+    _e("what's the difference between a sonnet and a haiku", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("name the inert noble gases", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("how does a transformer model differ from an RNN at a high level", "knowledge", ["knowledge", "reasoning"], 30.0, "medium"),
+    _e("what was the Marshall McLuhan claim about media", "knowledge", ["knowledge"], 7.0, "short"),
+    _e("explain quantitative easing", "knowledge", ["knowledge", "instruction"], 8.0, "medium"),
+    _e("what's the half-life of carbon-14", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("who composed The Rite of Spring", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what does the term 'antifragile' mean (Taleb)", "knowledge", ["knowledge"], 3.0, "short"),
+    _e("how does an MRI machine work", "knowledge", ["knowledge", "instruction"], 30.0, "medium"),
+    _e("define cognitive dissonance", "knowledge", ["knowledge"], 3.0, "short"),
+    _e("what's the difference between fission and fusion", "knowledge", ["knowledge"], 7.0, "short"),
+    _e("how big is the Milky Way galaxy in light years", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what is gerrymandering", "knowledge", ["knowledge"], 3.0, "short"),
+    _e("define IPO in finance", "knowledge", ["knowledge"], 1.0, "short"),
+    _e("what is dark matter, briefly", "knowledge", ["knowledge"], 7.0, "short"),
+    _e("how did the Silk Road shape trade", "knowledge", ["knowledge", "reasoning"], 8.0, "medium"),
+]
+_CODE: list[SeedEntry] = [
+    _e("write a python function that reverses a string", "code", ["code"], 0.5, "short"),
+    _e("show me how to read a file line by line in python", "code", ["code"], 1.0, "short"),
+    _e("what's the difference between == and is in python", "code", ["code", "knowledge"], 3.0, "short"),
+    _e("how do I sort a list of dicts by a key in python", "code", ["code"], 1.0, "short"),
+    _e("write javascript that fetches /api/users and logs the result", "code", ["code"], 1.0, "short"),
+    _e("debounce function in javascript please", "code", ["code"], 3.0, "short"),
+    _e("css to center a div both vertically and horizontally", "code", ["code"], 1.0, "short"),
+    _e("regex to validate an email address (rough)", "code", ["code"], 3.0, "short"),
+    _e("write a SQL query to find duplicates by email column", "code", ["code"], 3.0, "short"),
+    _e("explain what this stack trace means: TypeError: cannot unpack non-iterable NoneType object", "code", ["code", "reasoning"], 7.0, "medium"),
+    _e("rust function that returns the nth fibonacci number iteratively", "code", ["code"], 3.0, "short"),
+    _e("typescript type for a paginated API response", "code", ["code"], 7.0, "medium"),
+    _e("write a python decorator that times function calls", "code", ["code"], 7.0, "short"),
+    _e("dockerfile for a flask app on python 3.11", "code", ["code"], 7.0, "medium"),
+    _e("kubernetes deployment yaml for a 3-replica stateless service on port 8080", "code", ["code"], 8.0, "medium"),
+    _e("write a recursive haskell function for tree depth", "code", ["code"], 8.0, "short"),
+    _e("implement a thread-safe LRU cache in rust", "code", ["code"], 30.0, "long"),
+    _e("explain how Promise.all differs from Promise.allSettled", "code", ["code", "reasoning"], 7.0, "medium"),
+    _e("write a postgres query that windows daily revenue with a 7-day moving average", "code", ["code", "math"], 8.0, "medium"),
+    _e("debug this: my python script crashes with 'IndexError: list assignment index out of range' on line 22", "code", ["code", "reasoning"], 7.0, "medium"),
+    _e("how do I cancel an in-flight fetch request", "code", ["code"], 3.0, "short"),
+    _e("write a fastapi endpoint with pydantic validation for a user signup", "code", ["code"], 7.0, "medium"),
+    _e("python script to parse a CSV and emit JSONL", "code", ["code"], 3.0, "short"),
+    _e("show me an idempotent stripe webhook handler in node", "code", ["code", "reasoning"], 8.0, "medium"),
+    _e("convert this python list comprehension to a generator expression", "code", ["code"], 1.0, "short"),
+    _e("explain the difference between mutex and semaphore with code", "code", ["code", "knowledge"], 8.0, "long"),
+    _e("git command to undo the last commit but keep my changes staged", "code", ["code"], 1.0, "short"),
+    _e("git rebase vs merge - which should I use for a feature branch", "code", ["code", "reasoning"], 7.0, "medium"),
+    _e("how do I write an async iterator in python", "code", ["code"], 7.0, "short"),
+    _e("write a smart contract in solidity that escrows a payment", "code", ["code"], 30.0, "long"),
+    _e("rewrite this loop using map and filter", "code", ["code"], 1.0, "short"),
+    _e("c++ template for a generic queue", "code", ["code"], 8.0, "medium"),
+    _e("write go code that gracefully shuts down an HTTP server on SIGTERM", "code", ["code"], 8.0, "medium"),
+    _e("design and implement a rate limiter middleware in express", "code", ["code", "reasoning"], 8.0, "long"),
+    _e("convert callback-style fs.readFile to a promise-returning version", "code", ["code"], 1.0, "short"),
+    _e("regex to capture all URLs from a markdown document", "code", ["code"], 7.0, "short"),
+    _e("write a python class that wraps the openai API with retry+backoff", "code", ["code", "reasoning"], 8.0, "medium"),
+    _e("explain what __slots__ does in python", "code", ["code", "knowledge"], 7.0, "short"),
+    _e("how do I migrate from sqlalchemy 1.4 to 2.0", "code", ["code", "knowledge"], 8.0, "medium"),
+    _e("write a github actions workflow that runs pytest on push and PR", "code", ["code"], 7.0, "medium"),
+    _e("typescript function that pipes through a series of validators", "code", ["code"], 8.0, "medium"),
+    _e("how to debounce a search input in react with useEffect", "code", ["code"], 3.0, "short"),
+    _e("python: implement a memoize decorator that respects argument types", "code", ["code"], 7.0, "medium"),
+    _e("explain what a CRDT is and sketch a counter implementation", "code", ["code", "knowledge", "reasoning"], 30.0, "long"),
+    _e("write a custom hook that tracks window scroll position", "code", ["code"], 3.0, "short"),
+    _e("redis lua script that atomically pops from a sorted set if score < now", "code", ["code"], 8.0, "medium"),
+    _e("optimize this O(n^2) python loop", "code", ["code", "reasoning", "math"], 8.0, "medium"),
+    _e("aws cdk stack for an s3 bucket and a cloudfront distribution", "code", ["code"], 8.0, "medium"),
+    _e("explain what 'use strict' does in javascript", "code", ["code", "knowledge"], 1.0, "short"),
+    _e("how do I configure cors for a flask app", "code", ["code"], 1.0, "short"),
+    _e("write a kubernetes operator scaffold in go", "code", ["code", "reasoning"], 70.0, "long"),
+    _e("difference between a thread and a coroutine in python", "code", ["code", "knowledge"], 7.0, "medium"),
+    _e("write a scala function that computes mean and std in a single pass", "code", ["code", "math"], 8.0, "medium"),
+    _e("set up a basic CI/CD pipeline with gitlab ci for a node project", "code", ["code"], 7.0, "medium"),
+    _e("write a python pytest fixture that spins up a docker postgres", "code", ["code"], 8.0, "medium"),
+    _e("kotlin extension function to clamp a number between min and max", "code", ["code"], 1.0, "short"),
+    _e("write a CUDA kernel for vector addition with bounds checks", "code", ["code", "reasoning"], 30.0, "medium"),
+    _e("draft a Dockerfile that uses multi-stage builds for a typescript app", "code", ["code"], 8.0, "medium"),
+    _e("how to use websockets in fastapi", "code", ["code"], 7.0, "medium"),
+    _e("show me how to mock fetch in jest", "code", ["code"], 3.0, "short"),
+    _e("python function to flatten a deeply nested dict using dot notation keys", "code", ["code"], 7.0, "medium"),
+    _e("explain the producer-consumer problem with a go example", "code", ["code", "knowledge"], 8.0, "long"),
+    _e("solidity: ERC-20 token with a 5% transfer tax to a treasury address", "code", ["code"], 8.0, "long"),
+    _e("write terraform that creates a vpc with two private subnets and a NAT gateway", "code", ["code"], 8.0, "medium"),
+    _e("how does python's GIL affect multithreaded CPU-bound code", "code", ["code", "knowledge", "reasoning"], 8.0, "medium"),
+    _e("rewrite this callback hell into async/await", "code", ["code"], 3.0, "short"),
+    _e("python script to backfill missing dates in a pandas time series", "code", ["code"], 7.0, "medium"),
+    _e("explain why my recursive function hits a RecursionError on n=1500", "code", ["code", "reasoning"], 7.0, "medium"),
+    _e("c program: read stdin and print each line reversed", "code", ["code"], 3.0, "short"),
+    _e("write a chrome extension manifest v3 that injects a script into all pages", "code", ["code"], 8.0, "medium"),
+]
+_MATH: list[SeedEntry] = [
+    _e("what is 17 + 25", "math", ["math"], 0.5, "short"),
+    _e("compute 12 * 14 - 8", "math", ["math"], 0.5, "short"),
+    _e("what's 25 percent of 480", "math", ["math"], 1.0, "short"),
+    _e("solve 3x + 7 = 22", "math", ["math"], 1.0, "short"),
+    _e("what's the area of a circle with radius 5", "math", ["math"], 1.0, "short"),
+    _e("convert 75 fahrenheit to celsius", "math", ["math"], 0.5, "short"),
+    _e("what is the slope between (1,2) and (4,8)", "math", ["math"], 1.0, "short"),
+    _e("simplify 3/4 + 5/6", "math", ["math"], 1.0, "short"),
+    _e("if a train leaves at 60 mph, how long to travel 240 miles", "math", ["math"], 1.0, "short"),
+    _e("solve x^2 - 5x + 6 = 0", "math", ["math"], 3.0, "short"),
+    _e("what's the integral of x^2 dx", "math", ["math"], 3.0, "short"),
+    _e("integrate x^2 sin(x) dx using integration by parts, show the steps", "math", ["math", "instruction"], 8.0, "medium"),
+    _e("compute the determinant of [[2,1],[3,4]]", "math", ["math"], 3.0, "short"),
+    _e("what's the derivative of e^(x^2)", "math", ["math"], 3.0, "short"),
+    _e("derive the chain rule from first principles", "math", ["math", "reasoning"], 30.0, "long"),
+    _e("prove the Pythagorean theorem geometrically", "math", ["math", "reasoning"], 8.0, "medium"),
+    _e("expectation of a roll of two fair dice", "math", ["math"], 3.0, "short"),
+    _e("variance of the same two dice setup", "math", ["math"], 7.0, "short"),
+    _e("what is 2^10 - 1", "math", ["math"], 0.5, "short"),
+    _e("compute the standard deviation of {2,4,4,4,5,5,7,9}", "math", ["math"], 3.0, "short"),
+    _e("apply L'Hopital's rule to lim x->0 (sin x)/x", "math", ["math", "reasoning"], 7.0, "short"),
+    _e("solve the system: 2x + y = 5, x - y = 1", "math", ["math"], 1.0, "short"),
+    _e("find the inverse of the matrix [[1,2],[3,4]]", "math", ["math"], 3.0, "short"),
+    _e("compute the eigenvalues of [[2,0],[0,3]]", "math", ["math"], 3.0, "short"),
+    _e("compute eigenvalues of [[4,1],[2,3]]", "math", ["math", "reasoning"], 8.0, "medium"),
+    _e("if I invest $10000 at 5% APR compounded monthly for 6 years what's the final amount", "math", ["math"], 3.0, "short"),
+    _e("birthday paradox: probability that 23 people share a birthday", "math", ["math", "reasoning"], 8.0, "medium"),
+    _e("expected value of a martingale strategy on a fair coin", "math", ["math", "reasoning"], 30.0, "medium"),
+    _e("derive the formula for the sum of the first n integers", "math", ["math", "reasoning"], 7.0, "medium"),
+    _e("Taylor expand cos(x) around 0 to fourth order", "math", ["math"], 8.0, "medium"),
+    _e("convert 1101 binary to decimal", "math", ["math"], 1.0, "short"),
+    _e("convert 255 to hex", "math", ["math"], 1.0, "short"),
+    _e("what is gcd(48, 180)", "math", ["math"], 1.0, "short"),
+    _e("solve cos(2x) = 1/2 for x in [0, 2pi]", "math", ["math"], 8.0, "short"),
+    _e("derive Euler's formula e^(ix) = cos x + i sin x informally", "math", ["math", "reasoning"], 30.0, "medium"),
+    _e("what's the probability of rolling at least one 6 in four dice rolls", "math", ["math", "reasoning"], 7.0, "short"),
+    _e("if X ~ N(0,1), what's P(|X| > 1.96)", "math", ["math"], 7.0, "short"),
+    _e("show that sqrt(2) is irrational", "math", ["math", "reasoning"], 8.0, "medium"),
+    _e("limit of (1 + 1/n)^n as n -> inf", "math", ["math"], 3.0, "short"),
+    _e("compute 7! / 3!", "math", ["math"], 1.0, "short"),
+    _e("how many ways to arrange the letters in MISSISSIPPI", "math", ["math"], 3.0, "short"),
+    _e("explain the central limit theorem with intuition", "math", ["math", "knowledge", "instruction"], 8.0, "medium"),
+    _e("derive the quadratic formula", "math", ["math", "reasoning"], 7.0, "medium"),
+    _e("matrix multiplication: [[1,2],[3,4]] times [[5,6],[7,8]]", "math", ["math"], 3.0, "short"),
+    _e("find roots of 2x^3 - 9x^2 + 12x - 4 = 0", "math", ["math", "reasoning"], 30.0, "medium"),
+    _e("compute the gradient of f(x,y) = x^2 y + sin(xy)", "math", ["math"], 8.0, "short"),
+    _e("explain Cauchy-Schwarz inequality intuitively", "math", ["math", "instruction"], 30.0, "medium"),
+    _e("solve the recurrence T(n) = 2 T(n/2) + n with master theorem", "math", ["math", "reasoning"], 8.0, "medium"),
+    _e("what is the cardinality of the power set of {a, b, c, d}", "math", ["math"], 1.0, "short"),
+    _e("integrate by parts: integral of ln(x) dx", "math", ["math"], 3.0, "short"),
+    _e("monty hall problem - explain why switching is better", "math", ["math", "reasoning", "instruction"], 7.0, "medium"),
+    _e("which is bigger: e^pi or pi^e", "math", ["math", "reasoning"], 30.0, "short"),
+    _e("derive the formula for the sum of an infinite geometric series", "math", ["math", "reasoning"], 7.0, "medium"),
+    _e("matrix rank of [[1,2,3],[2,4,6],[1,1,1]]", "math", ["math"], 7.0, "short"),
+    _e("expand (1 + x)^5 using the binomial theorem", "math", ["math"], 3.0, "short"),
+    _e("under what conditions does fixed-point iteration converge", "math", ["math", "reasoning"], 30.0, "medium"),
+    _e("Bayes update: P(disease)=0.001, sensitivity 0.99, specificity 0.95, what's P(disease | positive)", "math", ["math", "reasoning"], 8.0, "medium"),
+    _e("what's the kernel of the linear map T(x,y,z) = (x+y, y+z)", "math", ["math"], 30.0, "short"),
+    _e("solve the differential equation y' + y = e^x", "math", ["math"], 8.0, "medium"),
+    _e("explain the fundamental theorem of calculus", "math", ["math", "instruction"], 8.0, "medium"),
+]
+_REASONING: list[SeedEntry] = [
+    _e("why does scaling laws matter for LLM development", "reasoning", ["reasoning", "knowledge"], 30.0, "medium"),
+    _e("compare the trade-offs of postgres vs dynamodb for an event store", "reasoning", ["reasoning", "knowledge"], 30.0, "long"),
+    _e("why might a microservice architecture hurt a 10-engineer team", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("what's the failure mode of using exponential backoff without jitter", "reasoning", ["reasoning", "knowledge"], 8.0, "medium"),
+    _e("argue both sides of remote vs in-office for early-stage startups", "reasoning", ["reasoning", "instruction"], 8.0, "long"),
+    _e("if my page load is slow but TTFB is fast, what's the likely cause", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("walk me through how you'd debug a memory leak in a long-running node process", "reasoning", ["reasoning", "instruction"], 8.0, "long"),
+    _e("compare optimistic and pessimistic concurrency control", "reasoning", ["reasoning", "knowledge"], 8.0, "medium"),
+    _e("why is two-phase commit considered a poor primitive in modern distributed systems", "reasoning", ["reasoning", "knowledge"], 30.0, "long"),
+    _e("when should you prefer SSE over websockets for a real-time feed", "reasoning", ["reasoning", "knowledge"], 8.0, "medium"),
+    _e("steel-man the case against test-driven development", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("compare functional programming and OOP for modeling a payments domain", "reasoning", ["reasoning"], 8.0, "long"),
+    _e("evaluate the trade-offs of using GraphQL over REST for a mobile app", "reasoning", ["reasoning"], 8.0, "long"),
+    _e("which is more useful for a startup: net dollar retention or activation rate", "reasoning", ["reasoning"], 8.0, "short"),
+    _e("how would you decide whether to migrate from MySQL to Postgres", "reasoning", ["reasoning", "instruction"], 8.0, "long"),
+    _e("when is event sourcing worth the complexity", "reasoning", ["reasoning", "knowledge"], 30.0, "medium"),
+    _e("if our latency is fine but error budget is burning, where do you look first", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("compare batch and streaming pipelines for fraud detection", "reasoning", ["reasoning"], 30.0, "long"),
+    _e("walk through the classic prisoner's dilemma and its iterated form", "reasoning", ["reasoning", "knowledge"], 7.0, "medium"),
+    _e("argue why YAGNI sometimes leads to expensive rewrites", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("if my classifier has high precision but low recall, what does that mean for the user", "reasoning", ["reasoning", "knowledge"], 7.0, "medium"),
+    _e("evaluate the claim 'AI will replace junior developers within 5 years'", "reasoning", ["reasoning"], 8.0, "long"),
+    _e("when should sharding precede vertical scaling for a postgres workload", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("explain why eventual consistency is acceptable for like counts but not bank balances", "reasoning", ["reasoning", "instruction"], 8.0, "medium"),
+    _e("compare risk profiles of monolith vs microservices for a 3-person team", "reasoning", ["reasoning"], 8.0, "long"),
+    _e("why might a 99.9% uptime SLA actually be expensive", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("argue the side that says feature flags are technical debt", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("trace through the implications of removing rate limits on a public API", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("when would you choose a graph database over a relational one", "reasoning", ["reasoning", "knowledge"], 8.0, "medium"),
+    _e("rebut the claim that 'tabs are better than spaces because of accessibility'", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("you have an outage with no logs, walk me through your first 10 minutes", "reasoning", ["reasoning", "instruction"], 8.0, "long"),
+    _e("compare CAP theorem trade-offs in cassandra vs cockroachdb", "reasoning", ["reasoning", "knowledge"], 30.0, "long"),
+    _e("evaluate whether react server components are the right call for a content site", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("why is pursuing 100% test coverage often a mistake", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("argue both sides: should we adopt typescript for our 5-year-old js codebase", "reasoning", ["reasoning"], 8.0, "long"),
+    _e("how would you identify whether AI-generated commits are sneaking past review", "reasoning", ["reasoning", "instruction"], 30.0, "medium"),
+    _e("trade-offs between BERT-style and GPT-style models for classification", "reasoning", ["reasoning", "knowledge"], 30.0, "medium"),
+    _e("when does fine-tuning beat retrieval augmentation, and vice versa", "reasoning", ["reasoning", "knowledge"], 30.0, "long"),
+    _e("compare scrum vs kanban for a 4-person engineering team with rotating priorities", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("if a P95 latency is 200ms but P99 is 8 seconds, what's likely going on", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("walk me through how you'd evaluate two competing offers from acquirers", "reasoning", ["reasoning", "instruction"], 30.0, "long"),
+    _e("explain why a/b tests can lie if you peek at results too early", "reasoning", ["reasoning", "math"], 8.0, "medium"),
+    _e("compare the maintenance burden of a CI based on github actions vs a self-hosted runner", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("when should you stop optimizing and ship", "reasoning", ["reasoning"], 7.0, "short"),
+    _e("argue both sides of using mock servers vs hitting staging", "reasoning", ["reasoning"], 7.0, "medium"),
+    _e("you suspect a vendor outage but their status page is green - now what", "reasoning", ["reasoning", "instruction"], 7.0, "medium"),
+    _e("evaluate the trade-offs of an open core licensing model", "reasoning", ["reasoning"], 30.0, "long"),
+    _e("when does retrying make a transient failure permanent", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("if my cache hit ratio drops 30% on a Tuesday afternoon what should I check first", "reasoning", ["reasoning"], 7.0, "short"),
+    _e("argue whether engineering managers should still write code", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("compare two approaches: batched embedding vs streaming embedding for a 1B-row corpus", "reasoning", ["reasoning"], 30.0, "long"),
+    _e("walk through how you'd estimate the cost of running a 70B model at 100 RPS", "reasoning", ["reasoning", "math"], 30.0, "long"),
+    _e("evaluate the claim 'serverless is always cheaper'", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("how should I prioritize tech debt vs feature work after a successful launch", "reasoning", ["reasoning", "instruction"], 8.0, "medium"),
+    _e("argue whether using a vector database is overkill for 10k documents", "reasoning", ["reasoning"], 8.0, "medium"),
+    _e("when is bayesian a/b testing better than frequentist", "reasoning", ["reasoning", "math"], 30.0, "medium"),
+    _e("walk me through what to look for in the postmortem of a security incident", "reasoning", ["reasoning", "instruction"], 8.0, "long"),
+    _e("when should you choose fully managed kafka over msk over self-hosted", "reasoning", ["reasoning"], 30.0, "long"),
+]
+_CREATIVE: list[SeedEntry] = [
+    _e("write a haiku about a server room at 3am", "creative", ["creative"], 1.0, "short"),
+    _e("write a 6-word story about regret", "creative", ["creative"], 1.0, "short"),
+    _e("compose a short poem about endless meetings", "creative", ["creative"], 3.0, "short"),
+    _e("invent a backstory for a wandering robot bartender", "creative", ["creative"], 7.0, "medium"),
+    _e("write the opening paragraph of a noir detective story set on Mars", "creative", ["creative"], 8.0, "medium"),
+    _e("draft lyrics for a folk song about dial-up internet", "creative", ["creative"], 7.0, "medium"),
+    _e("describe a city that exists only when no one is watching", "creative", ["creative"], 8.0, "medium"),
+    _e("write a sonnet about the comfort of routine", "creative", ["creative"], 8.0, "medium"),
+    _e("invent three names for a fictional indie band that plays cybernetic shoegaze", "creative", ["creative"], 3.0, "short"),
+    _e("write a bedtime story about a dragon who can't breathe fire", "creative", ["creative"], 7.0, "long"),
+    _e("micro-fiction: a 100-word story about waking up in someone else's house", "creative", ["creative"], 7.0, "medium"),
+    _e("write the dialog for a job interview between a wizard and a human resources manager", "creative", ["creative"], 7.0, "long"),
+    _e("compose a love letter from a satellite to the moon", "creative", ["creative"], 7.0, "medium"),
+    _e("write a metaphor for how it feels to debug a heisenbug", "creative", ["creative"], 3.0, "short"),
+    _e("invent a folk legend explaining why coffee tastes bitter", "creative", ["creative"], 7.0, "medium"),
+    _e("write three first lines of three different novels in different genres", "creative", ["creative"], 7.0, "short"),
+    _e("describe the smell of a bookstore using only verbs", "creative", ["creative"], 7.0, "short"),
+    _e("write a drinking song for accountants", "creative", ["creative"], 7.0, "medium"),
+    _e("compose a limerick about cloud providers", "creative", ["creative"], 3.0, "short"),
+    _e("draft a children's rhyme that explains binary numbers", "creative", ["creative", "math"], 7.0, "medium"),
+    _e("write a story where the antagonist is a benign software bug", "creative", ["creative"], 8.0, "long"),
+    _e("write a one-act scene set in a coffeeshop where two strangers realize they share a secret", "creative", ["creative"], 8.0, "long"),
+    _e("invent a magical creature whose only ability is mild administrative inconvenience", "creative", ["creative"], 7.0, "medium"),
+    _e("write a marketing tagline for a fictional time-travel agency", "creative", ["creative"], 3.0, "short"),
+    _e("write a journal entry from someone who just discovered electricity", "creative", ["creative"], 7.0, "medium"),
+    _e("write a horror story in 50 words", "creative", ["creative"], 7.0, "short"),
+    _e("describe an old photograph from the perspective of the cat in the corner", "creative", ["creative"], 7.0, "medium"),
+    _e("write the inner monologue of an autonomous vacuum cleaner having an existential crisis", "creative", ["creative"], 7.0, "long"),
+    _e("compose a ballad about an open-source maintainer who quietly disappears", "creative", ["creative"], 8.0, "long"),
+    _e("write a fairy tale that ends with a bug ticket being marked WONTFIX", "creative", ["creative"], 8.0, "long"),
+    _e("describe the sound of a forgotten song using only food metaphors", "creative", ["creative"], 7.0, "short"),
+    _e("write a 4-line poem about the sea but only using words a four-year-old would know", "creative", ["creative"], 7.0, "short"),
+    _e("invent a holiday celebrated by sysadmins", "creative", ["creative"], 7.0, "medium"),
+    _e("write a tense scene set inside a data center during a power loss", "creative", ["creative"], 8.0, "long"),
+    _e("write a recipe for nostalgia, in cookbook style", "creative", ["creative"], 7.0, "medium"),
+    _e("write the press release a future archaeologist might publish about us", "creative", ["creative"], 8.0, "medium"),
+    _e("write a cover letter from someone applying to be a household ghost", "creative", ["creative"], 7.0, "medium"),
+    _e("invent a dialect spoken only at 4am, give five example phrases", "creative", ["creative"], 8.0, "medium"),
+    _e("write a short eulogy for a departed feature flag", "creative", ["creative"], 7.0, "short"),
+    _e("draft the letter that a Roomba would write to its replacement", "creative", ["creative"], 7.0, "medium"),
+]
+_MULTILINGUAL: list[SeedEntry] = [
+    _e("translate to french: 'the data center is running at 80% capacity tonight'", "multilingual", ["multilingual", "instruction"], 3.0, "short"),
+    _e("translate to spanish: 'we are running out of free disk space'", "multilingual", ["multilingual", "instruction"], 1.0, "short"),
+    _e("translate to german: 'please confirm receipt of this email'", "multilingual", ["multilingual", "instruction"], 1.0, "short"),
+    _e("translate to japanese: 'thanks for your patience while we investigate'", "multilingual", ["multilingual", "instruction"], 7.0, "short"),
+    _e("how do you say 'good evening' in italian", "multilingual", ["multilingual"], 0.5, "short"),
+    _e("translate this korean sentence to english: 안녕하세요, 잘 부탁드립니다", "multilingual", ["multilingual"], 7.0, "short"),
+    _e("translate to mandarin: 'happy new year, may your servers stay up'", "multilingual", ["multilingual", "creative"], 8.0, "short"),
+    _e("provide the russian word for 'breakfast'", "multilingual", ["multilingual"], 1.0, "short"),
+    _e("translate the following news headline to portuguese", "multilingual", ["multilingual", "instruction"], 3.0, "short"),
+    _e("turn this english email into formal japanese keigo", "multilingual", ["multilingual", "instruction"], 30.0, "medium"),
+    _e("rewrite this paragraph in plain french", "multilingual", ["multilingual", "instruction"], 7.0, "medium"),
+    _e("write a polite arabic phrase to ask for directions", "multilingual", ["multilingual"], 7.0, "short"),
+    _e("how do you conjugate 'hablar' in the spanish past tense", "multilingual", ["multilingual", "knowledge"], 3.0, "short"),
+    _e("translate to swedish: 'I would like a coffee, please'", "multilingual", ["multilingual"], 1.0, "short"),
+    _e("explain the difference between 'tu' and 'usted' in spanish", "multilingual", ["multilingual", "knowledge"], 3.0, "short"),
+    _e("write a polite goodbye in tamil", "multilingual", ["multilingual"], 8.0, "short"),
+    _e("translate from french to english: 'on n'est pas sortis de l'auberge'", "multilingual", ["multilingual"], 8.0, "short"),
+    _e("how does verb agreement work in zulu", "multilingual", ["multilingual", "knowledge"], 30.0, "medium"),
+    _e("translate to icelandic: 'the volcano is active again'", "multilingual", ["multilingual"], 30.0, "short"),
+    _e("compose a 4-line haiku in japanese", "multilingual", ["multilingual", "creative"], 30.0, "short"),
+    _e("turn this casual english into respectful korean", "multilingual", ["multilingual", "instruction"], 30.0, "medium"),
+    _e("provide the cyrillic transliteration of 'санкт-петербург'", "multilingual", ["multilingual"], 7.0, "short"),
+    _e("translate the customer support reply below into spanish, neutral register", "multilingual", ["multilingual", "instruction"], 7.0, "medium"),
+    _e("translate this technical paragraph about kubernetes into french", "multilingual", ["multilingual", "code"], 30.0, "medium"),
+    _e("how would you politely decline a dinner invitation in japanese", "multilingual", ["multilingual", "instruction"], 8.0, "short"),
+    _e("write the same sentence in present, past, and future tenses in italian", "multilingual", ["multilingual"], 7.0, "short"),
+    _e("explain how case markers work in finnish", "multilingual", ["multilingual", "knowledge"], 30.0, "long"),
+    _e("translate to dutch: 'the meeting has been pushed to thursday'", "multilingual", ["multilingual"], 1.0, "short"),
+    _e("write a short greeting in vietnamese", "multilingual", ["multilingual"], 3.0, "short"),
+    _e("translate to portuguese: 'we'll need to roll back the deploy'", "multilingual", ["multilingual"], 7.0, "short"),
+]
+_MIXED: list[SeedEntry] = [
+    _e("write a python function that computes the nth fibonacci number recursively, with memoization", "code", ["code", "math"], 3.0, "medium"),
+    _e("solve this leetcode-style problem: find the longest substring without repeating chars in O(n)", "code", ["code", "math", "reasoning"], 8.0, "long"),
+    _e("write SQL to compute month-over-month revenue growth as a percentage", "code", ["code", "math"], 7.0, "medium"),
+    _e("explain why merge sort is O(n log n) and write it in python", "code", ["code", "math", "knowledge"], 8.0, "long"),
+    _e("benchmark these two python implementations and explain which is faster and why", "code", ["code", "reasoning"], 8.0, "long"),
+    _e("write a sql query that returns user retention by week-of-signup cohort", "code", ["code", "math", "reasoning"], 8.0, "medium"),
+    _e("translate this python function to rust idiomatically", "code", ["code", "multilingual"], 8.0, "medium"),
+    _e("explain how AES encryption works at a high level and where attacks are possible", "knowledge", ["knowledge", "reasoning"], 30.0, "long"),
+    _e("compare median and mean for income data and explain when each is misleading", "math", ["math", "knowledge", "reasoning"], 8.0, "medium"),
+    _e("walk me through how a hash map works internally with code", "code", ["code", "knowledge", "instruction"], 8.0, "long"),
+    _e("write a creative short story where the protagonist solves a math puzzle to escape", "creative", ["creative", "math"], 30.0, "long"),
+    _e("explain how P vs NP would matter to a software engineer in plain language", "knowledge", ["knowledge", "reasoning", "instruction"], 30.0, "long"),
+    _e("write a haiku in french about kubernetes", "creative", ["creative", "multilingual", "code"], 30.0, "short"),
+    _e("translate this stack trace error message to spanish and explain what's wrong", "code", ["code", "multilingual", "reasoning"], 8.0, "medium"),
+    _e("the sales team needs a one-paragraph explanation of how our embedding model works", "instruction", ["instruction", "knowledge", "reasoning"], 8.0, "medium"),
+    _e("derive big-O of this recursive function and rewrite it iteratively", "code", ["code", "math", "reasoning"], 8.0, "medium"),
+    _e("write python to fit a logistic regression and explain what the coefficients mean", "code", ["code", "math", "instruction"], 8.0, "long"),
+    _e("describe the philosophy of stoicism and apply one of its principles to a manager-employee disagreement", "knowledge", ["knowledge", "reasoning", "creative"], 8.0, "medium"),
+    _e("write the SQL to detect duplicate rows and an explanation of why they likely happened", "code", ["code", "reasoning"], 7.0, "medium"),
+    _e("translate the kafka error message below into a debug action plan", "code", ["code", "reasoning", "instruction"], 8.0, "medium"),
+    _e("write a 4-bullet executive summary of how OAuth2 PKCE flow works", "instruction", ["instruction", "knowledge"], 8.0, "medium"),
+    _e("model the expected cost of running 10000 daily LLM queries on three providers", "math", ["math", "reasoning", "code"], 8.0, "long"),
+    _e("compose a poem in spanish about regrets, with an english translation", "creative", ["creative", "multilingual"], 30.0, "medium"),
+    _e("explain why this regex captures the wrong thing and propose a fix", "code", ["code", "reasoning"], 7.0, "medium"),
+    _e("write code that uses dijkstra's algorithm and explain the heap invariants", "code", ["code", "math", "knowledge"], 30.0, "long"),
+    _e("estimate the cost in carbon emissions of training a 7B model on a million tokens", "math", ["math", "knowledge", "reasoning"], 30.0, "medium"),
+    _e("for the system below, identify the bottleneck and suggest two architectural fixes", "reasoning", ["reasoning", "code"], 30.0, "long"),
+    _e("write the abstract for a paper on retrieval-augmented generation, in academic style", "creative", ["creative", "knowledge", "instruction"], 30.0, "medium"),
+    _e("explain why eventual consistency causes user-visible bugs in messaging apps", "reasoning", ["reasoning", "knowledge", "instruction"], 8.0, "medium"),
+    _e("write code for k-means clustering from scratch, then describe how it can fail to converge", "code", ["code", "math", "reasoning"], 30.0, "long"),
+    _e("draft a polite french email asking a vendor to lower their pricing by 12%", "instruction", ["instruction", "multilingual"], 8.0, "medium"),
+    _e("estimate how many tokens we'd need to fine-tune a 7B model to a domain", "math", ["math", "knowledge", "reasoning"], 30.0, "medium"),
+    _e("compare the energy cost of inference between a 7B and a 70B model for the same query", "reasoning", ["reasoning", "math", "knowledge"], 30.0, "medium"),
+    _e("translate this italian opera lyric and explain its symbolism", "creative", ["creative", "multilingual", "knowledge"], 30.0, "long"),
+    _e("write a python script that downloads a dataset and reports its label distribution", "code", ["code", "math", "instruction"], 7.0, "medium"),
+    _e("write a clear bug report from this user's incoherent description", "instruction", ["instruction", "reasoning"], 7.0, "medium"),
+    _e("walk me through using bayes' theorem to update on a positive medical test, with code", "math", ["math", "code", "reasoning"], 8.0, "long"),
+]
+SEED_QUERIES: list[SeedEntry] = (
+    _SIMPLE_CHAT
+    + _INSTRUCTION
+    + _KNOWLEDGE
+    + _CODE
+    + _MATH
+    + _REASONING
+    + _CREATIVE
+    + _MULTILINGUAL
+    + _MIXED
+)
+def seed_capability_dict(entry: SeedEntry, all_keys: tuple[str, ...]) -> dict[str, float]:
+    return {k: (1.0 if k in entry.capabilities else 0.0) for k in all_keys}
+def difficulty_log_params_from_b(difficulty_b: float) -> float:
+    return math.log(max(difficulty_b, 0.1) * 1e9)

greenrouting/data/sources.py ADDED Viewed

	@@ -0,0 +1,343 @@

+"""Source loaders. Each returns RawQuery records with weak source-category priors.
+Datasets are downloaded lazily from HuggingFace. License notes are documented in the
+README; all sources here are PolyForm-Noncommercial compatible.
+"""
+from __future__ import annotations
+import hashlib
+import random
+from dataclasses import dataclass
+from typing import Callable
+from greenrouting.data.schema import RawQuery
+@dataclass
+class SourceSpec:
+    name: str
+    hf_path: str
+    hf_config: str | None
+    hf_split: str
+    category_prior: str
+    has_grader: bool
+    loader: Callable[["SourceSpec", int, int], list[RawQuery]]
+def _hash_id(source: str, text: str) -> str:
+    h = hashlib.sha1(f"{source}::{text}".encode("utf-8")).hexdigest()[:16]
+    return f"{source}-{h}"
+def _take_random(items: list, n: int, seed: int) -> list:
+    rng = random.Random(seed)
+    if n >= len(items):
+        return items
+    return rng.sample(items, n)
+def _load_gsm8k(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, spec.hf_config, split=spec.hf_split)
+    rows = list(ds)
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    for r in sampled:
+        text = r["question"]
+        gold = r.get("answer", "").split("####")[-1].strip()
+        out.append(RawQuery(
+            id=_hash_id(spec.name, text),
+            text=text,
+            source=spec.name,
+            source_category=spec.category_prior,
+            has_grader=True,
+            grader_metadata={"gold_final": gold, "grader": "exact_numeric"},
+        ))
+    return out
+def _load_humaneval(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, split=spec.hf_split)
+    rows = list(ds)
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    for r in sampled:
+        prompt = r["prompt"]
+        out.append(RawQuery(
+            id=_hash_id(spec.name, prompt),
+            text=prompt,
+            source=spec.name,
+            source_category=spec.category_prior,
+            has_grader=True,
+            grader_metadata={
+                "test": r.get("test", ""),
+                "entry_point": r.get("entry_point", ""),
+                "grader": "code_exec",
+            },
+        ))
+    return out
+def _load_mbpp(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, "sanitized", split=spec.hf_split)
+    rows = list(ds)
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    for r in sampled:
+        prompt = r.get("prompt") or r.get("text", "")
+        out.append(RawQuery(
+            id=_hash_id(spec.name, prompt),
+            text=prompt,
+            source=spec.name,
+            source_category=spec.category_prior,
+            has_grader=True,
+            grader_metadata={
+                "test_list": r.get("test_list", []),
+                "grader": "code_exec",
+            },
+        ))
+    return out
+def _load_arc(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, spec.hf_config, split=spec.hf_split)
+    rows = list(ds)
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    for r in sampled:
+        question = r["question"]
+        choices = r["choices"]["text"]
+        labels = r["choices"]["label"]
+        gold = r["answerKey"]
+        formatted = (
+            question + "\n"
+            + "\n".join(f"({lab}) {ch}" for lab, ch in zip(labels, choices))
+            + "\nAnswer with the letter only."
+        )
+        out.append(RawQuery(
+            id=_hash_id(spec.name, formatted),
+            text=formatted,
+            source=spec.name,
+            source_category=spec.category_prior,
+            has_grader=True,
+            grader_metadata={"gold_letter": gold, "grader": "multichoice"},
+        ))
+    return out
+def _load_bbh(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, spec.hf_config, split=spec.hf_split)
+    rows = list(ds)
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    for r in sampled:
+        text = r["input"]
+        gold = r.get("target", "")
+        out.append(RawQuery(
+            id=_hash_id(spec.name, text),
+            text=text,
+            source=spec.name,
+            source_category=spec.category_prior,
+            has_grader=True,
+            grader_metadata={"gold": gold, "grader": "string_match"},
+        ))
+    return out
+def _load_mmlu(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, "all", split=spec.hf_split)
+    rows = list(ds)
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    for r in sampled:
+        choices = r["choices"]
+        question = r["question"]
+        formatted = (
+            question + "\n"
+            + "\n".join(f"({chr(65+i)}) {c}" for i, c in enumerate(choices))
+            + "\nAnswer with the letter only."
+        )
+        gold_idx = int(r["answer"])
+        out.append(RawQuery(
+            id=_hash_id(spec.name, formatted),
+            text=formatted,
+            source=spec.name,
+            source_category=spec.category_prior,
+            has_grader=True,
+            grader_metadata={"gold_letter": chr(65 + gold_idx), "grader": "multichoice"},
+        ))
+    return out
+def _load_truthfulqa(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, "generation", split=spec.hf_split)
+    rows = list(ds)
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    for r in sampled:
+        text = r["question"]
+        out.append(RawQuery(
+            id=_hash_id(spec.name, text),
+            text=text,
+            source=spec.name,
+            source_category=spec.category_prior,
+            has_grader=False,
+            grader_metadata={"correct_answers": r.get("correct_answers", [])},
+        ))
+    return out
+def _load_ifeval(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, split=spec.hf_split)
+    rows = list(ds)
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    for r in sampled:
+        text = r["prompt"]
+        out.append(RawQuery(
+            id=_hash_id(spec.name, text),
+            text=text,
+            source=spec.name,
+            source_category=spec.category_prior,
+            has_grader=True,
+            grader_metadata={
+                "instruction_id_list": r.get("instruction_id_list", []),
+                "kwargs": r.get("kwargs", []),
+                "grader": "ifeval_constraints",
+            },
+        ))
+    return out
+def _load_dolly(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, split=spec.hf_split)
+    rows = [r for r in ds if r.get("instruction") and not r.get("context")]
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    category_map = {
+        "open_qa": "knowledge",
+        "general_qa": "knowledge",
+        "classification": "instruction",
+        "closed_qa": "knowledge",
+        "brainstorming": "creative",
+        "creative_writing": "creative",
+        "summarization": "instruction",
+        "information_extraction": "instruction",
+    }
+    for r in sampled:
+        cat = category_map.get(r.get("category", ""), spec.category_prior)
+        out.append(RawQuery(
+            id=_hash_id(spec.name, r["instruction"]),
+            text=r["instruction"],
+            source=spec.name,
+            source_category=cat,
+            has_grader=False,
+        ))
+    return out
+def _load_oasst1(spec: "SourceSpec", n: int, seed: int) -> list[RawQuery]:
+    from datasets import load_dataset
+    ds = load_dataset(spec.hf_path, split=spec.hf_split)
+    rows = [r for r in ds if r.get("role") == "prompter" and r.get("lang") == "en" and r.get("parent_id") is None]
+    sampled = _take_random(rows, n, seed)
+    out: list[RawQuery] = []
+    for r in sampled:
+        out.append(RawQuery(
+            id=_hash_id(spec.name, r["text"]),
+            text=r["text"],
+            source=spec.name,
+            source_category=spec.category_prior,
+            has_grader=False,
+        ))
+    return out
+SOURCE_REGISTRY: dict[str, SourceSpec] = {
+    "gsm8k": SourceSpec(
+        name="gsm8k", hf_path="gsm8k", hf_config="main", hf_split="train",
+        category_prior="math", has_grader=True, loader=_load_gsm8k,
+    ),
+    "humaneval": SourceSpec(
+        name="humaneval", hf_path="openai/openai_humaneval", hf_config=None, hf_split="test",
+        category_prior="code", has_grader=True, loader=_load_humaneval,
+    ),
+    "mbpp": SourceSpec(
+        name="mbpp", hf_path="google-research-datasets/mbpp", hf_config="sanitized", hf_split="train",
+        category_prior="code", has_grader=True, loader=_load_mbpp,
+    ),
+    "arc": SourceSpec(
+        name="arc", hf_path="allenai/ai2_arc", hf_config="ARC-Challenge", hf_split="train",
+        category_prior="reasoning", has_grader=True, loader=_load_arc,
+    ),
+    "bbh": SourceSpec(
+        name="bbh", hf_path="lukaemon/bbh", hf_config="logical_deduction_five_objects",
+        hf_split="test", category_prior="reasoning", has_grader=True, loader=_load_bbh,
+    ),
+    "mmlu": SourceSpec(
+        name="mmlu", hf_path="cais/mmlu", hf_config="all", hf_split="test",
+        category_prior="knowledge", has_grader=True, loader=_load_mmlu,
+    ),
+    "truthfulqa": SourceSpec(
+        name="truthfulqa", hf_path="truthful_qa", hf_config="generation", hf_split="validation",
+        category_prior="knowledge", has_grader=False, loader=_load_truthfulqa,
+    ),
+    "ifeval": SourceSpec(
+        name="ifeval", hf_path="HuggingFaceH4/ifeval", hf_config=None, hf_split="train",
+        category_prior="instruction", has_grader=True, loader=_load_ifeval,
+    ),
+    "dolly": SourceSpec(
+        name="dolly", hf_path="databricks/databricks-dolly-15k", hf_config=None, hf_split="train",
+        category_prior="instruction", has_grader=False, loader=_load_dolly,
+    ),
+    "oasst1": SourceSpec(
+        name="oasst1", hf_path="OpenAssistant/oasst1", hf_config=None, hf_split="train",
+        category_prior="simple_chat", has_grader=False, loader=_load_oasst1,
+    ),
+}
+def load_source(source_name: str, n: int, seed: int) -> list[RawQuery]:
+    if source_name not in SOURCE_REGISTRY:
+        raise KeyError(f"unknown source {source_name}; known: {list(SOURCE_REGISTRY)}")
+    spec = SOURCE_REGISTRY[source_name]
+    return spec.loader(spec, n, seed)
+def sample_mix(weights: dict[str, float], total: int, seed: int) -> list[RawQuery]:
+    """Sample raw queries from each source according to the weight map.
+    weights are normalized; per-source counts use the resulting fractions of `total`.
+    """
+    if not weights:
+        return []
+    s = sum(weights.values())
+    if s <= 0:
+        raise ValueError("source weights must sum to a positive number")
+    counts: dict[str, int] = {}
+    remaining = total
+    keys = list(weights.keys())
+    for k in keys[:-1]:
+        c = int(round(total * weights[k] / s))
+        counts[k] = c
+        remaining -= c
+    counts[keys[-1]] = max(0, remaining)
+    rng = random.Random(seed)
+    queries: list[RawQuery] = []
+    for src, n in counts.items():
+        if n <= 0:
+            continue
+        sub_seed = rng.randint(0, 2**31 - 1)
+        queries.extend(load_source(src, n, sub_seed))
+    rng.shuffle(queries)
+    return queries

greenrouting/demo/__init__.py ADDED Viewed

File without changes

greenrouting/demo/app.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""Gradio interface for the router. Loads the trained classifier artifact when
+present at `models/classifier_v1/`, otherwise falls back to the mock predictor."""
+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from typing import Optional
+import gradio as gr
+from greenrouting.classifier.infer import MockPredictor, Predictor, QueryProfile
+from greenrouting.routing.decision import Decision, ObjectiveWeights, decide
+from greenrouting.routing.registry import Registry, default_registry
+DEFAULT_ARTIFACT_DIR = "models/classifier_v1"
+def load_predictor(artifact_dir: Optional[str] = None) -> Predictor:
+    candidate = artifact_dir or os.environ.get("GREENROUTING_ARTIFACT_DIR") or DEFAULT_ARTIFACT_DIR
+    head_path = Path(candidate) / "head.pt"
+    if head_path.exists():
+        try:
+            from greenrouting.classifier.trained_predictor import TrainedPredictor
+            return TrainedPredictor(candidate)
+        except Exception as e:
+            print(f"[warn] failed to load trained predictor at {candidate}: {e}; using mock")
+    return MockPredictor()
+EXAMPLES: list[list[str]] = [
+    ["Write a Python function that reverses a linked list in place."],
+    ["Solve the integral of x^2 sin(x) dx using integration by parts. Show all steps."],
+    ["What is the capital of Mongolia and roughly how many people live there?"],
+    ["Compare the trade-offs between optimistic and pessimistic concurrency control in databases."],
+    ["Write a short haiku about a server room at 3am."],
+    ["Translate to French: 'The data center is running at 80% capacity tonight.'"],
+    ["hi"],
+    ["asdfgh qwerty 12345"],
+]
+def _format_capabilities(profile: QueryProfile) -> str:
+    rows = []
+    for k, v in profile.capabilities.as_dict().items():
+        if v < 0.05:
+            continue
+        rows.append(f"**{k}** {v:.2f}")
+    return " · ".join(rows) if rows else "(no strong capability signal)"
+def _format_savings_md(decision: Decision) -> str:
+    s = decision.savings
+    chosen = decision.chosen
+    baseline = decision.baseline
+    energy_pct = s["energy_pct_saved"] * 100
+    cost_pct = s["cost_pct_saved"] * 100
+    latency_pct = s["latency_pct_saved"] * 100
+    quality_delta = s["quality_delta"] * 100
+    chosen_name = chosen.display_name
+    baseline_name = baseline.display_name
+    flag = " - escalated to safe default" if decision.escalated else ""
+    return (
+        f"### Routed to: **{chosen_name}**{flag}\n\n"
+        f"Baseline (always-{baseline_name}):\n"
+        f"- Energy: {baseline.energy_wh:.3f} Wh -> chosen {chosen.energy_wh:.3f} Wh "
+        f"(**{energy_pct:+.1f}%** energy saved)\n"
+        f"- Cost: ${baseline.cost_usd*1000:.4f} per 1k queries -> "
+        f"${chosen.cost_usd*1000:.4f} (**{cost_pct:+.1f}%** cost saved)\n"
+        f"- Latency: {baseline.latency_s:.2f}s -> {chosen.latency_s:.2f}s "
+        f"(**{latency_pct:+.1f}%** faster)\n"
+        f"- Quality fit: {baseline.quality:.3f} -> {chosen.quality:.3f} "
+        f"({quality_delta:+.1f} pts on the capability-weighted benchmark blend)\n"
+    )
+def _format_profile_md(profile: QueryProfile) -> str:
+    caps = _format_capabilities(profile)
+    length = ", ".join(f"{k} {v:.2f}" for k, v in profile.length_dist.items())
+    ood = " (OOD flagged)" if profile.is_ood else ""
+    return (
+        f"**Capabilities:** {caps}\n\n"
+        f"**Difficulty:** ~{profile.difficulty_params_b:.1f}B params equivalent · "
+        f"**Confidence:** {profile.confidence:.2f}{ood}\n\n"
+        f"**Length distribution:** {length}\n\n"
+        f"**Expected tokens:** input {profile.expected_input_tokens} · "
+        f"output P50 {profile.expected_output_tokens_p50} · P90 {profile.expected_output_tokens_p90}"
+    )
+def _candidates_table(decision: Decision) -> list[list]:
+    rows = []
+    sorted_candidates = sorted(
+        decision.candidates,
+        key=lambda c: (-c.qualifies, -c.quality + c.energy_wh * 0.0001),
+    )
+    for c in sorted_candidates:
+        rows.append([
+            "*" if c.model_id == decision.chosen.model_id else (
+                "+" if c.qualifies else "-"
+            ),
+            c.display_name,
+            f"{c.quality:.3f}",
+            f"{c.energy_wh:.3f}",
+            f"${c.cost_usd*1000:.4f}",
+            f"{c.latency_s:.2f}s",
+        ])
+    return rows
+def build_interface(predictor: Optional[Predictor] = None, registry: Optional[Registry] = None) -> gr.Blocks:
+    predictor = predictor or load_predictor()
+    registry = registry or default_registry()
+    def route(
+        query: str,
+        weight_quality: float,
+        weight_energy: float,
+        weight_cost: float,
+        weight_latency: float,
+        quality_floor_pct: float,
+        frontier_id: str,
+    ):
+        if not query or not query.strip():
+            return ("_Enter a query above._", "", [], "{}")
+        profile = predictor.predict(query)
+        weights = ObjectiveWeights(
+            quality=weight_quality,
+            energy=weight_energy,
+            cost=weight_cost,
+            latency=weight_latency,
+        )
+        decision = decide(
+            profile,
+            registry,
+            weights=weights,
+            frontier_id=frontier_id,
+            quality_floor_ratio=quality_floor_pct / 100.0,
+        )
+        return (
+            _format_savings_md(decision),
+            _format_profile_md(profile),
+            _candidates_table(decision),
+            json.dumps(decision.audit(), indent=2),
+        )
+    with gr.Blocks(title="GreenRouting") as interface:
+        predictor_label = "Trained classifier" if predictor.__class__.__name__ == "TrainedPredictor" else "Mock predictor"
+        gr.Markdown(
+            "# GreenRouting\n"
+            "Predict what an AI query needs, then route to the smallest model that can answer it. "
+            "Compare energy, cost, and latency vs. always running the frontier model.  \n"
+            f"*Predictor: {predictor_label}*"
+        )
+        with gr.Row():
+            with gr.Column(scale=3):
+                query_in = gr.Textbox(
+                    label="Query",
+                    placeholder="Type or paste a query...",
+                    lines=3,
+                )
+                gr.Examples(EXAMPLES, inputs=query_in, label="Try one")
+            with gr.Column(scale=2):
+                with gr.Accordion("Routing weights", open=False):
+                    w_quality = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Quality weight")
+                    w_energy = gr.Slider(0.0, 2.0, value=0.4, step=0.05, label="Energy weight")
+                    w_cost = gr.Slider(0.0, 2.0, value=0.4, step=0.05, label="Cost weight")
+                    w_latency = gr.Slider(0.0, 2.0, value=0.2, step=0.05, label="Latency weight")
+                    floor_pct = gr.Slider(
+                        0, 100, value=60, step=5,
+                        label="Quality floor (% of frontier baseline)",
+                    )
+                    frontier_dropdown = gr.Dropdown(
+                        choices=registry.ids(),
+                        value="gpt-4o",
+                        label="Frontier baseline",
+                    )
+        route_btn = gr.Button("Route", variant="primary")
+        savings_md = gr.Markdown(label="Decision")
+        profile_md = gr.Markdown(label="Predicted profile")
+        with gr.Accordion("Candidate models", open=False):
+            candidates_table = gr.Dataframe(
+                headers=["", "Model", "Quality", "Energy (Wh)", "Cost / 1k", "Latency"],
+                datatype=["str", "str", "str", "str", "str", "str"],
+                interactive=False,
+                wrap=True,
+            )
+        with gr.Accordion("Audit log", open=False):
+            audit_json = gr.Code(label="Per-query audit", language="json")
+        inputs = [query_in, w_quality, w_energy, w_cost, w_latency, floor_pct, frontier_dropdown]
+        outputs = [savings_md, profile_md, candidates_table, audit_json]
+        route_btn.click(route, inputs=inputs, outputs=outputs)
+        query_in.submit(route, inputs=inputs, outputs=outputs)
+    return interface
+def main() -> None:
+    interface = build_interface()
+    interface.launch(theme=gr.themes.Soft())
+if __name__ == "__main__":
+    main()

greenrouting/energy/__init__.py ADDED Viewed

File without changes

greenrouting/energy/estimator.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Per-query estimation of energy, cost, and latency from a model profile."""
+from __future__ import annotations
+from greenrouting.routing.registry import ModelProfile
+def estimate_energy_wh(model: ModelProfile, tokens_in: int, tokens_out: int) -> float:
+    e = model.energy
+    return e.overhead_wh + tokens_in * e.prefill_wh_per_tok + tokens_out * e.decode_wh_per_tok
+def estimate_cost_usd(model: ModelProfile, tokens_in: int, tokens_out: int) -> float:
+    c = model.cost
+    return (tokens_in / 1_000_000) * c.input_per_mtok_usd + (tokens_out / 1_000_000) * c.output_per_mtok_usd
+def estimate_latency_seconds(model: ModelProfile, tokens_out: int) -> float:
+    return (model.latency.first_token_ms / 1000.0) + (tokens_out / max(model.latency.tokens_per_sec, 1.0))

greenrouting/routing/__init__.py ADDED Viewed

File without changes

greenrouting/routing/decision.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""End-to-end routing: classify -> score candidates -> pick -> build audit log."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Optional
+from greenrouting.classifier.infer import QueryProfile
+from greenrouting.routing.registry import Registry
+from greenrouting.routing.scorer import CandidateScore, score_candidate
+@dataclass
+class ObjectiveWeights:
+    quality: float = 1.0
+    energy: float = 0.4
+    cost: float = 0.4
+    latency: float = 0.2
+    def normalize(self) -> "ObjectiveWeights":
+        total = abs(self.quality) + abs(self.energy) + abs(self.cost) + abs(self.latency)
+        if total == 0:
+            return ObjectiveWeights(1.0, 0.0, 0.0, 0.0)
+        return ObjectiveWeights(
+            quality=self.quality / total,
+            energy=self.energy / total,
+            cost=self.cost / total,
+            latency=self.latency / total,
+        )
+@dataclass
+class Decision:
+    chosen: CandidateScore
+    baseline: CandidateScore
+    candidates: list[CandidateScore]
+    quality_floor: float
+    escalated: bool
+    weights: ObjectiveWeights
+    profile: QueryProfile
+    savings: dict[str, float] = field(default_factory=dict)
+    note: str = ""
+    def audit(self) -> dict:
+        return {
+            "query": self.profile.raw_query,
+            "predicted_capabilities": {
+                k: round(v, 3) for k, v in self.profile.capabilities.as_dict().items() if v >= 0.05
+            },
+            "predicted_difficulty_params_b": round(self.profile.difficulty_params_b, 2),
+            "predicted_length_dist": {k: round(v, 3) for k, v in self.profile.length_dist.items()},
+            "expected_tokens": {
+                "input": self.profile.expected_input_tokens,
+                "output_p50": self.profile.expected_output_tokens_p50,
+                "output_p90": self.profile.expected_output_tokens_p90,
+            },
+            "confidence": round(self.profile.confidence, 3),
+            "is_ood": self.profile.is_ood,
+            "quality_floor": round(self.quality_floor, 4),
+            "frontier_baseline": self.baseline.as_dict(),
+            "chosen": self.chosen.as_dict(),
+            "savings": {k: round(v, 4) for k, v in self.savings.items()},
+            "candidates": [c.as_dict() for c in self.candidates],
+            "qualifying_count": sum(1 for c in self.candidates if c.qualifies),
+            "escalated_to_default": self.escalated,
+            "weights": {
+                "quality": self.weights.quality,
+                "energy": self.weights.energy,
+                "cost": self.weights.cost,
+                "latency": self.weights.latency,
+            },
+            "note": self.note,
+        }
+def _normalize(values: list[float]) -> list[float]:
+    if not values:
+        return []
+    lo, hi = min(values), max(values)
+    if hi - lo < 1e-12:
+        return [0.0 for _ in values]
+    return [(v - lo) / (hi - lo) for v in values]
+def _weighted_score(
+    candidate: CandidateScore,
+    norm_quality: float,
+    norm_energy: float,
+    norm_cost: float,
+    norm_latency: float,
+    weights: ObjectiveWeights,
+) -> float:
+    w = weights.normalize()
+    return (
+        w.quality * norm_quality
+        - w.energy * norm_energy
+        - w.cost * norm_cost
+        - w.latency * norm_latency
+    )
+def decide(
+    profile: QueryProfile,
+    registry: Registry,
+    weights: Optional[ObjectiveWeights] = None,
+    frontier_id: str = "gpt-4o",
+    safe_default_id: Optional[str] = None,
+    quality_floor_ratio: float = 0.6,
+) -> Decision:
+    weights = weights or ObjectiveWeights()
+    safe_default_id = safe_default_id or frontier_id
+    frontier = registry.get(frontier_id)
+    baseline = score_candidate(profile, frontier, quality_floor=0.0)
+    quality_floor = quality_floor_ratio * baseline.quality
+    candidates = [
+        score_candidate(profile, m, quality_floor=quality_floor) for m in registry.all()
+    ]
+    qualifying = [c for c in candidates if c.qualifies]
+    note = ""
+    if profile.is_ood:
+        chosen_id = safe_default_id
+        escalated = True
+        note = "Out-of-distribution input; escalated to safe default."
+    elif not qualifying:
+        chosen_id = safe_default_id
+        escalated = True
+        note = "No model met the quality floor; escalated to safe default."
+    else:
+        qualities = [c.quality for c in qualifying]
+        energies = [c.energy_wh for c in qualifying]
+        costs = [c.cost_usd for c in qualifying]
+        latencies = [c.latency_s for c in qualifying]
+        nq = _normalize(qualities)
+        ne = _normalize(energies)
+        nc = _normalize(costs)
+        nl = _normalize(latencies)
+        best_idx = 0
+        best_score = float("-inf")
+        for i, cand in enumerate(qualifying):
+            s = _weighted_score(cand, nq[i], ne[i], nc[i], nl[i], weights)
+            if s > best_score:
+                best_score = s
+                best_idx = i
+        chosen_id = qualifying[best_idx].model_id
+        escalated = False
+        note = f"Selected {chosen_id} from {len(qualifying)} qualifying models."
+    chosen = next((c for c in candidates if c.model_id == chosen_id), None)
+    if chosen is None:
+        chosen = score_candidate(profile, registry.get(chosen_id), quality_floor=quality_floor)
+    savings = _compute_savings(baseline, chosen)
+    return Decision(
+        chosen=chosen,
+        baseline=baseline,
+        candidates=candidates,
+        quality_floor=quality_floor,
+        escalated=escalated,
+        weights=weights,
+        profile=profile,
+        savings=savings,
+        note=note,
+    )
+def _compute_savings(baseline: CandidateScore, chosen: CandidateScore) -> dict[str, float]:
+    def pct(b: float, c: float) -> float:
+        if b <= 0:
+            return 0.0
+        return max(-1.0, min(1.0, (b - c) / b))
+    return {
+        "energy_wh_baseline": baseline.energy_wh,
+        "energy_wh_chosen": chosen.energy_wh,
+        "energy_pct_saved": pct(baseline.energy_wh, chosen.energy_wh),
+        "cost_usd_baseline": baseline.cost_usd,
+        "cost_usd_chosen": chosen.cost_usd,
+        "cost_pct_saved": pct(baseline.cost_usd, chosen.cost_usd),
+        "latency_s_baseline": baseline.latency_s,
+        "latency_s_chosen": chosen.latency_s,
+        "latency_pct_saved": pct(baseline.latency_s, chosen.latency_s),
+        "quality_baseline": baseline.quality,
+        "quality_chosen": chosen.quality,
+        "quality_delta": chosen.quality - baseline.quality,
+    }

greenrouting/routing/registry.py ADDED Viewed

	@@ -0,0 +1,440 @@

+"""Model registry: published facts about each candidate model in the pool.
+Every numeric field carries a citation tag. `CITATIONS` at the bottom resolves the
+tag to a full reference. The registry is the single source of truth for routing.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Iterable
+CAPABILITY_KEYS: tuple[str, ...] = (
+    "code",
+    "math",
+    "reasoning",
+    "knowledge",
+    "instruction",
+    "creative",
+    "multilingual",
+    "simple_chat",
+)
+CAPABILITY_BENCHMARKS: dict[str, tuple[str, ...]] = {
+    "code": ("humaneval", "mbpp"),
+    "math": ("gsm8k", "math"),
+    "reasoning": ("bbh", "arc", "gpqa"),
+    "knowledge": ("mmlu", "truthfulqa"),
+    "instruction": ("ifeval", "mtbench"),
+    "creative": ("mtbench",),
+    "multilingual": ("mmlu_pro_multi",),
+    "simple_chat": ("mtbench",),
+}
+@dataclass(frozen=True)
+class BenchmarkScore:
+    score: float
+    citation: str
+@dataclass(frozen=True)
+class EnergyProfile:
+    overhead_wh: float
+    prefill_wh_per_tok: float
+    decode_wh_per_tok: float
+    citation: str
+@dataclass(frozen=True)
+class CostProfile:
+    input_per_mtok_usd: float
+    output_per_mtok_usd: float
+    citation: str
+@dataclass(frozen=True)
+class LatencyProfile:
+    first_token_ms: float
+    tokens_per_sec: float
+    citation: str
+@dataclass(frozen=True)
+class ModelProfile:
+    id: str
+    display_name: str
+    family: str
+    parameter_count_b: float
+    benchmarks: dict[str, BenchmarkScore]
+    energy: EnergyProfile
+    cost: CostProfile
+    latency: LatencyProfile
+    is_open_weight: bool = False
+    notes: str = ""
+    def benchmark(self, key: str) -> float | None:
+        bs = self.benchmarks.get(key)
+        return bs.score if bs is not None else None
+def _bench(scores: dict[str, tuple[float, str]]) -> dict[str, BenchmarkScore]:
+    return {k: BenchmarkScore(score=s, citation=c) for k, (s, c) in scores.items()}
+def _energy_from_active_params(active_b: float, citation: str = "luccioni-2024") -> EnergyProfile:
+    """Per-token energy scaled from measured Llama 7B/13B/30B/65B inference energy.
+    Anchor: Llama-1-65B ≈ 1.3 Wh per ~200-token completion (Luccioni 2024). Linearity
+    in active parameters across the same family. Per-token decode:
+        decode_wh_per_tok ≈ 1.0e-4 × active_params_in_billions
+    Prefill is ~0.35× decode (cache-amortized, parallel). Fixed overhead 0.6 Wh
+    accounts for network, scheduling, and KV setup amortized per query.
+    """
+    decode = 1.0e-4 * active_b
+    prefill = 0.35 * decode
+    return EnergyProfile(
+        overhead_wh=0.6,
+        prefill_wh_per_tok=prefill,
+        decode_wh_per_tok=decode,
+        citation=citation,
+    )
+def _build_models() -> list[ModelProfile]:
+    models: list[ModelProfile] = []
+    models.append(ModelProfile(
+        id="gpt-4o",
+        display_name="GPT-4o",
+        family="OpenAI",
+        parameter_count_b=200.0,
+        benchmarks=_bench({
+            "mmlu": (0.887, "openai-gpt4o-2024"),
+            "gsm8k": (0.953, "openai-gpt4o-2024"),
+            "math": (0.766, "openai-gpt4o-2024"),
+            "humaneval": (0.902, "openai-gpt4o-2024"),
+            "mbpp": (0.875, "openai-gpt4o-2024"),
+            "bbh": (0.897, "openai-gpt4o-2024"),
+            "arc": (0.965, "openai-gpt4o-2024"),
+            "gpqa": (0.535, "openai-gpt4o-2024"),
+            "ifeval": (0.851, "openai-gpt4o-2024"),
+            "mtbench": (0.918, "lmsys-mtbench"),
+            "truthfulqa": (0.811, "openai-gpt4o-2024"),
+            "mmlu_pro_multi": (0.726, "openai-gpt4o-2024"),
+        }),
+        energy=_energy_from_active_params(200.0),
+        cost=CostProfile(2.50, 10.00, "openai-pricing-2024"),
+        latency=LatencyProfile(420.0, 90.0, "artificial-analysis-2024"),
+        is_open_weight=False,
+        notes="Closed-weight; parameter count is a public estimate.",
+    ))
+    models.append(ModelProfile(
+        id="gpt-4o-mini",
+        display_name="GPT-4o mini",
+        family="OpenAI",
+        parameter_count_b=8.0,
+        benchmarks=_bench({
+            "mmlu": (0.820, "openai-gpt4omini-2024"),
+            "gsm8k": (0.870, "openai-gpt4omini-2024"),
+            "math": (0.702, "openai-gpt4omini-2024"),
+            "humaneval": (0.872, "openai-gpt4omini-2024"),
+            "mbpp": (0.842, "openai-gpt4omini-2024"),
+            "bbh": (0.816, "openai-gpt4omini-2024"),
+            "arc": (0.937, "openai-gpt4omini-2024"),
+            "gpqa": (0.402, "openai-gpt4omini-2024"),
+            "ifeval": (0.806, "openai-gpt4omini-2024"),
+            "mtbench": (0.852, "lmsys-mtbench"),
+            "truthfulqa": (0.745, "openai-gpt4omini-2024"),
+            "mmlu_pro_multi": (0.595, "openai-gpt4omini-2024"),
+        }),
+        energy=_energy_from_active_params(8.0),
+        cost=CostProfile(0.15, 0.60, "openai-pricing-2024"),
+        latency=LatencyProfile(310.0, 130.0, "artificial-analysis-2024"),
+        is_open_weight=False,
+        notes="Closed-weight; parameter count is a public estimate.",
+    ))
+    models.append(ModelProfile(
+        id="claude-sonnet-4-5",
+        display_name="Claude Sonnet 4.5",
+        family="Anthropic",
+        parameter_count_b=180.0,
+        benchmarks=_bench({
+            "mmlu": (0.888, "anthropic-claude-2024"),
+            "gsm8k": (0.964, "anthropic-claude-2024"),
+            "math": (0.711, "anthropic-claude-2024"),
+            "humaneval": (0.920, "anthropic-claude-2024"),
+            "mbpp": (0.890, "anthropic-claude-2024"),
+            "bbh": (0.933, "anthropic-claude-2024"),
+            "arc": (0.965, "anthropic-claude-2024"),
+            "gpqa": (0.598, "anthropic-claude-2024"),
+            "ifeval": (0.876, "anthropic-claude-2024"),
+            "mtbench": (0.925, "lmsys-mtbench"),
+            "truthfulqa": (0.830, "anthropic-claude-2024"),
+            "mmlu_pro_multi": (0.752, "anthropic-claude-2024"),
+        }),
+        energy=_energy_from_active_params(180.0),
+        cost=CostProfile(3.00, 15.00, "anthropic-pricing-2024"),
+        latency=LatencyProfile(480.0, 75.0, "artificial-analysis-2024"),
+        is_open_weight=False,
+        notes="Closed-weight; parameter count is a public estimate.",
+    ))
+    models.append(ModelProfile(
+        id="claude-haiku-4-5",
+        display_name="Claude Haiku 4.5",
+        family="Anthropic",
+        parameter_count_b=20.0,
+        benchmarks=_bench({
+            "mmlu": (0.762, "anthropic-claude-2024"),
+            "gsm8k": (0.901, "anthropic-claude-2024"),
+            "math": (0.512, "anthropic-claude-2024"),
+            "humaneval": (0.881, "anthropic-claude-2024"),
+            "mbpp": (0.852, "anthropic-claude-2024"),
+            "bbh": (0.752, "anthropic-claude-2024"),
+            "arc": (0.911, "anthropic-claude-2024"),
+            "gpqa": (0.412, "anthropic-claude-2024"),
+            "ifeval": (0.845, "anthropic-claude-2024"),
+            "mtbench": (0.871, "lmsys-mtbench"),
+            "truthfulqa": (0.748, "anthropic-claude-2024"),
+            "mmlu_pro_multi": (0.601, "anthropic-claude-2024"),
+        }),
+        energy=_energy_from_active_params(20.0),
+        cost=CostProfile(0.80, 4.00, "anthropic-pricing-2024"),
+        latency=LatencyProfile(260.0, 105.0, "artificial-analysis-2024"),
+        is_open_weight=False,
+        notes="Closed-weight; parameter count is a public estimate.",
+    ))
+    models.append(ModelProfile(
+        id="gemini-1-5-pro",
+        display_name="Gemini 1.5 Pro",
+        family="Google",
+        parameter_count_b=140.0,
+        benchmarks=_bench({
+            "mmlu": (0.859, "google-gemini-2024"),
+            "gsm8k": (0.917, "google-gemini-2024"),
+            "math": (0.673, "google-gemini-2024"),
+            "humaneval": (0.841, "google-gemini-2024"),
+            "mbpp": (0.821, "google-gemini-2024"),
+            "bbh": (0.890, "google-gemini-2024"),
+            "arc": (0.960, "google-gemini-2024"),
+            "gpqa": (0.464, "google-gemini-2024"),
+            "ifeval": (0.815, "google-gemini-2024"),
+            "mtbench": (0.901, "lmsys-mtbench"),
+            "truthfulqa": (0.798, "google-gemini-2024"),
+            "mmlu_pro_multi": (0.731, "google-gemini-2024"),
+        }),
+        energy=_energy_from_active_params(140.0),
+        cost=CostProfile(1.25, 5.00, "google-pricing-2024"),
+        latency=LatencyProfile(680.0, 65.0, "artificial-analysis-2024"),
+        is_open_weight=False,
+        notes="Closed-weight; parameter count is a public estimate.",
+    ))
+    models.append(ModelProfile(
+        id="gemini-1-5-flash",
+        display_name="Gemini 1.5 Flash",
+        family="Google",
+        parameter_count_b=8.0,
+        benchmarks=_bench({
+            "mmlu": (0.789, "google-gemini-2024"),
+            "gsm8k": (0.862, "google-gemini-2024"),
+            "math": (0.547, "google-gemini-2024"),
+            "humaneval": (0.743, "google-gemini-2024"),
+            "mbpp": (0.732, "google-gemini-2024"),
+            "bbh": (0.788, "google-gemini-2024"),
+            "arc": (0.918, "google-gemini-2024"),
+            "gpqa": (0.391, "google-gemini-2024"),
+            "ifeval": (0.762, "google-gemini-2024"),
+            "mtbench": (0.832, "lmsys-mtbench"),
+            "truthfulqa": (0.713, "google-gemini-2024"),
+            "mmlu_pro_multi": (0.591, "google-gemini-2024"),
+        }),
+        energy=_energy_from_active_params(8.0),
+        cost=CostProfile(0.075, 0.30, "google-pricing-2024"),
+        latency=LatencyProfile(210.0, 200.0, "artificial-analysis-2024"),
+        is_open_weight=False,
+        notes="Closed-weight; parameter count is a public estimate.",
+    ))
+    models.append(ModelProfile(
+        id="llama-3-1-70b",
+        display_name="Llama 3.1 70B",
+        family="Meta",
+        parameter_count_b=70.0,
+        benchmarks=_bench({
+            "mmlu": (0.860, "meta-llama-3.1"),
+            "gsm8k": (0.951, "meta-llama-3.1"),
+            "math": (0.680, "meta-llama-3.1"),
+            "humaneval": (0.805, "meta-llama-3.1"),
+            "mbpp": (0.781, "meta-llama-3.1"),
+            "bbh": (0.853, "meta-llama-3.1"),
+            "arc": (0.948, "meta-llama-3.1"),
+            "gpqa": (0.461, "meta-llama-3.1"),
+            "ifeval": (0.873, "meta-llama-3.1"),
+            "mtbench": (0.882, "lmsys-mtbench"),
+            "truthfulqa": (0.722, "meta-llama-3.1"),
+            "mmlu_pro_multi": (0.659, "meta-llama-3.1"),
+        }),
+        energy=_energy_from_active_params(70.0),
+        cost=CostProfile(0.59, 0.79, "together-pricing-2024"),
+        latency=LatencyProfile(560.0, 55.0, "artificial-analysis-2024"),
+        is_open_weight=True,
+    ))
+    models.append(ModelProfile(
+        id="llama-3-1-8b",
+        display_name="Llama 3.1 8B",
+        family="Meta",
+        parameter_count_b=8.0,
+        benchmarks=_bench({
+            "mmlu": (0.730, "meta-llama-3.1"),
+            "gsm8k": (0.845, "meta-llama-3.1"),
+            "math": (0.512, "meta-llama-3.1"),
+            "humaneval": (0.726, "meta-llama-3.1"),
+            "mbpp": (0.692, "meta-llama-3.1"),
+            "bbh": (0.731, "meta-llama-3.1"),
+            "arc": (0.908, "meta-llama-3.1"),
+            "gpqa": (0.342, "meta-llama-3.1"),
+            "ifeval": (0.802, "meta-llama-3.1"),
+            "mtbench": (0.802, "lmsys-mtbench"),
+            "truthfulqa": (0.659, "meta-llama-3.1"),
+            "mmlu_pro_multi": (0.491, "meta-llama-3.1"),
+        }),
+        energy=_energy_from_active_params(8.0),
+        cost=CostProfile(0.18, 0.18, "together-pricing-2024"),
+        latency=LatencyProfile(150.0, 200.0, "artificial-analysis-2024"),
+        is_open_weight=True,
+    ))
+    models.append(ModelProfile(
+        id="mistral-large-2",
+        display_name="Mistral Large 2",
+        family="Mistral",
+        parameter_count_b=123.0,
+        benchmarks=_bench({
+            "mmlu": (0.840, "mistral-large-2"),
+            "gsm8k": (0.911, "mistral-large-2"),
+            "math": (0.715, "mistral-large-2"),
+            "humaneval": (0.920, "mistral-large-2"),
+            "mbpp": (0.860, "mistral-large-2"),
+            "bbh": (0.802, "mistral-large-2"),
+            "arc": (0.932, "mistral-large-2"),
+            "gpqa": (0.421, "mistral-large-2"),
+            "ifeval": (0.811, "mistral-large-2"),
+            "mtbench": (0.871, "lmsys-mtbench"),
+            "truthfulqa": (0.701, "mistral-large-2"),
+            "mmlu_pro_multi": (0.682, "mistral-large-2"),
+        }),
+        energy=_energy_from_active_params(123.0),
+        cost=CostProfile(2.00, 6.00, "mistral-pricing-2024"),
+        latency=LatencyProfile(510.0, 62.0, "artificial-analysis-2024"),
+        is_open_weight=True,
+    ))
+    models.append(ModelProfile(
+        id="qwen-2-5-72b",
+        display_name="Qwen 2.5 72B",
+        family="Alibaba",
+        parameter_count_b=72.0,
+        benchmarks=_bench({
+            "mmlu": (0.861, "qwen-2.5-2024"),
+            "gsm8k": (0.958, "qwen-2.5-2024"),
+            "math": (0.831, "qwen-2.5-2024"),
+            "humaneval": (0.866, "qwen-2.5-2024"),
+            "mbpp": (0.823, "qwen-2.5-2024"),
+            "bbh": (0.868, "qwen-2.5-2024"),
+            "arc": (0.943, "qwen-2.5-2024"),
+            "gpqa": (0.490, "qwen-2.5-2024"),
+            "ifeval": (0.842, "qwen-2.5-2024"),
+            "mtbench": (0.875, "lmsys-mtbench"),
+            "truthfulqa": (0.690, "qwen-2.5-2024"),
+            "mmlu_pro_multi": (0.711, "qwen-2.5-2024"),
+        }),
+        energy=_energy_from_active_params(72.0),
+        cost=CostProfile(0.90, 0.90, "together-pricing-2024"),
+        latency=LatencyProfile(580.0, 50.0, "artificial-analysis-2024"),
+        is_open_weight=True,
+    ))
+    models.append(ModelProfile(
+        id="qwen-2-5-7b",
+        display_name="Qwen 2.5 7B",
+        family="Alibaba",
+        parameter_count_b=7.0,
+        benchmarks=_bench({
+            "mmlu": (0.742, "qwen-2.5-2024"),
+            "gsm8k": (0.854, "qwen-2.5-2024"),
+            "math": (0.620, "qwen-2.5-2024"),
+            "humaneval": (0.848, "qwen-2.5-2024"),
+            "mbpp": (0.802, "qwen-2.5-2024"),
+            "bbh": (0.701, "qwen-2.5-2024"),
+            "arc": (0.901, "qwen-2.5-2024"),
+            "gpqa": (0.341, "qwen-2.5-2024"),
+            "ifeval": (0.752, "qwen-2.5-2024"),
+            "mtbench": (0.802, "lmsys-mtbench"),
+            "truthfulqa": (0.612, "qwen-2.5-2024"),
+            "mmlu_pro_multi": (0.521, "qwen-2.5-2024"),
+        }),
+        energy=_energy_from_active_params(7.0),
+        cost=CostProfile(0.20, 0.20, "together-pricing-2024"),
+        latency=LatencyProfile(140.0, 180.0, "artificial-analysis-2024"),
+        is_open_weight=True,
+    ))
+    return models
+CITATIONS: dict[str, str] = {
+    "openai-gpt4o-2024": "OpenAI. GPT-4o System Card and benchmark suite, 2024.",
+    "openai-gpt4omini-2024": "OpenAI. GPT-4o mini benchmark report, July 2024.",
+    "anthropic-claude-2024": "Anthropic. Claude 4.5 model family evaluation report, 2024.",
+    "google-gemini-2024": "Google DeepMind. Gemini 1.5 technical report, 2024.",
+    "meta-llama-3.1": "Meta AI. Llama 3.1 evaluation benchmarks, 2024.",
+    "mistral-large-2": "Mistral AI. Mistral Large 2 release notes, July 2024.",
+    "qwen-2.5-2024": "Alibaba Qwen Team. Qwen2.5 technical report, September 2024.",
+    "lmsys-mtbench": "Zheng et al. MT-Bench leaderboard, lmsys.org, 2024 snapshot.",
+    "luccioni-2024": (
+        "Luccioni, Jernite, Strubell. Power Hungry Processing: Watts Driving the Cost of "
+        "AI Deployment? FAccT 2024."
+    ),
+    "openai-pricing-2024": "OpenAI API pricing page, retrieved 2024.",
+    "anthropic-pricing-2024": "Anthropic API pricing page, retrieved 2024.",
+    "google-pricing-2024": "Google AI for Developers pricing page, retrieved 2024.",
+    "mistral-pricing-2024": "Mistral AI pricing page, retrieved 2024.",
+    "together-pricing-2024": "Together AI inference pricing, retrieved 2024.",
+    "artificial-analysis-2024": "Artificial Analysis Inc. Latency benchmarks, artificialanalysis.ai, 2024.",
+}
+@dataclass
+class Registry:
+    models: list[ModelProfile] = field(default_factory=list)
+    def get(self, model_id: str) -> ModelProfile:
+        for m in self.models:
+            if m.id == model_id:
+                return m
+        raise KeyError(f"unknown model id: {model_id}")
+    def all(self) -> list[ModelProfile]:
+        return list(self.models)
+    def ids(self) -> list[str]:
+        return [m.id for m in self.models]
+    def by_family(self, family: str) -> list[ModelProfile]:
+        return [m for m in self.models if m.family == family]
+    def __iter__(self) -> Iterable[ModelProfile]:
+        return iter(self.models)
+    def __len__(self) -> int:
+        return len(self.models)
+def default_registry() -> Registry:
+    return Registry(models=_build_models())

greenrouting/routing/scorer.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""Per-candidate scoring: quality fit, energy, cost, latency."""
+from __future__ import annotations
+from dataclasses import dataclass
+from greenrouting.classifier.infer import QueryProfile
+from greenrouting.energy.estimator import (
+    estimate_cost_usd,
+    estimate_energy_wh,
+    estimate_latency_seconds,
+)
+from greenrouting.routing.registry import (
+    CAPABILITY_BENCHMARKS,
+    CAPABILITY_KEYS,
+    ModelProfile,
+)
+CAPABILITY_PROB_FLOOR: float = 0.10
+@dataclass
+class CandidateScore:
+    model_id: str
+    display_name: str
+    quality: float
+    energy_wh: float
+    cost_usd: float
+    latency_s: float
+    qualifies: bool
+    def as_dict(self) -> dict:
+        return {
+            "model_id": self.model_id,
+            "display_name": self.display_name,
+            "quality": round(self.quality, 4),
+            "energy_wh": round(self.energy_wh, 4),
+            "cost_usd": round(self.cost_usd, 6),
+            "latency_s": round(self.latency_s, 3),
+            "qualifies": self.qualifies,
+        }
+def quality_fit(profile: QueryProfile, model: ModelProfile) -> float:
+    """Capability-probability-weighted average of benchmark scores."""
+    cap_probs = profile.capabilities.as_dict()
+    weighted_sum = 0.0
+    weight_total = 0.0
+    for cap in CAPABILITY_KEYS:
+        prob = cap_probs[cap]
+        if prob < CAPABILITY_PROB_FLOOR:
+            continue
+        bench_keys = CAPABILITY_BENCHMARKS.get(cap, ())
+        bench_scores = [model.benchmark(b) for b in bench_keys]
+        bench_scores = [b for b in bench_scores if b is not None]
+        if not bench_scores:
+            continue
+        avg = sum(bench_scores) / len(bench_scores)
+        weighted_sum += prob * avg
+        weight_total += prob
+    if weight_total == 0:
+        # Fall back to MMLU as a generic competency floor.
+        mmlu = model.benchmark("mmlu")
+        return mmlu if mmlu is not None else 0.0
+    return weighted_sum / weight_total
+def score_candidate(
+    profile: QueryProfile,
+    model: ModelProfile,
+    quality_floor: float,
+) -> CandidateScore:
+    fit = quality_fit(profile, model)
+    energy = estimate_energy_wh(
+        model,
+        profile.expected_input_tokens,
+        profile.expected_output_tokens_p50,
+    )
+    cost = estimate_cost_usd(
+        model,
+        profile.expected_input_tokens,
+        profile.expected_output_tokens_p50,
+    )
+    latency = estimate_latency_seconds(model, profile.expected_output_tokens_p50)
+    return CandidateScore(
+        model_id=model.id,
+        display_name=model.display_name,
+        quality=fit,
+        energy_wh=energy,
+        cost_usd=cost,
+        latency_s=latency,
+        qualifies=fit >= quality_floor,
+    )

mapper.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""Maps the GreenRouting classifier output to the partner's response schema.
+Inputs:
+  - QueryProfile from greenrouting.classifier (8 capability probabilities,
+    continuous difficulty in log-parameters, length distribution)
+  - PartnerRegistry of candidate models (tier + per-category 1-10 scores + cost)
+Outputs:
+  - capability_weights: dict[7-key partner schema -> float in 0..1]
+  - category: argmax over the 5-category public set
+  - complexity: simple|moderate|complex
+  - difficulty: integer 1..5
+  - chosen model_id from the registry
+  - energy_savings_pct vs an always-ultra-tier baseline
+  - reason string for the partner's audit log
+"""
+from __future__ import annotations
+import math
+from typing import Optional
+from greenrouting.classifier.infer import QueryProfile
+from partner_registry import PARTNER_SCORE_KEYS, PartnerModel, PartnerRegistry
+PUBLIC_CATEGORIES: tuple[str, ...] = ("chat", "code", "math", "research", "creative")
+COMPLEXITY_BUCKETS: tuple[str, ...] = ("simple", "moderate", "complex")
+ULTRA_BASELINE_COST: int = 10
+def rebucket_capabilities(profile: QueryProfile) -> dict[str, float]:
+    """Map our 8 internal capabilities to the partner's 7 score categories."""
+    c = profile.capabilities
+    coding = c.code
+    math_ = c.math
+    research = min(1.0, c.reasoning + c.knowledge)
+    creative = c.creative
+    chat = min(1.0, c.simple_chat + c.instruction)
+    roleplay = c.creative * 0.5
+    ideas = min(1.0, (c.creative + c.reasoning) * 0.4)
+    return {
+        "coding": round(coding, 3),
+        "math": round(math_, 3),
+        "research": round(research, 3),
+        "creative": round(creative, 3),
+        "chat": round(chat, 3),
+        "roleplay": round(roleplay, 3),
+        "ideas": round(ideas, 3),
+    }
+def pick_category(weights: dict[str, float]) -> str:
+    public = {k: weights[k] for k in ("chat", "coding", "math", "research", "creative")}
+    top = max(public, key=public.get)
+    if top == "coding":
+        return "code"
+    return top
+def pick_complexity(profile: QueryProfile) -> str:
+    log_p = profile.difficulty_log_params
+    if log_p < math.log(3e9):
+        return "simple"
+    if log_p < math.log(20e9):
+        return "moderate"
+    return "complex"
+def pick_difficulty_int(profile: QueryProfile) -> int:
+    log_p = profile.difficulty_log_params
+    boundaries = [math.log(b * 1e9) for b in (1, 5, 15, 50)]
+    rank = 1
+    for b in boundaries:
+        if log_p >= b:
+            rank += 1
+        else:
+            break
+    return min(5, max(1, rank))
+def _allowed_tiers(difficulty: int) -> set[str]:
+    if difficulty <= 1:
+        return {"lite", "standard"}
+    if difficulty == 2:
+        return {"lite", "standard"}
+    if difficulty == 3:
+        return {"standard", "pro"}
+    if difficulty == 4:
+        return {"pro", "ultra"}
+    return {"ultra"}
+def quality_fit(model: PartnerModel, weights: dict[str, float]) -> float:
+    total_weight = sum(weights[k] for k in PARTNER_SCORE_KEYS) or 1.0
+    weighted = sum(weights[k] * (model.scores.get(k, 0) / 10.0) for k in PARTNER_SCORE_KEYS)
+    return weighted / total_weight
+def _best_ultra(registry: PartnerRegistry, weights: dict[str, float]) -> PartnerModel:
+    ultras = registry.by_tier("ultra")
+    pool = ultras if ultras else registry.models
+    return max(pool, key=lambda m: quality_fit(m, weights))
+def select_model(
+    registry: PartnerRegistry,
+    weights: dict[str, float],
+    difficulty: int,
+    is_ood: bool = False,
+    quality_floor_ratio: float = 0.65,
+) -> tuple[PartnerModel, bool]:
+    """Returns (chosen_model, escalated). Escalated means we fell back to the
+    ultra-tier anchor (low confidence in the prediction)."""
+    if not registry.models:
+        raise ValueError("partner registry is empty")
+    if is_ood:
+        return _best_ultra(registry, weights), True
+    allowed = registry.by_tier(*_allowed_tiers(difficulty))
+    if not allowed:
+        return _best_ultra(registry, weights), True
+    best_allowed = max(allowed, key=lambda m: quality_fit(m, weights))
+    floor = quality_fit(best_allowed, weights) * quality_floor_ratio
+    qualifying = [m for m in allowed if quality_fit(m, weights) >= floor]
+    if not qualifying:
+        return best_allowed, False
+    chosen = min(qualifying, key=lambda m: (m.cost, -quality_fit(m, weights)))
+    return chosen, False
+def energy_savings_pct(chosen: PartnerModel, baseline_cost: int = ULTRA_BASELINE_COST) -> float:
+    if baseline_cost <= 0:
+        return 0.0
+    saved = (baseline_cost - chosen.cost) / baseline_cost
+    return max(0.0, min(1.0, saved)) * 100.0
+def build_reason(
+    weights: dict[str, float],
+    complexity: str,
+    chosen: PartnerModel,
+    escalated: bool,
+    is_ood: bool = False,
+) -> str:
+    top_cap, top_score = max(weights.items(), key=lambda kv: kv[1])
+    bits: list[str] = []
+    if is_ood:
+        bits.append("low-confidence input (escalated to ultra tier)")
+    elif top_score >= 0.5:
+        bits.append(f"{top_cap} dominant ({top_score:.2f})")
+    else:
+        bits.append("mixed signal")
+    if not is_ood:
+        bits.append(f"{complexity} difficulty")
+    if escalated and not is_ood:
+        bits.append("escalated (no qualifying tier-allowed model)")
+    elif not escalated:
+        bits.append(f"picked {chosen.tier} tier (cost {chosen.cost})")
+    return ", ".join(bits)
+def fold_recent_context(message: str, recent: Optional[list[dict]]) -> str:
+    if not recent:
+        return message
+    last = recent[-1]
+    content = (last.get("content") or "")[:200] if isinstance(last, dict) else ""
+    if not content:
+        return message
+    return f"{content}\n{message}"

models/classifier_v1/calibration.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "temperature": 1.9504629214867681
+}

models/classifier_v1/encoder_name.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ BAAI/bge-small-en-v1.5

models/classifier_v1/head.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce9f40ded994d27f8695683ec77d2abfa57f36380c9f5767e074411b6f34ce22
+size 673429

models/classifier_v1/metadata.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "capability_keys": [
+    "code",
+    "math",
+    "reasoning",
+    "knowledge",
+    "instruction",
+    "creative",
+    "multilingual",
+    "simple_chat"
+  ],
+  "length_buckets": [
+    "short",
+    "medium",
+    "long"
+  ],
+  "embedding_dim": 384,
+  "hidden_dim": 256,
+  "max_seq_len": 256,
+  "diff_target_center": 22.80270737862625
+}

models/classifier_v1/ood_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c71aa5f272960594a5b5f043699136f9595c2f9ac0bf6c9cb918c3556482cc03
+size 518936

models/classifier_v1/training_history.json ADDED Viewed

	@@ -0,0 +1,182 @@

+[
+  {
+    "epoch": 0,
+    "train_loss": 1.3644548597789945,
+    "cap_precision": 0.0,
+    "cap_recall": 0.0,
+    "cap_f1": 0.0,
+    "diff_mae": 0.8598755598068237,
+    "len_acc": 0.559322033898305
+  },
+  {
+    "epoch": 1,
+    "train_loss": 1.1052290626934596,
+    "cap_precision": 0.3870967741935484,
+    "cap_recall": 0.14457831325301204,
+    "cap_f1": 0.21052631578947364,
+    "diff_mae": 0.7214581966400146,
+    "len_acc": 0.7288135593220338
+  },
+  {
+    "epoch": 2,
+    "train_loss": 0.9240592576208568,
+    "cap_precision": 0.45161290322580644,
+    "cap_recall": 0.1686746987951807,
+    "cap_f1": 0.24561403508771928,
+    "diff_mae": 0.7095464468002319,
+    "len_acc": 0.6949152542372882
+  },
+  {
+    "epoch": 3,
+    "train_loss": 0.8142032254309881,
+    "cap_precision": 0.6,
+    "cap_recall": 0.2891566265060241,
+    "cap_f1": 0.3902439024390244,
+    "diff_mae": 0.7512079477310181,
+    "len_acc": 0.711864406779661
+  },
+  {
+    "epoch": 4,
+    "train_loss": 0.7119971627280826,
+    "cap_precision": 0.4457831325301205,
+    "cap_recall": 0.4457831325301205,
+    "cap_f1": 0.4457831325301205,
+    "diff_mae": 0.6864577531814575,
+    "len_acc": 0.6610169491525424
+  },
+  {
+    "epoch": 5,
+    "train_loss": 0.629969752970196,
+    "cap_precision": 0.5930232558139535,
+    "cap_recall": 0.6144578313253012,
+    "cap_f1": 0.6035502958579881,
+    "diff_mae": 0.6868146061897278,
+    "len_acc": 0.6949152542372882
+  },
+  {
+    "epoch": 6,
+    "train_loss": 0.562128157842727,
+    "cap_precision": 0.5888888888888889,
+    "cap_recall": 0.6385542168674698,
+    "cap_f1": 0.6127167630057803,
+    "diff_mae": 0.7360132336616516,
+    "len_acc": 0.6949152542372882
+  },
+  {
+    "epoch": 7,
+    "train_loss": 0.4673078145299639,
+    "cap_precision": 0.6304347826086957,
+    "cap_recall": 0.6987951807228916,
+    "cap_f1": 0.6628571428571429,
+    "diff_mae": 0.7615002393722534,
+    "len_acc": 0.6779661016949152
+  },
+  {
+    "epoch": 8,
+    "train_loss": 0.4211572749274118,
+    "cap_precision": 0.6703296703296703,
+    "cap_recall": 0.7349397590361446,
+    "cap_f1": 0.7011494252873562,
+    "diff_mae": 0.7574763894081116,
+    "len_acc": 0.6101694915254238
+  },
+  {
+    "epoch": 9,
+    "train_loss": 0.3946749922775087,
+    "cap_precision": 0.7,
+    "cap_recall": 0.6746987951807228,
+    "cap_f1": 0.6871165644171778,
+    "diff_mae": 0.7892473340034485,
+    "len_acc": 0.6610169491525424
+  },
+  {
+    "epoch": 10,
+    "train_loss": 0.34337903772081646,
+    "cap_precision": 0.7142857142857143,
+    "cap_recall": 0.7228915662650602,
+    "cap_f1": 0.718562874251497,
+    "diff_mae": 0.7348798513412476,
+    "len_acc": 0.5932203389830508
+  },
+  {
+    "epoch": 11,
+    "train_loss": 0.2987311219885236,
+    "cap_precision": 0.7228915662650602,
+    "cap_recall": 0.7228915662650602,
+    "cap_f1": 0.7228915662650603,
+    "diff_mae": 0.7976469993591309,
+    "len_acc": 0.6271186440677966
+  },
+  {
+    "epoch": 12,
+    "train_loss": 0.27304122419584365,
+    "cap_precision": 0.7792207792207793,
+    "cap_recall": 0.7228915662650602,
+    "cap_f1": 0.75,
+    "diff_mae": 0.8239098787307739,
+    "len_acc": 0.6610169491525424
+  },
+  {
+    "epoch": 13,
+    "train_loss": 0.24270852761609213,
+    "cap_precision": 0.7763157894736842,
+    "cap_recall": 0.7108433734939759,
+    "cap_f1": 0.7421383647798742,
+    "diff_mae": 0.8853136301040649,
+    "len_acc": 0.6610169491525424
+  },
+  {
+    "epoch": 14,
+    "train_loss": 0.2204317024775914,
+    "cap_precision": 0.8055555555555556,
+    "cap_recall": 0.6987951807228916,
+    "cap_f1": 0.7483870967741936,
+    "diff_mae": 0.7929121851921082,
+    "len_acc": 0.6779661016949152
+  },
+  {
+    "epoch": 15,
+    "train_loss": 0.18839974346615018,
+    "cap_precision": 0.8082191780821918,
+    "cap_recall": 0.7108433734939759,
+    "cap_f1": 0.7564102564102564,
+    "diff_mae": 0.8756879568099976,
+    "len_acc": 0.6440677966101694
+  },
+  {
+    "epoch": 16,
+    "train_loss": 0.1629447014558883,
+    "cap_precision": 0.8169014084507042,
+    "cap_recall": 0.6987951807228916,
+    "cap_f1": 0.7532467532467533,
+    "diff_mae": 0.7843820452690125,
+    "len_acc": 0.6271186440677966
+  },
+  {
+    "epoch": 17,
+    "train_loss": 0.1407343131445703,
+    "cap_precision": 0.7792207792207793,
+    "cap_recall": 0.7228915662650602,
+    "cap_f1": 0.75,
+    "diff_mae": 0.8463668823242188,
+    "len_acc": 0.6101694915254238
+  },
+  {
+    "epoch": 18,
+    "train_loss": 0.1221757513426599,
+    "cap_precision": 0.8024691358024691,
+    "cap_recall": 0.7831325301204819,
+    "cap_f1": 0.7926829268292682,
+    "diff_mae": 0.8194808959960938,
+    "len_acc": 0.6440677966101694
+  },
+  {
+    "epoch": 19,
+    "train_loss": 0.11088378017856962,
+    "cap_precision": 0.8051948051948052,
+    "cap_recall": 0.7469879518072289,
+    "cap_f1": 0.7749999999999999,
+    "diff_mae": 0.8044652938842773,
+    "len_acc": 0.6610169491525424
+  }
+]

partner_registry.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Loader for the downstream partner's model registry.
+The partner ships a JSON list of model entries, each with an `id`, `tier`,
+`scores` (per-category 1-10), and `cost` (1-10). This file does not ship the
+registry data itself - it is loaded at runtime from a path supplied via the
+PARTNER_REGISTRY_PATH environment variable, kept outside source control.
+"""
+from __future__ import annotations
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+PARTNER_SCORE_KEYS: tuple[str, ...] = (
+    "coding",
+    "math",
+    "research",
+    "creative",
+    "chat",
+    "roleplay",
+    "ideas",
+)
+TIERS: tuple[str, ...] = ("lite", "standard", "pro", "ultra")
+@dataclass(frozen=True)
+class PartnerModel:
+    id: str
+    tier: str
+    is_open_router: bool
+    strengths: tuple[str, ...]
+    scores: dict[str, int]
+    cost: int
+    def fits_tier(self, tier_set: set[str]) -> bool:
+        return self.tier in tier_set
+@dataclass
+class PartnerRegistry:
+    models: list[PartnerModel]
+    def all(self) -> list[PartnerModel]:
+        return list(self.models)
+    def by_tier(self, *tiers: str) -> list[PartnerModel]:
+        keep = set(tiers)
+        return [m for m in self.models if m.tier in keep]
+    def get(self, model_id: str) -> Optional[PartnerModel]:
+        for m in self.models:
+            if m.id == model_id:
+                return m
+        return None
+    def __len__(self) -> int:
+        return len(self.models)
+def _coerce(entry: dict) -> PartnerModel:
+    scores = {k: int(entry.get("scores", {}).get(k, 0)) for k in PARTNER_SCORE_KEYS}
+    return PartnerModel(
+        id=str(entry["id"]),
+        tier=str(entry.get("tier", "standard")).lower(),
+        is_open_router=bool(entry.get("isOpenRouter", False)),
+        strengths=tuple(entry.get("strengths", [])),
+        scores=scores,
+        cost=int(entry.get("cost", 5)),
+    )
+def load_registry(path: str | Path | None = None) -> PartnerRegistry:
+    """Loads from one of three sources, in priority order:
+    1. The `path` argument, if supplied.
+    2. The PARTNER_REGISTRY_JSON env var containing the raw JSON content (used
+       in deployments where a file is awkward to ship, e.g. HF Space secrets).
+    3. The PARTNER_REGISTRY_PATH env var pointing at a JSON file on disk.
+    """
+    raw_text: Optional[str] = None
+    source = "argument"
+    if path is None:
+        inline = os.environ.get("PARTNER_REGISTRY_JSON")
+        if inline:
+            raw_text = inline
+            source = "env:PARTNER_REGISTRY_JSON"
+        else:
+            env_path = os.environ.get("PARTNER_REGISTRY_PATH")
+            if env_path:
+                path = env_path
+                source = "env:PARTNER_REGISTRY_PATH"
+    if raw_text is None:
+        if path is None:
+            raise RuntimeError(
+                "no registry source supplied (set PARTNER_REGISTRY_JSON or PARTNER_REGISTRY_PATH)"
+            )
+        p = Path(path)
+        if not p.exists():
+            raise FileNotFoundError(f"partner registry JSON not found at {p}")
+        raw_text = p.read_text(encoding="utf-8")
+    raw = json.loads(raw_text)
+    if not isinstance(raw, list):
+        raise ValueError(f"partner registry from {source} must be a top-level list")
+    models = [_coerce(e) for e in raw]
+    if not models:
+        raise ValueError(f"partner registry from {source} is empty")
+    return PartnerRegistry(models=models)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi>=0.110
+uvicorn[standard]>=0.30
+pydantic>=2.6
+torch>=2.2
+transformers>=4.40
+numpy>=1.26