Spaces:

emresar
/

gurma-dashboard

Running

App Files Files Community

Emre Sarigöl commited on 22 days ago

Commit

80f99be

1 Parent(s): ef90a4e

Deploy GURMA.ai Dashboard - 2026-04-09 09:10

Browse files

Files changed (3) hide show

app.py +1 -1
eval_tab.py +224 -2
sota_agent.py +3 -3

app.py CHANGED Viewed

@@ -70,7 +70,7 @@ DATA_FILE = DATA_DIR / "competitors.json"
 PRIVATE_DATASET_REPO = os.getenv("PRIVATE_DATASET_REPO", "").strip()
 PRIVATE_DATASET_REVISION = os.getenv("PRIVATE_DATASET_REVISION", "main").strip() or "main"
 PRIVATE_DATA_LOCAL_DIR = Path(os.getenv("PRIVATE_DATA_LOCAL_DIR", "/tmp/gurma-private-data"))
-PRIVATE_DATA_FORCE_DOWNLOAD = os.getenv("PRIVATE_DATA_FORCE_DOWNLOAD", "1").strip() in {"1", "true", "True"}
 # Access key from environment (set in HF Space secrets)
 ACCESS_KEY = os.getenv("ACCESS_KEY", "")

 PRIVATE_DATASET_REPO = os.getenv("PRIVATE_DATASET_REPO", "").strip()
 PRIVATE_DATASET_REVISION = os.getenv("PRIVATE_DATASET_REVISION", "main").strip() or "main"
 PRIVATE_DATA_LOCAL_DIR = Path(os.getenv("PRIVATE_DATA_LOCAL_DIR", "/tmp/gurma-private-data"))
+PRIVATE_DATA_FORCE_DOWNLOAD = os.getenv("PRIVATE_DATA_FORCE_DOWNLOAD", "0").strip() in {"1", "true", "True"}
 # Access key from environment (set in HF Space secrets)
 ACCESS_KEY = os.getenv("ACCESS_KEY", "")

eval_tab.py CHANGED Viewed

@@ -30,6 +30,7 @@ else:
 EXPERIMENTS_DIR = DATA_ROOT / "experiments"
 TRAINING_DIR = DATA_ROOT / "training"
 ADAPTERS_DIR = DATA_ROOT / "adapters"
 # MLX model → HF Hub model for Inference API
 MODEL_HF_MAP = {
@@ -376,10 +377,15 @@ fine-tuned version.
 """)
         # --- Base Model ---
         model_short = model_id.split("/")[-1]
         is_moe = "A3B" in model_short or "MoE" in model_short
-        arch_desc = ("a Mixture-of-Experts model (30B total, 3B active per token)"
-                     if is_moe else "a dense transformer model")
         st.markdown(f"""
 **Base model** — `{model_short}`
@@ -468,6 +474,30 @@ converted the raw sensor data into structured clinical Q&A pairs across
                 height=min(35 * (len(rows) + 1), 300),
             )
         # --- How to Read Scores ---
         st.markdown("---")
         st.markdown("""
@@ -1552,6 +1582,195 @@ def _render_baseline_comparison(bench_data: dict, bench_map: dict,
     st.markdown(table_html, unsafe_allow_html=True)
 # ============================================================
 # Main Entry Point
 # ============================================================
@@ -1705,3 +1924,6 @@ def render_eval_tab():
     # --- Sample Browser ---
     _render_sample_browser(examples, bench_data, has_adapted)

 EXPERIMENTS_DIR = DATA_ROOT / "experiments"
 TRAINING_DIR = DATA_ROOT / "training"
 ADAPTERS_DIR = DATA_ROOT / "adapters"
+RESULTS_TSV = DATA_ROOT.parent / "src" / "models" / "results.tsv"
 # MLX model → HF Hub model for Inference API
 MODEL_HF_MAP = {
 """)
         # --- Base Model ---
+        import re as _re
         model_short = model_id.split("/")[-1]
         is_moe = "A3B" in model_short or "MoE" in model_short
+        if is_moe:
+            _m = _re.search(r'(\d+(?:\.\d+)?)B-A3B', model_short)
+            _total = _m.group(1) if _m else "?"
+            arch_desc = f"a Mixture-of-Experts model ({_total}B total, 3B active per token)"
+        else:
+            arch_desc = "a dense transformer model"
         st.markdown(f"""
 **Base model** — `{model_short}`
                 height=min(35 * (len(rows) + 1), 300),
             )
+        # --- Evaluation Set ---
+        st.markdown("---")
+        st.markdown("""
+**Evaluation set** — `rehab_public_v1_eval` · 533 examples (514 unique prompts)
+Held-out test set derived from the same Zenodo source, never seen during training.
+Each question is routed to the adapter trained for that task:
+""")
+        eval_rows = [
+            {"Category": "Trajectory Prediction", "Examples": 66, "Adapter": "prediction_trajectory"},
+            {"Category": "FAC Forecasting",        "Examples": 66, "Adapter": "prediction_fac"},
+            {"Category": "Speed Prediction",       "Examples": 66, "Adapter": "prediction_speed"},
+            {"Category": "Risk Assessment",        "Examples": 66, "Adapter": "prediction_risk"},
+            {"Category": "Clinical Reporting",     "Examples": 88, "Adapter": "general (fallback)"},
+            {"Category": "Param Interpretation",   "Examples": 71, "Adapter": "general (fallback)"},
+            {"Category": "Progress Analysis",      "Examples": 110, "Adapter": "general (fallback)"},
+        ]
+        st.dataframe(
+            pd.DataFrame(eval_rows),
+            hide_index=True,
+            width="stretch",
+            height=35 * (len(eval_rows) + 1),
+        )
         # --- How to Read Scores ---
         st.markdown("---")
         st.markdown("""
     st.markdown(table_html, unsafe_allow_html=True)
+# ============================================================
+# MLP Champion vs LLM+LoRA Cross-Architecture Comparison
+# ============================================================
+_MLP_VAL_N = 16  # held-out val pairs used for MLP evaluation
+def _load_mlp_champion() -> dict | None:
+    """Parse results.tsv and return the row with the lowest val_metric
+    that also has extended prediction metrics (fac_exact_acc etc.)."""
+    if not RESULTS_TSV.exists():
+        return None
+    champion = None
+    best_val = float("inf")
+    try:
+        with open(RESULTS_TSV) as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith("timestamp"):
+                    continue
+                parts = line.split("\t")
+                if len(parts) < 11:
+                    continue
+                try:
+                    val_metric = float(parts[2])
+                except ValueError:
+                    continue
+                if val_metric < best_val and len(parts) >= 17:
+                    best_val = val_metric
+                    champion = {
+                        "exp_id": parts[1],
+                        "val_metric": val_metric,
+                        "MAE_fac": float(parts[3]),
+                        "RMSE_speed": float(parts[4]),
+                        "inference_ms": float(parts[5]),
+                        "n_params": int(parts[6]),
+                        "fac_exact": float(parts[10]),
+                        "fac_dir": float(parts[12]),
+                        "fac_err": float(parts[3]),   # MAE_fac doubles as fac_err
+                        "speed_mae": float(parts[13]),
+                        "speed_r2": float(parts[14]),
+                        "speed_dir": float(parts[15]),
+                        "notes": parts[16] if len(parts) > 16 else "",
+                    }
+    except Exception:
+        return None
+    return champion
+def _render_mlp_comparison(bench_map: dict, all_keys: list[str]):
+    """Collapsible panel: MLP champion prediction metrics vs LoRA models.
+    Purely additive — does not modify any existing rendering path.
+    Both sides are loaded independently from results.tsv and bench_map.
+    """
+    champion = _load_mlp_champion()
+    if not champion:
+        return
+    CURRENT_TEST_SIZE = 533
+    # Collect all adapted LoRA runs that have prediction metrics
+    lora_models = []
+    for k in all_keys:
+        d = bench_map[k][1]
+        agg = d.get("aggregate", {})
+        if "adapted" not in agg:
+            continue
+        if d.get("test_examples") != CURRENT_TEST_SIZE:
+            continue
+        pred = agg["adapted"].get("prediction", {})
+        if not any(v is not None for v in pred.values()):
+            continue
+        model_id = d.get("model", "?")
+        model_short = model_id.split("/")[-1] if "/" in model_id else model_id
+        for suffix in ["-4bit", "-8bit", "-MLX-4bit", "-MLX-8bit", "-textonly"]:
+            model_short = model_short.replace(suffix, "")
+        ts = d.get("timestamp", "")[:10]
+        tag = "routed LoRA" if bool(d.get("routing")) else "LoRA"
+        lora_models.append({
+            "label": f"{model_short}\n({tag}, {ts})",
+            "n_test": d.get("test_examples"),
+            "fac_exact": pred.get("fac_exact_match"),
+            "fac_dir": pred.get("fac_direction_accuracy"),
+            "fac_err": pred.get("fac_mean_error"),
+            "speed_mae": pred.get("speed_mean_abs_error"),
+            "speed_dir": pred.get("speed_direction_accuracy"),
+        })
+    if not lora_models:
+        return
+    with st.expander("MLP Champion vs LLM+LoRA — cross-architecture comparison",
+                     expanded=False):
+        st.caption(
+            "**Caveat:** different test sets and task formulations — not a direct A/B. "
+            f"MLP uses {_MLP_VAL_N} val examples (structured tabular input); "
+            f"LLM models use {CURRENT_TEST_SIZE} test examples (natural language Q&A)."
+        )
+        metric_defs = [
+            ("FAC Exact Match",    "fac_exact", "{:.0%}", True),
+            ("FAC Direction Acc",  "fac_dir",   "{:.0%}", True),
+            ("FAC Mean Error",     "fac_err",   "{:.3f}", False),
+            ("Speed MAE (m/s)",    "speed_mae", "{:.3f}", False),
+            ("Speed Direction Acc","speed_dir",  "{:.0%}", True),
+        ]
+        mlp_vals = {k: champion.get(k) for k in
+                    ("fac_exact", "fac_dir", "fac_err", "speed_mae", "speed_dir")}
+        def _fmt(val, fmt_str):
+            if val is None:
+                return "—"
+            try:
+                return fmt_str.format(val)
+            except Exception:
+                return str(val)
+        header_cols = "".join(
+            f'<th style="padding:6px 10px;text-align:center;font-weight:400;'
+            f'color:#aaa;font-size:0.82em;white-space:pre-line;">{m["label"]}</th>'
+            for m in lora_models
+        )
+        mlp_label = f"MLP champion\n({champion['exp_id']})"
+        html_rows = []
+        for label, key, fmt, higher_better in metric_defs:
+            mlp_val = mlp_vals.get(key)
+            mlp_str = _fmt(mlp_val, fmt)
+            cells = ""
+            for m in lora_models:
+                bl_val = m.get(key)
+                bl_str = _fmt(bl_val, fmt)
+                is_winner = False
+                if mlp_val is not None and bl_val is not None:
+                    is_winner = mlp_val > bl_val if higher_better else mlp_val < bl_val
+                color = "#888" if is_winner else "#e0e0e0"
+                cells += (
+                    f'<td style="padding:6px 10px;text-align:center;'
+                    f'color:{color};font-size:0.9em;">{bl_str}</td>'
+                )
+            mlp_cell = (
+                f'<td style="padding:6px 10px;text-align:center;'
+                f'color:#198754;font-weight:600;font-size:0.9em;'
+                f'border-left:2px solid #198754;">{mlp_str}</td>'
+            )
+            html_rows.append(
+                f'<tr>'
+                f'<td style="padding:6px 10px;color:#ccc;font-size:0.85em;">{label}</td>'
+                f'{cells}{mlp_cell}'
+                f'</tr>'
+            )
+        table_html = f"""
+<div style="margin:0.8rem 0 0.5rem 0;">
+<table style="width:100%;border-collapse:collapse;border:1px solid #333;
+              border-radius:6px;overflow:hidden;">
+<thead>
+<tr style="border-bottom:1px solid #333;">
+    <th style="padding:6px 10px;text-align:left;color:#888;
+               font-size:0.82em;font-weight:400;">Metric</th>
+    {header_cols}
+    <th style="padding:6px 10px;text-align:center;font-weight:600;
+               color:#198754;font-size:0.82em;white-space:pre-line;
+               border-left:2px solid #198754;">{mlp_label}</th>
+</tr>
+</thead>
+<tbody>
+{"".join(html_rows)}
+</tbody>
+</table>
+</div>
+"""
+        st.markdown(table_html, unsafe_allow_html=True)
+        n_test_caption = " · ".join(
+            f"{m['label'].split(chr(10))[0].strip()}: n={m['n_test']}"
+            for m in lora_models
+        )
+        st.caption(
+            f"Test set sizes — {n_test_caption} · "
+            f"MLP champion ({champion['exp_id']}): n={_MLP_VAL_N} val | "
+            f"val_metric={champion['val_metric']:.4f}, "
+            f"RMSE_speed={champion['RMSE_speed']:.3f} m/s, "
+            f"params={champion['n_params']:,}"
+        )
 # ============================================================
 # Main Entry Point
 # ============================================================
     # --- Sample Browser ---
     _render_sample_browser(examples, bench_data, has_adapted)
+    # --- MLP vs LoRA cross-architecture comparison (additive, collapsed) ---
+    _render_mlp_comparison(bench_map, all_keys)

sota_agent.py CHANGED Viewed

@@ -43,7 +43,7 @@ GURMA_CONTEXT = {
     "domain": "Rehabilitation robotics AI — high-precision medical domain",
     "data_moat": "15 years of patient outcome data from BAMA Teknoloji "
                  "(gait dynamics, EMG signals, recovery outcomes — not just motion data)",
-    "products": ["RoboGate (stationary gait rehab robot)", "FreeGate (5-axis mobile exoskeleton)"],
     "architecture": "Privacy-first edge computing — no cloud data exposure",
     "regulatory": "EU AI Act (high-risk), MDR, ISO 13485, GDPR/KVKK — 80% safety focus from day one",
     "precision_requirement": (
@@ -190,7 +190,7 @@ INITIAL_KNOWLEDGE_BASE = {
             "name": "DeepSeek-V3.2",
             "params": "varies",
             "why": "Sparse attention architecture, efficient inference",
-            "gurma_fit": "Sparse attention promising for edge deployment on RoboGate/FreeGate",
             "status": "watch",
             "added": "2026-02-06",
             "source": "Lex Fridman Podcast #490",
@@ -246,7 +246,7 @@ INITIAL_KNOWLEDGE_BASE = {
             "name": "Sparse Attention",
             "category": "efficiency",
             "why": "Lightweight token selection indexer; efficient inference for edge deployment",
-            "gurma_fit": "Could enable on-device models for RoboGate/FreeGate with privacy-first architecture",
             "priority": "medium",
             "added": "2026-02-06",
             "source": "Lex Fridman Podcast #490",

     "domain": "Rehabilitation robotics AI — high-precision medical domain",
     "data_moat": "15 years of patient outcome data from BAMA Teknoloji "
                  "(gait dynamics, EMG signals, recovery outcomes — not just motion data)",
+    "products": ["RoboGait (stationary gait rehab robot)", "FreeGait (5-axis mobile exoskeleton)"],
     "architecture": "Privacy-first edge computing — no cloud data exposure",
     "regulatory": "EU AI Act (high-risk), MDR, ISO 13485, GDPR/KVKK — 80% safety focus from day one",
     "precision_requirement": (
             "name": "DeepSeek-V3.2",
             "params": "varies",
             "why": "Sparse attention architecture, efficient inference",
+            "gurma_fit": "Sparse attention promising for edge deployment on RoboGait/FreeGait",
             "status": "watch",
             "added": "2026-02-06",
             "source": "Lex Fridman Podcast #490",
             "name": "Sparse Attention",
             "category": "efficiency",
             "why": "Lightweight token selection indexer; efficient inference for edge deployment",
+            "gurma_fit": "Could enable on-device models for RoboGait/FreeGait with privacy-first architecture",
             "priority": "medium",
             "added": "2026-02-06",
             "source": "Lex Fridman Podcast #490",