Emre Sarigöl commited on
Commit
80f99be
·
1 Parent(s): ef90a4e

Deploy GURMA.ai Dashboard - 2026-04-09 09:10

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. eval_tab.py +224 -2
  3. sota_agent.py +3 -3
app.py CHANGED
@@ -70,7 +70,7 @@ DATA_FILE = DATA_DIR / "competitors.json"
70
  PRIVATE_DATASET_REPO = os.getenv("PRIVATE_DATASET_REPO", "").strip()
71
  PRIVATE_DATASET_REVISION = os.getenv("PRIVATE_DATASET_REVISION", "main").strip() or "main"
72
  PRIVATE_DATA_LOCAL_DIR = Path(os.getenv("PRIVATE_DATA_LOCAL_DIR", "/tmp/gurma-private-data"))
73
- PRIVATE_DATA_FORCE_DOWNLOAD = os.getenv("PRIVATE_DATA_FORCE_DOWNLOAD", "1").strip() in {"1", "true", "True"}
74
 
75
  # Access key from environment (set in HF Space secrets)
76
  ACCESS_KEY = os.getenv("ACCESS_KEY", "")
 
70
  PRIVATE_DATASET_REPO = os.getenv("PRIVATE_DATASET_REPO", "").strip()
71
  PRIVATE_DATASET_REVISION = os.getenv("PRIVATE_DATASET_REVISION", "main").strip() or "main"
72
  PRIVATE_DATA_LOCAL_DIR = Path(os.getenv("PRIVATE_DATA_LOCAL_DIR", "/tmp/gurma-private-data"))
73
+ PRIVATE_DATA_FORCE_DOWNLOAD = os.getenv("PRIVATE_DATA_FORCE_DOWNLOAD", "0").strip() in {"1", "true", "True"}
74
 
75
  # Access key from environment (set in HF Space secrets)
76
  ACCESS_KEY = os.getenv("ACCESS_KEY", "")
eval_tab.py CHANGED
@@ -30,6 +30,7 @@ else:
30
  EXPERIMENTS_DIR = DATA_ROOT / "experiments"
31
  TRAINING_DIR = DATA_ROOT / "training"
32
  ADAPTERS_DIR = DATA_ROOT / "adapters"
 
33
 
34
  # MLX model → HF Hub model for Inference API
35
  MODEL_HF_MAP = {
@@ -376,10 +377,15 @@ fine-tuned version.
376
  """)
377
 
378
  # --- Base Model ---
 
379
  model_short = model_id.split("/")[-1]
380
  is_moe = "A3B" in model_short or "MoE" in model_short
381
- arch_desc = ("a Mixture-of-Experts model (30B total, 3B active per token)"
382
- if is_moe else "a dense transformer model")
 
 
 
 
383
  st.markdown(f"""
384
  **Base model** — `{model_short}`
385
 
@@ -468,6 +474,30 @@ converted the raw sensor data into structured clinical Q&A pairs across
468
  height=min(35 * (len(rows) + 1), 300),
469
  )
470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  # --- How to Read Scores ---
472
  st.markdown("---")
473
  st.markdown("""
@@ -1552,6 +1582,195 @@ def _render_baseline_comparison(bench_data: dict, bench_map: dict,
1552
  st.markdown(table_html, unsafe_allow_html=True)
1553
 
1554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1555
  # ============================================================
1556
  # Main Entry Point
1557
  # ============================================================
@@ -1705,3 +1924,6 @@ def render_eval_tab():
1705
 
1706
  # --- Sample Browser ---
1707
  _render_sample_browser(examples, bench_data, has_adapted)
 
 
 
 
30
  EXPERIMENTS_DIR = DATA_ROOT / "experiments"
31
  TRAINING_DIR = DATA_ROOT / "training"
32
  ADAPTERS_DIR = DATA_ROOT / "adapters"
33
+ RESULTS_TSV = DATA_ROOT.parent / "src" / "models" / "results.tsv"
34
 
35
  # MLX model → HF Hub model for Inference API
36
  MODEL_HF_MAP = {
 
377
  """)
378
 
379
  # --- Base Model ---
380
+ import re as _re
381
  model_short = model_id.split("/")[-1]
382
  is_moe = "A3B" in model_short or "MoE" in model_short
383
+ if is_moe:
384
+ _m = _re.search(r'(\d+(?:\.\d+)?)B-A3B', model_short)
385
+ _total = _m.group(1) if _m else "?"
386
+ arch_desc = f"a Mixture-of-Experts model ({_total}B total, 3B active per token)"
387
+ else:
388
+ arch_desc = "a dense transformer model"
389
  st.markdown(f"""
390
  **Base model** — `{model_short}`
391
 
 
474
  height=min(35 * (len(rows) + 1), 300),
475
  )
476
 
477
+ # --- Evaluation Set ---
478
+ st.markdown("---")
479
+ st.markdown("""
480
+ **Evaluation set** — `rehab_public_v1_eval` · 533 examples (514 unique prompts)
481
+
482
+ Held-out test set derived from the same Zenodo source, never seen during training.
483
+ Each question is routed to the adapter trained for that task:
484
+ """)
485
+ eval_rows = [
486
+ {"Category": "Trajectory Prediction", "Examples": 66, "Adapter": "prediction_trajectory"},
487
+ {"Category": "FAC Forecasting", "Examples": 66, "Adapter": "prediction_fac"},
488
+ {"Category": "Speed Prediction", "Examples": 66, "Adapter": "prediction_speed"},
489
+ {"Category": "Risk Assessment", "Examples": 66, "Adapter": "prediction_risk"},
490
+ {"Category": "Clinical Reporting", "Examples": 88, "Adapter": "general (fallback)"},
491
+ {"Category": "Param Interpretation", "Examples": 71, "Adapter": "general (fallback)"},
492
+ {"Category": "Progress Analysis", "Examples": 110, "Adapter": "general (fallback)"},
493
+ ]
494
+ st.dataframe(
495
+ pd.DataFrame(eval_rows),
496
+ hide_index=True,
497
+ width="stretch",
498
+ height=35 * (len(eval_rows) + 1),
499
+ )
500
+
501
  # --- How to Read Scores ---
502
  st.markdown("---")
503
  st.markdown("""
 
1582
  st.markdown(table_html, unsafe_allow_html=True)
1583
 
1584
 
1585
+ # ============================================================
1586
+ # MLP Champion vs LLM+LoRA Cross-Architecture Comparison
1587
+ # ============================================================
1588
+
1589
+ _MLP_VAL_N = 16 # held-out val pairs used for MLP evaluation
1590
+
1591
+
1592
+ def _load_mlp_champion() -> dict | None:
1593
+ """Parse results.tsv and return the row with the lowest val_metric
1594
+ that also has extended prediction metrics (fac_exact_acc etc.)."""
1595
+ if not RESULTS_TSV.exists():
1596
+ return None
1597
+ champion = None
1598
+ best_val = float("inf")
1599
+ try:
1600
+ with open(RESULTS_TSV) as f:
1601
+ for line in f:
1602
+ line = line.strip()
1603
+ if not line or line.startswith("timestamp"):
1604
+ continue
1605
+ parts = line.split("\t")
1606
+ if len(parts) < 11:
1607
+ continue
1608
+ try:
1609
+ val_metric = float(parts[2])
1610
+ except ValueError:
1611
+ continue
1612
+ if val_metric < best_val and len(parts) >= 17:
1613
+ best_val = val_metric
1614
+ champion = {
1615
+ "exp_id": parts[1],
1616
+ "val_metric": val_metric,
1617
+ "MAE_fac": float(parts[3]),
1618
+ "RMSE_speed": float(parts[4]),
1619
+ "inference_ms": float(parts[5]),
1620
+ "n_params": int(parts[6]),
1621
+ "fac_exact": float(parts[10]),
1622
+ "fac_dir": float(parts[12]),
1623
+ "fac_err": float(parts[3]), # MAE_fac doubles as fac_err
1624
+ "speed_mae": float(parts[13]),
1625
+ "speed_r2": float(parts[14]),
1626
+ "speed_dir": float(parts[15]),
1627
+ "notes": parts[16] if len(parts) > 16 else "",
1628
+ }
1629
+ except Exception:
1630
+ return None
1631
+ return champion
1632
+
1633
+
1634
+ def _render_mlp_comparison(bench_map: dict, all_keys: list[str]):
1635
+ """Collapsible panel: MLP champion prediction metrics vs LoRA models.
1636
+
1637
+ Purely additive — does not modify any existing rendering path.
1638
+ Both sides are loaded independently from results.tsv and bench_map.
1639
+ """
1640
+ champion = _load_mlp_champion()
1641
+ if not champion:
1642
+ return
1643
+
1644
+ CURRENT_TEST_SIZE = 533
1645
+
1646
+ # Collect all adapted LoRA runs that have prediction metrics
1647
+ lora_models = []
1648
+ for k in all_keys:
1649
+ d = bench_map[k][1]
1650
+ agg = d.get("aggregate", {})
1651
+ if "adapted" not in agg:
1652
+ continue
1653
+ if d.get("test_examples") != CURRENT_TEST_SIZE:
1654
+ continue
1655
+ pred = agg["adapted"].get("prediction", {})
1656
+ if not any(v is not None for v in pred.values()):
1657
+ continue
1658
+ model_id = d.get("model", "?")
1659
+ model_short = model_id.split("/")[-1] if "/" in model_id else model_id
1660
+ for suffix in ["-4bit", "-8bit", "-MLX-4bit", "-MLX-8bit", "-textonly"]:
1661
+ model_short = model_short.replace(suffix, "")
1662
+ ts = d.get("timestamp", "")[:10]
1663
+ tag = "routed LoRA" if bool(d.get("routing")) else "LoRA"
1664
+ lora_models.append({
1665
+ "label": f"{model_short}\n({tag}, {ts})",
1666
+ "n_test": d.get("test_examples"),
1667
+ "fac_exact": pred.get("fac_exact_match"),
1668
+ "fac_dir": pred.get("fac_direction_accuracy"),
1669
+ "fac_err": pred.get("fac_mean_error"),
1670
+ "speed_mae": pred.get("speed_mean_abs_error"),
1671
+ "speed_dir": pred.get("speed_direction_accuracy"),
1672
+ })
1673
+
1674
+ if not lora_models:
1675
+ return
1676
+
1677
+ with st.expander("MLP Champion vs LLM+LoRA — cross-architecture comparison",
1678
+ expanded=False):
1679
+ st.caption(
1680
+ "**Caveat:** different test sets and task formulations — not a direct A/B. "
1681
+ f"MLP uses {_MLP_VAL_N} val examples (structured tabular input); "
1682
+ f"LLM models use {CURRENT_TEST_SIZE} test examples (natural language Q&A)."
1683
+ )
1684
+
1685
+ metric_defs = [
1686
+ ("FAC Exact Match", "fac_exact", "{:.0%}", True),
1687
+ ("FAC Direction Acc", "fac_dir", "{:.0%}", True),
1688
+ ("FAC Mean Error", "fac_err", "{:.3f}", False),
1689
+ ("Speed MAE (m/s)", "speed_mae", "{:.3f}", False),
1690
+ ("Speed Direction Acc","speed_dir", "{:.0%}", True),
1691
+ ]
1692
+
1693
+ mlp_vals = {k: champion.get(k) for k in
1694
+ ("fac_exact", "fac_dir", "fac_err", "speed_mae", "speed_dir")}
1695
+
1696
+ def _fmt(val, fmt_str):
1697
+ if val is None:
1698
+ return "—"
1699
+ try:
1700
+ return fmt_str.format(val)
1701
+ except Exception:
1702
+ return str(val)
1703
+
1704
+ header_cols = "".join(
1705
+ f'<th style="padding:6px 10px;text-align:center;font-weight:400;'
1706
+ f'color:#aaa;font-size:0.82em;white-space:pre-line;">{m["label"]}</th>'
1707
+ for m in lora_models
1708
+ )
1709
+ mlp_label = f"MLP champion\n({champion['exp_id']})"
1710
+
1711
+ html_rows = []
1712
+ for label, key, fmt, higher_better in metric_defs:
1713
+ mlp_val = mlp_vals.get(key)
1714
+ mlp_str = _fmt(mlp_val, fmt)
1715
+ cells = ""
1716
+ for m in lora_models:
1717
+ bl_val = m.get(key)
1718
+ bl_str = _fmt(bl_val, fmt)
1719
+ is_winner = False
1720
+ if mlp_val is not None and bl_val is not None:
1721
+ is_winner = mlp_val > bl_val if higher_better else mlp_val < bl_val
1722
+ color = "#888" if is_winner else "#e0e0e0"
1723
+ cells += (
1724
+ f'<td style="padding:6px 10px;text-align:center;'
1725
+ f'color:{color};font-size:0.9em;">{bl_str}</td>'
1726
+ )
1727
+ mlp_cell = (
1728
+ f'<td style="padding:6px 10px;text-align:center;'
1729
+ f'color:#198754;font-weight:600;font-size:0.9em;'
1730
+ f'border-left:2px solid #198754;">{mlp_str}</td>'
1731
+ )
1732
+ html_rows.append(
1733
+ f'<tr>'
1734
+ f'<td style="padding:6px 10px;color:#ccc;font-size:0.85em;">{label}</td>'
1735
+ f'{cells}{mlp_cell}'
1736
+ f'</tr>'
1737
+ )
1738
+
1739
+ table_html = f"""
1740
+ <div style="margin:0.8rem 0 0.5rem 0;">
1741
+ <table style="width:100%;border-collapse:collapse;border:1px solid #333;
1742
+ border-radius:6px;overflow:hidden;">
1743
+ <thead>
1744
+ <tr style="border-bottom:1px solid #333;">
1745
+ <th style="padding:6px 10px;text-align:left;color:#888;
1746
+ font-size:0.82em;font-weight:400;">Metric</th>
1747
+ {header_cols}
1748
+ <th style="padding:6px 10px;text-align:center;font-weight:600;
1749
+ color:#198754;font-size:0.82em;white-space:pre-line;
1750
+ border-left:2px solid #198754;">{mlp_label}</th>
1751
+ </tr>
1752
+ </thead>
1753
+ <tbody>
1754
+ {"".join(html_rows)}
1755
+ </tbody>
1756
+ </table>
1757
+ </div>
1758
+ """
1759
+ st.markdown(table_html, unsafe_allow_html=True)
1760
+
1761
+ n_test_caption = " · ".join(
1762
+ f"{m['label'].split(chr(10))[0].strip()}: n={m['n_test']}"
1763
+ for m in lora_models
1764
+ )
1765
+ st.caption(
1766
+ f"Test set sizes — {n_test_caption} · "
1767
+ f"MLP champion ({champion['exp_id']}): n={_MLP_VAL_N} val | "
1768
+ f"val_metric={champion['val_metric']:.4f}, "
1769
+ f"RMSE_speed={champion['RMSE_speed']:.3f} m/s, "
1770
+ f"params={champion['n_params']:,}"
1771
+ )
1772
+
1773
+
1774
  # ============================================================
1775
  # Main Entry Point
1776
  # ============================================================
 
1924
 
1925
  # --- Sample Browser ---
1926
  _render_sample_browser(examples, bench_data, has_adapted)
1927
+
1928
+ # --- MLP vs LoRA cross-architecture comparison (additive, collapsed) ---
1929
+ _render_mlp_comparison(bench_map, all_keys)
sota_agent.py CHANGED
@@ -43,7 +43,7 @@ GURMA_CONTEXT = {
43
  "domain": "Rehabilitation robotics AI — high-precision medical domain",
44
  "data_moat": "15 years of patient outcome data from BAMA Teknoloji "
45
  "(gait dynamics, EMG signals, recovery outcomes — not just motion data)",
46
- "products": ["RoboGate (stationary gait rehab robot)", "FreeGate (5-axis mobile exoskeleton)"],
47
  "architecture": "Privacy-first edge computing — no cloud data exposure",
48
  "regulatory": "EU AI Act (high-risk), MDR, ISO 13485, GDPR/KVKK — 80% safety focus from day one",
49
  "precision_requirement": (
@@ -190,7 +190,7 @@ INITIAL_KNOWLEDGE_BASE = {
190
  "name": "DeepSeek-V3.2",
191
  "params": "varies",
192
  "why": "Sparse attention architecture, efficient inference",
193
- "gurma_fit": "Sparse attention promising for edge deployment on RoboGate/FreeGate",
194
  "status": "watch",
195
  "added": "2026-02-06",
196
  "source": "Lex Fridman Podcast #490",
@@ -246,7 +246,7 @@ INITIAL_KNOWLEDGE_BASE = {
246
  "name": "Sparse Attention",
247
  "category": "efficiency",
248
  "why": "Lightweight token selection indexer; efficient inference for edge deployment",
249
- "gurma_fit": "Could enable on-device models for RoboGate/FreeGate with privacy-first architecture",
250
  "priority": "medium",
251
  "added": "2026-02-06",
252
  "source": "Lex Fridman Podcast #490",
 
43
  "domain": "Rehabilitation robotics AI — high-precision medical domain",
44
  "data_moat": "15 years of patient outcome data from BAMA Teknoloji "
45
  "(gait dynamics, EMG signals, recovery outcomes — not just motion data)",
46
+ "products": ["RoboGait (stationary gait rehab robot)", "FreeGait (5-axis mobile exoskeleton)"],
47
  "architecture": "Privacy-first edge computing — no cloud data exposure",
48
  "regulatory": "EU AI Act (high-risk), MDR, ISO 13485, GDPR/KVKK — 80% safety focus from day one",
49
  "precision_requirement": (
 
190
  "name": "DeepSeek-V3.2",
191
  "params": "varies",
192
  "why": "Sparse attention architecture, efficient inference",
193
+ "gurma_fit": "Sparse attention promising for edge deployment on RoboGait/FreeGait",
194
  "status": "watch",
195
  "added": "2026-02-06",
196
  "source": "Lex Fridman Podcast #490",
 
246
  "name": "Sparse Attention",
247
  "category": "efficiency",
248
  "why": "Lightweight token selection indexer; efficient inference for edge deployment",
249
+ "gurma_fit": "Could enable on-device models for RoboGait/FreeGait with privacy-first architecture",
250
  "priority": "medium",
251
  "added": "2026-02-06",
252
  "source": "Lex Fridman Podcast #490",