XcodeAddy commited on
Commit
4797955
·
1 Parent(s): 2c0b609

Build SENTINEL phase 3 judge demo UI

Browse files
Files changed (2) hide show
  1. app.py +8 -0
  2. static/index.html +507 -22
app.py CHANGED
@@ -88,6 +88,14 @@ def baseline_comparison_chart():
88
  return FileResponse(chart_path, media_type="image/png")
89
 
90
 
 
 
 
 
 
 
 
 
91
  @app.get("/api")
92
  def api_root():
93
  return {
 
88
  return FileResponse(chart_path, media_type="image/png")
89
 
90
 
91
+ @app.get("/assets/evaluation_results.json")
92
+ def evaluation_results():
93
+ results_path = _OUTPUTS_DIR / "evaluation_results.json"
94
+ if not results_path.exists():
95
+ raise HTTPException(status_code=404, detail="Evaluation results not found.")
96
+ return FileResponse(results_path, media_type="application/json")
97
+
98
+
99
  @app.get("/api")
100
  def api_root():
101
  return {
static/index.html CHANGED
@@ -193,6 +193,50 @@
193
  flex-wrap: wrap;
194
  }
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  .console {
197
  width: min(1540px, 100%);
198
  margin: 0 auto;
@@ -201,12 +245,14 @@
201
  gap: 14px;
202
  grid-template-columns: minmax(420px, 1.35fr) minmax(340px, 0.85fr);
203
  grid-template-areas:
 
204
  "theater command"
205
  "mission playground"
206
  "trust playground"
207
- "story readiness"
208
  "proof events"
209
- "flow themes";
 
210
  align-items: start;
211
  }
212
 
@@ -219,18 +265,24 @@
219
  overflow: hidden;
220
  }
221
 
 
222
  .theater { grid-area: theater; }
223
  .command { grid-area: command; }
224
  .mission { grid-area: mission; }
225
  .trust { grid-area: trust; }
226
  .playground { grid-area: playground; }
227
  .story { grid-area: story; }
 
228
  .readiness { grid-area: readiness; }
229
  .proof { grid-area: proof; }
230
  .events { grid-area: events; }
231
  .flow { grid-area: flow; }
232
  .themes { grid-area: themes; }
233
 
 
 
 
 
234
  .section-head {
235
  min-height: 54px;
236
  display: flex;
@@ -254,6 +306,92 @@
254
  padding: 15px;
255
  }
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  .chips {
258
  display: flex;
259
  flex-wrap: wrap;
@@ -904,6 +1042,57 @@
904
  background: rgba(10, 12, 8, 0.35);
905
  }
906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
907
  .readiness-list {
908
  display: grid;
909
  gap: 10px;
@@ -1062,12 +1251,14 @@
1062
  .console {
1063
  grid-template-columns: 1fr;
1064
  grid-template-areas:
 
1065
  "theater"
1066
  "command"
1067
  "mission"
1068
  "trust"
1069
  "playground"
1070
  "story"
 
1071
  "readiness"
1072
  "proof"
1073
  "events"
@@ -1095,10 +1286,14 @@
1095
 
1096
  .stage-topline,
1097
  .outcome-strip,
 
 
1098
  .proof-grid,
1099
  .json-grid,
1100
  .playground-meta,
1101
  .story-grid,
 
 
1102
  .flow-line,
1103
  .theme-grid,
1104
  .stats-grid {
@@ -1146,6 +1341,20 @@
1146
  width: 100%;
1147
  }
1148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1149
  .specialist-grid {
1150
  grid-template-columns: 1fr;
1151
  }
@@ -1180,11 +1389,86 @@
1180
  <input id="seedInput" aria-label="Seed" type="number" value="42">
1181
  <button id="resetBtn" class="primary" type="button">Reset Episode</button>
1182
  <button id="swapBtn" class="warn" type="button">Swap Profiles</button>
1183
- <button id="autoBtn" type="button">Auto Policy</button>
1184
  </div>
1185
  </header>
1186
 
 
 
 
 
 
 
 
 
 
 
 
1187
  <main class="console">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1188
  <section class="theater">
1189
  <div class="section-head">
1190
  <h2>Live Trust Theater</h2>
@@ -1399,7 +1683,7 @@
1399
  <div class="story-lane before">
1400
  <div class="story-title">
1401
  <strong>Without SENTINEL</strong>
1402
- <span class="story-score">task3 random 0.699</span>
1403
  </div>
1404
  <div class="story-flow">
1405
  <div class="story-step">All public slots start near the same trust. The orchestrator delegates with weak evidence.</div>
@@ -1412,7 +1696,7 @@
1412
  <div class="story-lane after">
1413
  <div class="story-title">
1414
  <strong>With SENTINEL</strong>
1415
- <span class="story-score">task3 heuristic 0.784</span>
1416
  </div>
1417
  <div class="story-flow">
1418
  <div class="story-step">Behavior updates the TrustLedger after every step, so public slots diverge quickly.</div>
@@ -1426,6 +1710,56 @@
1426
  </div>
1427
  </section>
1428
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1429
  <section class="readiness">
1430
  <div class="section-head">
1431
  <h2>Hackathon Readiness</h2>
@@ -1463,23 +1797,23 @@
1463
  <div class="baseline-table">
1464
  <div class="baseline-row">
1465
  <span>Random</span>
1466
- <div class="mini-bar"><span style="width:71.4%;background:#ff5f45"></span></div>
1467
- <strong>0.714</strong>
1468
  </div>
1469
  <div class="baseline-row">
1470
  <span>Heuristic</span>
1471
- <div class="mini-bar"><span style="width:81.6%;background:#73a7ff"></span></div>
1472
- <strong>0.816</strong>
1473
  </div>
1474
  <div class="baseline-row">
1475
  <span>Oracle-lite</span>
1476
- <div class="mini-bar"><span style="width:87.2%;background:#27e0a1"></span></div>
1477
- <strong>0.872</strong>
1478
  </div>
1479
  <div class="baseline-row">
1480
  <span>T3 detect</span>
1481
- <div class="mini-bar"><span style="width:73.5%;background:#f5ba41"></span></div>
1482
- <strong>0.735</strong>
1483
  </div>
1484
  </div>
1485
  <div class="chart-frame">
@@ -1574,7 +1908,10 @@
1574
  events: [],
1575
  lastRequest: null,
1576
  lastResult: null,
1577
- lastMode: "reset()"
 
 
 
1578
  };
1579
 
1580
  const el = {
@@ -1585,6 +1922,13 @@
1585
  swapBtn: document.getElementById("swapBtn"),
1586
  swapPanelBtn: document.getElementById("swapPanelBtn"),
1587
  autoBtn: document.getElementById("autoBtn"),
 
 
 
 
 
 
 
1588
  specialistSelect: document.getElementById("specialistSelect"),
1589
  recommendChip: document.getElementById("recommendChip"),
1590
  recommendText: document.getElementById("recommendText"),
@@ -1617,7 +1961,23 @@
1617
  leadMove: document.getElementById("leadMove"),
1618
  stageMove: document.getElementById("stageMove"),
1619
  stageSignals: document.getElementById("stageSignals"),
1620
- rewardText: document.getElementById("rewardText")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1621
  };
1622
 
1623
  function trustColor(value) {
@@ -1656,6 +2016,111 @@
1656
  return {type: "delegate", specialist: best, trust};
1657
  }
1658
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1659
  function renderTrust() {
1660
  const trust = state.observation?.trust_snapshot || Object.fromEntries(ids.map(id => [id, 0.5]));
1661
  const values = ids.map(id => Number(trust[id] ?? 0.5));
@@ -1766,6 +2231,9 @@
1766
  el.selfBtn.disabled = disabled;
1767
  el.skipBtn.disabled = disabled;
1768
  el.applyRecommendBtn.disabled = disabled;
 
 
 
1769
  }
1770
 
1771
  function render(result) {
@@ -1908,11 +2376,11 @@
1908
  }
1909
  }
1910
 
1911
- async function autoRun() {
1912
  if (!state.observation || state.done) await resetEpisode();
1913
  let guard = 0;
1914
  while (!state.done && guard < 70) {
1915
- const move = recommendedMove();
1916
  await stepEpisode(move.type, move.specialist);
1917
  guard += 1;
1918
  await new Promise(resolve => setTimeout(resolve, 150));
@@ -1924,23 +2392,40 @@
1924
  await stepEpisode(move.type, move.specialist);
1925
  }
1926
 
1927
- async function swapProfiles() {
 
 
 
 
 
 
1928
  const nextSeed = Number(el.seedInput.value || 0) + 1;
1929
  el.seedInput.value = String(nextSeed);
1930
  await resetEpisode();
 
 
 
1931
  }
1932
 
1933
  el.resetBtn.addEventListener("click", resetEpisode);
1934
  el.resetPanelBtn.addEventListener("click", resetEpisode);
1935
- el.swapBtn.addEventListener("click", swapProfiles);
1936
- el.swapPanelBtn.addEventListener("click", swapProfiles);
1937
  el.delegateBtn.addEventListener("click", () => stepEpisode("delegate"));
1938
  el.verifyBtn.addEventListener("click", () => stepEpisode("verify"));
1939
  el.selfBtn.addEventListener("click", () => stepEpisode("solve_independently"));
1940
  el.skipBtn.addEventListener("click", () => stepEpisode("skip"));
1941
- el.autoBtn.addEventListener("click", autoRun);
1942
  el.applyRecommendBtn.addEventListener("click", applyRecommendation);
1943
-
 
 
 
 
 
 
 
 
1944
  render();
1945
  resetEpisode();
1946
  </script>
 
193
  flex-wrap: wrap;
194
  }
195
 
196
+ .modebar {
197
+ padding: 0 22px 14px;
198
+ border-bottom: 1px solid #232920;
199
+ background: rgba(7, 8, 6, 0.9);
200
+ backdrop-filter: blur(14px);
201
+ }
202
+
203
+ .modebar-inner {
204
+ width: min(1540px, 100%);
205
+ margin: 0 auto;
206
+ display: flex;
207
+ align-items: center;
208
+ justify-content: space-between;
209
+ gap: 14px;
210
+ flex-wrap: wrap;
211
+ }
212
+
213
+ .view-tabs {
214
+ display: flex;
215
+ align-items: center;
216
+ gap: 8px;
217
+ flex-wrap: wrap;
218
+ }
219
+
220
+ .view-tab {
221
+ min-height: 38px;
222
+ border-radius: 999px;
223
+ padding: 0 14px;
224
+ background: #10140f;
225
+ }
226
+
227
+ .view-tab.active {
228
+ border-color: rgba(39, 224, 161, 0.72);
229
+ background: linear-gradient(180deg, #1d6a53, #133c30);
230
+ color: #effff7;
231
+ }
232
+
233
+ .view-copy {
234
+ color: var(--muted);
235
+ font-size: 13px;
236
+ line-height: 1.4;
237
+ max-width: 760px;
238
+ }
239
+
240
  .console {
241
  width: min(1540px, 100%);
242
  margin: 0 auto;
 
245
  gap: 14px;
246
  grid-template-columns: minmax(420px, 1.35fr) minmax(340px, 0.85fr);
247
  grid-template-areas:
248
+ "hero hero"
249
  "theater command"
250
  "mission playground"
251
  "trust playground"
252
+ "story judge"
253
  "proof events"
254
+ "flow themes"
255
+ "readiness readiness";
256
  align-items: start;
257
  }
258
 
 
265
  overflow: hidden;
266
  }
267
 
268
+ .hero { grid-area: hero; }
269
  .theater { grid-area: theater; }
270
  .command { grid-area: command; }
271
  .mission { grid-area: mission; }
272
  .trust { grid-area: trust; }
273
  .playground { grid-area: playground; }
274
  .story { grid-area: story; }
275
+ .judge { grid-area: judge; }
276
  .readiness { grid-area: readiness; }
277
  .proof { grid-area: proof; }
278
  .events { grid-area: events; }
279
  .flow { grid-area: flow; }
280
  .themes { grid-area: themes; }
281
 
282
+ .section-hidden {
283
+ display: none;
284
+ }
285
+
286
  .section-head {
287
  min-height: 54px;
288
  display: flex;
 
306
  padding: 15px;
307
  }
308
 
309
+ .hero-grid {
310
+ display: grid;
311
+ grid-template-columns: minmax(0, 1.1fr) minmax(320px, 0.9fr);
312
+ gap: 13px;
313
+ }
314
+
315
+ .hero-panel {
316
+ min-height: 208px;
317
+ border: 1px solid #394132;
318
+ border-radius: 8px;
319
+ padding: 15px;
320
+ background: var(--panel-2);
321
+ }
322
+
323
+ .hero-panel.primary {
324
+ border-color: rgba(39, 224, 161, 0.42);
325
+ background:
326
+ linear-gradient(180deg, rgba(39, 224, 161, 0.12), transparent 45%),
327
+ var(--panel-2);
328
+ }
329
+
330
+ .hero-panel h3,
331
+ .judge-card h3 {
332
+ font-size: 18px;
333
+ color: var(--cream);
334
+ margin: 0 0 10px 0;
335
+ }
336
+
337
+ .hero-panel p {
338
+ color: #e8e1ca;
339
+ font-size: 14px;
340
+ line-height: 1.55;
341
+ }
342
+
343
+ .hero-callouts,
344
+ .hero-steps,
345
+ .judge-list {
346
+ display: grid;
347
+ gap: 9px;
348
+ margin-top: 14px;
349
+ }
350
+
351
+ .hero-callout,
352
+ .hero-step,
353
+ .judge-step {
354
+ min-height: 52px;
355
+ border: 1px solid #394132;
356
+ border-radius: 8px;
357
+ padding: 11px 12px;
358
+ background: #0d100b;
359
+ color: #ebe5cf;
360
+ line-height: 1.42;
361
+ }
362
+
363
+ .hero-callout strong,
364
+ .hero-step strong,
365
+ .judge-step strong {
366
+ display: block;
367
+ margin-bottom: 4px;
368
+ color: var(--cream);
369
+ font-size: 13px;
370
+ }
371
+
372
+ .hero-stats {
373
+ margin-top: 14px;
374
+ display: grid;
375
+ grid-template-columns: repeat(3, minmax(0, 1fr));
376
+ gap: 10px;
377
+ }
378
+
379
+ .hero-stat {
380
+ min-height: 80px;
381
+ border: 1px solid #394132;
382
+ border-radius: 8px;
383
+ padding: 11px;
384
+ background: #0d100b;
385
+ }
386
+
387
+ .hero-stat .label {
388
+ margin-bottom: 7px;
389
+ }
390
+
391
+ .hero-stat .value {
392
+ font-size: 22px;
393
+ }
394
+
395
  .chips {
396
  display: flex;
397
  flex-wrap: wrap;
 
1042
  background: rgba(10, 12, 8, 0.35);
1043
  }
1044
 
1045
+ .judge-grid {
1046
+ display: grid;
1047
+ gap: 12px;
1048
+ }
1049
+
1050
+ .judge-stats {
1051
+ display: grid;
1052
+ grid-template-columns: repeat(3, minmax(0, 1fr));
1053
+ gap: 10px;
1054
+ }
1055
+
1056
+ .judge-card {
1057
+ min-height: 132px;
1058
+ border: 1px solid #394132;
1059
+ border-radius: 8px;
1060
+ padding: 13px;
1061
+ background: var(--panel-2);
1062
+ }
1063
+
1064
+ .judge-card.good {
1065
+ border-color: rgba(39, 224, 161, 0.4);
1066
+ background: var(--jade-soft);
1067
+ }
1068
+
1069
+ .judge-card.warn {
1070
+ border-color: rgba(245, 186, 65, 0.4);
1071
+ background: var(--amber-soft);
1072
+ }
1073
+
1074
+ .judge-card.bad {
1075
+ border-color: rgba(255, 95, 69, 0.4);
1076
+ background: var(--flame-soft);
1077
+ }
1078
+
1079
+ .judge-card .value {
1080
+ font-size: 28px;
1081
+ margin-top: 8px;
1082
+ }
1083
+
1084
+ .judge-card .muted {
1085
+ display: block;
1086
+ margin-top: 6px;
1087
+ line-height: 1.4;
1088
+ }
1089
+
1090
+ .judge-actions {
1091
+ display: grid;
1092
+ grid-template-columns: repeat(3, minmax(0, 1fr));
1093
+ gap: 10px;
1094
+ }
1095
+
1096
  .readiness-list {
1097
  display: grid;
1098
  gap: 10px;
 
1251
  .console {
1252
  grid-template-columns: 1fr;
1253
  grid-template-areas:
1254
+ "hero"
1255
  "theater"
1256
  "command"
1257
  "mission"
1258
  "trust"
1259
  "playground"
1260
  "story"
1261
+ "judge"
1262
  "readiness"
1263
  "proof"
1264
  "events"
 
1286
 
1287
  .stage-topline,
1288
  .outcome-strip,
1289
+ .hero-grid,
1290
+ .hero-stats,
1291
  .proof-grid,
1292
  .json-grid,
1293
  .playground-meta,
1294
  .story-grid,
1295
+ .judge-stats,
1296
+ .judge-actions,
1297
  .flow-line,
1298
  .theme-grid,
1299
  .stats-grid {
 
1341
  width: 100%;
1342
  }
1343
 
1344
+ .modebar {
1345
+ padding: 0 13px 12px;
1346
+ }
1347
+
1348
+ .view-tabs {
1349
+ width: 100%;
1350
+ flex-direction: column;
1351
+ align-items: stretch;
1352
+ }
1353
+
1354
+ .view-tab {
1355
+ width: 100%;
1356
+ }
1357
+
1358
  .specialist-grid {
1359
  grid-template-columns: 1fr;
1360
  }
 
1389
  <input id="seedInput" aria-label="Seed" type="number" value="42">
1390
  <button id="resetBtn" class="primary" type="button">Reset Episode</button>
1391
  <button id="swapBtn" class="warn" type="button">Swap Profiles</button>
1392
+ <button id="autoBtn" type="button">Heuristic Auto</button>
1393
  </div>
1394
  </header>
1395
 
1396
+ <div class="modebar">
1397
+ <div class="modebar-inner">
1398
+ <div class="view-tabs">
1399
+ <button id="viewOverviewBtn" class="view-tab active" type="button">Overview</button>
1400
+ <button id="viewPlaygroundBtn" class="view-tab" type="button">Playground</button>
1401
+ <button id="viewJudgeBtn" class="view-tab" type="button">Judge Demo</button>
1402
+ </div>
1403
+ <div id="viewCopy" class="view-copy">Overview turns the environment into a judge-readable system story: the problem, the learning signal, and the live failure mode it fixes.</div>
1404
+ </div>
1405
+ </div>
1406
+
1407
  <main class="console">
1408
+ <section class="hero">
1409
+ <div class="section-head">
1410
+ <h2>System Overview</h2>
1411
+ <div class="chips">
1412
+ <span class="chip live">reset → step → state</span>
1413
+ <span class="chip">OpenEnv compatible</span>
1414
+ <span class="chip warn">skill, not identity</span>
1415
+ </div>
1416
+ </div>
1417
+ <div class="body">
1418
+ <div class="hero-grid">
1419
+ <div class="hero-panel primary">
1420
+ <h3>What SENTINEL actually teaches</h3>
1421
+ <p>SENTINEL is not training a specialist to solve one domain task. It trains the orchestrator to decide who to trust, when to verify, when to self-solve, and how to recover when one public slot turns unreliable or adversarial inside a long multi-agent task graph.</p>
1422
+ <div class="hero-callouts">
1423
+ <div class="hero-callout">
1424
+ <strong>Observation model</strong>
1425
+ The orchestrator only sees behavior: public slots, trust scores, stakes, step budget, and outcomes.
1426
+ </div>
1427
+ <div class="hero-callout">
1428
+ <strong>Core novelty</strong>
1429
+ Hidden specialist profiles reshuffle every reset, so the agent cannot memorize that S2 or S3 is dangerous.
1430
+ </div>
1431
+ <div class="hero-callout">
1432
+ <strong>Judge takeaway</strong>
1433
+ This environment turns blind agent-to-agent trust into a trainable oversight skill.
1434
+ </div>
1435
+ </div>
1436
+ <div class="hero-stats">
1437
+ <div class="hero-stat">
1438
+ <div class="label">Random overall</div>
1439
+ <div id="heroRandomScore" class="value">0.714</div>
1440
+ </div>
1441
+ <div class="hero-stat">
1442
+ <div class="label">Heuristic overall</div>
1443
+ <div id="heroHeuristicScore" class="value">0.816</div>
1444
+ </div>
1445
+ <div class="hero-stat">
1446
+ <div class="label">Task 3 detect</div>
1447
+ <div id="heroDetectionScore" class="value">0.735</div>
1448
+ </div>
1449
+ </div>
1450
+ </div>
1451
+ <div class="hero-panel">
1452
+ <h3>How to test this fast</h3>
1453
+ <div class="hero-steps">
1454
+ <div class="hero-step">
1455
+ <strong>1. Overview mode</strong>
1456
+ Read the before/after lanes and reward proof. This tells the story in judge language.
1457
+ </div>
1458
+ <div class="hero-step">
1459
+ <strong>2. Playground mode</strong>
1460
+ Reset an episode, click Auto Policy, and watch the API payloads, trust bars, and reward stream update.
1461
+ </div>
1462
+ <div class="hero-step">
1463
+ <strong>3. Judge Demo mode</strong>
1464
+ Run Random, then Heuristic, then Swap + Replay. That is the live finale sequence.
1465
+ </div>
1466
+ </div>
1467
+ </div>
1468
+ </div>
1469
+ </div>
1470
+ </section>
1471
+
1472
  <section class="theater">
1473
  <div class="section-head">
1474
  <h2>Live Trust Theater</h2>
 
1683
  <div class="story-lane before">
1684
  <div class="story-title">
1685
  <strong>Without SENTINEL</strong>
1686
+ <span id="storyBeforeScore" class="story-score">task3 random 0.699</span>
1687
  </div>
1688
  <div class="story-flow">
1689
  <div class="story-step">All public slots start near the same trust. The orchestrator delegates with weak evidence.</div>
 
1696
  <div class="story-lane after">
1697
  <div class="story-title">
1698
  <strong>With SENTINEL</strong>
1699
+ <span id="storyAfterScore" class="story-score">task3 heuristic 0.784</span>
1700
  </div>
1701
  <div class="story-flow">
1702
  <div class="story-step">Behavior updates the TrustLedger after every step, so public slots diverge quickly.</div>
 
1710
  </div>
1711
  </section>
1712
 
1713
+ <section class="judge">
1714
+ <div class="section-head">
1715
+ <h2>Judge Demo Rail</h2>
1716
+ <div class="chips">
1717
+ <span class="chip live">3-minute flow</span>
1718
+ <span class="chip">one-click policies</span>
1719
+ </div>
1720
+ </div>
1721
+ <div class="body">
1722
+ <div class="judge-grid">
1723
+ <div class="judge-stats">
1724
+ <div class="judge-card bad">
1725
+ <div class="label">Random baseline</div>
1726
+ <div id="judgeRandomScore" class="value">0.714</div>
1727
+ <span class="muted">Blind delegation baseline. Good enough to move, weak at skepticism.</span>
1728
+ </div>
1729
+ <div class="judge-card warn">
1730
+ <div class="label">Heuristic policy</div>
1731
+ <div id="judgeHeuristicScore" class="value">0.816</div>
1732
+ <span class="muted">Trust-weighted routing plus verification at risky gates.</span>
1733
+ </div>
1734
+ <div class="judge-card good">
1735
+ <div class="label">Task 3 detection</div>
1736
+ <div id="judgeDetectionScore" class="value">0.735</div>
1737
+ <span class="muted">Adversarial detections before poison can cascade into later nodes.</span>
1738
+ </div>
1739
+ </div>
1740
+ <div class="judge-actions">
1741
+ <button id="randomPolicyBtn" class="danger" type="button">Run Random</button>
1742
+ <button id="heuristicPolicyBtn" class="primary" type="button">Run Heuristic</button>
1743
+ <button id="judgeSwapBtn" class="warn" type="button">Swap + Replay</button>
1744
+ </div>
1745
+ <div class="judge-list">
1746
+ <div class="judge-step">
1747
+ <strong>Step 1 — show the failure</strong>
1748
+ Run Random to show how similar-looking trust scores lead to brittle routing and weak detection.
1749
+ </div>
1750
+ <div class="judge-step">
1751
+ <strong>Step 2 — show the learned behavior</strong>
1752
+ Run Heuristic to show trust divergence, verification at risky gates, and cleaner recovery.
1753
+ </div>
1754
+ <div class="judge-step">
1755
+ <strong>Step 3 — show generalization</strong>
1756
+ Hit Swap + Replay so hidden roles reshuffle and the orchestrator has to learn from fresh evidence again.
1757
+ </div>
1758
+ </div>
1759
+ </div>
1760
+ </div>
1761
+ </section>
1762
+
1763
  <section class="readiness">
1764
  <div class="section-head">
1765
  <h2>Hackathon Readiness</h2>
 
1797
  <div class="baseline-table">
1798
  <div class="baseline-row">
1799
  <span>Random</span>
1800
+ <div class="mini-bar"><span id="proofRandomBar" style="width:71.4%;background:#ff5f45"></span></div>
1801
+ <strong id="proofRandomScore">0.714</strong>
1802
  </div>
1803
  <div class="baseline-row">
1804
  <span>Heuristic</span>
1805
+ <div class="mini-bar"><span id="proofHeuristicBar" style="width:81.6%;background:#73a7ff"></span></div>
1806
+ <strong id="proofHeuristicScore">0.816</strong>
1807
  </div>
1808
  <div class="baseline-row">
1809
  <span>Oracle-lite</span>
1810
+ <div class="mini-bar"><span id="proofOracleBar" style="width:87.2%;background:#27e0a1"></span></div>
1811
+ <strong id="proofOracleScore">0.872</strong>
1812
  </div>
1813
  <div class="baseline-row">
1814
  <span>T3 detect</span>
1815
+ <div class="mini-bar"><span id="proofDetectBar" style="width:73.5%;background:#f5ba41"></span></div>
1816
+ <strong id="proofDetectScore">0.735</strong>
1817
  </div>
1818
  </div>
1819
  <div class="chart-frame">
 
1908
  events: [],
1909
  lastRequest: null,
1910
  lastResult: null,
1911
+ lastMode: "reset()",
1912
+ view: "overview",
1913
+ evaluation: null,
1914
+ demoPolicy: "heuristic"
1915
  };
1916
 
1917
  const el = {
 
1922
  swapBtn: document.getElementById("swapBtn"),
1923
  swapPanelBtn: document.getElementById("swapPanelBtn"),
1924
  autoBtn: document.getElementById("autoBtn"),
1925
+ viewOverviewBtn: document.getElementById("viewOverviewBtn"),
1926
+ viewPlaygroundBtn: document.getElementById("viewPlaygroundBtn"),
1927
+ viewJudgeBtn: document.getElementById("viewJudgeBtn"),
1928
+ viewCopy: document.getElementById("viewCopy"),
1929
+ randomPolicyBtn: document.getElementById("randomPolicyBtn"),
1930
+ heuristicPolicyBtn: document.getElementById("heuristicPolicyBtn"),
1931
+ judgeSwapBtn: document.getElementById("judgeSwapBtn"),
1932
  specialistSelect: document.getElementById("specialistSelect"),
1933
  recommendChip: document.getElementById("recommendChip"),
1934
  recommendText: document.getElementById("recommendText"),
 
1961
  leadMove: document.getElementById("leadMove"),
1962
  stageMove: document.getElementById("stageMove"),
1963
  stageSignals: document.getElementById("stageSignals"),
1964
+ rewardText: document.getElementById("rewardText"),
1965
+ heroRandomScore: document.getElementById("heroRandomScore"),
1966
+ heroHeuristicScore: document.getElementById("heroHeuristicScore"),
1967
+ heroDetectionScore: document.getElementById("heroDetectionScore"),
1968
+ storyBeforeScore: document.getElementById("storyBeforeScore"),
1969
+ storyAfterScore: document.getElementById("storyAfterScore"),
1970
+ judgeRandomScore: document.getElementById("judgeRandomScore"),
1971
+ judgeHeuristicScore: document.getElementById("judgeHeuristicScore"),
1972
+ judgeDetectionScore: document.getElementById("judgeDetectionScore"),
1973
+ proofRandomBar: document.getElementById("proofRandomBar"),
1974
+ proofRandomScore: document.getElementById("proofRandomScore"),
1975
+ proofHeuristicBar: document.getElementById("proofHeuristicBar"),
1976
+ proofHeuristicScore: document.getElementById("proofHeuristicScore"),
1977
+ proofOracleBar: document.getElementById("proofOracleBar"),
1978
+ proofOracleScore: document.getElementById("proofOracleScore"),
1979
+ proofDetectBar: document.getElementById("proofDetectBar"),
1980
+ proofDetectScore: document.getElementById("proofDetectScore")
1981
  };
1982
 
1983
  function trustColor(value) {
 
2016
  return {type: "delegate", specialist: best, trust};
2017
  }
2018
 
2019
+ function randomMove() {
2020
+ const obs = state.observation;
2021
+ if (!obs) return {type: "delegate", specialist: "S0", trust: 0.5};
2022
+ const available = obs.available_specialists || ids;
2023
+ const specialist = available[Math.floor(Math.random() * available.length)] || "S0";
2024
+ return {type: "delegate", specialist, trust: obs.trust_snapshot?.[specialist] ?? 0.5};
2025
+ }
2026
+
2027
+ function setView(view) {
2028
+ state.view = view;
2029
+ const sectionViews = {
2030
+ hero: ["overview", "judge"],
2031
+ theater: ["playground", "judge"],
2032
+ command: ["playground", "judge"],
2033
+ mission: ["playground", "judge"],
2034
+ trust: ["playground", "judge"],
2035
+ playground: ["playground", "judge"],
2036
+ story: ["overview", "judge"],
2037
+ judge: ["judge"],
2038
+ readiness: ["overview"],
2039
+ proof: ["overview", "judge"],
2040
+ events: ["playground", "judge"],
2041
+ flow: ["overview"],
2042
+ themes: ["overview"]
2043
+ };
2044
+
2045
+ Object.entries(sectionViews).forEach(([name, views]) => {
2046
+ const node = document.querySelector(`section.${name}`);
2047
+ if (!node) return;
2048
+ node.classList.toggle("section-hidden", !views.includes(view));
2049
+ });
2050
+
2051
+ el.viewOverviewBtn.classList.toggle("active", view === "overview");
2052
+ el.viewPlaygroundBtn.classList.toggle("active", view === "playground");
2053
+ el.viewJudgeBtn.classList.toggle("active", view === "judge");
2054
+
2055
+ const copy = {
2056
+ overview: "Overview turns the environment into a judge-readable system story: the problem, the learning signal, and the live failure mode it fixes.",
2057
+ playground: "Playground is the backend-visible mode: every reset() and step() payload is shown so you can understand exactly what the environment returns.",
2058
+ judge: "Judge Demo is the fast pitch mode: show baseline failure, show heuristic recovery, then swap profiles to prove the agent learned a skill instead of an identity."
2059
+ };
2060
+ if (el.viewCopy) {
2061
+ el.viewCopy.textContent = copy[view] || copy.overview;
2062
+ }
2063
+ }
2064
+
2065
+ async function loadEvaluation() {
2066
+ try {
2067
+ const response = await fetch("/assets/evaluation_results.json");
2068
+ if (!response.ok) throw new Error("evaluation asset missing");
2069
+ state.evaluation = await response.json();
2070
+ renderEvaluation();
2071
+ } catch (error) {
2072
+ console.warn("Failed to load evaluation results", error);
2073
+ }
2074
+ }
2075
+
2076
+ function setMetricText(node, value, digits = 3) {
2077
+ if (!node || value === undefined || value === null || Number.isNaN(Number(value))) return;
2078
+ node.textContent = Number(value).toFixed(digits);
2079
+ }
2080
+
2081
+ function setMetricBar(node, value) {
2082
+ if (!node || value === undefined || value === null || Number.isNaN(Number(value))) return;
2083
+ node.style.width = `${Math.max(0, Math.min(100, Number(value) * 100))}%`;
2084
+ }
2085
+
2086
+ function renderEvaluation() {
2087
+ const data = state.evaluation;
2088
+ if (!data) return;
2089
+
2090
+ const overall = data.summary || {};
2091
+ const task3 = data.by_task?.task3 || {};
2092
+ const random = overall.random || {};
2093
+ const heuristic = overall.heuristic || {};
2094
+ const oracle = overall.oracle_lite || {};
2095
+ const task3Random = task3.random || {};
2096
+ const task3Heuristic = task3.heuristic || {};
2097
+
2098
+ setMetricText(el.heroRandomScore, random.avg_score);
2099
+ setMetricText(el.heroHeuristicScore, heuristic.avg_score);
2100
+ setMetricText(el.heroDetectionScore, task3Heuristic.avg_detection_rate);
2101
+
2102
+ if (el.storyBeforeScore && task3Random.avg_score !== undefined) {
2103
+ el.storyBeforeScore.textContent = `task3 random ${Number(task3Random.avg_score).toFixed(3)}`;
2104
+ }
2105
+ if (el.storyAfterScore && task3Heuristic.avg_score !== undefined) {
2106
+ el.storyAfterScore.textContent = `task3 heuristic ${Number(task3Heuristic.avg_score).toFixed(3)}`;
2107
+ }
2108
+
2109
+ setMetricText(el.judgeRandomScore, random.avg_score);
2110
+ setMetricText(el.judgeHeuristicScore, heuristic.avg_score);
2111
+ setMetricText(el.judgeDetectionScore, task3Heuristic.avg_detection_rate);
2112
+
2113
+ setMetricBar(el.proofRandomBar, random.avg_score);
2114
+ setMetricBar(el.proofHeuristicBar, heuristic.avg_score);
2115
+ setMetricBar(el.proofOracleBar, oracle.avg_score);
2116
+ setMetricBar(el.proofDetectBar, task3Heuristic.avg_detection_rate);
2117
+
2118
+ setMetricText(el.proofRandomScore, random.avg_score);
2119
+ setMetricText(el.proofHeuristicScore, heuristic.avg_score);
2120
+ setMetricText(el.proofOracleScore, oracle.avg_score);
2121
+ setMetricText(el.proofDetectScore, task3Heuristic.avg_detection_rate);
2122
+ }
2123
+
2124
  function renderTrust() {
2125
  const trust = state.observation?.trust_snapshot || Object.fromEntries(ids.map(id => [id, 0.5]));
2126
  const values = ids.map(id => Number(trust[id] ?? 0.5));
 
2231
  el.selfBtn.disabled = disabled;
2232
  el.skipBtn.disabled = disabled;
2233
  el.applyRecommendBtn.disabled = disabled;
2234
+ if (el.randomPolicyBtn) el.randomPolicyBtn.disabled = state.running;
2235
+ if (el.heuristicPolicyBtn) el.heuristicPolicyBtn.disabled = state.running;
2236
+ if (el.judgeSwapBtn) el.judgeSwapBtn.disabled = state.running;
2237
  }
2238
 
2239
  function render(result) {
 
2376
  }
2377
  }
2378
 
2379
+ async function autoRun(policy = state.demoPolicy) {
2380
  if (!state.observation || state.done) await resetEpisode();
2381
  let guard = 0;
2382
  while (!state.done && guard < 70) {
2383
+ const move = policy === "random" ? randomMove() : recommendedMove();
2384
  await stepEpisode(move.type, move.specialist);
2385
  guard += 1;
2386
  await new Promise(resolve => setTimeout(resolve, 150));
 
2392
  await stepEpisode(move.type, move.specialist);
2393
  }
2394
 
2395
+ async function runPolicy(policy) {
2396
+ state.demoPolicy = policy;
2397
+ await resetEpisode();
2398
+ await autoRun(policy);
2399
+ }
2400
+
2401
+ async function swapProfiles(policy = null) {
2402
  const nextSeed = Number(el.seedInput.value || 0) + 1;
2403
  el.seedInput.value = String(nextSeed);
2404
  await resetEpisode();
2405
+ if (policy) {
2406
+ await autoRun(policy);
2407
+ }
2408
  }
2409
 
2410
  el.resetBtn.addEventListener("click", resetEpisode);
2411
  el.resetPanelBtn.addEventListener("click", resetEpisode);
2412
+ el.swapBtn.addEventListener("click", () => swapProfiles());
2413
+ el.swapPanelBtn.addEventListener("click", () => swapProfiles());
2414
  el.delegateBtn.addEventListener("click", () => stepEpisode("delegate"));
2415
  el.verifyBtn.addEventListener("click", () => stepEpisode("verify"));
2416
  el.selfBtn.addEventListener("click", () => stepEpisode("solve_independently"));
2417
  el.skipBtn.addEventListener("click", () => stepEpisode("skip"));
2418
+ el.autoBtn.addEventListener("click", () => autoRun("heuristic"));
2419
  el.applyRecommendBtn.addEventListener("click", applyRecommendation);
2420
+ el.viewOverviewBtn.addEventListener("click", () => setView("overview"));
2421
+ el.viewPlaygroundBtn.addEventListener("click", () => setView("playground"));
2422
+ el.viewJudgeBtn.addEventListener("click", () => setView("judge"));
2423
+ el.randomPolicyBtn.addEventListener("click", () => runPolicy("random"));
2424
+ el.heuristicPolicyBtn.addEventListener("click", () => runPolicy("heuristic"));
2425
+ el.judgeSwapBtn.addEventListener("click", () => swapProfiles(state.demoPolicy));
2426
+
2427
+ setView("overview");
2428
+ loadEvaluation();
2429
  render();
2430
  resetEpisode();
2431
  </script>