| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <title>ellamind base-eval</title> |
| <script src="https://cdn.plot.ly/plotly-2.35.2.min.js" charset="utf-8"></script> |
| <script type="importmap"> |
| { |
| "imports": { |
| "@huggingface/hub": "https://cdn.jsdelivr.net/npm/@huggingface/hub@0.21.0/+esm" |
| } |
| } |
| </script> |
| <style> |
| * { box-sizing: border-box; margin: 0; padding: 0; } |
| body { |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; |
| background: #f8f9fa; |
| color: #1a1a2e; |
| padding: 24px; |
| } |
| |
| |
| .page-header { |
| text-align: center; |
| margin-bottom: 24px; |
| } |
| .page-header h1 { |
| font-size: 1.5rem; |
| font-weight: 600; |
| color: #1a1a2e; |
| } |
| .btn { |
| padding: 8px 16px; |
| border: 1px solid #dee2e6; |
| border-radius: 6px; |
| background: #fff; |
| font-size: 0.875rem; |
| color: #495057; |
| cursor: pointer; |
| transition: background 0.15s; |
| } |
| .btn:hover { background: #e9ecef; } |
| .btn-primary { |
| background: #4361ee; |
| color: #fff; |
| border-color: #4361ee; |
| } |
| .btn-primary:hover { background: #3a56d4; } |
| .btn-sm { |
| padding: 4px 10px; |
| font-size: 0.75rem; |
| } |
| .btn-danger { color: #e63946; border-color: #e6394640; } |
| .btn-danger:hover { background: #e6394610; } |
| |
| |
| #panels-container { |
| display: grid; |
| grid-template-columns: 1fr; |
| gap: 20px; |
| } |
| |
| |
| .panel { |
| background: #fff; |
| border: 1px solid #dee2e6; |
| border-radius: 8px; |
| overflow: hidden; |
| position: relative; |
| width: 1400px; |
| max-width: 100%; |
| margin: 0 auto; |
| } |
| .panel-toolbar { |
| display: flex; |
| align-items: center; |
| justify-content: flex-end; |
| gap: 6px; |
| padding: 6px 10px; |
| border-bottom: 1px solid #dee2e6; |
| background: #f8f9fa; |
| } |
| .panel-controls { |
| padding: 16px; |
| border-bottom: 1px solid #dee2e6; |
| } |
| .panel-controls.collapsed { display: none; } |
| .controls-row { |
| display: flex; |
| flex-wrap: wrap; |
| gap: 12px; |
| align-items: flex-end; |
| } |
| .controls-row + .controls-row { margin-top: 12px; } |
| .control-group { |
| display: flex; |
| flex-direction: column; |
| gap: 4px; |
| } |
| .control-group label { |
| font-size: 0.7rem; |
| font-weight: 600; |
| text-transform: uppercase; |
| letter-spacing: 0.05em; |
| color: #6c757d; |
| } |
| select { |
| padding: 6px 10px; |
| border: 1px solid #dee2e6; |
| border-radius: 6px; |
| background: #fff; |
| font-size: 0.8rem; |
| color: #1a1a2e; |
| min-width: 160px; |
| cursor: pointer; |
| } |
| select:focus { |
| outline: none; |
| border-color: #4361ee; |
| box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.15); |
| } |
| |
| |
| .models-section { |
| margin-top: 12px; |
| } |
| .models-header { |
| display: flex; |
| align-items: center; |
| gap: 8px; |
| margin-bottom: 8px; |
| } |
| .models-header span { |
| font-size: 0.7rem; |
| font-weight: 600; |
| text-transform: uppercase; |
| letter-spacing: 0.05em; |
| color: #6c757d; |
| } |
| .checkbox-grid { |
| display: flex; |
| flex-wrap: wrap; |
| gap: 6px 16px; |
| } |
| .checkbox-item { |
| display: flex; |
| align-items: center; |
| gap: 5px; |
| cursor: pointer; |
| font-size: 0.8rem; |
| } |
| .checkbox-item input[type="checkbox"] { |
| width: 14px; |
| height: 14px; |
| cursor: pointer; |
| accent-color: #4361ee; |
| } |
| .checkbox-item .color-dot { |
| cursor: pointer; |
| border: 1px solid transparent; |
| transition: border-color 0.15s; |
| } |
| .checkbox-item .color-dot:hover { |
| border-color: #888; |
| } |
| .checkbox-item .model-name.missing { |
| text-decoration: line-through; |
| opacity: 0.5; |
| cursor: help; |
| } |
| .checkbox-item .model-name.missing:hover { |
| opacity: 0.8; |
| } |
| .model-separator { |
| width: 100%; |
| border-top: 1px solid #eee; |
| margin: 4px 0; |
| } |
| |
| |
| .panel-chart-wrapper { |
| position: relative; |
| } |
| .panel-chart { |
| min-height: 100px; |
| overflow: hidden; |
| } |
| .title-hover-zone { |
| position: absolute; |
| top: 0; |
| left: 50px; |
| right: 50px; |
| height: 40px; |
| cursor: pointer; |
| z-index: 10; |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| pointer-events: none; |
| } |
| .title-hover-zone > * { |
| pointer-events: auto; |
| } |
| .title-info-icon { |
| position: absolute; |
| top: 50%; |
| transform: translateY(-50%); |
| width: 18px; |
| height: 18px; |
| border-radius: 50%; |
| background: #e9ecef; |
| color: #495057; |
| font-size: 11px; |
| font-weight: 600; |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| opacity: 0.6; |
| transition: opacity 0.15s; |
| } |
| .title-hover-zone:hover .title-info-icon { |
| opacity: 1; |
| } |
| |
| |
| .task-stats { |
| display: flex; |
| flex-direction: column; |
| padding: 6px 16px; |
| border-top: 1px solid #dee2e6; |
| background: #f8f9fa; |
| font-size: 0.72rem; |
| color: #495057; |
| gap: 3px; |
| } |
| .task-stats:empty { display: none; } |
| .stat-row { |
| display: flex; |
| flex-wrap: wrap; |
| align-items: center; |
| gap: 4px 12px; |
| } |
| .stat-stage { |
| font-weight: 700; |
| text-transform: uppercase; |
| letter-spacing: 0.04em; |
| color: #495057; |
| min-width: 52px; |
| font-size: 0.68rem; |
| } |
| .stat-item { |
| display: flex; |
| align-items: center; |
| gap: 3px; |
| } |
| .stat-label { |
| font-weight: 600; |
| text-transform: uppercase; |
| letter-spacing: 0.03em; |
| color: #6c757d; |
| font-size: 0.68rem; |
| } |
| .stat-value { |
| font-variant-numeric: tabular-nums; |
| } |
| .stat-value.good { color: #2a9d8f; } |
| .stat-value.ok { color: #e9c46a; } |
| .stat-value.bad { color: #e63946; } |
| .stat-help { |
| display: inline-block; |
| width: 14px; |
| height: 14px; |
| border-radius: 50%; |
| background: #e9ecef; |
| color: #6c757d; |
| font-size: 9px; |
| font-weight: 700; |
| text-align: center; |
| line-height: 14px; |
| cursor: help; |
| } |
| |
| |
| .panel-width-handle { |
| position: absolute; |
| top: 0; |
| right: -4px; |
| width: 8px; |
| height: 100%; |
| cursor: ew-resize; |
| z-index: 20; |
| transition: background-color 0.15s; |
| } |
| .panel-width-handle:hover, |
| .panel-width-handle.active { |
| background-color: #e9ecef; |
| } |
| .panel-resize-handle { |
| height: 6px; |
| cursor: ns-resize; |
| background: linear-gradient(to bottom, #dee2e6 1px, transparent 1px, transparent 3px, #dee2e6 3px); |
| background-size: 100% 4px; |
| background-position: center; |
| transition: background-color 0.15s; |
| } |
| .panel-resize-handle:hover, |
| .panel-resize-handle.active { |
| background-color: #e9ecef; |
| } |
| .loading { |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| padding: 1rem 0; |
| color: #adb5bd; |
| font-size: 0.85rem; |
| } |
| |
| |
| .custom-tooltip { |
| position: fixed; |
| pointer-events: none; |
| background: rgba(0, 0, 0, 0.85); |
| color: #fff; |
| padding: 8px 12px 12px; |
| border-radius: 4px; |
| font-size: 11px; |
| line-height: 1.5; |
| z-index: 9999; |
| display: none; |
| white-space: nowrap; |
| } |
| .custom-tooltip.scrollable { |
| pointer-events: auto; |
| overflow-y: auto; |
| white-space: normal; |
| min-width: 200px; |
| max-width: 400px; |
| } |
| |
| |
| .merge-dataset-row { |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| gap: 8px; |
| margin-bottom: 16px; |
| flex-wrap: wrap; |
| } |
| .merge-dataset-row input[type="text"] { |
| padding: 6px 10px; |
| border: 1px solid #dee2e6; |
| border-radius: 6px; |
| font-size: 0.8rem; |
| color: #1a1a2e; |
| width: 420px; |
| max-width: 60vw; |
| } |
| .merge-dataset-row input[type="text"]:focus, |
| .hf-auth-row input[type="password"]:focus { |
| outline: none; |
| border-color: #4361ee; |
| box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.15); |
| } |
| .hf-auth-row input[type="password"] { |
| padding: 6px 10px; |
| border: 1px solid #dee2e6; |
| border-radius: 6px; |
| font-size: 0.8rem; |
| color: #1a1a2e; |
| width: 420px; |
| max-width: 50vw; |
| } |
| .hf-auth-row { |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| gap: 8px; |
| margin-bottom: 8px; |
| } |
| .hf-auth-row .hf-user { |
| font-size: 0.8rem; |
| color: #495057; |
| } |
| .hf-signin-img { |
| cursor: pointer; |
| height: 28px; |
| } |
| .hf-auth-row label { |
| font-size: 0.75rem; |
| color: #6c757d; |
| cursor: pointer; |
| display: flex; |
| align-items: center; |
| gap: 4px; |
| } |
| .hf-auth-row label input { |
| accent-color: #4361ee; |
| } |
| .merge-dataset-row .merge-status { |
| font-size: 0.75rem; |
| color: #6c757d; |
| } |
| .merge-dataset-row .merge-status.error { |
| color: #e63946; |
| } |
| .merged-tags { |
| display: flex; |
| flex-wrap: wrap; |
| gap: 6px; |
| justify-content: center; |
| margin-bottom: 12px; |
| } |
| .merged-tag { |
| display: inline-flex; |
| align-items: center; |
| gap: 4px; |
| padding: 3px 10px; |
| background: #e9ecef; |
| border-radius: 12px; |
| font-size: 0.75rem; |
| color: #495057; |
| } |
| .merged-tag button { |
| background: none; |
| border: none; |
| cursor: pointer; |
| color: #e63946; |
| font-size: 0.85rem; |
| line-height: 1; |
| padding: 0 2px; |
| } |
| .merged-tag button:hover { color: #c5303c; } |
| |
| |
| .add-panel-row { |
| display: flex; |
| justify-content: center; |
| padding: 20px; |
| } |
| |
| |
| #init-loading { |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| height: 300px; |
| color: #6c757d; |
| font-size: 1rem; |
| } |
| </style> |
| </head> |
| <body> |
| <div class="page-header"> |
| <h1>ellamind base-eval</h1> |
| <p style="margin:4px 0 0;font-size:13px;color:#6c757d;">Benchmarks: <a href="https://github.com/ellamind/base-eval" target="_blank" rel="noopener" style="color:#4361ee;">ellamind/base-eval</a> Β· Data: <a href="https://huggingface.co/datasets/ellamind/eval-scores-ref" target="_blank" rel="noopener" style="color:#4361ee;">ellamind/eval-scores-ref</a></p> |
| </div> |
|
|
| <div id="init-loading">Initializing DuckDB...</div> |
|
|
| <div class="hf-auth-row" id="hf-auth-row" style="display:none"> |
| <img id="hf-signin-btn" class="hf-signin-img" src="https://huggingface.co/datasets/huggingface/badges/resolve/main/sign-in-with-huggingface-sm-dark.svg" alt="Sign in with Hugging Face" style="display:none"> |
| <label id="hf-private-label" style="display:none"><input type="checkbox" id="hf-private-toggle"> Include private repos</label> |
| <span id="hf-user" class="hf-user" style="display:none"></span> |
| <button class="btn btn-sm" id="hf-signout-btn" style="display:none">Sign out</button> |
| <input type="password" id="hf-token-input" placeholder="HF token (for private datasets)" style="display:none"> |
| </div> |
| <div class="merge-dataset-row" id="merge-dataset-row" style="display:none"> |
| <input type="text" id="merge-dataset-input" placeholder="HF dataset path, e.g. org/dataset-name or org/dataset-name/file.parquet"> |
| <button class="btn btn-primary btn-sm" id="btn-merge-dataset">Merge Dataset</button> |
| <span class="merge-status" id="merge-status"></span> |
| </div> |
| <div class="merged-tags" id="merged-tags"></div> |
|
|
| <div id="panels-container"></div> |
| <div class="custom-tooltip" id="custom-tooltip"></div> |
|
|
| <div class="add-panel-row" id="add-panel-row" style="display:none"> |
| <button class="btn btn-primary" id="btn-add-panel">+ Add Panel</button> |
| </div> |
|
|
| <script type="module"> |
| import * as duckdb from 'https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@1.29.0/+esm'; |
| import jsyaml from 'https://cdn.jsdelivr.net/npm/js-yaml@4.1.0/+esm'; |
| import { oauthLoginUrl, oauthHandleRedirectIfPresent } from '@huggingface/hub'; |
| |
| |
| let hfAccessToken = null; |
| let db = null; |
| let conn = null; |
| let panelCounter = 0; |
| const panels = new Map(); |
| |
| |
| const COLOR_PALETTE = [ |
| '#4361ee', '#e63946', '#2a9d8f', '#e9c46a', '#f4a261', |
| '#264653', '#7209b7', '#06d6a0', '#ef476f', '#ff6b6b', |
| '#48bfe3', '#d4a017', '#b5838d', '#588157', '#9d4edd', |
| '#f77f00', '#3a86a7', '#8338ec', '#ff006e', '#fb5607', |
| ]; |
| |
| const PARQUET_URL = 'https://huggingface.co/datasets/ellamind/eval-scores-ref/resolve/main/scores.parquet'; |
| |
| |
| let ALL_MODELS = []; |
| let MODEL_COLORS = {}; |
| let CONFIG = {}; |
| |
| |
| let mergedDatasets = []; |
| let mergeCounter = 0; |
| |
| |
| async function initDuckDB() { |
| const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles(); |
| const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES); |
| const worker_url = URL.createObjectURL( |
| new Blob([`importScripts("${bundle.mainWorker}");`], { type: 'text/javascript' }) |
| ); |
| const worker = new Worker(worker_url); |
| const logger = new duckdb.ConsoleLogger(); |
| db = new duckdb.AsyncDuckDB(logger, worker); |
| await db.instantiate(bundle.mainModule, bundle.pthreadWorker); |
| URL.revokeObjectURL(worker_url); |
| conn = await db.connect(); |
| } |
| |
| async function loadParquet() { |
| const response = await fetch(PARQUET_URL); |
| const buffer = new Uint8Array(await response.arrayBuffer()); |
| await db.registerFileBuffer('scores.parquet', buffer); |
| await conn.query(`CREATE VIEW scores AS SELECT * FROM 'scores.parquet'`); |
| } |
| |
| |
| async function query(sql) { |
| const result = await conn.query(sql); |
| return result.toArray().map(row => row.toJSON()); |
| } |
| |
| function esc(s) { return s.replace(/'/g, "''"); } |
| function sqlIn(vals) { return vals.map(v => `'${esc(v)}'`).join(', '); } |
| |
| |
| function populateSelect(el, options, selected) { |
| el.innerHTML = ''; |
| options.forEach(opt => { |
| const o = document.createElement('option'); |
| if (typeof opt === 'object') { o.value = opt.value; o.textContent = opt.label; } |
| else { o.value = opt; o.textContent = opt; } |
| el.appendChild(o); |
| }); |
| if (selected && options.some(o => (typeof o === 'object' ? o.value : o) === selected)) { |
| el.value = selected; |
| } |
| } |
| |
| function formatTokensSingle(value, precision) { |
| if (value >= 1e12) return (value / 1e12).toFixed(precision) + 'T'; |
| if (value >= 1e9) return (value / 1e9).toFixed(precision) + 'B'; |
| if (value >= 1e6) return (value / 1e6).toFixed(precision) + 'M'; |
| if (value >= 1e3) return (value / 1e3).toFixed(precision) + 'K'; |
| return value.toString(); |
| } |
| |
| function formatTokens(value) { |
| if (value == null || isNaN(value)) return 'N/A'; |
| return formatTokensSingle(value, value >= 1e12 ? 1 : 0); |
| } |
| |
| function formatTokensArray(values) { |
| |
| for (let p = (values[0] >= 1e12 ? 1 : 0); p <= 3; p++) { |
| const labels = values.map(v => formatTokensSingle(v, p)); |
| if (new Set(labels).size === labels.length) return labels; |
| } |
| return values.map(v => formatTokensSingle(v, 3)); |
| } |
| |
| function niceTicks(min, max, maxTicks = 8) { |
| min = Math.max(0, min); |
| if (min >= max) return [min]; |
| const range = max - min; |
| |
| const rawStep = range / maxTicks; |
| const mag = Math.pow(10, Math.floor(Math.log10(rawStep))); |
| const normalized = rawStep / mag; |
| let step; |
| if (normalized <= 1.5) step = 1 * mag; |
| else if (normalized <= 3.5) step = 2 * mag; |
| else if (normalized <= 7.5) step = 5 * mag; |
| else step = 10 * mag; |
| |
| const start = Math.ceil(min / step) * step; |
| const ticks = []; |
| for (let v = start; v <= max; v += step) { |
| ticks.push(Math.round(v)); |
| } |
| |
| if (ticks.length === 0 || ticks[0] - min > step * 0.3) ticks.unshift(Math.round(min)); |
| if (max - ticks[ticks.length - 1] > step * 0.3) ticks.push(Math.round(max)); |
| return ticks; |
| } |
| |
| function exponentialMovingAverage(values, alpha) { |
| if (alpha <= 0) return values; |
| const result = []; |
| let ema = 0; |
| let debiasWeight = 0; |
| for (let i = 0; i < values.length; i++) { |
| ema = alpha * ema + (1 - alpha) * values[i]; |
| debiasWeight = alpha * debiasWeight + (1 - alpha); |
| result.push(ema / debiasWeight); |
| } |
| return result; |
| } |
| |
| |
| |
| function spearmanCorrelation(xs, ys) { |
| const n = xs.length; |
| if (n < 3) return NaN; |
| function rankArray(arr) { |
| const indexed = arr.map((v, i) => ({ v, i })); |
| indexed.sort((a, b) => a.v - b.v); |
| const ranks = new Array(n); |
| let i = 0; |
| while (i < n) { |
| let j = i; |
| while (j < n - 1 && indexed[j + 1].v === indexed[j].v) j++; |
| const avgRank = (i + j) / 2 + 1; |
| for (let k = i; k <= j; k++) ranks[indexed[k].i] = avgRank; |
| i = j + 1; |
| } |
| return ranks; |
| } |
| const rx = rankArray(xs); |
| const ry = rankArray(ys); |
| let sumD2 = 0; |
| for (let i = 0; i < n; i++) sumD2 += (rx[i] - ry[i]) ** 2; |
| return 1 - (6 * sumD2) / (n * (n * n - 1)); |
| } |
| |
| |
| function kendallTau(xs, ys) { |
| const n = xs.length; |
| if (n < 2) return NaN; |
| let concordant = 0, discordant = 0; |
| for (let i = 0; i < n; i++) { |
| for (let j = i + 1; j < n; j++) { |
| const dx = xs[i] - xs[j]; |
| const dy = ys[i] - ys[j]; |
| if (dx * dy > 0) concordant++; |
| else if (dx * dy < 0) discordant++; |
| } |
| } |
| const pairs = n * (n - 1) / 2; |
| return (concordant - discordant) / pairs; |
| } |
| |
| |
| |
| |
| function _monotonicity(slices, flip) { |
| const vals = slices.map(pts => { |
| if (pts.length < 3) return NaN; |
| return flip * spearmanCorrelation(pts.map(p => p.x), pts.map(p => p.y)); |
| }).filter(v => !isNaN(v)); |
| return vals.length > 0 ? vals.reduce((a, b) => a + b, 0) / vals.length : NaN; |
| } |
| |
| |
| function _signalStrength(slices, higherIsBetter) { |
| const vals = slices.map(pts => { |
| if (pts.length < 2) return NaN; |
| const first = pts[0].y; |
| const best = higherIsBetter === false |
| ? Math.min(...pts.map(p => p.y)) |
| : Math.max(...pts.map(p => p.y)); |
| const raw = higherIsBetter === false ? first - best : best - first; |
| return Math.abs(first) > 0.01 ? raw / Math.abs(first) : raw; |
| }).filter(v => !isNaN(v)); |
| return vals.length > 0 ? vals.reduce((a, b) => a + b, 0) / vals.length : NaN; |
| } |
| |
| |
| function _noise(slices) { |
| const vals = slices.map(pts => { |
| if (pts.length < 3) return NaN; |
| const diffs = []; |
| for (let i = 1; i < pts.length; i++) diffs.push(pts[i].y - pts[i - 1].y); |
| const sorted = [...diffs].sort((a, b) => a - b); |
| const median = sorted[Math.floor(sorted.length / 2)]; |
| const absDev = diffs.map(d => Math.abs(d - median)); |
| const sortedDev = [...absDev].sort((a, b) => a - b); |
| return sortedDev[Math.floor(sortedDev.length / 2)]; |
| }).filter(v => !isNaN(v)); |
| return vals.length > 0 ? vals.reduce((a, b) => a + b, 0) / vals.length : NaN; |
| } |
| |
| |
| function _orderingConsistency(checkpointModels, xValues) { |
| if (checkpointModels.length < 2 || xValues.length < 2) return NaN; |
| const scoreLookup = {}; |
| for (const [name, d] of checkpointModels) { |
| scoreLookup[name] = {}; |
| for (const p of d.points) scoreLookup[name][p.x] = p.y; |
| } |
| const taus = []; |
| for (let i = 0; i < xValues.length - 1; i++) { |
| const x1 = xValues[i], x2 = xValues[i + 1]; |
| const s1 = checkpointModels.map(([name]) => scoreLookup[name][x1]); |
| const s2 = checkpointModels.map(([name]) => scoreLookup[name][x2]); |
| const tau = kendallTau(s1, s2); |
| if (!isNaN(tau)) taus.push(tau); |
| } |
| return taus.length > 0 ? taus.reduce((a, b) => a + b, 0) / taus.length : NaN; |
| } |
| |
| |
| function _discrimination(checkpointModels, xValues) { |
| if (checkpointModels.length < 2 || xValues.length === 0) return NaN; |
| const lastX = xValues[xValues.length - 1]; |
| const scores = checkpointModels.map(([, d]) => { |
| const p = d.points.find(p => p.x === lastX); |
| return p ? p.y : NaN; |
| }).filter(v => !isNaN(v)); |
| if (scores.length < 2) return NaN; |
| const mean = scores.reduce((a, b) => a + b, 0) / scores.length; |
| const variance = scores.reduce((a, b) => a + (b - mean) ** 2, 0) / scores.length; |
| return Math.sqrt(variance); |
| } |
| |
| |
| |
| |
| |
| |
| |
| function computeTaskQualityMetrics(byModel, higherIsBetter) { |
| const flip = higherIsBetter === false ? -1 : 1; |
| |
| const checkpointModels = Object.entries(byModel) |
| .map(([name, d]) => [name, { ...d, points: d.points.filter(p => p.x != null) }]) |
| .filter(([, d]) => d.isCheckpoint && d.points.length >= 3); |
| |
| if (checkpointModels.length === 0) return null; |
| |
| |
| |
| function perModelSlices(stage) { |
| return checkpointModels.map(([, d]) => { |
| const pts = d.points; |
| const half = Math.floor(pts.length / 2); |
| if (stage === 'early') return pts.slice(0, Math.max(half, 1)); |
| if (stage === 'late') return pts.slice(half); |
| return pts; |
| }); |
| } |
| |
| |
| |
| const xSets = checkpointModels.map(([, d]) => new Set(d.points.map(p => p.x))); |
| let commonXs = [...xSets[0]].filter(x => xSets.every(s => s.has(x))).sort((a, b) => a - b); |
| if (commonXs.length < 2) { |
| |
| const xCount = {}; |
| for (const s of xSets) for (const x of s) xCount[x] = (xCount[x] || 0) + 1; |
| commonXs = Object.entries(xCount) |
| .filter(([, c]) => c >= 2) |
| .map(([x]) => Number(x)) |
| .sort((a, b) => a - b); |
| } |
| const commonHalf = Math.floor(commonXs.length / 2); |
| const earlyCommonXs = commonXs.slice(0, Math.max(commonHalf, 1)); |
| const lateCommonXs = commonXs.slice(commonHalf); |
| |
| function metricsFor(stage, crossXs) { |
| const slices = perModelSlices(stage); |
| return { |
| monotonicity: _monotonicity(slices, flip), |
| signalStrength: _signalStrength(slices, higherIsBetter), |
| noise: _noise(slices), |
| orderingConsistency: _orderingConsistency(checkpointModels, crossXs), |
| discrimination: _discrimination(checkpointModels, crossXs), |
| }; |
| } |
| |
| return { |
| overall: metricsFor('overall', commonXs), |
| early: metricsFor('early', earlyCommonXs), |
| late: metricsFor('late', lateCommonXs), |
| }; |
| } |
| |
| const METRIC_HELP = { |
| monotonicity: 'Spearman rank correlation between training steps and score, averaged across models. Values near 1.0 mean scores consistently improve with training.', |
| signalStrength: 'Relative improvement: (best \u2212 first) / |first|, averaged across models. Shows how much the task is learned beyond its initial performance.', |
| noise: 'Median absolute deviation (MAD) of consecutive score differences, averaged across models. Lower = cleaner signal. Uses MAD to be robust to sudden jumps from data-mix changes.', |
| orderingConsistency: "Average Kendall\u2019s Tau of model rankings between consecutive checkpoint steps. High values mean stable model ordering.", |
| discrimination: 'Std of scores across models at the last checkpoint in this stage. Higher = task better separates model quality.', |
| }; |
| |
| const METRIC_ITEMS = [ |
| { key: 'monotonicity', label: 'Monotonicity' }, |
| { key: 'signalStrength', label: 'Signal Str.' }, |
| { key: 'noise', label: 'Noise' }, |
| { key: 'orderingConsistency', label: 'Ordering' }, |
| { key: 'discrimination', label: 'Discrim.' }, |
| ]; |
| |
| const STAGE_LABELS = { overall: 'Overall', early: 'Early', late: 'Late' }; |
| |
| function qualityClass(metric, value) { |
| if (value == null || isNaN(value)) return ''; |
| switch (metric) { |
| case 'monotonicity': |
| return value >= 0.7 ? 'good' : value >= 0.4 ? 'ok' : 'bad'; |
| case 'signalStrength': |
| return value >= 0.10 ? 'good' : value >= 0.03 ? 'ok' : 'bad'; |
| case 'noise': |
| return ''; |
| case 'orderingConsistency': |
| return value >= 0.6 ? 'good' : value >= 0.3 ? 'ok' : 'bad'; |
| case 'discrimination': |
| return value >= 0.03 ? 'good' : value >= 0.01 ? 'ok' : 'bad'; |
| default: return ''; |
| } |
| } |
| |
| function renderTaskStats(statsEl, metrics) { |
| if (!metrics) { statsEl.textContent = ''; return; } |
| const stages = ['overall', 'early', 'late']; |
| |
| |
| statsEl.textContent = ''; |
| const tooltip = document.getElementById('custom-tooltip'); |
| |
| for (const stage of stages) { |
| const data = metrics[stage]; |
| if (!data) continue; |
| |
| const row = document.createElement('div'); |
| row.className = 'stat-row'; |
| |
| const stageLabel = document.createElement('span'); |
| stageLabel.className = 'stat-stage'; |
| stageLabel.textContent = STAGE_LABELS[stage]; |
| row.appendChild(stageLabel); |
| |
| for (const { key, label } of METRIC_ITEMS) { |
| const val = data[key]; |
| const display = val == null || isNaN(val) ? 'N/A' : val.toFixed(3); |
| const cls = qualityClass(key, val); |
| |
| const item = document.createElement('span'); |
| item.className = 'stat-item'; |
| |
| const lbl = document.createElement('span'); |
| lbl.className = 'stat-label'; |
| lbl.textContent = label + ':'; |
| item.appendChild(lbl); |
| |
| const valEl = document.createElement('span'); |
| valEl.className = 'stat-value' + (cls ? ' ' + cls : ''); |
| valEl.textContent = display; |
| item.appendChild(valEl); |
| |
| const help = document.createElement('span'); |
| help.className = 'stat-help'; |
| help.textContent = '?'; |
| help.dataset.helpKey = key; |
| help.addEventListener('mouseenter', () => { |
| if (tooltip.classList.contains('scrollable')) return; |
| tooltip.textContent = METRIC_HELP[key]; |
| tooltip.style.display = 'block'; |
| tooltip._statTip = true; |
| const rect = help.getBoundingClientRect(); |
| tooltip.style.left = rect.left + 'px'; |
| tooltip.style.top = (rect.top - tooltip.offsetHeight - 4) + 'px'; |
| }); |
| help.addEventListener('mouseleave', () => { |
| if (tooltip._statTip) { |
| tooltip.style.display = 'none'; |
| tooltip._statTip = false; |
| } |
| }); |
| item.appendChild(help); |
| |
| row.appendChild(item); |
| } |
| statsEl.appendChild(row); |
| } |
| } |
| |
| async function loadConfig() { |
| try { |
| const resp = await fetch('config.yaml'); |
| if (resp.ok) { |
| CONFIG = jsyaml.load(await resp.text()) || {}; |
| } |
| } catch (e) { |
| console.warn('Could not load config.yaml, using defaults:', e); |
| } |
| } |
| |
| async function loadModels() { |
| ALL_MODELS = await query(` |
| WITH raw AS ( |
| SELECT DISTINCT model, model_display_name, is_checkpoint |
| FROM scores |
| ), |
| ckpt_models AS ( |
| SELECT model FROM raw WHERE is_checkpoint = true |
| ) |
| SELECT r.model, r.model_display_name, r.is_checkpoint |
| FROM raw r |
| WHERE r.is_checkpoint = true |
| OR r.model NOT IN (SELECT model FROM ckpt_models) |
| ORDER BY r.is_checkpoint DESC, r.model_display_name |
| `); |
| |
| |
| const configColors = CONFIG.model_colors || {}; |
| MODEL_COLORS = {}; |
| let paletteIdx = 0; |
| ALL_MODELS.forEach(m => { |
| const name = m.model_display_name; |
| if (configColors[name]) { |
| MODEL_COLORS[name] = configColors[name]; |
| } else { |
| MODEL_COLORS[name] = COLOR_PALETTE[paletteIdx % COLOR_PALETTE.length]; |
| paletteIdx++; |
| } |
| }); |
| } |
| |
| |
| class Panel { |
| constructor(id) { |
| this.id = id; |
| this.el = {}; |
| this.collapsed = false; |
| this._zoomXRange = null; |
| this._zoomYRange = null; |
| this.build(); |
| } |
| |
| build() { |
| const container = document.getElementById('panels-container'); |
| const panel = document.createElement('div'); |
| panel.className = 'panel'; |
| panel.id = `panel-${this.id}`; |
| |
| panel.innerHTML = ` |
| <div class="panel-toolbar"> |
| <button class="btn btn-sm" id="ptoggle-${this.id}">Collapse</button> |
| <button class="btn btn-sm" id="pexport-png-${this.id}">PNG</button> |
| <button class="btn btn-sm" id="pexport-svg-${this.id}">SVG</button> |
| <button class="btn btn-sm btn-danger" id="premove-${this.id}">Remove</button> |
| </div> |
| <div class="panel-controls" id="pcontrols-${this.id}"> |
| <div class="controls-row"> |
| <div class="control-group"> |
| <label>Eval Suite</label> |
| <select id="psuite-${this.id}"></select> |
| </div> |
| <div class="control-group"> |
| <label>Task</label> |
| <select id="ptask-${this.id}"></select> |
| </div> |
| <div class="control-group"> |
| <label>Metric</label> |
| <select id="pmetric-${this.id}"></select> |
| </div> |
| <div class="control-group"> |
| <label>Smoothing: <span id="psmooth-val-${this.id}">0</span></label> |
| <input type="range" id="psmooth-${this.id}" min="0" max="0.99" step="0.01" value="0" style="width:120px;vertical-align:middle"> |
| </div> |
| <div class="control-group"> |
| <label>Chart Type</label> |
| <select id="pchart-type-${this.id}"> |
| <option value="auto">Auto</option> |
| <option value="line" selected>Line</option> |
| <option value="bar">Bar</option> |
| </select> |
| </div> |
| <div class="control-group"> |
| <label>X-Ticks</label> |
| <select id="pxticks-${this.id}"> |
| <option value="4">4</option> |
| <option value="6">6</option> |
| <option value="8" selected>8</option> |
| <option value="12">12</option> |
| <option value="16">16</option> |
| <option value="24">24</option> |
| </select> |
| </div> |
| </div> |
| <div class="models-section"> |
| <div class="models-header"> |
| <span>Models</span> |
| <button class="btn btn-sm" id="pmodels-all-${this.id}">All</button> |
| <button class="btn btn-sm" id="pmodels-none-${this.id}">None</button> |
| <button class="btn btn-sm" id="pmodels-ckpt-${this.id}">Checkpoints</button> |
| <button class="btn btn-sm" id="pmodels-base-${this.id}">Baselines</button> |
| </div> |
| <div class="checkbox-grid" id="pmodels-${this.id}"></div> |
| </div> |
| </div> |
| <div class="panel-chart-wrapper"> |
| <div class="title-hover-zone" id="ptitle-hover-${this.id}" style="display:none"></div> |
| <div class="panel-chart" id="pchart-${this.id}"></div> |
| </div> |
| <div class="task-stats" id="pstats-${this.id}"></div> |
| <div class="panel-resize-handle" id="presize-${this.id}"></div> |
| <div class="panel-width-handle" id="pwidth-${this.id}"></div> |
| `; |
| |
| container.appendChild(panel); |
| |
| |
| this.el.panel = panel; |
| this.el.controls = panel.querySelector(`#pcontrols-${this.id}`); |
| this.el.suite = panel.querySelector(`#psuite-${this.id}`); |
| this.el.task = panel.querySelector(`#ptask-${this.id}`); |
| this.el.metric = panel.querySelector(`#pmetric-${this.id}`); |
| this.el.smooth = panel.querySelector(`#psmooth-${this.id}`); |
| this.el.chartType = panel.querySelector(`#pchart-type-${this.id}`); |
| this.el.xTicks = panel.querySelector(`#pxticks-${this.id}`); |
| this.el.models = panel.querySelector(`#pmodels-${this.id}`); |
| this.el.chart = panel.querySelector(`#pchart-${this.id}`); |
| this.el.titleHover = panel.querySelector(`#ptitle-hover-${this.id}`); |
| this.el.stats = panel.querySelector(`#pstats-${this.id}`); |
| this.el.resize = panel.querySelector(`#presize-${this.id}`); |
| this.el.widthHandle = panel.querySelector(`#pwidth-${this.id}`); |
| this.chartHeight = null; |
| |
| |
| panel.querySelector(`#ptoggle-${this.id}`).addEventListener('click', () => this.toggleControls()); |
| panel.querySelector(`#premove-${this.id}`).addEventListener('click', () => this.remove()); |
| panel.querySelector(`#pexport-png-${this.id}`).addEventListener('click', () => this.export('png')); |
| panel.querySelector(`#pexport-svg-${this.id}`).addEventListener('click', () => this.export('svg')); |
| |
| this.el.suite.addEventListener('change', () => this.onSuiteChange()); |
| this.el.task.addEventListener('change', () => this.onTaskChange()); |
| this.el.metric.addEventListener('change', () => this.renderChart()); |
| this.el.smooth.addEventListener('input', () => { |
| panel.querySelector(`#psmooth-val-${this.id}`).textContent = this.el.smooth.value; |
| this.renderChart(); |
| }); |
| this.el.chartType.addEventListener('change', () => this.renderChart()); |
| this.el.xTicks.addEventListener('change', () => this.renderChart()); |
| |
| panel.querySelector(`#pmodels-all-${this.id}`).addEventListener('click', () => this.setModels(true)); |
| panel.querySelector(`#pmodels-none-${this.id}`).addEventListener('click', () => this.setModels(false)); |
| panel.querySelector(`#pmodels-ckpt-${this.id}`).addEventListener('click', () => this.setModelsByType(true)); |
| panel.querySelector(`#pmodels-base-${this.id}`).addEventListener('click', () => this.setModelsByType(false)); |
| |
| |
| this.el.resize.addEventListener('mousedown', (e) => this.startResize(e)); |
| this.el.widthHandle.addEventListener('mousedown', (e) => this.startWidthResize(e)); |
| |
| this.buildModelCheckboxes(); |
| } |
| |
| toggleControls() { |
| this.collapsed = !this.collapsed; |
| this.el.controls.classList.toggle('collapsed', this.collapsed); |
| this.el.panel.querySelector(`#ptoggle-${this.id}`).textContent = |
| this.collapsed ? 'Expand' : 'Collapse'; |
| } |
| |
| remove() { |
| this.el.panel.remove(); |
| panels.delete(this.id); |
| } |
| |
| buildModelCheckboxes() { |
| const container = this.el.models; |
| container.innerHTML = ''; |
| let lastCkpt = null; |
| |
| for (const m of ALL_MODELS) { |
| if (lastCkpt !== null && lastCkpt !== m.is_checkpoint) { |
| const sep = document.createElement('div'); |
| sep.className = 'model-separator'; |
| container.appendChild(sep); |
| } |
| lastCkpt = m.is_checkpoint; |
| |
| const lbl = document.createElement('div'); |
| lbl.className = 'checkbox-item'; |
| |
| const cb = document.createElement('input'); |
| cb.type = 'checkbox'; |
| cb.value = m.model_display_name; |
| const DEFAULT_MODELS = [ |
| 'SmolLM3 3B', 'Olmo 3 7B', 'Olmo 3 32B', |
| 'Apertus 8B', 'Apertus 70B', 'Kimi K2', |
| 'Nemotron 3 Nano 30B-A3B', 'Nemotron 3 Super 120B-A12B', |
| ]; |
| cb.checked = DEFAULT_MODELS.includes(m.model_display_name) |
| || /^Qwen3\.5\b/.test(m.model_display_name); |
| cb.dataset.isCheckpoint = m.is_checkpoint; |
| cb.addEventListener('change', () => this.renderChart()); |
| |
| const dot = document.createElement('span'); |
| dot.className = 'color-dot'; |
| dot.dataset.model = m.model_display_name; |
| dot.style.cssText = `display:inline-block;width:9px;height:9px;border-radius:50%;background:${MODEL_COLORS[m.model_display_name]};position:relative`; |
| |
| const colorInput = document.createElement('input'); |
| colorInput.type = 'color'; |
| colorInput.value = MODEL_COLORS[m.model_display_name]; |
| colorInput.style.cssText = 'position:absolute;top:0;left:0;width:100%;height:100%;opacity:0;cursor:pointer;border:none;padding:0'; |
| colorInput.addEventListener('click', (e) => e.stopPropagation()); |
| colorInput.addEventListener('input', (e) => { |
| const newColor = e.target.value; |
| MODEL_COLORS[m.model_display_name] = newColor; |
| document.querySelectorAll(`.color-dot[data-model="${CSS.escape(m.model_display_name)}"]`).forEach(d => { |
| d.style.background = newColor; |
| }); |
| panels.forEach((p) => p.renderChart()); |
| }); |
| dot.appendChild(colorInput); |
| |
| const name = document.createElement('span'); |
| name.className = 'model-name'; |
| name.dataset.modelName = m.model_display_name; |
| name.textContent = ' ' + m.model_display_name; |
| if (!m.is_checkpoint) { |
| name.style.fontStyle = 'italic'; |
| } |
| |
| name.addEventListener('mouseenter', (e) => { |
| const tip = name.dataset.missingTip; |
| if (!tip) return; |
| const tooltip = document.getElementById('custom-tooltip'); |
| if (tooltip.classList.contains('scrollable')) return; |
| tooltip.innerHTML = tip; |
| tooltip.style.display = 'block'; |
| tooltip._modelTip = true; |
| const rect = name.getBoundingClientRect(); |
| tooltip.style.left = (rect.left) + 'px'; |
| tooltip.style.top = (rect.bottom + 4) + 'px'; |
| }); |
| name.addEventListener('mouseleave', () => { |
| const tooltip = document.getElementById('custom-tooltip'); |
| if (tooltip._modelTip) { |
| tooltip.style.display = 'none'; |
| tooltip._modelTip = false; |
| } |
| }); |
| |
| |
| name.addEventListener('click', () => { cb.checked = !cb.checked; cb.dispatchEvent(new Event('change')); }); |
| lbl.addEventListener('click', (e) => { |
| if (e.target === lbl) { cb.checked = !cb.checked; cb.dispatchEvent(new Event('change')); } |
| }); |
| |
| lbl.append(cb, dot, name); |
| container.appendChild(lbl); |
| } |
| } |
| |
| setModels(checked) { |
| this.el.models.querySelectorAll('input').forEach(cb => cb.checked = checked); |
| this.renderChart(); |
| } |
| |
| setModelsByType(isCheckpoint) { |
| this.el.models.querySelectorAll('input').forEach(cb => { |
| cb.checked = (cb.dataset.isCheckpoint === String(isCheckpoint)); |
| }); |
| this.renderChart(); |
| } |
| |
| getSelectedModels() { |
| return Array.from(this.el.models.querySelectorAll('input:checked')).map(cb => cb.value); |
| } |
| |
| getSmoothing() { |
| return parseFloat(this.el.smooth.value) || 0; |
| } |
| |
| getChartType() { |
| return this.el.chartType.value; |
| } |
| |
| getMaxXTicks() { |
| return parseInt(this.el.xTicks.value, 10) || 8; |
| } |
| |
| getSelectedTask() { |
| return this.el.task.value; |
| } |
| |
| |
| async populateSuites(defaults) { |
| const rows = await query(` |
| SELECT DISTINCT task AS value, task_display_name AS label |
| FROM scores |
| WHERE task_type = 'eval_suite' AND task != 'test_fix' |
| ORDER BY task |
| `); |
| populateSelect(this.el.suite, rows, defaults?.suite); |
| await this.onSuiteChange(defaults); |
| } |
| |
| async onSuiteChange(defaults) { |
| const suite = this.el.suite.value; |
| if (!suite) return; |
| |
| |
| |
| const stRows = await query(` |
| SELECT DISTINCT subtask_tree FROM scores |
| WHERE task = '${esc(suite)}' AND subtask_tree IS NOT NULL |
| `); |
| |
| let groupTasks = []; |
| let leafTasks = []; |
| if (stRows.length > 0) { |
| const merged = {}; |
| for (const row of stRows) { |
| if (!row.subtask_tree) continue; |
| const tree = JSON.parse(row.subtask_tree); |
| for (const [key, children] of Object.entries(tree)) { |
| if (!merged[key]) merged[key] = new Set(); |
| for (const c of children) merged[key].add(c); |
| } |
| } |
| const allChildren = new Set(); |
| for (const children of Object.values(merged)) { |
| for (const c of children) allChildren.add(c); |
| } |
| |
| groupTasks = Object.keys(merged).sort(); |
| |
| leafTasks = [...allChildren].filter(t => !merged[t]).sort(); |
| } |
| |
| |
| const allTasks = [...groupTasks, ...leafTasks]; |
| let rows = []; |
| if (allTasks.length > 0) { |
| rows = await query(` |
| SELECT DISTINCT task AS value, task_display_name AS label, task_type |
| FROM scores |
| WHERE task IN (${sqlIn(allTasks)}) |
| ORDER BY task |
| `); |
| } |
| |
| const groupRows = rows.filter(r => r.task_type === 'task_group'); |
| const leafRows = rows.filter(r => r.task_type === 'benchmark'); |
| |
| const options = [ |
| { value: suite, label: `${suite} (eval suite)` }, |
| ...groupRows.map(r => ({ value: r.value, label: `${r.label} (group)` })), |
| ...leafRows, |
| ]; |
| populateSelect(this.el.task, options, defaults?.task); |
| await this.onTaskChange(defaults); |
| } |
| |
| async onTaskChange(defaults) { |
| const task = this.getSelectedTask(); |
| if (!task) return; |
| |
| const rows = await query(` |
| SELECT DISTINCT metric FROM scores WHERE task = '${esc(task)}' ORDER BY metric |
| `); |
| const prev = defaults?.metric || this.el.metric.value; |
| populateSelect(this.el.metric, rows.map(r => r.metric), prev); |
| if (defaults?.chartType) this.el.chartType.value = defaults.chartType; |
| await this.renderChart(); |
| } |
| |
| async updateMissingModels(task, metric) { |
| const nameEls = this.el.models.querySelectorAll('.model-name'); |
| if (!task || !metric) { |
| nameEls.forEach(el => { |
| el.classList.remove('missing'); |
| delete el.dataset.missingTip; |
| }); |
| return; |
| } |
| const available = await query(` |
| SELECT DISTINCT model_display_name FROM scores |
| WHERE task = '${esc(task)}' AND metric = '${esc(metric)}' |
| `); |
| const availableSet = new Set(available.map(r => r.model_display_name)); |
| nameEls.forEach(el => { |
| const modelName = el.dataset.modelName; |
| if (!availableSet.has(modelName)) { |
| el.classList.add('missing'); |
| el.dataset.missingTip = `No scores for "${modelName}" on this task / metric`; |
| } else { |
| el.classList.remove('missing'); |
| delete el.dataset.missingTip; |
| } |
| }); |
| } |
| |
| |
| async renderChart() { |
| const task = this.getSelectedTask(); |
| const metric = this.el.metric.value; |
| const models = this.getSelectedModels(); |
| |
| await this.updateMissingModels(task, metric); |
| |
| if (!task || !metric || models.length === 0) { |
| Plotly.react(this.el.chart, [], { |
| title: { text: '', font: { size: 14 } }, |
| xaxis: { visible: false }, |
| yaxis: { visible: false }, |
| height: this.getChartHeight(600), |
| plot_bgcolor: '#fff', paper_bgcolor: '#fff', |
| }); |
| this.el.stats.textContent = ''; |
| return; |
| } |
| |
| const rows = await query(` |
| SELECT model, model_display_name, tokens_trained, score, score_stderr, |
| is_checkpoint, higher_is_better, step |
| FROM scores |
| WHERE task = '${esc(task)}' |
| AND metric = '${esc(metric)}' |
| AND model_display_name IN (${sqlIn(models)}) |
| ORDER BY model_display_name, tokens_trained |
| `); |
| |
| if (rows.length === 0) { |
| this.el.chart.innerHTML = '<div class="loading">No data for this selection</div>'; |
| return; |
| } |
| |
| |
| const mergedRows = this.mergeFinalCheckpoints(rows); |
| |
| |
| const chartType = this.resolveChartType(mergedRows); |
| const higherIsBetter = mergedRows[0]?.higher_is_better; |
| |
| |
| let subtaskTree = null; |
| try { |
| const stRows = await query(` |
| SELECT DISTINCT subtask_tree FROM scores |
| WHERE task = '${esc(task)}' AND metric = '${esc(metric)}' |
| AND subtask_tree IS NOT NULL |
| `); |
| if (stRows.length > 0) { |
| const merged = {}; |
| for (const row of stRows) { |
| if (!row.subtask_tree) continue; |
| const tree = JSON.parse(row.subtask_tree); |
| for (const [key, children] of Object.entries(tree)) { |
| if (!merged[key]) merged[key] = new Set(); |
| for (const c of children) merged[key].add(c); |
| } |
| } |
| |
| for (const key of Object.keys(merged)) { |
| merged[key] = [...merged[key]]; |
| } |
| subtaskTree = merged; |
| } |
| } catch (e) { |
| |
| } |
| |
| if (chartType === 'bar') { |
| this.drawBarChart(mergedRows, task, metric, higherIsBetter, subtaskTree); |
| } else { |
| this.drawLineChart(mergedRows, task, metric, higherIsBetter, subtaskTree); |
| } |
| } |
| |
| mergeFinalCheckpoints(rows) { |
| |
| |
| const regular = []; |
| const finals = []; |
| for (const r of rows) { |
| if (r.step === null || r.step === undefined) { |
| finals.push(r); |
| } else { |
| regular.push(r); |
| } |
| } |
| if (finals.length === 0) return rows; |
| |
| |
| const modelToSeries = {}; |
| for (const r of regular) { |
| if (r.is_checkpoint) { |
| modelToSeries[r.model] = r.model_display_name; |
| } |
| } |
| |
| const result = [...regular]; |
| for (const fc of finals) { |
| const seriesName = modelToSeries[fc.model]; |
| if (seriesName) { |
| |
| result.push({ ...fc, model_display_name: seriesName, is_checkpoint: true }); |
| } else { |
| |
| result.push(fc); |
| } |
| } |
| return result; |
| } |
| |
| resolveChartType(rows) { |
| const pref = this.getChartType(); |
| if (pref !== 'auto') return pref; |
| |
| |
| const byModel = {}; |
| for (const r of rows) { |
| if (!byModel[r.model_display_name]) byModel[r.model_display_name] = new Set(); |
| if (r.tokens_trained != null) byModel[r.model_display_name].add(Number(r.tokens_trained)); |
| } |
| const allSingle = Object.values(byModel).every(s => s.size <= 1); |
| return allSingle ? 'bar' : 'line'; |
| } |
| |
| formatChartTitle(task, metric, higherIsBetter) { |
| const arrow = higherIsBetter === true ? ' \u2191' : higherIsBetter === false ? ' \u2193' : ''; |
| return `${task} \u2014 ${metric}${arrow}`; |
| } |
| |
| renderSubtaskTree(map, keys, depth = 0) { |
| if (!keys || keys.length === 0) return ''; |
| const indent = depth * 16; |
| return keys.map(key => { |
| const children = map[key]; |
| let html = `<div style="padding-left:${indent}px">${key}</div>`; |
| if (children) { |
| html += this.renderSubtaskTree(map, children, depth + 1); |
| } |
| return html; |
| }).join(''); |
| } |
| |
| setupTitleTooltip(subtaskTree) { |
| const hoverZone = this.el.titleHover; |
| hoverZone.innerHTML = ''; |
| if (!subtaskTree || typeof subtaskTree !== 'object' || Object.keys(subtaskTree).length === 0) { |
| hoverZone.style.display = 'none'; |
| return; |
| } |
| hoverZone.style.display = ''; |
| |
| |
| const icon = document.createElement('span'); |
| icon.className = 'title-info-icon'; |
| icon.textContent = 'i'; |
| hoverZone.appendChild(icon); |
| const titleEl = this.el.chart.querySelector('.gtitle'); |
| if (titleEl) { |
| const wrapperRect = this.el.chart.closest('.panel-chart-wrapper').getBoundingClientRect(); |
| const titleRect = titleEl.getBoundingClientRect(); |
| icon.style.left = (titleRect.right - wrapperRect.left - 50 + 6) + 'px'; |
| } else { |
| icon.style.right = '0px'; |
| } |
| const tooltip = document.getElementById('custom-tooltip'); |
| |
| const allChildren = new Set(Object.values(subtaskTree).flat()); |
| const rootKeys = Object.keys(subtaskTree).filter(k => !allChildren.has(k)); |
| const html = this.renderSubtaskTree(subtaskTree, rootKeys); |
| |
| const positionTooltip = () => { |
| const titleEl = this.el.chart.querySelector('.gtitle'); |
| const chartRect = this.el.chart.getBoundingClientRect(); |
| const tw = tooltip.offsetWidth; |
| let tipTop; |
| if (titleEl) { |
| const titleRect = titleEl.getBoundingClientRect(); |
| const titleCenter = (titleRect.left + titleRect.right) / 2; |
| tooltip.style.left = (titleCenter - tw / 2) + 'px'; |
| tipTop = titleRect.bottom + 4; |
| } else { |
| tooltip.style.left = (chartRect.left + chartRect.width / 2 - tw / 2) + 'px'; |
| tipTop = chartRect.top + 40; |
| } |
| tooltip.style.top = tipTop + 'px'; |
| tooltip.style.maxHeight = Math.max(0, chartRect.bottom - tipTop) + 'px'; |
| }; |
| |
| this._titleClick = (e) => { |
| |
| if (tooltip.style.display === 'block' && tooltip._panelId === this.id) { |
| tooltip.style.display = 'none'; |
| tooltip.classList.remove('scrollable'); |
| tooltip._panelId = null; |
| window.removeEventListener('scroll', this._titleScroll, true); |
| return; |
| } |
| tooltip.innerHTML = html; |
| tooltip.classList.add('scrollable'); |
| tooltip.style.display = 'block'; |
| tooltip._panelId = this.id; |
| positionTooltip(); |
| window.addEventListener('scroll', this._titleScroll, true); |
| }; |
| |
| this._titleScroll = () => { |
| if (tooltip.style.display === 'block' && tooltip._panelId === this.id) { |
| positionTooltip(); |
| } |
| }; |
| |
| this._titleOutsideClick = (e) => { |
| if (tooltip._panelId !== this.id) return; |
| if (tooltip.contains(e.target) || hoverZone.contains(e.target)) return; |
| tooltip.style.display = 'none'; |
| tooltip.classList.remove('scrollable'); |
| tooltip._panelId = null; |
| window.removeEventListener('scroll', this._titleScroll, true); |
| }; |
| |
| hoverZone.addEventListener('click', this._titleClick); |
| document.addEventListener('mousedown', this._titleOutsideClick); |
| } |
| |
| startResize(e) { |
| e.preventDefault(); |
| const startY = e.clientY; |
| const startH = this.el.chart.offsetHeight; |
| this.el.resize.classList.add('active'); |
| |
| const onMove = (ev) => { |
| const delta = ev.clientY - startY; |
| const newH = Math.max(200, startH + delta); |
| this.chartHeight = newH; |
| Plotly.relayout(this.el.chart, { height: newH }); |
| }; |
| |
| const onUp = () => { |
| this.el.resize.classList.remove('active'); |
| document.removeEventListener('mousemove', onMove); |
| document.removeEventListener('mouseup', onUp); |
| }; |
| |
| document.addEventListener('mousemove', onMove); |
| document.addEventListener('mouseup', onUp); |
| } |
| |
| startWidthResize(e) { |
| e.preventDefault(); |
| const startX = e.clientX; |
| const startW = this.el.panel.offsetWidth; |
| this.el.widthHandle.classList.add('active'); |
| |
| const chart = this.el.chart; |
| |
| |
| const lockedH = chart.offsetHeight; |
| chart.style.minHeight = lockedH + 'px'; |
| |
| const onMove = (ev) => { |
| const newW = Math.max(300, startW + ev.clientX - startX); |
| this.el.panel.style.width = newW + 'px'; |
| }; |
| |
| const onUp = () => { |
| this.el.widthHandle.classList.remove('active'); |
| document.removeEventListener('mousemove', onMove); |
| document.removeEventListener('mouseup', onUp); |
| chart.style.minHeight = ''; |
| Plotly.purge(chart); |
| this.renderChart(); |
| }; |
| |
| document.addEventListener('mousemove', onMove); |
| document.addEventListener('mouseup', onUp); |
| } |
| |
| getChartHeight(fallback) { |
| return this.chartHeight || fallback; |
| } |
| |
| cleanupTooltip() { |
| const tooltip = document.getElementById('custom-tooltip'); |
| tooltip.style.display = 'none'; |
| const chart = this.el.chart; |
| chart.removeAllListeners?.('plotly_hover'); |
| chart.removeAllListeners?.('plotly_unhover'); |
| if (this._tooltipMouseMove) { |
| chart.removeEventListener('mousemove', this._tooltipMouseMove); |
| this._tooltipMouseMove = null; |
| } |
| if (this._tooltipMouseLeave) { |
| chart.removeEventListener('mouseleave', this._tooltipMouseLeave); |
| this._tooltipMouseLeave = null; |
| } |
| |
| if (this._titleClick) { |
| const hz = this.el.titleHover; |
| hz.removeEventListener('click', this._titleClick); |
| hz.style.display = 'none'; |
| this._titleClick = null; |
| } |
| if (this._titleScroll) { |
| window.removeEventListener('scroll', this._titleScroll, true); |
| this._titleScroll = null; |
| } |
| if (this._titleOutsideClick) { |
| document.removeEventListener('mousedown', this._titleOutsideClick); |
| this._titleOutsideClick = null; |
| } |
| if (tooltip._panelId === this.id) { |
| tooltip.classList.remove('scrollable'); |
| tooltip._panelId = null; |
| } |
| } |
| |
| drawLineChart(rows, task, metric, higherIsBetter, subtasks) { |
| this.cleanupTooltip(); |
| const w = this.getSmoothing(); |
| |
| |
| const byModel = {}; |
| for (const r of rows) { |
| const name = r.model_display_name; |
| if (!byModel[name]) byModel[name] = { points: [], isCheckpoint: r.is_checkpoint }; |
| byModel[name].points.push({ x: r.tokens_trained != null ? Number(r.tokens_trained) : null, y: r.score }); |
| } |
| for (const d of Object.values(byModel)) d.points.sort((a, b) => (a.x ?? -Infinity) - (b.x ?? -Infinity)); |
| |
| |
| let xMin = Infinity, xMax = -Infinity; |
| for (const d of Object.values(byModel)) { |
| if (d.isCheckpoint) { |
| for (const p of d.points) { |
| if (p.x != null) { |
| xMin = Math.min(xMin, p.x); |
| xMax = Math.max(xMax, p.x); |
| } |
| } |
| } |
| } |
| if (!isFinite(xMin)) { xMin = 0; xMax = 1; } |
| |
| const traces = []; |
| for (const [name, d] of Object.entries(byModel)) { |
| const color = MODEL_COLORS[name] || '#999'; |
| const validPoints = d.points.filter(p => p.x != null); |
| if (d.isCheckpoint && validPoints.length > 1) { |
| traces.push({ |
| x: validPoints.map(p => p.x), |
| y: exponentialMovingAverage(validPoints.map(p => p.y), w), |
| name, mode: 'lines+markers', |
| line: { color, width: 2 }, marker: { size: 5 }, |
| }); |
| } else { |
| const score = d.points[0]?.y; |
| if (score != null) { |
| |
| const nPts = 50; |
| const xs = Array.from({ length: nPts }, (_, i) => xMin + (xMax - xMin) * i / (nPts - 1)); |
| const ys = xs.map(() => score); |
| traces.push({ |
| x: xs, y: ys, |
| name, mode: 'lines', |
| line: { color, width: 2, dash: 'dash' }, |
| hoverinfo: 'name+y', |
| }); |
| } |
| } |
| } |
| |
| |
| const maxXTicks = this.getMaxXTicks(); |
| const tickVals = niceTicks(xMin, xMax, maxXTicks); |
| |
| Plotly.react(this.el.chart, traces, { |
| title: { text: this.formatChartTitle(task, metric, higherIsBetter), font: { size: 14, color: '#1a1a2e' } }, |
| hoverlabel: { namelength: -1 }, |
| xaxis: { |
| title: { text: 'Tokens Trained', font: { size: 12 } }, |
| tickfont: { size: 10 }, tickvals: this._zoomXRange ? niceTicks(this._zoomXRange[0], this._zoomXRange[1], maxXTicks) : tickVals, |
| ticktext: formatTokensArray(this._zoomXRange ? niceTicks(this._zoomXRange[0], this._zoomXRange[1], maxXTicks) : tickVals), |
| gridcolor: '#e9ecef', zeroline: false, |
| ...(this._zoomXRange ? { range: [...this._zoomXRange], autorange: false } : {}), |
| }, |
| yaxis: { |
| title: { text: 'Score', font: { size: 12 } }, |
| tickfont: { size: 10 }, gridcolor: '#e9ecef', zeroline: false, |
| ...(this._zoomYRange ? { range: [...this._zoomYRange], autorange: false } : { autorange: true }), |
| }, |
| legend: { orientation: 'h', yanchor: 'top', y: -0.15, x: 0, font: { size: 11 } }, |
| margin: { t: 50, r: 20, b: 100, l: 50 }, |
| plot_bgcolor: '#fff', paper_bgcolor: '#fff', |
| font: { family: '-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif' }, |
| height: this.getChartHeight(600), |
| }, { responsive: true }); |
| |
| |
| this.el.chart.removeAllListeners?.('plotly_relayout'); |
| let updatingTicks = false; |
| this.el.chart.on('plotly_relayout', (evt) => { |
| if (updatingTicks) return; |
| let tv; |
| const r0 = evt['xaxis.range[0]']; |
| const r1 = evt['xaxis.range[1]']; |
| if (r0 != null && r1 != null) { |
| |
| this._zoomXRange = [r0, r1]; |
| const yr0 = evt['yaxis.range[0]']; |
| const yr1 = evt['yaxis.range[1]']; |
| if (yr0 != null && yr1 != null) this._zoomYRange = [yr0, yr1]; |
| tv = niceTicks(r0, r1, maxXTicks); |
| } else if (evt['xaxis.autorange']) { |
| |
| this._zoomXRange = null; |
| this._zoomYRange = null; |
| tv = niceTicks(xMin, xMax, maxXTicks); |
| } |
| if (tv) { |
| updatingTicks = true; |
| Plotly.relayout(this.el.chart, { |
| 'xaxis.tickvals': tv, |
| 'xaxis.ticktext': formatTokensArray(tv), |
| }).then(() => { updatingTicks = false; }); |
| } |
| }); |
| |
| |
| const metrics = computeTaskQualityMetrics(byModel, higherIsBetter); |
| renderTaskStats(this.el.stats, metrics); |
| |
| this.setupTitleTooltip(subtasks); |
| } |
| |
| drawBarChart(rows, task, metric, higherIsBetter, subtasks) { |
| this.cleanupTooltip(); |
| this.el.stats.innerHTML = ''; |
| |
| const byModel = {}; |
| for (const r of rows) { |
| const name = r.model_display_name; |
| const tokens = r.tokens_trained != null ? Number(r.tokens_trained) : null; |
| if (!byModel[name] || (tokens != null && (byModel[name].tokens == null || tokens > byModel[name].tokens))) { |
| byModel[name] = { score: r.score, tokens, isCheckpoint: r.is_checkpoint }; |
| } |
| } |
| |
| |
| const sorted = Object.entries(byModel) |
| .sort((a, b) => higherIsBetter !== false ? b[1].score - a[1].score : a[1].score - b[1].score); |
| |
| const names = sorted.map(([n]) => n); |
| const scores = sorted.map(([, d]) => d.score); |
| const colors = sorted.map(([n]) => MODEL_COLORS[n] || '#999'); |
| const tokens = sorted.map(([, d]) => formatTokens(d.tokens)); |
| const hovertext = sorted.map(([n, d]) => |
| `${n}<br>Score: ${d.score.toFixed(4)}<br>Tokens: ${formatTokens(d.tokens)}` |
| ); |
| |
| |
| const annotations = names.map((name, i) => ({ |
| x: 0, |
| y: name, |
| text: tokens[i], |
| hovertext: 'Tokens Trained', |
| xanchor: 'left', |
| yanchor: 'middle', |
| showarrow: false, |
| font: { size: 10, color: '#000' }, |
| xshift: 4, |
| })); |
| |
| Plotly.react(this.el.chart, [{ |
| type: 'bar', |
| orientation: 'h', |
| y: names, |
| x: scores, |
| marker: { color: colors }, |
| text: scores.map(s => s.toFixed(4)), |
| textposition: 'outside', |
| textfont: { size: 11 }, |
| hoverinfo: 'none', |
| customdata: hovertext, |
| }], { |
| title: { text: this.formatChartTitle(task, metric, higherIsBetter), font: { size: 14, color: '#1a1a2e' } }, |
| hovermode: 'closest', |
| annotations, |
| xaxis: { |
| title: { text: 'Score', font: { size: 12 } }, |
| tickfont: { size: 10 }, gridcolor: '#e9ecef', zeroline: false, |
| ...(this._zoomXRange ? { range: [...this._zoomXRange], autorange: false } : {}), |
| }, |
| yaxis: { |
| tickfont: { size: 11 }, automargin: true, |
| categoryorder: 'array', categoryarray: names.slice().reverse(), |
| }, |
| margin: { t: 60, r: 80, b: 60, l: 10 }, |
| plot_bgcolor: '#fff', paper_bgcolor: '#fff', |
| font: { family: '-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif' }, |
| height: this.getChartHeight(Math.max(400, names.length * 40 + 100)), |
| showlegend: false, |
| }, { responsive: true }); |
| |
| |
| const tooltip = document.getElementById('custom-tooltip'); |
| const chart = this.el.chart; |
| chart.on('plotly_hover', (data) => { |
| if (tooltip.classList.contains('scrollable')) return; |
| const pt = data.points[0]; |
| tooltip.innerHTML = pt.customdata; |
| tooltip.style.display = 'block'; |
| }); |
| chart.on('plotly_unhover', () => { |
| if (tooltip.classList.contains('scrollable')) return; |
| tooltip.style.display = 'none'; |
| }); |
| this._tooltipMouseMove = (e) => { |
| if (tooltip.classList.contains('scrollable')) return; |
| if (tooltip.style.display === 'block') { |
| tooltip.style.left = (e.clientX + 12) + 'px'; |
| tooltip.style.top = (e.clientY - 10) + 'px'; |
| } |
| }; |
| this._tooltipMouseLeave = () => { |
| if (tooltip.classList.contains('scrollable')) return; |
| tooltip.style.display = 'none'; |
| }; |
| chart.addEventListener('mousemove', this._tooltipMouseMove); |
| chart.addEventListener('mouseleave', this._tooltipMouseLeave); |
| |
| this.setupTitleTooltip(subtasks); |
| } |
| |
| export(format) { |
| const task = this.getSelectedTask(); |
| const metric = this.el.metric.value; |
| let filename = `${task}_${metric}`.replace(/[^a-zA-Z0-9_-]/g, '_'); |
| Plotly.downloadImage(this.el.chart, { format, scale: 3, filename }); |
| } |
| } |
| |
| |
| async function addPanel(defaults) { |
| const id = panelCounter++; |
| const panel = new Panel(id); |
| panels.set(id, panel); |
| await panel.populateSuites(defaults); |
| return panel; |
| } |
| |
| |
| function resolveParquetUrl(input) { |
| input = input.trim(); |
| if (input.startsWith('http://') || input.startsWith('https://')) return input; |
| const parts = input.split('/'); |
| if (parts.length === 2) { |
| return `https://huggingface.co/datasets/${parts[0]}/${parts[1]}/resolve/main/scores.parquet`; |
| } |
| if (parts.length >= 3) { |
| const org = parts[0], dataset = parts[1], filePath = parts.slice(2).join('/'); |
| return `https://huggingface.co/datasets/${org}/${dataset}/resolve/main/${filePath}`; |
| } |
| return input; |
| } |
| |
| async function mergeDataset(input) { |
| const statusEl = document.getElementById('merge-status'); |
| statusEl.className = 'merge-status'; |
| statusEl.textContent = 'Loading...'; |
| |
| try { |
| const url = resolveParquetUrl(input); |
| const id = mergeCounter++; |
| const bufferName = `merged_${id}.parquet`; |
| |
| const fetchOpts = {}; |
| const token = hfAccessToken || document.getElementById('hf-token-input').value.trim(); |
| if (token) { |
| fetchOpts.headers = { 'Authorization': `Bearer ${token}` }; |
| } |
| const response = await fetch(url, fetchOpts); |
| if (!response.ok) throw new Error(`HTTP ${response.status}: ${response.statusText}`); |
| const buffer = new Uint8Array(await response.arrayBuffer()); |
| await db.registerFileBuffer(bufferName, buffer); |
| |
| const unionParts = mergedDatasets.map(d => `UNION ALL SELECT * FROM '${d.bufferName}'`).join('\n'); |
| await conn.query(`CREATE OR REPLACE VIEW scores AS |
| SELECT * FROM 'scores.parquet' |
| ${unionParts} |
| UNION ALL SELECT * FROM '${bufferName}' |
| `); |
| |
| const label = input.trim().replace(/^https:\/\/huggingface\.co\/datasets\//, ''); |
| mergedDatasets.push({ id, label, url, bufferName, input: input.trim() }); |
| |
| renderMergedTags(); |
| saveMergedToStorage(); |
| await refreshAfterMerge(); |
| |
| statusEl.textContent = 'Merged successfully.'; |
| setTimeout(() => { statusEl.textContent = ''; }, 3000); |
| document.getElementById('merge-dataset-input').value = ''; |
| } catch (err) { |
| statusEl.className = 'merge-status error'; |
| statusEl.textContent = `Error: ${err.message}`; |
| console.error('Merge failed:', err); |
| throw err; |
| } |
| } |
| |
| async function removeMergedDataset(id) { |
| mergedDatasets = mergedDatasets.filter(d => d.id !== id); |
| |
| const unionParts = mergedDatasets.map(d => `UNION ALL SELECT * FROM '${d.bufferName}'`).join('\n'); |
| await conn.query(`CREATE OR REPLACE VIEW scores AS |
| SELECT * FROM 'scores.parquet' |
| ${unionParts} |
| `); |
| |
| renderMergedTags(); |
| saveMergedToStorage(); |
| await refreshAfterMerge(); |
| } |
| |
| function saveMergedToStorage() { |
| try { |
| const inputs = mergedDatasets.map(d => d.input); |
| localStorage.setItem('mergedDatasets', JSON.stringify(inputs)); |
| } catch (e) { } |
| } |
| |
| function renderMergedTags() { |
| const container = document.getElementById('merged-tags'); |
| container.textContent = ''; |
| for (const ds of mergedDatasets) { |
| const tag = document.createElement('span'); |
| tag.className = 'merged-tag'; |
| const labelSpan = document.createElement('span'); |
| labelSpan.textContent = ds.label; |
| const removeBtn = document.createElement('button'); |
| removeBtn.textContent = '\u00d7'; |
| removeBtn.title = 'Remove merged dataset'; |
| removeBtn.addEventListener('click', () => removeMergedDataset(ds.id)); |
| tag.append(labelSpan, removeBtn); |
| container.appendChild(tag); |
| } |
| } |
| |
| async function refreshAfterMerge() { |
| await loadModels(); |
| for (const [, panel] of panels) { |
| const selectedModels = new Set(panel.getSelectedModels()); |
| const suite = panel.el.suite.value; |
| const task = panel.el.task.value; |
| const metric = panel.el.metric.value; |
| const chartType = panel.el.chartType.value; |
| |
| panel.buildModelCheckboxes(); |
| panel.el.models.querySelectorAll('input').forEach(cb => { |
| if (selectedModels.has(cb.value)) { |
| cb.checked = true; |
| } |
| }); |
| await panel.populateSuites({ suite, task, metric, chartType }); |
| } |
| } |
| |
| |
| function updateAuthUI(oauthResult) { |
| const authRow = document.getElementById('hf-auth-row'); |
| const signinBtn = document.getElementById('hf-signin-btn'); |
| const signoutBtn = document.getElementById('hf-signout-btn'); |
| const userEl = document.getElementById('hf-user'); |
| const tokenInput = document.getElementById('hf-token-input'); |
| |
| authRow.style.display = ''; |
| |
| if (!window.huggingface?.variables?.OAUTH_CLIENT_ID) { |
| |
| authRow.style.display = 'none'; |
| return; |
| } |
| |
| const privateLabel = document.getElementById('hf-private-label'); |
| authRow.style.display = ''; |
| if (oauthResult) { |
| signinBtn.style.display = 'none'; |
| privateLabel.style.display = 'none'; |
| tokenInput.style.display = 'none'; |
| userEl.textContent = `Signed in as ${oauthResult.userInfo?.name || oauthResult.userInfo?.preferred_username || 'HF user'}`; |
| userEl.style.display = ''; |
| signoutBtn.style.display = ''; |
| } else { |
| signinBtn.style.display = ''; |
| privateLabel.style.display = ''; |
| tokenInput.style.display = 'none'; |
| userEl.style.display = 'none'; |
| signoutBtn.style.display = 'none'; |
| } |
| } |
| |
| let _hfExpiryTimer = null; |
| |
| function hfSignOut() { |
| localStorage.removeItem('hf_oauth'); |
| hfAccessToken = null; |
| if (_hfExpiryTimer) { clearTimeout(_hfExpiryTimer); _hfExpiryTimer = null; } |
| updateAuthUI(null); |
| } |
| |
| const HF_EXPIRY_BUFFER_MS = 5 * 60_000; |
| |
| function isHfTokenExpired(oauthResult) { |
| if (!oauthResult?.accessTokenExpiresAt) return false; |
| return new Date(oauthResult.accessTokenExpiresAt).getTime() - HF_EXPIRY_BUFFER_MS < Date.now(); |
| } |
| |
| function scheduleHfExpiry(oauthResult) { |
| if (_hfExpiryTimer) clearTimeout(_hfExpiryTimer); |
| if (!oauthResult?.accessTokenExpiresAt) return; |
| const expiresAt = new Date(oauthResult.accessTokenExpiresAt).getTime(); |
| const ms = expiresAt - HF_EXPIRY_BUFFER_MS - Date.now(); |
| if (ms <= 0) { |
| hfSignOut(); |
| return; |
| } |
| console.log(`HF token expires at ${new Date(expiresAt).toISOString()}, auto-signout in ${Math.round(ms / 60_000)}m`); |
| _hfExpiryTimer = setTimeout(() => { |
| console.warn('HF OAuth token expired, signing out automatically.'); |
| hfSignOut(); |
| }, ms); |
| } |
| |
| async function initHfAuth() { |
| |
| let oauthResult = null; |
| const stored = localStorage.getItem('hf_oauth'); |
| if (stored) { |
| try { oauthResult = JSON.parse(stored); } catch { oauthResult = null; } |
| } |
| |
| |
| oauthResult = (await oauthHandleRedirectIfPresent()) || oauthResult; |
| |
| |
| if (oauthResult && isHfTokenExpired(oauthResult)) { |
| console.warn('HF OAuth token expired, discarding.'); |
| oauthResult = null; |
| localStorage.removeItem('hf_oauth'); |
| } |
| |
| if (oauthResult?.accessToken) { |
| hfAccessToken = oauthResult.accessToken; |
| localStorage.setItem('hf_oauth', JSON.stringify(oauthResult)); |
| scheduleHfExpiry(oauthResult); |
| } |
| |
| updateAuthUI(oauthResult); |
| |
| |
| document.getElementById('hf-signin-btn').addEventListener('click', async () => { |
| const includePrivate = document.getElementById('hf-private-toggle').checked; |
| const scopes = includePrivate |
| ? 'openid profile read-repos' |
| : 'openid profile gated-repos'; |
| window.location.href = (await oauthLoginUrl({ scopes })) + '&prompt=consent'; |
| }); |
| |
| |
| document.getElementById('hf-signout-btn').addEventListener('click', () => hfSignOut()); |
| } |
| |
| |
| const elInitLoading = document.getElementById('init-loading'); |
| const elAddPanelRow = document.getElementById('add-panel-row'); |
| |
| async function init() { |
| try { |
| elInitLoading.textContent = 'Loading config...'; |
| await loadConfig(); |
| |
| elInitLoading.textContent = 'Initializing DuckDB...'; |
| await initDuckDB(); |
| |
| elInitLoading.textContent = 'Loading data from HuggingFace...'; |
| await loadParquet(); |
| |
| elInitLoading.textContent = 'Loading models...'; |
| await loadModels(); |
| |
| elInitLoading.style.display = 'none'; |
| elAddPanelRow.style.display = ''; |
| document.getElementById('merge-dataset-row').style.display = ''; |
| |
| |
| await initHfAuth(); |
| |
| |
| try { |
| const saved = JSON.parse(localStorage.getItem('mergedDatasets') || '[]'); |
| const failed = []; |
| for (const input of saved) { |
| try { |
| await mergeDataset(input); |
| } catch (e) { |
| console.warn(`Skipping saved dataset "${input}":`, e.message); |
| failed.push(input); |
| } |
| } |
| if (failed.length > 0) { |
| |
| const remaining = mergedDatasets.map(d => d.input); |
| localStorage.setItem('mergedDatasets', JSON.stringify(remaining)); |
| const statusEl = document.getElementById('merge-status'); |
| statusEl.className = 'merge-status error'; |
| statusEl.textContent = `${failed.length} saved dataset(s) skipped (sign in to load private datasets)`; |
| } |
| } catch (e) { console.warn('Failed to restore merged datasets:', e); } |
| |
| |
| await addPanel({ suite: 'eng_base_main', metric: 'acc_norm' }); |
| } catch (err) { |
| elInitLoading.innerHTML = `<span style="color:#e63946"> |
| Error: ${err.message}<br> |
| <small>Check browser console for details.</small> |
| </span>`; |
| console.error('Init failed:', err); |
| } |
| } |
| |
| document.getElementById('btn-add-panel').addEventListener('click', () => addPanel()); |
| document.getElementById('btn-merge-dataset').addEventListener('click', () => { |
| const input = document.getElementById('merge-dataset-input').value; |
| if (input.trim()) mergeDataset(input).catch(() => {}); |
| }); |
| document.getElementById('merge-dataset-input').addEventListener('keydown', (e) => { |
| if (e.key === 'Enter') { |
| const input = e.target.value; |
| if (input.trim()) mergeDataset(input).catch(() => {}); |
| } |
| }); |
| |
| init(); |
| </script> |
| </body> |
| </html> |
|
|