webgpu-bench / js /data.js
GitHub Actions
sync from abhijitramesh/webgpu-bench@d35922fe12
e2ac5c3
import { fetchAllRuns } from './dataset.js';
import { HF_DATASET_REPO } from './run/config.js';
// In-memory cache for the current page session.
let cachedData = null;
// sessionStorage cache so a refresh-within-a-minute doesn't re-fetch the
// entire dataset. Short TTL β€” submissions land continuously and the
// dashboard is the surface where we actually want freshness.
const SESSION_CACHE_KEY = 'webgpu-bench:dashboard-data';
const SESSION_CACHE_TTL_MS = 60 * 1000;
export async function loadData() {
if (cachedData) return cachedData;
const fromSession = readSessionCache();
if (fromSession) {
cachedData = fromSession;
return cachedData;
}
// Single source of truth: the HF dataset repo. No static baseline. A new
// dashboard with zero submissions shows an empty state until something is
// submitted.
const empty = makeEmptyDataset();
try {
const { records, machines, fileCount } = await fetchAllRuns(HF_DATASET_REPO);
if (fileCount > 0) {
mergeRecords(empty, records, machines);
}
cachedData = empty;
writeSessionCache(cachedData);
} catch (err) {
console.warn(`Live dataset load failed: ${err.message}`);
cachedData = empty;
}
return cachedData;
}
function makeEmptyDataset() {
return {
meta: {
machines: [],
models: [],
browsers: [],
generatedAt: new Date().toISOString(),
},
results: [],
};
}
/* Append records into an empty payload and recompute the meta lookups. Same
shape the old combined.json had, so all downstream consumers (charts,
tables, machine cards) work unchanged. */
function mergeRecords(payload, records, machines) {
if (records.length === 0) return;
payload.results.push(...records);
const modelsSet = new Set(payload.meta.models || []);
const browsersSet = new Set(payload.meta.browsers || []);
for (const r of records) {
if (r.model) modelsSet.add(r.model);
if (r.browser) browsersSet.add(r.browser);
}
payload.meta.models = [...modelsSet].sort();
payload.meta.browsers = [...browsersSet].sort();
const machineMap = new Map((payload.meta.machines || []).map(m => [m.slug, m]));
for (const m of machines) {
if (!machineMap.has(m.slug)) machineMap.set(m.slug, { ...m });
}
for (const m of machineMap.values()) {
m.resultCount = 0;
m.passCount = 0;
}
// Per-machine submitter aggregation β€” counts contributions and tracks the
// most-recent submission so the machine card can render a stacked-avatar
// row sorted by activity.
const submitterAccumulator = new Map(); // slug β†’ Map(key β†’ {profile, count, latestAt})
for (const r of payload.results) {
const m = machineMap.get(r.machineSlug);
if (!m) continue;
m.resultCount += 1;
if (r.status === 'done') m.passCount += 1;
const sb = r.submittedBy;
if (!sb?.name) continue;
const key = sb.hubId || sb.name;
if (!submitterAccumulator.has(r.machineSlug)) submitterAccumulator.set(r.machineSlug, new Map());
const inner = submitterAccumulator.get(r.machineSlug);
const cur = inner.get(key);
if (!cur) {
inner.set(key, { profile: sb, count: 1, latestAt: r.timestamp || '' });
} else {
cur.count += 1;
if (r.timestamp && r.timestamp > cur.latestAt) {
cur.profile = sb;
cur.latestAt = r.timestamp;
}
}
}
for (const [slug, inner] of submitterAccumulator) {
const m = machineMap.get(slug);
if (!m) continue;
m.submitters = [...inner.values()]
.map(({ profile, count, latestAt }) => ({ ...profile, count, latestAt }))
.sort((a, b) => b.count - a.count || (b.latestAt || '').localeCompare(a.latestAt || ''));
}
payload.meta.machines = [...machineMap.values()];
payload.meta.generatedAt = new Date().toISOString();
}
function readSessionCache() {
try {
const raw = sessionStorage.getItem(SESSION_CACHE_KEY);
if (!raw) return null;
const { ts, data } = JSON.parse(raw);
if (typeof ts !== 'number' || (Date.now() - ts) > SESSION_CACHE_TTL_MS) return null;
return data;
} catch {
return null;
}
}
function writeSessionCache(data) {
try {
sessionStorage.setItem(SESSION_CACHE_KEY, JSON.stringify({ ts: Date.now(), data }));
} catch { /* quota or disabled */ }
}
/* Fold the (d=0, d=N) GPU record pair that Run Study emits per variant
into a single dashboard row. The d=N record stays canonical
(`decode_tok_s` / `prefill_tok_s` keep the depth-loaded numbers) so
existing chart/table consumers keep working unchanged; a new pair of
`_d0` / `_dN` suffix fields lets depth-aware code pick a specific pass.
CPU records are pinned to d=0 by the runner, so they pass through
untouched. Cells with only one half of the pair (plain Run, pre-study
data, or a partial study) lift their values into the suffix field on
the side that exists, leaving the other side null β€” so consumers can
render `β€”` without having to know the record's history.
Within each cell we also tie-break duplicate records per depth bucket
(same iteration / latest timestamp wins, mirroring selectBestResults)
so multiple study runs of the same variant collapse cleanly.
Run AFTER attachCpuBaselineFromCpuRecords (which keys on the
depth-independent (machine, browser, model, variant) tuple) and
BEFORE selectBestResults (CPU rows still need cell-dedup; GPU rows
are already deduped here). */
export function mergeDepthPairs(records) {
const cells = new Map();
const cpuRows = [];
for (const r of records) {
if (r.nGpuLayers === 0) {
cpuRows.push(r);
continue;
}
const cellKey = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
const bucket = (r.n_depth ?? 0) === 0 ? 'd0' : 'dN';
const slot = cells.get(cellKey) || { d0: null, dN: null };
if (!slot[bucket] || isStrongerRecord(r, slot[bucket])) slot[bucket] = r;
cells.set(cellKey, slot);
}
const merged = [...cpuRows];
for (const { d0, dN } of cells.values()) {
if (d0 && dN) merged.push(joinDepthPair(d0, dN));
else if (dN) merged.push(liftSingleDepth(dN, 'dN'));
else if (d0) merged.push(liftSingleDepth(d0, 'd0'));
}
return merged;
}
function isStrongerRecord(a, b) {
const ai = a.iterations ?? 0;
const bi = b.iterations ?? 0;
if (ai !== bi) return ai > bi;
return (a.timestamp || '') > (b.timestamp || '');
}
const DEPTH_PERF_FIELDS = [
'decode_tok_s', 'prefill_tok_s',
'decode_stddev_ts', 'prefill_stddev_ts',
'pp_test_name', 'tg_test_name',
];
function joinDepthPair(d0, dN) {
const out = { ...dN };
for (const f of DEPTH_PERF_FIELDS) {
out[`${f}_d0`] = d0[f] ?? null;
out[`${f}_dN`] = dN[f] ?? null;
}
out.n_depth_dN = dN.n_depth ?? null;
return out;
}
function liftSingleDepth(r, bucket) {
const out = { ...r };
for (const f of DEPTH_PERF_FIELDS) {
out[`${f}_d0`] = bucket === 'd0' ? (r[f] ?? null) : null;
out[`${f}_dN`] = bucket === 'dN' ? (r[f] ?? null) : null;
}
out.n_depth_dN = bucket === 'dN' ? (r.n_depth ?? null) : null;
return out;
}
/* Reduce a flat result set down to one canonical row per
(machineSlug, browser, model, variant, backend) cell. Picks the row with
the most iterations; ties break on latest timestamp. This is the
leaderboard view β€” "best representative number per cell" β€” and is what
the dashboard renders in the table, charts, and stat cards.
`backend` (CPU vs GPU, derived from nGpuLayers) is part of the key so
CLI CPU+GPU pairs and browser-flow synthetic CPU rows don't collapse
into the GPU row. */
export function selectBestResults(records) {
const bestByCell = new Map();
for (const r of records) {
const backend = r.nGpuLayers === 0 ? 'cpu' : 'gpu';
const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}|${backend}`;
const cur = bestByCell.get(key);
if (!cur) {
bestByCell.set(key, r);
continue;
}
const curIter = cur.iterations ?? 0;
const newIter = r.iterations ?? 0;
if (newIter > curIter) {
bestByCell.set(key, r);
} else if (newIter === curIter) {
const curTs = cur.timestamp || '';
const newTs = r.timestamp || '';
if (newTs > curTs) bestByCell.set(key, r);
}
}
return [...bestByCell.values()];
}
/* For CLI-flow records that ship CPU and GPU as separate dataset entries,
look up each GPU record's matching CPU companion (same machine, browser,
model, variant) and copy its perf into cpu_baseline_*. After this pass,
GPU records from both submission paths (browser, CLI) carry their CPU
baseline inline, so the main table can render a single row per cell with
both numbers side-by-side. No-op on records that already have
cpu_baseline_* (e.g. browser-flow records, where controller.makeRecord
embeds it at write time). */
export function attachCpuBaselineFromCpuRecords(results) {
const cpuByCell = new Map();
for (const r of results) {
if (r.nGpuLayers === 0 && r.status === 'done' && (r.decode_tok_s != null || r.prefill_tok_s != null)) {
const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
const cur = cpuByCell.get(key);
// Most-recent wins on tiebreak β€” matches selectBestResults() semantics.
if (!cur || (r.timestamp || '') > (cur.timestamp || '')) {
cpuByCell.set(key, r);
}
}
}
return results.map(r => {
if (r.nGpuLayers === 0) return r;
if (r.cpu_baseline_decode_tok_s != null || r.cpu_baseline_prefill_tok_s != null) return r;
const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
const cpu = cpuByCell.get(key);
if (!cpu) return r;
return {
...r,
cpu_baseline_decode_tok_s: cpu.decode_tok_s ?? null,
cpu_baseline_prefill_tok_s: cpu.prefill_tok_s ?? null,
};
});
}
/* Synthesize a CPU row for every browser-flow GPU record (the in-page
bench measures one CPU pass per variant alongside the GPU iterations
and stamps the result on the same record via cpu_baseline_*). Returns
only CPU rows β€” combine real (nGpuLayers === 0) and synthetic ones.
Used by the CPU-vs-GPU views which want the CPU subset only. */
export function expandCpuRows(results) {
const real = results.filter(r => r.nGpuLayers === 0);
const synthetic = synthesizeCpuRowsFromBaseline(results);
return [...real, ...synthetic];
}
/* Same synthesis as expandCpuRows but returns the originals plus the
synthesized CPU rows β€” for the main results table where we want both
GPU and CPU rows visible. */
export function withSyntheticCpuRows(results) {
return [...results, ...synthesizeCpuRowsFromBaseline(results)];
}
function synthesizeCpuRowsFromBaseline(results) {
return results
.filter(r => r.nGpuLayers !== 0
&& (r.cpu_baseline_decode_tok_s != null || r.cpu_baseline_prefill_tok_s != null))
.map(r => ({
...r,
decode_tok_s: r.cpu_baseline_decode_tok_s,
prefill_tok_s: r.cpu_baseline_prefill_tok_s,
// The CPU baseline is a single-rep measurement (warmup + 1 timed),
// so it has no stddev. Null out the stddev fields the spread above
// inherited from the GPU row β€” otherwise the table renders the
// CPU avg with the GPU's stddev attached, which is nonsensical.
decode_stddev_ts: null,
prefill_stddev_ts: null,
// CPU baseline runs have no t_eval / n_eval breakdowns β€” null those
// out so the table doesn't show stale GPU numbers in CPU rows.
n_eval: null,
t_eval_ms: null,
n_p_eval: null,
t_p_eval_ms: null,
// Strip the embedded baseline from synthetic CPU rows so the
// "CPU decode tok/s" column doesn't duplicate the row's own metric.
cpu_baseline_decode_tok_s: null,
cpu_baseline_prefill_tok_s: null,
cpu_baseline: null,
nGpuLayers: 0,
}));
}
export function filterResults(results, filters) {
return results.filter(r => {
if (filters.machine && filters.machine !== 'all' && r.machineSlug !== filters.machine) return false;
if (filters.browser && filters.browser !== 'all' && r.browser !== filters.browser) return false;
if (filters.model && filters.model !== 'all' && r.model !== filters.model) return false;
if (filters.backend && filters.backend !== 'all') {
if (filters.backend === 'cpu' && r.nGpuLayers !== 0) return false;
if (filters.backend === 'webgpu' && r.nGpuLayers === 0) return false;
}
if (filters.status && filters.status !== 'all') {
if (filters.status === 'pass' && r.status !== 'done') return false;
if (filters.status === 'fail' && r.status === 'done') return false;
}
if (filters.quants && filters.quants.size > 0 && !filters.quants.has(r.variant)) return false;
return true;
});
}