| | <div class="d3-evals-after-fix"></div> |
| | <style> |
| | .d3-evals-after-fix { position: relative; } |
| | .d3-evals-after-fix .controls { |
| | margin-top: 0; |
| | display: flex; |
| | gap: 16px; |
| | align-items: center; |
| | justify-content: flex-end; |
| | width: auto; |
| | flex-wrap: wrap; |
| | } |
| | .d3-evals-after-fix .controls label { |
| | font-size: 12px; |
| | color: var(--text-color); |
| | display: flex; |
| | align-items: center; |
| | gap: 6px; |
| | white-space: nowrap; |
| | font-weight: 700; |
| | } |
| | .d3-evals-after-fix .controls select { |
| | font-size: 12px; |
| | padding: 8px 28px 8px 10px; |
| | border: 1px solid var(--border-color); |
| | border-radius: 8px; |
| | background-color: var(--surface-bg); |
| | color: var(--text-color); |
| | background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 24 24' fill='none' stroke='%230f1115' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'/%3E%3C/svg%3E"); |
| | background-repeat: no-repeat; background-position: right 8px center; background-size: 12px; |
| | -webkit-appearance: none; appearance: none; cursor: pointer; transition: border-color .15s ease, box-shadow .15s ease; |
| | } |
| | [data-theme="dark"] .d3-evals-after-fix .controls select { |
| | background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 24 24' fill='none' stroke='%23ffffff' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'/%3E%3C/svg%3E"); |
| | } |
| | .d3-evals-after-fix .controls select:hover { border-color: var(--primary-color); } |
| | .d3-evals-after-fix .controls select:focus { border-color: var(--primary-color); box-shadow: 0 0 0 3px rgba(232,137,171,.25); outline: none; } |
| | .d3-evals-after-fix .axis-label { fill: var(--text-color); font-size: 12px; font-weight: 700; } |
| | .d3-evals-after-fix .axes path, .d3-evals-after-fix .axes line { stroke: var(--axis-color); } |
| | .d3-evals-after-fix .axes text { fill: var(--tick-color); } |
| | .d3-evals-after-fix .grid line { stroke: var(--grid-color); } |
| | .d3-evals-after-fix .legend { font-size: 12px; color: var(--text-color);padding-left: 6px; } |
| | .d3-evals-after-fix .legend .items { display:flex; flex-wrap:wrap; gap:8px 12px; align-items:center; } |
| | .d3-evals-after-fix .legend .item { display:flex; align-items:center; gap:6px; white-space:nowrap; } |
| | .d3-evals-after-fix .legend .swatch { width:14px; height:14px; border-radius:3px; border:1px solid var(--border-color); display:inline-block; } |
| | |
| | .d3-evals-after-fix.hovering .legend-bottom .item.ghost { opacity: .35; } |
| | .d3-evals-after-fix.hovering .lines path.ghost { opacity: .25; } |
| | .d3-evals-after-fix.hovering .points circle.ghost { opacity: .25; } |
| | .d3-evals-after-fix.hovering .areas path.ghost { opacity: .08; } |
| | .d3-evals-after-fix .chart-header { display:flex; align-items:center; justify-content:space-between; gap:12px; margin: 0 0 8px 0; flex-wrap: wrap; } |
| | .d3-evals-after-fix .legend-bottom { display:flex; align-items:center; justify-content:flex-start; font-size:12px; color: var(--text-color); } |
| | .d3-evals-after-fix .legend-bottom .items { display:flex; flex-wrap:wrap; gap:8px 14px; } |
| | .d3-evals-after-fix .legend-bottom .item { display:inline-flex; align-items:center; gap:6px; white-space:nowrap; } |
| | .d3-evals-after-fix .legend-bottom .swatch { width:14px; height:14px; border-radius:3px; border:1px solid var(--border-color); display:inline-block; } |
| | .d3-evals-after-fix .lines path.active { stroke-width: 3; } |
| | |
| | .d3-evals-after-fix .controls .control-group { |
| | display: flex; |
| | flex-direction: column; |
| | align-items: flex-start; |
| | gap: 6px; |
| | } |
| | .d3-evals-after-fix .legend-bottom { |
| | flex-direction: column; |
| | align-items: flex-start; |
| | gap: 6px; |
| | } |
| | .d3-evals-after-fix .legend-bottom .legend-title { |
| | font-size: 12px; |
| | font-weight: 700; |
| | color: var(--text-color); |
| | } |
| | |
| | .d3-evals-after-fix .d3-tooltip { z-index: var(--z-elevated); backdrop-filter: saturate(1.12) blur(8px); } |
| | .d3-evals-after-fix .d3-tooltip__inner { display:flex; flex-direction:column; gap:6px; min-width: 220px; } |
| | .d3-evals-after-fix .d3-tooltip__inner > div:first-child { font-weight: 800; letter-spacing: 0.1px; margin-bottom: 0; } |
| | .d3-evals-after-fix .d3-tooltip__inner > div:nth-child(2) { font-size: 11px; color: var(--muted-color); display: block; margin-top: -4px; margin-bottom: 2px; letter-spacing: 0.1px; } |
| | .d3-evals-after-fix .d3-tooltip__inner > div:nth-child(n+3) { padding-top: 6px; border-top: 1px solid var(--border-color); } |
| | .d3-evals-after-fix .d3-tooltip__color-dot { display:inline-block; width: 12px; height: 12px; border-radius: 3px; border: 1px solid var(--border-color); } |
| | |
| | .d3-evals-after-fix .chart-card { background: var(--surface-bg); border: 1px solid var(--border-color); border-radius: 10px; padding: 8px; } |
| | |
| | .d3-evals-after-fix .chart-header { display:flex; align-items:flex-start; justify-content:flex-start; gap:12px; margin: 8px 0 0 0; flex-wrap: wrap; } |
| | </style> |
| | <script> |
| | (() => { |
| | |
| | const prettyMetricLabel = (key) => { |
| | if (!key) return ''; |
| | const table = { |
| | 'hellaswag': 'HellaSwag', |
| | 'mmlu': 'MMLU', |
| | 'arc': 'ARC', |
| | 'truthfulqa': 'TruthfulQA', |
| | 'gsm8k': 'GSM8K', |
| | 'winogrande': 'WinoGrande', |
| | 'openbookqa': 'OpenBookQA', |
| | 'piqa': 'PIQA', |
| | 'race': 'RACE', |
| | 'boolq': 'BoolQ', |
| | 'cb': 'CB', |
| | 'copa': 'COPA', |
| | 'multirc': 'MultiRC', |
| | 'record': 'ReCoRD', |
| | 'rte': 'RTE', |
| | 'wic': 'WiC', |
| | 'wsc': 'WSC' |
| | }; |
| | if (table[key]) return table[key]; |
| | const cleaned = String(key).replace(/[_-]+/g, ' ').trim(); |
| | return cleaned.split(/\s+/).map(w => { |
| | if (/^(mmlu|arc|gsm8k|piqa|race|boolq|multirc|record|wsc)$/i.test(w)) return w.toUpperCase(); |
| | return w.charAt(0).toUpperCase() + w.slice(1); |
| | }).join(' '); |
| | }; |
| | |
| | const ensureD3 = (cb) => { |
| | if (window.d3 && typeof window.d3.select === 'function') return cb(); |
| | let s = document.getElementById('d3-cdn-script'); |
| | if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); } |
| | const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); }; |
| | s.addEventListener('load', onReady, { once: true }); if (window.d3) onReady(); |
| | }; |
| | |
| | const bootstrap = () => { |
| | const scriptEl = document.currentScript; |
| | let container = scriptEl ? scriptEl.previousElementSibling : null; |
| | if (!(container && container.classList && container.classList.contains('d3-evals-after-fix'))){ |
| | const cs = Array.from(document.querySelectorAll('.d3-evals-after-fix')).filter(el => !(el.dataset && el.dataset.mounted === 'true')); |
| | container = cs[cs.length - 1] || null; |
| | } |
| | if (!container) return; |
| | if (container.dataset) { if (container.dataset.mounted === 'true') return; container.dataset.mounted = 'true'; } |
| | |
| | |
| | const controls = document.createElement('div'); |
| | controls.className = 'controls'; |
| | const controlGroup = document.createElement('div'); |
| | controlGroup.className = 'control-group'; |
| | const labelMetric = document.createElement('label'); |
| | labelMetric.textContent = 'Metric'; |
| | const selectMetric = document.createElement('select'); |
| | |
| | const uniqueId = Math.random().toString(36).slice(2, 9); |
| | selectMetric.id = `metric-select-${uniqueId}`; |
| | labelMetric.setAttribute('for', selectMetric.id); |
| | controlGroup.appendChild(labelMetric); |
| | controlGroup.appendChild(selectMetric); |
| | controls.appendChild(controlGroup); |
| | |
| | |
| | container.style.position = container.style.position || 'relative'; |
| | let tip = container.querySelector('.d3-tooltip'); let tipInner; |
| | if (!tip) { |
| | tip = document.createElement('div'); tip.className = 'd3-tooltip'; |
| | Object.assign(tip.style, { |
| | position:'absolute', top:'0px', left:'0px', transform:'translate(-9999px, -9999px)', pointerEvents:'none', |
| | padding:'8px 10px', borderRadius:'8px', fontSize:'12px', lineHeight:'1.35', border:'1px solid var(--border-color)', |
| | background:'var(--surface-bg)', color:'var(--text-color)', boxShadow:'0 4px 24px rgba(0,0,0,.18)', opacity:'0', transition:'opacity .12s ease' |
| | }); |
| | tipInner = document.createElement('div'); tipInner.className = 'd3-tooltip__inner'; tipInner.style.textAlign='left'; tip.appendChild(tipInner); container.appendChild(tip); |
| | } else { tipInner = tip.querySelector('.d3-tooltip__inner') || tip; } |
| | |
| | |
| | const header = document.createElement('div'); header.className = 'chart-header'; |
| | const legendBottom = document.createElement('div'); legendBottom.className = 'legend-bottom'; header.appendChild(legendBottom); |
| | header.appendChild(controls); |
| | |
| | |
| | const card = document.createElement('div'); card.className = 'chart-card'; container.appendChild(card); |
| | container.appendChild(header); |
| | |
| | const svg = d3.select(card).append('svg').attr('width','100%').style('display','block'); |
| | const gRoot = svg.append('g'); |
| | const gGrid = gRoot.append('g').attr('class','grid'); |
| | const gAxes = gRoot.append('g').attr('class','axes'); |
| | const gAreas = gRoot.append('g').attr('class','areas'); |
| | const gLines = gRoot.append('g').attr('class','lines'); |
| | const gPoints = gRoot.append('g').attr('class','points'); |
| | |
| | const overlay = gRoot.append('rect').attr('fill','transparent').style('cursor','crosshair'); |
| | const hoverLine = gRoot.append('line').attr('stroke-width',1).style('display','none'); |
| | |
| | |
| | let width = 800, height = 480; const margin = { top: 16, right: 32, bottom: 44, left: 56 }; |
| | const xScale = d3.scaleLinear(); |
| | const yScale = d3.scaleLinear(); |
| | const lineGen = d3.line().x(d => xScale(d.tokens)).y(d => yScale(d.value)); |
| | const dataByMetric = new Map(); |
| | let runOrder = []; |
| | |
| | |
| | function getRunColors(count){ |
| | try { if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') return window.ColorPalettes.getColors('categorical', count); } catch(_){} |
| | return d3.schemeTableau10 ? d3.schemeTableau10.slice(0, count) : ['#4e79a7','#f28e2b','#e15759','#76b7b2','#59a14f','#edc948','#b07aa1','#ff9da7','#9c755f','#bab0ab'].slice(0, count); |
| | } |
| | |
| | |
| | function formatTokens(v){ |
| | const billions = v / 1e9; |
| | return d3.format('.0f')(billions) + 'B'; |
| | } |
| | |
| | function updateLayout(){ |
| | const axisColor = getComputedStyle(container).getPropertyValue('--axis-color').trim() || 'rgba(0,0,0,0.25)'; |
| | width = container.clientWidth || 800; |
| | height = Math.max(280, Math.round(width / 3)); |
| | svg.attr('width', width).attr('height', height); |
| | gRoot.attr('transform', `translate(${margin.left},${margin.top})`); |
| | const innerWidth = width - margin.left - margin.right; |
| | const innerHeight = height - margin.top - margin.bottom; |
| | overlay.attr('x',0).attr('y',0).attr('width', innerWidth).attr('height', innerHeight); |
| | hoverLine.attr('y1',0).attr('y2', innerHeight).attr('stroke', axisColor); |
| | return { innerWidth, innerHeight }; |
| | } |
| | |
| | function render(metricKey){ |
| | const { innerWidth, innerHeight } = updateLayout(); |
| | const map = dataByMetric.get(metricKey) || {}; |
| | const runs = runOrder; |
| | |
| | let minTokens = Infinity, maxTokens = -Infinity, minV = Infinity, maxV = -Infinity; |
| | runs.forEach(r => { (map[r]||[]).forEach(pt => { minTokens = Math.min(minTokens, pt.tokens); maxTokens = Math.max(maxTokens, pt.tokens); minV = Math.min(minV, pt.value); maxV = Math.max(maxV, pt.value); }); }); |
| | if (!isFinite(minTokens) || !isFinite(maxTokens)) return; |
| | xScale.domain([minTokens, maxTokens]).range([0, innerWidth]); |
| | yScale.domain([minV, maxV]).nice().range([innerHeight, 0]); |
| | |
| | |
| | gGrid.selectAll('*').remove(); |
| | gGrid.selectAll('line').data(yScale.ticks(6)).join('line') |
| | .attr('x1',0).attr('x2', innerWidth).attr('y1', d=>yScale(d)).attr('y2', d=>yScale(d)) |
| | .attr('stroke','var(--grid-color)').attr('stroke-width',1).attr('shape-rendering','crispEdges'); |
| | |
| | |
| | gAxes.selectAll('*').remove(); |
| | gAxes.append('g').attr('transform', `translate(0,${innerHeight})`).call(d3.axisBottom(xScale).ticks(8).tickFormat(formatTokens)).call(g=>{ g.selectAll('path, line').attr('stroke','var(--axis-color)'); g.selectAll('text').attr('fill','var(--tick-color)').style('font-size','12px'); }); |
| | gAxes.append('g').call(d3.axisLeft(yScale).ticks(6)).call(g=>{ g.selectAll('path, line').attr('stroke','var(--axis-color)'); g.selectAll('text').attr('fill','var(--tick-color)').style('font-size','12px'); }); |
| | gAxes.append('text').attr('class','axis-label').attr('text-anchor','middle').attr('x', innerWidth/2).attr('y', innerHeight + 38).text('Tokens (B)'); |
| | gAxes.append('text').attr('class','axis-label').attr('text-anchor','middle').attr('transform', `translate(${-44}, ${innerHeight/2}) rotate(-90)`).text('Score'); |
| | |
| | |
| | const series = runs.map((r, i) => ({ run:r, color: getRunColors(runs.length)[i % getRunColors(runs.length).length], values: (map[r]||[]).slice().sort((a,b)=>a.tokens-b.tokens) })); |
| | const paths = gLines.selectAll('path.run').data(series, d=>d.run); |
| | const pathsEnter = paths.enter().append('path').attr('class','run').attr('fill','none').attr('stroke-width',2).attr('stroke', d=>d.color).attr('d', d=>lineGen(d.values)); |
| | pathsEnter.merge(paths).transition().duration(200).attr('stroke', d=>d.color).attr('d', d=>lineGen(d.values)); |
| | paths.exit().remove(); |
| | |
| | |
| | const captures = gLines.selectAll('path.run-hover').data(series, d=>`cap-${d.run}`); |
| | captures.enter().append('path').attr('class','run-hover').attr('fill','none').attr('stroke','transparent').attr('stroke-width', 12).style('pointer-events','stroke') |
| | .attr('d', d=>lineGen(d.values)) |
| | .merge(captures) |
| | .attr('d', d=>lineGen(d.values)) |
| | .on('mouseenter', function(ev, d){ |
| | container.classList.add('hovering'); |
| | |
| | gLines.selectAll('path.run').classed('ghost', s => s.run !== d.run); |
| | gPoints.selectAll('circle.pt').classed('ghost', p => p.run !== d.run); |
| | |
| | try { |
| | const legendNode = legendBottom; |
| | if (legendNode) { |
| | legendNode.querySelectorAll('.item').forEach(el => { |
| | const name = el.getAttribute('data-run'); |
| | el.classList.toggle('ghost', name !== d.run); |
| | }); |
| | } |
| | } catch {} |
| | }) |
| | .on('mouseleave', function(){ |
| | container.classList.remove('hovering'); |
| | gLines.selectAll('path.run').classed('ghost', false); |
| | gPoints.selectAll('circle.pt').classed('ghost', false); |
| | try { const legendNode = legendBottom; if (legendNode) legendNode.querySelectorAll('.item').forEach(el => el.classList.remove('ghost')); } catch {} |
| | }); |
| | captures.exit().remove(); |
| | |
| | |
| | const allPts = series.flatMap(s => s.values.map(v => ({ run:s.run, color:s.color, tokens:v.tokens, value:v.value }))); |
| | const ptsSel = gPoints.selectAll('circle.pt').data(allPts, d=>`${d.run}-${d.tokens}`); |
| | ptsSel.enter().append('circle').attr('class','pt').attr('r', 2).attr('fill', d=>d.color).attr('fill-opacity', 0.6) |
| | .attr('cx', d=>xScale(d.tokens)).attr('cy', d=>yScale(d.value)) |
| | .merge(ptsSel).transition().duration(150).attr('cx', d=>xScale(d.tokens)).attr('cy', d=>yScale(d.value)); |
| | ptsSel.exit().remove(); |
| | |
| | |
| | legendBottom.innerHTML = `<div class="legend-title">Legend</div><div class="items">${series.map(s => `<span class="item" data-run="${s.run}"><span class="swatch" style="background:${s.color}"></span><span>${s.run}</span></span>`).join('')}</div>`; |
| | |
| | try { |
| | const legendNode = legendBottom; |
| | legendNode.querySelectorAll('.item').forEach(el => { |
| | el.addEventListener('mouseenter', () => { |
| | const run = el.getAttribute('data-run'); if (!run) return; |
| | container.classList.add('hovering'); |
| | gLines.selectAll('path.run').classed('ghost', s => s.run !== run); |
| | gPoints.selectAll('circle.pt').classed('ghost', p => p.run !== run); |
| | legendNode.querySelectorAll('.item').forEach(it => it.classList.toggle('ghost', it.getAttribute('data-run') !== run)); |
| | }); |
| | el.addEventListener('mouseleave', () => { |
| | container.classList.remove('hovering'); |
| | gLines.selectAll('path.run').classed('ghost', false); |
| | gPoints.selectAll('circle.pt').classed('ghost', false); |
| | legendNode.querySelectorAll('.item').forEach(it => it.classList.remove('ghost')); |
| | }); |
| | }); |
| | } catch {} |
| | |
| | |
| | function onMove(ev){ |
| | const [mx, my] = d3.pointer(ev, overlay.node()); |
| | const sx = xScale.invert(mx); |
| | |
| | const tokens = Array.from(new Set(allPts.map(p=>p.tokens))).sort((a,b)=>a-b); |
| | const nearest = tokens.reduce((best, s) => Math.abs(s - sx) < Math.abs(best - sx) ? s : best, tokens[0]); |
| | const xpx = xScale(nearest); |
| | hoverLine.style('display', null).attr('x1', xpx).attr('x2', xpx); |
| | |
| | let html = `<div style=\"font-weight:800;letter-spacing:.1px;\">${prettyMetricLabel(metricKey)}</div><div style=\"font-size:11px;color:var(--muted-color);margin-top:-4px;margin-bottom:2px;\">${formatTokens(nearest)} tokens</div>`; |
| | const entries = series.map(s => { |
| | const m = new Map(s.values.map(v=>[v.tokens, v.value])); |
| | const val = m.get(nearest); |
| | return { run: s.run, color: s.color, val }; |
| | }).filter(e => e.val != null); |
| | entries.sort((a, b) => a.val - b.val); |
| | entries.forEach(e => { |
| | html += `<div style=\"display:flex;align-items:center;gap:6px;white-space:nowrap;\"><span class=\"d3-tooltip__color-dot\" style=\"background:${e.color}\"></span><strong>${e.run}</strong><span style=\"margin-left:auto;\">${(+e.val).toFixed(4)}</span></div>`; |
| | }); |
| | tipInner.innerHTML = html; tip.style.opacity = '1'; tip.style.transform = `translate(${Math.round(mx + margin.left + 12)}px, ${Math.round(my + margin.top + 12)}px)`; |
| | } |
| | function onLeave(){ tip.style.opacity='0'; tip.style.transform='translate(-9999px, -9999px)'; hoverLine.style('display','none'); } |
| | overlay.on('mousemove', onMove).on('mouseleave', onLeave); |
| | } |
| | |
| | |
| | (async () => { |
| | try { |
| | |
| | const csvPaths = [ |
| | '/data/evals_tp_bug_fix_200B.csv', |
| | './assets/data/evals_tp_bug_fix_200B.csv', |
| | '../assets/data/evals_tp_bug_fix_200B.csv', |
| | '../../assets/data/evals_tp_bug_fix_200B.csv' |
| | ]; |
| | |
| | let csvText = null; |
| | for (const path of csvPaths) { |
| | try { |
| | const response = await fetch(path, { cache: 'no-cache' }); |
| | if (response.ok) { |
| | csvText = await response.text(); |
| | break; |
| | } |
| | } catch(_) {} |
| | } |
| | |
| | if (!csvText) { |
| | throw new Error('CSV file not found: evals_tp_bug_fix_200B.csv'); |
| | } |
| | const rows = d3.csvParse(csvText, d => ({ |
| | run: (d.run_name||'').trim(), |
| | tokens: +d.tokens, |
| | metric: (d.metric||'').trim(), |
| | value: +d.value |
| | })); |
| | const metrics = Array.from(new Set(rows.map(r=>r.metric))).sort(); |
| | runOrder = Array.from(new Set(rows.map(r=>r.run))).sort(); |
| | metrics.forEach(m => { |
| | const map = {}; runOrder.forEach(r => map[r] = []); |
| | rows.filter(r=>r.metric===m).forEach(r => { |
| | if (!isNaN(r.tokens) && !isNaN(r.value)) map[r.run].push({ tokens:r.tokens, value:r.value }); |
| | }); |
| | dataByMetric.set(m, map); |
| | }); |
| | |
| | const isSingleFile = true; |
| | metrics.forEach(m => { const o = document.createElement('option'); o.value=m; o.textContent=prettyMetricLabel(m); selectMetric.appendChild(o); }); |
| | |
| | if (metrics.length) { |
| | let initial = metrics.find(m => m === 'hellaswag') || metrics[0]; |
| | selectMetric.value = initial; |
| | } |
| | if (isSingleFile && metrics.length <= 1) { |
| | controls.style.display = 'none'; |
| | } |
| | |
| | render(selectMetric.value); |
| | selectMetric.addEventListener('change', () => render(selectMetric.value)); |
| | const rerender = () => render(selectMetric.value); |
| | if (window.ResizeObserver) { const ro = new ResizeObserver(() => rerender()); ro.observe(container); } else { window.addEventListener('resize', rerender); } |
| | } catch (e) { |
| | const pre = document.createElement('pre'); pre.textContent = 'CSV load error: ' + (e && e.message ? e.message : e); |
| | pre.style.color = 'var(--danger, #b00020)'; pre.style.fontSize = '12px'; pre.style.whiteSpace = 'pre-wrap'; container.appendChild(pre); |
| | } |
| | })(); |
| | }; |
| | |
| | if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true }); } else { ensureD3(bootstrap); } |
| | })(); |
| | </script> |
| |
|