|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import math |
|
|
import streamlit as st |
|
|
|
|
|
st.set_page_config(page_title="LLM Compute Estimator", page_icon="🧮", layout="centered") |
|
|
|
|
|
st.title("🧮 LLM Compute-Optimal Estimator") |
|
|
st.caption("Estimate total FLOPs, wall-clock time, steps, and cost for pretraining — with a Chinchilla-style token rule.") |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.logo('./static/logo_light.png') |
|
|
st.header("Assumptions & Notes") |
|
|
st.markdown( |
|
|
""" |
|
|
**Formulas** |
|
|
- **Total FLOPs** ≈ `c * N_params * N_tokens`, with default **c = 6** (forward+backward+optimizer overhead). |
|
|
- **Compute-optimal tokens** (rule-of-thumb): `N_tokens ≈ k * N_params`, default **k = 20**. |
|
|
- **Effective compute** = `GPU_count * (peak TFLOPs × 1e12) * efficiency`. |
|
|
|
|
|
**Disclaimers** |
|
|
- This is a *back-of-the-envelope* estimator. Real training efficiency depends on data pipeline, parallelism strategy, sequence length, kernel fusion, optimizer, etc. |
|
|
- Preset TFLOPs are **approximate** and depend on precision (FP8/BF16), sparsity, clocks, and vendor kernels. |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
st.subheader("1) Model & Token Budget") |
|
|
col1, col2, col3 = st.columns([1.2, 1, 1]) |
|
|
with col1: |
|
|
model_params_b = st.number_input("Model size (Billions of parameters)", min_value=0.05, value=4.0, step=0.5, format="%.2f") |
|
|
with col2: |
|
|
c_overhead = st.number_input("c (FLOPs constant)", min_value=4.0, value=6.0, step=0.5) |
|
|
with col3: |
|
|
k_tokens_per_param = st.number_input("k (tokens per param for compute-optimal)", min_value=5.0, value=20.0, step=1.0) |
|
|
|
|
|
use_compute_optimal = st.toggle("Use compute‑optimal tokens (k × params)", value=True) |
|
|
if use_compute_optimal: |
|
|
tokens_b = model_params_b * k_tokens_per_param |
|
|
st.info(f"Compute‑optimal token budget ≈ **{tokens_b:,.2f} B** (k = {k_tokens_per_param:g})") |
|
|
else: |
|
|
tokens_b = st.number_input("Token budget (Billions)", min_value=1.0, value=80.0, step=5.0, format="%.2f") |
|
|
|
|
|
|
|
|
st.subheader("2) Hardware") |
|
|
col6, col7 = st.columns(2) |
|
|
with col6: |
|
|
gpu_preset = st.selectbox( |
|
|
"GPU preset (approx peak TFLOPs per GPU)", |
|
|
( |
|
|
"Custom", |
|
|
"A100 80GB BF16 ≈ 312", |
|
|
"H100 SXM BF16 ≈ 989", |
|
|
"B200 (FP8-ish) ≈ 20000", |
|
|
), |
|
|
index=0, |
|
|
help="Values are back-of-the-envelope. Choose 'Custom' to enter your own.", |
|
|
) |
|
|
|
|
|
preset_map = { |
|
|
"A100 80GB BF16 ≈ 312": 312.0, |
|
|
"H100 SXM BF16 ≈ 989": 989.0, |
|
|
"B200 (FP8-ish) ≈ 20000": 20000.0, |
|
|
} |
|
|
|
|
|
with col7: |
|
|
if gpu_preset == "Custom": |
|
|
peak_tflops = st.number_input("Peak TFLOPs per GPU (approx)", min_value=10.0, value=20000.0, step=100.0) |
|
|
else: |
|
|
peak_tflops = preset_map[gpu_preset] |
|
|
st.number_input("Peak TFLOPs per GPU (approx)", value=peak_tflops, disabled=True) |
|
|
|
|
|
col8, col9, col10 = st.columns(3) |
|
|
with col8: |
|
|
gpu_count = st.number_input("GPU count", min_value=1, value=8, step=1) |
|
|
with col9: |
|
|
efficiency = st.slider("Training efficiency (MFU, %)", min_value=10, max_value=95, value=50, step=1) |
|
|
with col10: |
|
|
price_per_gpu_hour = st.number_input("Price per GPU·hour (USD)", min_value=0.0, value=25.0, step=1.0) |
|
|
|
|
|
|
|
|
st.subheader("3) Batch & Sequence Settings") |
|
|
col4, col5 = st.columns(2) |
|
|
with col4: |
|
|
micro_batch = st.number_input("Micro batch size per GPU", min_value=1, value=4, step=1, help="Number of sequences per GPU per optimizer step.") |
|
|
with col5: |
|
|
seq_len = st.number_input("Sequence length (tokens)", min_value=128, value=2048, step=128) |
|
|
|
|
|
tokens_per_step = micro_batch * seq_len * gpu_count |
|
|
st.info(f"Tokens per optimization step ≈ {tokens_per_step:,} (with {gpu_count} GPUs)") |
|
|
|
|
|
|
|
|
N_params = model_params_b * 1e9 |
|
|
N_tokens = tokens_b * 1e9 |
|
|
c = c_overhead |
|
|
|
|
|
|
|
|
flops_total = c * N_params * N_tokens |
|
|
|
|
|
|
|
|
effective_flops_per_s = gpu_count * (peak_tflops * 1e12) * (efficiency / 100.0) |
|
|
|
|
|
|
|
|
seconds = flops_total / effective_flops_per_s if effective_flops_per_s > 0 else float('inf') |
|
|
hours = seconds / 3600 |
|
|
days = hours / 24 |
|
|
|
|
|
|
|
|
steps = N_tokens / tokens_per_step if tokens_per_step > 0 else float('inf') |
|
|
|
|
|
|
|
|
throughput_tokens_per_s = N_tokens / seconds if seconds > 0 else float('inf') |
|
|
|
|
|
|
|
|
cost = price_per_gpu_hour * gpu_count * hours |
|
|
|
|
|
|
|
|
st.divider() |
|
|
st.subheader("Results") |
|
|
|
|
|
colA, colB = st.columns(2) |
|
|
with colA: |
|
|
st.metric("Total FLOPs", f"{flops_total:,.2e} FLOPs") |
|
|
st.metric("Effective compute", f"{effective_flops_per_s:,.2e} FLOPs/s") |
|
|
st.metric("Steps (est)", f"{0 if steps == float('inf') else steps:,.0f}") |
|
|
with colB: |
|
|
st.metric("Wall‑clock time", f"{hours:,.1f} h (~{days:,.2f} d)") |
|
|
st.metric("Throughput", f"{0 if throughput_tokens_per_s == float('inf') else throughput_tokens_per_s:,.0f} tok/s") |
|
|
st.metric("Projected cost", f"${0 if cost == float('inf') else cost:,.0f}") |
|
|
|
|
|
st.divider() |
|
|
|
|
|
st.markdown( |
|
|
f""" |
|
|
**Summary** |
|
|
- Params: **{model_params_b:,.2f}B** · Tokens: **{tokens_b:,.2f}B** (compute‑optimal: {use_compute_optimal}) |
|
|
- Constant **c = {c:g}** → Total ≈ **{flops_total:,.2e} FLOPs**. |
|
|
- Hardware: **{gpu_count}× GPU**, peak **{peak_tflops:g} TFLOPs/GPU**, MFU **{efficiency}%** → Effective ≈ **{effective_flops_per_s:,.2e} FLOPs/s**. |
|
|
- Time ≈ **{hours:,.1f} hours** (≈ {days:,.2f} days). Steps ≈ **{0 if steps == float('inf') else steps:,.0f}** (@ {tokens_per_step:,} tok/step). |
|
|
- Rough cost ≈ **${0 if cost == float('inf') else cost:,.0f}** (@ ${price_per_gpu_hour:g}/GPU·h). |
|
|
""" |
|
|
) |
|
|
|
|
|
with st.expander("What is the Chinchilla rule? Is it 1 epoch?"): |
|
|
st.markdown( |
|
|
""" |
|
|
**Chinchilla scaling** is a *compute‑optimal* rule of thumb: for a fixed compute budget, scale |
|
|
the **training tokens** roughly in proportion to the **model parameters** (commonly ~20× tokens per parameter). |
|
|
It is **not** about training for exactly one epoch. In web‑scale pretraining, datasets are often sampled with |
|
|
replacement or mixed; you might see data multiple times or less than once. The rule speaks to the *total number |
|
|
of tokens* the model should process for best use of compute, not to dataset passes. |
|
|
""" |
|
|
) |
|
|
|
|
|
st.success("Ready. Tweak inputs on the left to explore different scenarios.") |
|
|
|