Spaces:
Sleeping
Sleeping
| import json | |
| from datetime import datetime, date | |
| import gradio as gr | |
| import plotly.graph_objects as go | |
| def create_ai_cloud_companies_plot() -> go.Figure: | |
| # Read data from the JSON Lines file. | |
| with open("ai_cloud_companies_capex.jsonl", "r") as file: | |
| data = [json.loads(line) for line in file if line.strip()] | |
| quarters: list[str] = [entry["Quarter"] for entry in data] | |
| companies = ['Microsoft', 'Google', 'Meta', 'Amazon', 'Oracle', 'CoreWeave'] | |
| colors = ['#80bb00', '#ee161f', '#0065e3', '#ff6200', '#f80000', '#4b8bbe'] | |
| x_positions = list(range(len(quarters))) | |
| traces = [] | |
| for company, color in zip(companies, colors): | |
| y_data = [entry[company] for entry in data] | |
| traces.append(go.Bar( | |
| name=company, | |
| x=x_positions, | |
| y=y_data, | |
| marker_color=color | |
| )) | |
| fig = go.Figure(data=traces) | |
| fig.update_layout( | |
| barmode="stack", | |
| title="Capital Expenditures (PP&E) of AI Cloud Companies", | |
| xaxis_title="Quarter", | |
| yaxis_title="Capital Expenditures (Millions USD)", | |
| xaxis=dict( | |
| tickmode='array', | |
| tickvals=x_positions, | |
| ticktext=quarters | |
| ), | |
| height=800 | |
| ) | |
| # Calculate the x position for the AI race vertical dotted line. | |
| # We want the line drawn between "2023 Q1" and "2023 Q2". | |
| try: | |
| idx_q1 = quarters.index("2023 Q1") | |
| idx_q2 = quarters.index("2023 Q2") | |
| ai_race_vline_x = (idx_q1 + idx_q2) / 2 # position midway between the two quarters | |
| except ValueError: | |
| # Fall back if quarters not found. | |
| ai_race_vline_x = 0 | |
| # Calculate the x position for the COVID-19 pandemic vertical dotted line. | |
| # We want the line drawn between "2020 Q1" and "2020 Q2". | |
| try: | |
| covid_idx_q1 = quarters.index("2020 Q1") | |
| covid_idx_q2 = quarters.index("2020 Q2") | |
| covid_vline_x = (covid_idx_q1 + covid_idx_q2) / 2 # position midway between the two quarters | |
| except ValueError: | |
| # Fall back if quarters not found. | |
| covid_vline_x = 0 | |
| # Add a vertical dotted line for AI race spanning the full height | |
| fig.add_shape( | |
| type="line", | |
| xref="x", | |
| yref="paper", | |
| x0=ai_race_vline_x, | |
| y0=0, | |
| x1=ai_race_vline_x, | |
| y1=1, | |
| line=dict( | |
| color="black", | |
| dash="dot", | |
| width=2 | |
| ) | |
| ) | |
| # Add a vertical dotted line for COVID-19 pandemic spanning the full height | |
| fig.add_shape( | |
| type="line", | |
| xref="x", | |
| yref="paper", | |
| x0=covid_vline_x, | |
| y0=0, | |
| x1=covid_vline_x, | |
| y1=1, | |
| line=dict( | |
| color="black", | |
| dash="dot", | |
| width=2 | |
| ) | |
| ) | |
| # Add an annotation label above the AI race vertical line. | |
| fig.add_annotation( | |
| x=ai_race_vline_x, | |
| y=1.05, # place just above the top of the plotting area | |
| xref="x", | |
| yref="paper", | |
| text="AI race begins", | |
| showarrow=False, | |
| font=dict( | |
| color="black", | |
| size=12 | |
| ), | |
| align="center" | |
| ) | |
| # Add an annotation label above the COVID-19 pandemic vertical line. | |
| fig.add_annotation( | |
| x=covid_vline_x, | |
| y=1.05, | |
| xref="x", | |
| yref="paper", | |
| text="COVID-19 pandemic begins", | |
| showarrow=False, | |
| font=dict( | |
| color="black", | |
| size=12 | |
| ), | |
| align="center" | |
| ) | |
| return fig | |
| def create_simple_plot(data_path: str, | |
| name: str, | |
| subtitle: str, | |
| start_date: datetime, end_date: datetime, | |
| min_value: int = 0, max_value: int = 100, | |
| labeled_horizontal_lines: dict[str, float] = None) -> go.Figure: | |
| leaderboard = [] | |
| with open(data_path, 'r') as file: | |
| for line in file: | |
| leaderboard.append(json.loads(line)) | |
| models = [] | |
| with open("models.jsonl", 'r') as file: | |
| for line in file: | |
| models.append(json.loads(line)) | |
| data = [] | |
| for entry in leaderboard: | |
| model_name = entry['model'] | |
| score = entry['score'] | |
| model_info = next((m for m in models if m['Name'] == model_name), None) | |
| if model_info: | |
| release_date = datetime.strptime(model_info['Release Date'], "%Y-%m-%d") | |
| data.append({'model': model_name, 'score': score, 'release_date': release_date}) | |
| else: | |
| print(f"[WARNING] Model '{model_name}' not found in models.jsonl") | |
| data.sort(key=lambda x: x['release_date']) | |
| x_dates = [d['release_date'] for d in data] | |
| y_scores = [] | |
| max_score = 0 | |
| for entry in data: | |
| if entry['score'] > max_score: | |
| max_score = entry['score'] | |
| y_scores.append(max_score) | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter( | |
| x=x_dates, | |
| y=y_scores, | |
| mode='lines', | |
| line=dict(shape='hv', width=2), | |
| name='Best Score to Date' | |
| )) | |
| for i, entry in enumerate(data): | |
| if i == 0 or y_scores[i] > y_scores[i - 1]: | |
| fig.add_trace(go.Scatter( | |
| x=[entry['release_date']], | |
| y=[entry['score']], | |
| mode='markers+text', | |
| marker=dict(size=10), | |
| text=[entry['model']], | |
| textposition="top center", | |
| name=entry['model'] | |
| )) | |
| fig.update_layout( | |
| title=f'{name} Over Time<br><sup>{subtitle}</sup>', | |
| xaxis_title='Publication or Release Date', | |
| yaxis_title=name, | |
| hovermode='x unified', | |
| xaxis=dict( | |
| range=[start_date, end_date], | |
| type='date' | |
| ), | |
| yaxis=dict( | |
| range=[min_value, max_value] | |
| ), | |
| height=800 | |
| ) | |
| if labeled_horizontal_lines: | |
| for label, y_value in labeled_horizontal_lines.items(): | |
| fig.add_hline( | |
| y=y_value, | |
| line_dash="dot", | |
| line_color="black", | |
| annotation_text=label, | |
| annotation_position="right", | |
| annotation=dict( | |
| font_size=12, | |
| font_color="black", | |
| xanchor="left", | |
| yanchor="middle", | |
| xshift=10 | |
| ) | |
| ) | |
| return fig | |
| with gr.Blocks() as demo: | |
| with gr.Tab("System Performance Over Time"): | |
| with gr.Tab("Legend"): | |
| legend_markdown: gr.Markdown = gr.Markdown( | |
| value=""" | |
| ## Benchmarks and Top Scores | |
| | Benchmark | Top Score | | |
| |-----------|-----------| | |
| | Humanity's Last Exam | 🔴 7% | | |
| | BigCodeBench | 🟠 36% | | |
| | Simple Bench | 🟠 42% | | |
| | EMMA-Mini | 🟠 48% | | |
| | PlanBench | 🟠 53% | | |
| | NYT Connections | 🟡 60% | | |
| | GAIA | 🟡 65% | | |
| | LiveBench Language | 🟡 65% | | |
| | LiveBench Data Analysis | 🟡 71% | | |
| | LiveCodeBench | 🟡 73% | | |
| | ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% | | |
| | LiveBench | 🟡 76% | | |
| | GPQA | 🟡 76% | | |
| | LiveBench Mathematics | 🟡 81% | | |
| | ZebraLogic | 🟡 81% | | |
| | LiveBench Coding | 🟡 83% | | |
| | ARC-AGI-Pub (Public Eval) | 🟡 83% | | |
| | LiveBench IF | 🟡 86% | | |
| | ZeroEval | 🟡 86% | | |
| | MATH-L5 | 🟡 89% | | |
| | LiveBench Reasoning | 🟢 92% | | |
| | MMLU-Redux | 🟢 93% | | |
| | CRUX | 🟢 96% | | |
| ## Colors | |
| | Color | Score Range | | |
| |-------|------------| | |
| | 🔴 Red | Below 30% | | |
| | 🟠 Orange | 30% to 60% | | |
| | 🟡 Yellow | 60% to 90% | | |
| | 🟢 Green | Above 90% |""" | |
| ) | |
| with gr.Tab("🔴 Humanity's Last Exam") as humanitys_last_exam_tab: | |
| humanitys_last_exam_plot: gr.Plot = gr.Plot() | |
| humanitys_last_exam_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [Humanity's Last Exam Quantitative Results](https://lastexam.ai/)""" | |
| ) | |
| with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab: | |
| bigcodebench_plot: gr.Plot = gr.Plot() | |
| bigcodebench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [BigCodeBench Leaderboard](https://bigcode-bench.github.io/)""" | |
| ) | |
| with gr.Tab("🟠 Simple Bench") as simple_bench_tab: | |
| simple_bench_plot: gr.Plot = gr.Plot() | |
| simple_bench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)""" | |
| ) | |
| with gr.Tab("🟠 EMMA-Mini") as emma_tab: | |
| emma_plot: gr.Plot = gr.Plot() | |
| emma_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)""" | |
| ) | |
| with gr.Tab("🟠 PlanBench") as planbench_tab: | |
| planbench_plot: gr.Plot = gr.Plot() | |
| planbench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)""" | |
| ) | |
| with gr.Tab("🟡 NYT Connections") as nyt_connections_tab: | |
| nyt_connections_plot: gr.Plot = gr.Plot() | |
| nyt_connections_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)""" | |
| ) | |
| with gr.Tab("🟡 GAIA") as gaia_tab: | |
| gaia_plot: gr.Plot = gr.Plot() | |
| gaia_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)""" | |
| ) | |
| with gr.Tab("🟡 LiveBench Language") as livebench_language_tab: | |
| livebench_language_plot: gr.Plot = gr.Plot() | |
| livebench_language_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [LiveBench Leaderboard](https://livebench.ai/)""" | |
| ) | |
| with gr.Tab("🟡 LiveBench Data Analysis") as livebench_data_analysis_tab: | |
| livebench_data_analysis_plot: gr.Plot = gr.Plot() | |
| livebench_data_analysis_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [LiveBench Leaderboard](https://livebench.ai/)""" | |
| ) | |
| with gr.Tab("🟡 LiveCodeBench") as livecodebench_tab: | |
| livecodebench_plot: gr.Plot = gr.Plot() | |
| livecodebench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [LiveCodeBench Leaderboard](https://livecodebench.github.io/leaderboard.html)""" | |
| ) | |
| with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab: | |
| with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab: | |
| arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot() | |
| with gr.Tab("🟡 Public Eval") as arc_agi_public_eval_tab: | |
| arc_agi_public_eval_plot: gr.Plot = gr.Plot() | |
| arc_agi_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)""" | |
| ) | |
| with gr.Tab("🟡 LiveBench") as livebench_tab: | |
| livebench_plot: gr.Plot = gr.Plot() | |
| livebench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [LiveBench Leaderboard](https://livebench.ai/)""" | |
| ) | |
| with gr.Tab("🟡 GPQA") as gpqa_tab: | |
| gpqa_plot: gr.Plot = gr.Plot() | |
| gpqa_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)""" | |
| ) | |
| with gr.Tab("🟡 LiveBench Mathematics") as livebench_mathematics_tab: | |
| livebench_mathematics_plot: gr.Plot = gr.Plot() | |
| livebench_mathematics_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [LiveBench Leaderboard](https://livebench.ai/)""" | |
| ) | |
| with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab: | |
| zeroeval_zebralogic_plot: gr.Plot = gr.Plot() | |
| zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" | |
| ) | |
| with gr.Tab("🟡 LiveBench Coding") as livebench_coding_tab: | |
| livebench_coding_plot: gr.Plot = gr.Plot() | |
| livebench_coding_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [LiveBench Leaderboard](https://livebench.ai/)""" | |
| ) | |
| with gr.Tab("🟡 LiveBench IF") as livebench_if_tab: | |
| livebench_if_plot: gr.Plot = gr.Plot() | |
| livebench_if_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [LiveBench IF](https://livebench.ai/)""" | |
| ) | |
| with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab: | |
| zeroeval_average_plot: gr.Plot = gr.Plot() | |
| zeroeval_average_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" | |
| ) | |
| with gr.Tab("🟡 MATH-L5") as zeroeval_math_l5_tab: | |
| zeroeval_math_l5_plot: gr.Plot = gr.Plot() | |
| zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" | |
| ) | |
| with gr.Tab("🟢 LiveBench Reasoning") as livebench_reasoning_tab: | |
| livebench_reasoning_plot: gr.Plot = gr.Plot() | |
| livebench_reasoning_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [LiveBench Leaderboard](https://livebench.ai/)""" | |
| ) | |
| with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab: | |
| zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot() | |
| zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" | |
| ) | |
| with gr.Tab("🟢 CRUX") as zeroeval_crux_tab: | |
| zeroeval_crux_plot: gr.Plot = gr.Plot() | |
| zeroeval_crux_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" | |
| ) | |
| with gr.Tab("PhysBench", visible=False): | |
| physbench_plot: gr.Plot = gr.Plot() | |
| physbench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [PhysBench Leaderboard](https://physbench.github.io/#leaderboard)""" | |
| ) | |
| with gr.Tab("MMVU", visible=False): | |
| mmvu_plot: gr.Plot = gr.Plot() | |
| mmvu_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [MMVU Leaderboard](https://mmvu-benchmark.github.io/#leaderboard)""" | |
| ) | |
| with gr.Tab("EvalPlus", visible=False): | |
| evalplus_plot: gr.Plot = gr.Plot() | |
| evalplus_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [EvalPlus Leaderboard](https://evalplus.github.io/leaderboard.html)""" | |
| ) | |
| with gr.Tab("MultiChallenge", visible=False): | |
| multichallenge_plot: gr.Plot = gr.Plot() | |
| multichallenge_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [SEAL Leaderboard: MultiChallenge](https://scale.com/leaderboard/multichallenge)""" | |
| ) | |
| with gr.Tab("VISTA", visible=False): | |
| vista_plot: gr.Plot = gr.Plot() | |
| vista_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [SEAL Leaderboard: Visual-Language Understanding](https://scale.com/leaderboard/visual_language_understanding)""" | |
| ) | |
| with gr.Tab("ToolComp", visible=False): | |
| with gr.Tab("Enterprise"): | |
| toolcomp_enterprise_plot: gr.Plot = gr.Plot() | |
| toolcomp_enterprise_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [SEAL Leaderboard: Agentic Tool Use (Enterprise)](https://scale.com/leaderboard/tool_use_enterprise)""" | |
| ) | |
| with gr.Tab("Chat"): | |
| toolcomp_chat_plot: gr.Plot = gr.Plot() | |
| toolcomp_chat_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [SEAL Leaderboard: Agentic Tool Use (Chat)](https://scale.com/leaderboard/tool_use_chat)""" | |
| ) | |
| with gr.Tab("BFCL", visible=False): | |
| bfcl_plot: gr.Plot = gr.Plot() | |
| bfcl_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [BFCL Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html)""" | |
| ) | |
| with gr.Tab("Aider Polyglot", visible=False): | |
| aider_plot: gr.Plot = gr.Plot() | |
| aider_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [Aider LLM Leaderboards](https://aider.chat/docs/leaderboards/)""" | |
| ) | |
| with gr.Tab("OpenCompass", visible=False): | |
| opencompass_plot: gr.Plot = gr.Plot() | |
| opencompass_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [OpenCompass LLM Leaderboard](https://huggingface.co/spaces/opencompass/opencompass-llm-leaderboard)""" | |
| ) | |
| with gr.Tab("SWE-bench", visible=False): | |
| swe_bench_plot: gr.Plot = gr.Plot() | |
| swe_bench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)""" | |
| ) | |
| with gr.Tab("SWE-bench Multimodal", visible=False): | |
| swe_bench_multimodal_plot: gr.Plot = gr.Plot() | |
| swe_bench_multimodal_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/#multimodal)""" | |
| ) | |
| with gr.Tab("WebArena", visible=False): | |
| webarena_plot: gr.Plot = gr.Plot() | |
| webarena_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)""" | |
| ) | |
| with gr.Tab("OSWorld", visible=False): | |
| osworld_plot: gr.Plot = gr.Plot() | |
| osworld_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [OSWorld Benchmark](https://os-world.github.io/)""" | |
| ) | |
| with gr.Tab("MathVista", visible=False): | |
| mathvista_plot: gr.Plot = gr.Plot() | |
| mathvista_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [Leaderboard on MathVista](https://mathvista.github.io/#leaderboard)""" | |
| ) | |
| with gr.Tab("DABStep", visible=False): | |
| dabstep_plot: gr.Plot = gr.Plot() | |
| dabstep_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [DABStep Leaderboard](https://huggingface.co/spaces/adyen/DABstep)""" | |
| ) | |
| with gr.Tab("lineage-bench", visible=False): | |
| lineage_bench_plot: gr.Plot = gr.Plot() | |
| lineage_bench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [lineage-bench Results](https://github.com/fairydreaming/lineage-bench)""" | |
| ) | |
| with gr.Tab("Step-Game", visible=False): | |
| step_game_plot: gr.Plot = gr.Plot() | |
| step_game_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [Step-Game TrueSkill Leaderboard](https://github.com/lechmazur/step_game)""" | |
| ) | |
| with gr.Tab("HHEM", visible=False): | |
| hhem_plot: gr.Plot = gr.Plot() | |
| hhem_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)""" | |
| ) | |
| with gr.Tab("USACO", visible=False): | |
| usaco_plot: gr.Plot = gr.Plot() | |
| usaco_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [USACO Leaderboard](https://hal.cs.princeton.edu/usaco)""" | |
| ) | |
| with gr.Tab("AppWorld", visible=False): | |
| appworld_plot: gr.Plot = gr.Plot() | |
| appworld_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [AppWorld Agent Scores](https://appworld.dev/leaderboard)""" | |
| ) | |
| with gr.Tab("CORE-Bench", visible=False): | |
| core_bench_plot: gr.Plot = gr.Plot() | |
| core_bench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [HAL Leaderboards](https://hal.cs.princeton.edu/#leaderboards)""" | |
| ) | |
| with gr.Tab("Cybench", visible=False): | |
| cybench_plot: gr.Plot = gr.Plot() | |
| cybench_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [Cybench Leaderboard](https://hal.cs.princeton.edu/cybench)""" | |
| ) | |
| with gr.Tab("QuALITY", visible=False): | |
| quality_plot: gr.Plot = gr.Plot() | |
| quality_markdown: gr.Markdown = gr.Markdown( | |
| value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)""" | |
| ) | |
| with gr.Tab("Finance") as finance_tab: | |
| with gr.Tab("Big Tech Capex") as big_five_capex_tab: | |
| big_five_capex_plot: gr.Plot = gr.Plot() | |
| with gr.Tab("NVIDIA Revenue", visible=False) as nvidia_revenue: | |
| nvidia_revenue_plot: gr.Plot = gr.Plot() | |
| big_five_capex_tab.select(fn=create_ai_cloud_companies_plot, outputs=big_five_capex_plot) | |
| arc_agi_public_eval_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("arc_agi_leaderboard.jsonl"), | |
| gr.State( | |
| "ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"), | |
| gr.State( | |
| "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"), | |
| gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)), | |
| gr.State(0), gr.State(100), | |
| gr.State({"Humans\n(LeGris et al. 2024)": 64.2})], | |
| outputs=arc_agi_public_eval_plot) | |
| arc_agi_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"), | |
| gr.State( | |
| "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"), | |
| gr.State( | |
| "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"), | |
| gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)), | |
| gr.State(0), gr.State(100), | |
| gr.State({"MTurkers": 77})], | |
| outputs=arc_agi_semi_private_eval_plot) | |
| arc_agi_semi_private_eval_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"), | |
| gr.State( | |
| "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"), | |
| gr.State( | |
| "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"), | |
| gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)), | |
| gr.State(0), gr.State(100), | |
| gr.State({"MTurkers": 77})], | |
| outputs=arc_agi_semi_private_eval_plot) | |
| finance_tab.select(fn=create_ai_cloud_companies_plot, outputs=big_five_capex_plot) | |
| simple_bench_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("simple_bench_leaderboard.jsonl"), | |
| gr.State("Simple Bench Score"), | |
| gr.State( | |
| "\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"), | |
| gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1)), | |
| gr.State(0), gr.State(100), | |
| gr.State({"Humans": 83.7})], | |
| outputs=simple_bench_plot) | |
| planbench_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("planbench_leaderboard.jsonl"), | |
| gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"), | |
| gr.State( | |
| "\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"), | |
| gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))], | |
| outputs=planbench_plot) | |
| bigcodebench_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"), | |
| gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"), | |
| gr.State( | |
| "\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"), | |
| gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))], | |
| outputs=bigcodebench_plot) | |
| gaia_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("gaia_leaderboard.jsonl"), | |
| gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"), | |
| gr.State( | |
| "\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"), | |
| gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)), | |
| gr.State(0), gr.State(100), | |
| gr.State({"Humans": 92})], | |
| outputs=gaia_plot) | |
| gpqa_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("gpqa_leaderboard.jsonl"), | |
| gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"), | |
| gr.State( | |
| "\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"), | |
| gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)), | |
| gr.State(25), gr.State(100), | |
| gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})], | |
| outputs=gpqa_plot) | |
| zeroeval_average_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("zeroeval_average_leaderboard.jsonl"), | |
| gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"), | |
| gr.State( | |
| "\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"), | |
| gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], | |
| outputs=zeroeval_average_plot) | |
| zeroeval_mmlu_redux_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"), | |
| gr.State( | |
| "ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"), | |
| gr.State( | |
| "\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"), | |
| gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], | |
| outputs=zeroeval_mmlu_redux_plot) | |
| zeroeval_zebralogic_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"), | |
| gr.State("ZeroEval ZebraLogic Score"), | |
| gr.State( | |
| "\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"), | |
| gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], | |
| outputs=zeroeval_zebralogic_plot) | |
| zeroeval_crux_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"), | |
| gr.State( | |
| "ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"), | |
| gr.State( | |
| "\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"), | |
| gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], | |
| outputs=zeroeval_crux_plot) | |
| zeroeval_math_l5_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"), | |
| gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"), | |
| gr.State( | |
| "\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"), | |
| gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], | |
| outputs=zeroeval_math_l5_plot) | |
| livebench_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("livebench.jsonl"), | |
| gr.State("LiveBench-2024-11-25: Global Average Score"), | |
| gr.State( | |
| "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"), | |
| gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))], | |
| outputs=livebench_plot) | |
| livebench_reasoning_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("livebench_reasoning.jsonl"), | |
| gr.State("LiveBench-2024-11-25: Reasoning Average Score"), | |
| gr.State( | |
| "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"), | |
| gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))], | |
| outputs=livebench_reasoning_plot) | |
| livebench_coding_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("livebench_coding.jsonl"), | |
| gr.State("LiveBench-2024-11-25: Coding Average Score"), | |
| gr.State( | |
| "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"), | |
| gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))], | |
| outputs=livebench_coding_plot) | |
| livebench_mathematics_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("livebench_mathematics.jsonl"), | |
| gr.State("LiveBench-2024-11-25: Mathematics Average Score"), | |
| gr.State( | |
| "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"), | |
| gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))], | |
| outputs=livebench_mathematics_plot) | |
| livebench_data_analysis_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("livebench_data_analysis.jsonl"), | |
| gr.State("LiveBench-2024-11-25: Data Analysis Average Score"), | |
| gr.State( | |
| "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"), | |
| gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))], | |
| outputs=livebench_data_analysis_plot) | |
| livebench_language_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("livebench_language.jsonl"), | |
| gr.State("LiveBench-2024-11-25: Language Average Score"), | |
| gr.State( | |
| "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"), | |
| gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))], | |
| outputs=livebench_language_plot) | |
| livebench_if_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("livebench_if.jsonl"), | |
| gr.State("LiveBench-2024-11-25: IF Average Score"), | |
| gr.State( | |
| "\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"), | |
| gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))], | |
| outputs=livebench_if_plot) | |
| humanitys_last_exam_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("humanitys_last_exam.jsonl"), | |
| gr.State("Humanity's Last Exam (Multi-Modal Models Only) Score"), | |
| gr.State( | |
| "\"multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage\" (Phan et al. 2025)"), | |
| gr.State(date(2024, 5, 13)), gr.State(date(2025, 2, 11))], | |
| outputs=humanitys_last_exam_plot) | |
| livecodebench_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("livecodebench.jsonl"), | |
| gr.State("LiveCodeBench (7/1/2024 to 2/1/2025) Score"), | |
| gr.State( | |
| "\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"), | |
| gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))], | |
| outputs=livecodebench_plot) | |
| emma_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("emma_mini.jsonl"), | |
| gr.State("EMMA-Mini (Enhanced MultiModal ReAsoning) Score"), | |
| gr.State("\"benchmark targeting organic multimodal reasoning across mathematics, physics, chemistry, and coding\" (Hao et al. 2025)"), | |
| gr.State(date(2024, 9, 17)), gr.State(date(2025, 2, 1)), | |
| gr.State(22.75), gr.State(100), | |
| gr.State({"Human experts": 77.75})], | |
| outputs=emma_plot) | |
| nyt_connections_tab.select(fn=create_simple_plot, | |
| inputs=[gr.State("nyt_connections.jsonl"), | |
| gr.State("NYT Connections (Extended Version, Newest 100 Puzzles) Score"), | |
| gr.State("\"NYT Connections puzzles [...] To increase difficulty, Extended Connections adds up to four extra trick words to each puzzle.\" (Mazur, 2025)"), | |
| gr.State(date(2024, 7, 23)), gr.State(date(2025, 2, 1))], | |
| outputs=nyt_connections_plot) | |
| if __name__ == "__main__": | |
| demo.launch() | |