kaizuberbuehler's picture
Merge remote-tracking branch 'origin/main'
5474a7f
import json
import pickle
from datetime import datetime, date
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
def create_big_five_capex_plot():
# Capex in Millions of USD per Quarter of Microsoft, Google, Meta, Apple, Amazon
big_five_capex = []
with open("big_five_capex.jsonl", 'r') as file:
for line in file:
big_five_capex.append(json.loads(line))
df = pd.DataFrame(big_five_capex)
fig = go.Figure()
companies = ['Microsoft', 'Google', 'Meta', 'Apple', 'Amazon']
colors = ['#80bb00', '#ee161f', '#0065e3', '#000000', '#ff6200']
for company, color in zip(companies, colors):
fig.add_trace(go.Bar(
x=df['Quarter'],
y=df[company],
name=company,
marker_color=color
))
fig.add_vline(
x=df.index[df['Quarter'] == "2023 Q1"].tolist()[0] + 0.5,
line_width=1,
line_dash="dash",
line_color="black",
annotation_text="AI arms race begins",
annotation_position="top right",
annotation_font_size=12,
annotation_font_color="black"
)
fig.update_layout(
title='Capital Expenditure of the Big Five Tech Companies in Millions of U.S. Dollars per Quarter',
xaxis_title='Quarter',
yaxis_title='Capex (Millions of U.S. Dollars)',
barmode='stack',
legend_title='Companies',
height=800
)
return fig
def create_chip_designers_data_center_revenue_plot():
# Data Center Revenue in Millions of USD per Quarter of NVIDIA, AMD and Intel
data_center_revenue_by_company = []
with open("chip_designers_data_center_revenue.jsonl", 'r') as file:
for line in file:
data_center_revenue_by_company.append(json.loads(line))
df = pd.DataFrame(data_center_revenue_by_company)
fig = go.Figure()
companies = ['NVIDIA', 'AMD', 'Intel']
colors = ['#80bb00', '#ee161f', '#0065e3'] # TODO
for company, color in zip(companies, colors):
fig.add_trace(go.Bar(
x=df['Quarter'],
y=df[company],
name=company,
marker_color=color
))
fig.update_layout(
title='Data Center Revenue of NVIDIA, AMD and Intel in Millions of U.S. Dollars per Quarter',
xaxis_title='Quarter',
yaxis_title='Data Center Revenue (Millions of U.S. Dollars)',
barmode='stack',
legend_title='Companies',
height=800
)
return fig
def create_size_for_performance_plot(category_to_display: str,
parameter_type_to_display: str,
model_to_compare: str) -> (go.Figure, gr.Dropdown, gr.Dropdown):
with open('elo_results_20240915.pkl', 'rb') as file:
elo_results = pickle.load(file)
categories: list[str] = list(elo_results["text"].keys())
if category_to_display not in categories:
raise gr.Error(message=f"Category '{category_to_display}' not found.")
elo_ratings_for_category: dict = dict(elo_results["text"][category_to_display]["elo_rating_final"])
models: list[dict] = []
with open("models.jsonl", 'r') as file:
for line in file:
models.append(json.loads(line))
size_for_performance_data: list[dict] = []
for model_name, model_elo_rating in elo_ratings_for_category.items():
model_entries_found = [model for model in models if model["Name"] == model_name]
if model_entries_found:
size_for_performance_data.append({
"Name": model_name,
"Release Date": model_entries_found[0]["Release Date"],
"ELO Rating": model_elo_rating,
parameter_type_to_display: model_entries_found[0][parameter_type_to_display]
})
else:
print(f"[WARNING] Model '{model_name}' not found in models.jsonl")
comparison_model_elo_score = elo_ratings_for_category[model_to_compare]
filtered_models = [model for model in size_for_performance_data
if model[parameter_type_to_display] > 0 and
model['ELO Rating'] >= comparison_model_elo_score]
filtered_models.sort(key=lambda x: datetime.strptime(x['Release Date'], "%Y-%m-%d"))
x_dates = [datetime.strptime(model['Release Date'], "%Y-%m-%d") for model in filtered_models]
y_params = []
min_param = float('inf')
for model in filtered_models:
param = model[parameter_type_to_display]
if param <= min_param:
min_param = param
y_params.append(min_param)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=x_dates,
y=y_params,
mode='lines',
line=dict(shape='hv', width=2),
name='Model Parameters'
))
fig.update_layout(
title=f'Model Size Progression for Open-Weights Models Reaching Performance of "{model_to_compare}" in "{category_to_display}" Category',
xaxis_title='Release Date',
yaxis_title=parameter_type_to_display,
yaxis_type='log',
hovermode='x unified',
xaxis=dict(
range=[date(2023, 2, 27), date(2024, 9, 15)],
type='date'
),
height=800
)
for i, model in enumerate(filtered_models):
if i == 0 or y_params[i] < y_params[i - 1]:
fig.add_trace(go.Scatter(
x=[x_dates[i]],
y=[y_params[i]],
mode='markers+text',
marker=dict(size=10),
text=[model['Name']],
textposition="top center",
name=model['Name']
))
return (fig,
gr.Dropdown(choices=categories, value=category_to_display, interactive=True),
gr.Dropdown(choices=list(elo_ratings_for_category.keys()), value=model_to_compare, interactive=True))
def create_simple_plot(data_path: str,
name: str,
start_date: datetime, end_date: datetime,
min_value: int = 0, max_value: int = 100) -> go.Figure:
simple_bench_leaderboard = []
with open(data_path, 'r') as file:
for line in file:
simple_bench_leaderboard.append(json.loads(line))
models = []
with open("models.jsonl", 'r') as file:
for line in file:
models.append(json.loads(line))
data = []
for entry in simple_bench_leaderboard:
model_name = entry['model']
score = entry['score']
model_info = next((m for m in models if m['Name'] == model_name), None)
if model_info:
release_date = datetime.strptime(model_info['Release Date'], "%Y-%m-%d")
data.append({'model': model_name, 'score': score, 'release_date': release_date})
else:
print(f"[WARNING] Model '{model_name}' not found in models.jsonl")
data.sort(key=lambda x: x['release_date'])
x_dates = [d['release_date'] for d in data]
y_scores = []
max_score = 0
for entry in data:
if entry['score'] > max_score:
max_score = entry['score']
y_scores.append(max_score)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=x_dates,
y=y_scores,
mode='lines',
line=dict(shape='hv', width=2),
name='Best Score to Date'
))
for i, entry in enumerate(data):
if i == 0 or y_scores[i] > y_scores[i - 1]:
fig.add_trace(go.Scatter(
x=[entry['release_date']],
y=[entry['score']],
mode='markers+text',
marker=dict(size=10),
text=[entry['model']],
textposition="top center",
name=entry['model']
))
fig.update_layout(
title=f'{name} Over Time',
xaxis_title='Release Date',
yaxis_title=name,
hovermode='x unified',
xaxis=dict(
range=[start_date, end_date],
type='date'
),
yaxis=dict(
range=[min_value, max_value]
),
height=800
)
return fig
with gr.Blocks() as demo:
with gr.Tab("Finance"):
with gr.Tab("Big Five Capex") as big_five_capex_tab:
big_five_capex_plot: gr.Plot = gr.Plot()
with gr.Tab("Chip Designers Data Center Revenue") as chip_designers_data_center_revenue_tab:
chip_designers_data_center_revenue_plot: gr.Plot = gr.Plot()
with gr.Tab("Model Efficiency Over Time"):
with gr.Tab("Parameters Necessary for Specific Performance Level") as size_for_performance_tab:
with gr.Row():
size_for_performance_category_dropdown: gr.Dropdown = gr.Dropdown(label="Category",
value="full",
choices=["full"],
interactive=False)
size_for_performance_parameter_number_dropdown: gr.Dropdown = gr.Dropdown(label="Parameter Number",
choices=["Total Parameters",
"Active Parameters"],
value="Total Parameters",
interactive=True)
size_for_performance_comparison_model_dropdown: gr.Dropdown = gr.Dropdown(label="Model for Comparison",
value="gpt-4-0314",
choices=["gpt-4-0314"],
interactive=False)
size_for_performance_plot: gr.Plot = gr.Plot()
size_for_performance_button: gr.Button = gr.Button("Show")
size_for_performance_markdown: gr.Markdown = gr.Markdown(
value="""Model performance as reported on [LMSYS Chatbot Arena Leaderboard](https://lmarena.ai/?leaderboard)."""
)
with gr.Tab("API Cost for Specific Performance Level", interactive=False):
api_cost_for_performance_plot: gr.Plot = gr.Plot()
with gr.Tab("System Performance Over Time"):
with gr.Tab("ARC-AGI-Pub") as arc_agi_tab:
arc_agi_plot: gr.Plot = gr.Plot()
with gr.Tab("Simple Bench") as simple_bench_tab:
simple_bench_plot: gr.Plot = gr.Plot()
with gr.Tab("PlanBench") as planbench_tab:
planbench_plot: gr.Plot = gr.Plot()
planbench_markdown: gr.Markdown = gr.Markdown(
value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
)
with gr.Tab("Codeforces") as codeforces_tab:
with gr.Tab("General-Purpose Systems"):
codeforces_plot: gr.Plot = gr.Plot()
with gr.Tab("BigCodeBench", interactive=False):
bigcodebench_plot: gr.Plot = gr.Plot()
with gr.Tab("GAIA", interactive=False):
gaia_plot: gr.Plot = gr.Plot()
with gr.Tab("GPQA", interactive=False):
gpqa_plot: gr.Plot = gr.Plot()
with gr.Tab("HumanEval", interactive=False):
humaneval_plot: gr.Plot = gr.Plot()
with gr.Tab("Chatbot Arena", interactive=False):
chatbot_arena_plot: gr.Plot = gr.Plot()
with gr.Tab("MATH", interactive=False):
math_plot: gr.Plot = gr.Plot()
with gr.Tab("OpenCompass", interactive=False):
opencompass_plot: gr.Plot = gr.Plot()
with gr.Tab("SWE-bench", interactive=False):
swe_bench_plot: gr.Plot = gr.Plot()
with gr.Tab("WebArena", interactive=False):
webarena_plot: gr.Plot = gr.Plot()
with gr.Tab("ZeroEval", interactive=False):
zeroeval_plot: gr.Plot = gr.Plot()
with gr.Tab("Frontier Language Model Training Runs", interactive=False):
with gr.Tab("Street Price of GPUs Used"):
gpu_street_price_plot: gr.Plot = gr.Plot()
with gr.Tab("TDP of GPUs Used"):
tdp_gpus_plot: gr.Plot = gr.Plot()
big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
chip_designers_data_center_revenue_tab.select(fn=create_chip_designers_data_center_revenue_plot,
outputs=chip_designers_data_center_revenue_plot)
size_for_performance_button.click(fn=create_size_for_performance_plot,
inputs=[size_for_performance_category_dropdown,
size_for_performance_parameter_number_dropdown,
size_for_performance_comparison_model_dropdown],
outputs=[size_for_performance_plot,
size_for_performance_category_dropdown,
size_for_performance_comparison_model_dropdown])
arc_agi_tab.select(fn=create_simple_plot,
inputs=[gr.State("arc_agi_leaderboard.jsonl"), gr.State("ARC-AGI-Pub (Public Eval) Score"),
gr.State(date(2024, 5, 13)), gr.State(date(2024, 12, 20))],
outputs=arc_agi_plot)
simple_bench_tab.select(fn=create_simple_plot,
inputs=[gr.State("simple_bench_leaderboard.jsonl"), gr.State("Simple Bench Score"),
gr.State(date(2023, 6, 13)), gr.State(date(2024, 8, 14))],
outputs=simple_bench_plot)
codeforces_tab.select(fn=create_simple_plot,
inputs=[gr.State("codeforces_leaderboard.jsonl"), gr.State("Codeforces (Elo Rating)"),
gr.State(date(2024, 5, 13)), gr.State(date(2024, 12, 20)),
gr.State(800), gr.State(3000)],
outputs=codeforces_plot)
planbench_tab.select(fn=create_simple_plot,
inputs=[gr.State("planbench_leaderboard.jsonl"), gr.State("PlanBench (Mystery Blocksworld, 0-shot) Score"),
gr.State(date(2023, 3, 14)), gr.State(date(2024, 9, 23))],
outputs=planbench_plot)
if __name__ == "__main__":
demo.launch()