kaizuberbuehler commited on
Commit
9ac5371
1 Parent(s): 5467082

Add data for ARC-AGI and Simple Bench

Browse files
app.py CHANGED
@@ -159,6 +159,150 @@ def create_size_for_performance_plot(category_to_display: str,
159
  gr.Dropdown(choices=list(elo_ratings_for_category.keys()), value=model_to_compare, interactive=True))
160
 
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  with gr.Blocks() as demo:
163
  with gr.Tab("Finance"):
164
  with gr.Tab("Big Five Capex") as big_five_capex_tab:
@@ -188,30 +332,32 @@ with gr.Blocks() as demo:
188
  )
189
  with gr.Tab("API Cost for Specific Performance Level", interactive=False):
190
  api_cost_for_performance_plot: gr.Plot = gr.Plot()
191
- with gr.Tab("System Performance Over Time", interactive=False):
192
- with gr.Tab("ARC-AGI"):
193
  arc_agi_plot: gr.Plot = gr.Plot()
194
- with gr.Tab("BigCodeBench"):
 
 
195
  bigcodebench_plot: gr.Plot = gr.Plot()
196
- with gr.Tab("Codeforces"):
197
  codeforces_plot: gr.Plot = gr.Plot()
198
- with gr.Tab("GAIA"):
199
  gaia_plot: gr.Plot = gr.Plot()
200
- with gr.Tab("GPQA"):
201
  gpqa_plot: gr.Plot = gr.Plot()
202
- with gr.Tab("HumanEval"):
203
  humaneval_plot: gr.Plot = gr.Plot()
204
- with gr.Tab("LMSYS"):
205
  lmsys_plot: gr.Plot = gr.Plot()
206
- with gr.Tab("OpenCompass"):
 
 
207
  opencompass_plot: gr.Plot = gr.Plot()
208
- with gr.Tab("SWE-bench"):
209
  swe_bench_plot: gr.Plot = gr.Plot()
210
- with gr.Tab("Simple Bench"):
211
- simple_bench_plot: gr.Plot = gr.Plot()
212
- with gr.Tab("WebArena"):
213
  webarena_plot: gr.Plot = gr.Plot()
214
- with gr.Tab("ZeroEval"):
215
  zeroeval_plot: gr.Plot = gr.Plot()
216
  with gr.Tab("Frontier Language Model Training Runs", interactive=False):
217
  with gr.Tab("Street Price of GPUs Used"):
@@ -228,6 +374,8 @@ with gr.Blocks() as demo:
228
  outputs=[size_for_performance_plot,
229
  size_for_performance_category_dropdown,
230
  size_for_performance_comparison_model_dropdown])
 
 
231
 
232
 
233
  if __name__ == "__main__":
 
159
  gr.Dropdown(choices=list(elo_ratings_for_category.keys()), value=model_to_compare, interactive=True))
160
 
161
 
162
+ def create_arc_agi_plot() -> go.Figure:
163
+ arc_agi_leaderboard = []
164
+ with open("arc_agi_leaderboard.jsonl", 'r') as file:
165
+ for line in file:
166
+ arc_agi_leaderboard.append(json.loads(line))
167
+
168
+ models = []
169
+ with open("models.jsonl", 'r') as file:
170
+ for line in file:
171
+ models.append(json.loads(line))
172
+
173
+ data = []
174
+ for entry in arc_agi_leaderboard:
175
+ model_name = entry['model']
176
+ score = entry['score']
177
+ model_info = next((m for m in models if m['Name'] == model_name), None)
178
+ if model_info:
179
+ release_date = datetime.strptime(model_info['Release Date'], "%Y-%m-%d")
180
+ data.append({'model': model_name, 'score': score, 'release_date': release_date})
181
+ else:
182
+ print(f"[WARNING] Model '{model_name}' not found in models.jsonl")
183
+
184
+ data.sort(key=lambda x: x['release_date'])
185
+
186
+ x_dates = [d['release_date'] for d in data]
187
+ y_scores = []
188
+ max_score = 0
189
+ for entry in data:
190
+ if entry['score'] > max_score:
191
+ max_score = entry['score']
192
+ y_scores.append(max_score)
193
+
194
+ fig = go.Figure()
195
+
196
+ fig.add_trace(go.Scatter(
197
+ x=x_dates,
198
+ y=y_scores,
199
+ mode='lines',
200
+ line=dict(shape='hv', width=2),
201
+ name='ARC-AGI Score'
202
+ ))
203
+
204
+ for i, entry in enumerate(data):
205
+ if i == 0 or y_scores[i] > y_scores[i - 1]:
206
+ fig.add_trace(go.Scatter(
207
+ x=[entry['release_date']],
208
+ y=[entry['score']],
209
+ mode='markers+text',
210
+ marker=dict(size=10),
211
+ text=[entry['model']],
212
+ textposition="top center",
213
+ name=entry['model']
214
+ ))
215
+
216
+ fig.update_layout(
217
+ title='ARC-AGI Score Progression Over Time',
218
+ xaxis_title='Release Date',
219
+ yaxis_title='ARC-AGI Score',
220
+ hovermode='x unified',
221
+ xaxis=dict(
222
+ range=[date(2024, 5, 13), date(2024, 9, 17)],
223
+ type='date'
224
+ ),
225
+ yaxis=dict(
226
+ range=[0, 100]
227
+ ),
228
+ height=800
229
+ )
230
+
231
+ return fig
232
+
233
+
234
+ def create_simple_bench_plot() -> go.Figure:
235
+ simple_bench_leaderboard = []
236
+ with open("simple_bench_leaderboard.jsonl", 'r') as file:
237
+ for line in file:
238
+ simple_bench_leaderboard.append(json.loads(line))
239
+
240
+ models = []
241
+ with open("models.jsonl", 'r') as file:
242
+ for line in file:
243
+ models.append(json.loads(line))
244
+
245
+ data = []
246
+ for entry in simple_bench_leaderboard:
247
+ model_name = entry['model']
248
+ score = entry['score']
249
+ model_info = next((m for m in models if m['Name'] == model_name), None)
250
+ if model_info:
251
+ release_date = datetime.strptime(model_info['Release Date'], "%Y-%m-%d")
252
+ data.append({'model': model_name, 'score': score, 'release_date': release_date})
253
+ else:
254
+ print(f"[WARNING] Model '{model_name}' not found in models.jsonl")
255
+
256
+ data.sort(key=lambda x: x['release_date'])
257
+
258
+ x_dates = [d['release_date'] for d in data]
259
+ y_scores = []
260
+ max_score = 0
261
+ for entry in data:
262
+ if entry['score'] > max_score:
263
+ max_score = entry['score']
264
+ y_scores.append(max_score)
265
+
266
+ fig = go.Figure()
267
+
268
+ fig.add_trace(go.Scatter(
269
+ x=x_dates,
270
+ y=y_scores,
271
+ mode='lines',
272
+ line=dict(shape='hv', width=2),
273
+ name='Simple Bench Score'
274
+ ))
275
+
276
+ for i, entry in enumerate(data):
277
+ if i == 0 or y_scores[i] > y_scores[i - 1]:
278
+ fig.add_trace(go.Scatter(
279
+ x=[entry['release_date']],
280
+ y=[entry['score']],
281
+ mode='markers+text',
282
+ marker=dict(size=10),
283
+ text=[entry['model']],
284
+ textposition="top center",
285
+ name=entry['model']
286
+ ))
287
+
288
+ fig.update_layout(
289
+ title='Simple Bench Score Progression Over Time',
290
+ xaxis_title='Release Date',
291
+ yaxis_title='Simple Bench Score',
292
+ hovermode='x unified',
293
+ xaxis=dict(
294
+ range=[date(2023, 6, 13), date(2024, 8, 14)],
295
+ type='date'
296
+ ),
297
+ yaxis=dict(
298
+ range=[0, 100]
299
+ ),
300
+ height=800
301
+ )
302
+
303
+ return fig
304
+
305
+
306
  with gr.Blocks() as demo:
307
  with gr.Tab("Finance"):
308
  with gr.Tab("Big Five Capex") as big_five_capex_tab:
 
332
  )
333
  with gr.Tab("API Cost for Specific Performance Level", interactive=False):
334
  api_cost_for_performance_plot: gr.Plot = gr.Plot()
335
+ with gr.Tab("System Performance Over Time"):
336
+ with gr.Tab("ARC-AGI") as arc_agi_tab:
337
  arc_agi_plot: gr.Plot = gr.Plot()
338
+ with gr.Tab("Simple Bench") as simple_bench_tab:
339
+ simple_bench_plot: gr.Plot = gr.Plot()
340
+ with gr.Tab("BigCodeBench", interactive=False):
341
  bigcodebench_plot: gr.Plot = gr.Plot()
342
+ with gr.Tab("Codeforces", interactive=False):
343
  codeforces_plot: gr.Plot = gr.Plot()
344
+ with gr.Tab("GAIA", interactive=False):
345
  gaia_plot: gr.Plot = gr.Plot()
346
+ with gr.Tab("GPQA", interactive=False):
347
  gpqa_plot: gr.Plot = gr.Plot()
348
+ with gr.Tab("HumanEval", interactive=False):
349
  humaneval_plot: gr.Plot = gr.Plot()
350
+ with gr.Tab("LMSYS", interactive=False):
351
  lmsys_plot: gr.Plot = gr.Plot()
352
+ with gr.Tab("MATH", interactive=False):
353
+ math_plot: gr.Plot = gr.Plot()
354
+ with gr.Tab("OpenCompass", interactive=False):
355
  opencompass_plot: gr.Plot = gr.Plot()
356
+ with gr.Tab("SWE-bench", interactive=False):
357
  swe_bench_plot: gr.Plot = gr.Plot()
358
+ with gr.Tab("WebArena", interactive=False):
 
 
359
  webarena_plot: gr.Plot = gr.Plot()
360
+ with gr.Tab("ZeroEval", interactive=False):
361
  zeroeval_plot: gr.Plot = gr.Plot()
362
  with gr.Tab("Frontier Language Model Training Runs", interactive=False):
363
  with gr.Tab("Street Price of GPUs Used"):
 
374
  outputs=[size_for_performance_plot,
375
  size_for_performance_category_dropdown,
376
  size_for_performance_comparison_model_dropdown])
377
+ arc_agi_tab.select(fn=create_arc_agi_plot, outputs=arc_agi_plot)
378
+ simple_bench_tab.select(fn=create_simple_bench_plot, outputs=simple_bench_plot)
379
 
380
 
381
  if __name__ == "__main__":
arc_agi_leaderboard.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"model": "o1-preview-2024-09-12", "score": 21}
2
+ {"model": "claude-3-5-sonnet-20240620", "score": 21}
3
+ {"model": "o1-mini-2024-09-12", "score": 13}
4
+ {"model": "gpt-4o-2024-05-13", "score": 9}
5
+ {"model": "gemini-1.5-pro-001", "score": 8}
models.jsonl CHANGED
@@ -1,3 +1,5 @@
 
 
1
  {"Name": "deepseek-v2.5", "Release Date": "2024-09-05", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
2
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
3
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -37,6 +39,7 @@
37
  {"Name": "yi-large", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
38
  {"Name": "nemotron-4-340b-instruct", "Release Date": "2024-06-14", "Total Parameters": 340, "Active Parameters": 340, "API Cost": 0}
39
  {"Name": "bard-jan-24-gemini-pro", "Release Date": "2024-01-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
40
  {"Name": "glm-4-0520", "Release Date": "2024-05-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0.63}
41
  {"Name": "llama-3-70b-instruct", "Release Date": "2024-04-18", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
42
  {"Name": "claude-3-sonnet-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
1
+ {"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
2
+ {"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
3
  {"Name": "deepseek-v2.5", "Release Date": "2024-09-05", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
4
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
5
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
39
  {"Name": "yi-large", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
40
  {"Name": "nemotron-4-340b-instruct", "Release Date": "2024-06-14", "Total Parameters": 340, "Active Parameters": 340, "API Cost": 0}
41
  {"Name": "bard-jan-24-gemini-pro", "Release Date": "2024-01-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
42
+ {"Name": "gemini-1.5-pro-001", "Release Date": "2024-02-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
43
  {"Name": "glm-4-0520", "Release Date": "2024-05-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0.63}
44
  {"Name": "llama-3-70b-instruct", "Release Date": "2024-04-18", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
45
  {"Name": "claude-3-sonnet-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
simple_bench_leaderboard.jsonl ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "claude-3-5-sonnet-20240620", "score": 27}
2
+ {"model": "gpt-4-1106-preview", "score": 26}
3
+ {"model": "claude-3-opus-20240229", "score": 25}
4
+ {"model": "llama-3.1-405b-instruct-fp8", "score": 22}
5
+ {"model": "gemini-1.5-pro-001", "score": 21}
6
+ {"model": "gpt-4-0613", "score": 18}
7
+ {"model": "gpt-4o-2024-05-13", "score": 16}
8
+ {"model": "deepseek-v2-api-0628", "score": 15}
9
+ {"model": "mistral-large-2407", "score": 13}
10
+ {"model": "gpt-4o-mini-2024-07-18", "score": 5}