chriscanal commited on
Commit
1d6adda
β€’
1 Parent(s): 319b0b7

Added graphs tab

Browse files

Added graphs tab to show the progress of all models over time against human baselines

Files changed (1) hide show
  1. app.py +22 -2
app.py CHANGED
@@ -16,6 +16,13 @@ from src.assets.text_content import (
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
  )
 
 
 
 
 
 
 
19
  from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
20
  from src.display_models.utils import (
21
  AutoEvalColumn,
@@ -97,6 +104,7 @@ else:
97
 
98
  original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
99
  models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
 
100
 
101
  to_be_dumped = f"models = {repr(models)}\n"
102
 
@@ -349,7 +357,6 @@ with demo:
349
  interactive=True,
350
  elem_id="filter-columns-size",
351
  )
352
-
353
  leaderboard_table = gr.components.Dataframe(
354
  value=leaderboard_df[
355
  [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
@@ -466,6 +473,19 @@ with demo:
466
  leaderboard_table,
467
  queue=True,
468
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
470
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
471
 
@@ -588,4 +608,4 @@ with demo:
588
  scheduler = BackgroundScheduler()
589
  scheduler.add_job(restart_space, "interval", seconds=1800)
590
  scheduler.start()
591
- demo.queue(concurrency_count=40).launch()
 
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
  )
19
+ from src.display_models.plot_results import (
20
+ create_metric_plot_obj,
21
+ create_scores_df,
22
+ create_plot_df,
23
+ join_model_info_with_results,
24
+ HUMAN_BASELINES,
25
+ )
26
  from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
27
  from src.display_models.utils import (
28
  AutoEvalColumn,
 
104
 
105
  original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
106
  models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
107
+ plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
108
 
109
  to_be_dumped = f"models = {repr(models)}\n"
110
 
 
357
  interactive=True,
358
  elem_id="filter-columns-size",
359
  )
 
360
  leaderboard_table = gr.components.Dataframe(
361
  value=leaderboard_df[
362
  [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
 
473
  leaderboard_table,
474
  queue=True,
475
  )
476
+ with gr.TabItem("πŸ“ˆ Benchmark Graphs", elem_id="llm-benchmark-tab-table", id=4):
477
+ with gr.Row():
478
+ with gr.Column():
479
+ chart = create_metric_plot_obj(plot_df, ["Average ⬆️"], HUMAN_BASELINES).properties(
480
+ title="Average of Top Scores and Human Baseline Over Time"
481
+ )
482
+ gr.Plot(value=chart, interactive=False, width=500, height=500)
483
+ with gr.Column():
484
+ chart = create_metric_plot_obj(
485
+ plot_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], HUMAN_BASELINES
486
+ ).properties(title="Top Scores and Human Baseline Over Time")
487
+ gr.Plot(value=chart, interactive=False, width=500, height=500)
488
+
489
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
490
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
491
 
 
608
  scheduler = BackgroundScheduler()
609
  scheduler.add_job(restart_space, "interval", seconds=1800)
610
  scheduler.start()
611
+ demo.queue(concurrency_count=40).launch()