Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
chriscanal
commited on
Commit
β’
1d6adda
1
Parent(s):
319b0b7
Added graphs tab
Browse filesAdded graphs tab to show the progress of all models over time against human baselines
app.py
CHANGED
@@ -16,6 +16,13 @@ from src.assets.text_content import (
|
|
16 |
LLM_BENCHMARKS_TEXT,
|
17 |
TITLE,
|
18 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
|
20 |
from src.display_models.utils import (
|
21 |
AutoEvalColumn,
|
@@ -97,6 +104,7 @@ else:
|
|
97 |
|
98 |
original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
|
99 |
models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
|
|
100 |
|
101 |
to_be_dumped = f"models = {repr(models)}\n"
|
102 |
|
@@ -349,7 +357,6 @@ with demo:
|
|
349 |
interactive=True,
|
350 |
elem_id="filter-columns-size",
|
351 |
)
|
352 |
-
|
353 |
leaderboard_table = gr.components.Dataframe(
|
354 |
value=leaderboard_df[
|
355 |
[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
|
@@ -466,6 +473,19 @@ with demo:
|
|
466 |
leaderboard_table,
|
467 |
queue=True,
|
468 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
470 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
471 |
|
@@ -588,4 +608,4 @@ with demo:
|
|
588 |
scheduler = BackgroundScheduler()
|
589 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
590 |
scheduler.start()
|
591 |
-
demo.queue(concurrency_count=40).launch()
|
|
|
16 |
LLM_BENCHMARKS_TEXT,
|
17 |
TITLE,
|
18 |
)
|
19 |
+
from src.display_models.plot_results import (
|
20 |
+
create_metric_plot_obj,
|
21 |
+
create_scores_df,
|
22 |
+
create_plot_df,
|
23 |
+
join_model_info_with_results,
|
24 |
+
HUMAN_BASELINES,
|
25 |
+
)
|
26 |
from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
|
27 |
from src.display_models.utils import (
|
28 |
AutoEvalColumn,
|
|
|
104 |
|
105 |
original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
|
106 |
models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
107 |
+
plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
108 |
|
109 |
to_be_dumped = f"models = {repr(models)}\n"
|
110 |
|
|
|
357 |
interactive=True,
|
358 |
elem_id="filter-columns-size",
|
359 |
)
|
|
|
360 |
leaderboard_table = gr.components.Dataframe(
|
361 |
value=leaderboard_df[
|
362 |
[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
|
|
|
473 |
leaderboard_table,
|
474 |
queue=True,
|
475 |
)
|
476 |
+
with gr.TabItem("π Benchmark Graphs", elem_id="llm-benchmark-tab-table", id=4):
|
477 |
+
with gr.Row():
|
478 |
+
with gr.Column():
|
479 |
+
chart = create_metric_plot_obj(plot_df, ["Average β¬οΈ"], HUMAN_BASELINES).properties(
|
480 |
+
title="Average of Top Scores and Human Baseline Over Time"
|
481 |
+
)
|
482 |
+
gr.Plot(value=chart, interactive=False, width=500, height=500)
|
483 |
+
with gr.Column():
|
484 |
+
chart = create_metric_plot_obj(
|
485 |
+
plot_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], HUMAN_BASELINES
|
486 |
+
).properties(title="Top Scores and Human Baseline Over Time")
|
487 |
+
gr.Plot(value=chart, interactive=False, width=500, height=500)
|
488 |
+
|
489 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
490 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
491 |
|
|
|
608 |
scheduler = BackgroundScheduler()
|
609 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
610 |
scheduler.start()
|
611 |
+
demo.queue(concurrency_count=40).launch()
|