Spaces:
Sleeping
Sleeping
- app.py +1 -1
- src/display/utils.py +1 -2
app.py
CHANGED
@@ -99,7 +99,7 @@ with demo:
|
|
99 |
with gr.TabItem("π
1 Correct", elem_id="llm-benchmark-tab-table", id=0):
|
100 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
101 |
|
102 |
-
with gr.TabItem("π
1 Correct
|
103 |
leaderboard = init_leaderboard(LEADERBOARD_DF_1_CORRECT_VAR)
|
104 |
|
105 |
with gr.TabItem("π
N Correct", elem_id="llm-benchmark-tab-table", id=1):
|
|
|
99 |
with gr.TabItem("π
1 Correct", elem_id="llm-benchmark-tab-table", id=0):
|
100 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
101 |
|
102 |
+
with gr.TabItem("π
1 Correct + Variations", elem_id="llm-benchmark-tab-table", id=4):
|
103 |
leaderboard = init_leaderboard(LEADERBOARD_DF_1_CORRECT_VAR)
|
104 |
|
105 |
with gr.TabItem("π
N Correct", elem_id="llm-benchmark-tab-table", id=1):
|
src/display/utils.py
CHANGED
@@ -24,6 +24,7 @@ class ColumnContent:
|
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
27 |
#Scores
|
28 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average β¬οΈ", "number", True)])
|
29 |
for task in Tasks:
|
@@ -35,8 +36,6 @@ for task in Detail_Tasks:
|
|
35 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
36 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
37 |
|
38 |
-
|
39 |
-
auto_eval_column_dict.append(["output_format", ColumnContent, ColumnContent("Output Format", "str", True)])
|
40 |
auto_eval_column_dict.append(["dataset_version", ColumnContent, ColumnContent("Task Version", "str", False, False)])
|
41 |
|
42 |
# We use make dataclass to dynamically fill the scores from Tasks
|
|
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
27 |
+
auto_eval_column_dict.append(["output_format", ColumnContent, ColumnContent("Output Format", "str", True)])
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average β¬οΈ", "number", True)])
|
30 |
for task in Tasks:
|
|
|
36 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
37 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
38 |
|
|
|
|
|
39 |
auto_eval_column_dict.append(["dataset_version", ColumnContent, ColumnContent("Task Version", "str", False, False)])
|
40 |
|
41 |
# We use make dataclass to dynamically fill the scores from Tasks
|