Spaces:
Running
Running
Refactor init_leaderboard function to handle multiple subsets, improve column selection and hiding, and include Dataset Version in filter_columns
Browse files- app.py +68 -28
- src/populate.py +4 -3
app.py
CHANGED
@@ -65,40 +65,80 @@ except Exception:
|
|
65 |
restart_space()
|
66 |
|
67 |
|
68 |
-
LEADERBOARD_DF
|
69 |
|
70 |
|
71 |
-
def init_leaderboard(dataframes
|
72 |
-
subsets = list(
|
73 |
|
74 |
with gr.Row():
|
75 |
selected_subset = gr.Dropdown(choices=subsets, label="Select Dataset Subset", value=subsets[-1])
|
|
|
|
|
|
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
],
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
demo = gr.Blocks(css=custom_css)
|
104 |
with demo:
|
@@ -107,7 +147,7 @@ with demo:
|
|
107 |
|
108 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
109 |
with gr.TabItem("π
LiveBench Results", elem_id="llm-benchmark-tab-table", id=0):
|
110 |
-
init_leaderboard(LEADERBOARD_DF
|
111 |
|
112 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
113 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
65 |
restart_space()
|
66 |
|
67 |
|
68 |
+
LEADERBOARD_DF = get_leaderboard_df(RESULTS_REPO)
|
69 |
|
70 |
|
71 |
+
def init_leaderboard(dataframes):
|
72 |
+
subsets = list(dataframes.keys())
|
73 |
|
74 |
with gr.Row():
|
75 |
selected_subset = gr.Dropdown(choices=subsets, label="Select Dataset Subset", value=subsets[-1])
|
76 |
+
research_textbox = gr.Textbox(placeholder="π Search Models... [press enter]", label="Filter Models by Name")
|
77 |
+
selected_columns = gr.CheckboxGroup(choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden], label="Select Columns to Display", value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default])
|
78 |
+
|
79 |
|
80 |
+
data = dataframes[subsets[-1]]
|
81 |
+
|
82 |
+
with gr.Row():
|
83 |
+
datatype = [c.type for c in fields(AutoEvalColumn)]
|
84 |
+
df = gr.Dataframe(data, datatype=datatype, type="pandas")
|
85 |
+
|
86 |
+
def refresh(subset):
|
87 |
+
global LEADERBOARD_DF
|
88 |
+
LEADERBOARD_DF = get_leaderboard_df(RESULTS_REPO)
|
89 |
+
research_textbox.value = ""
|
90 |
+
selected_subset.choices = subsets
|
91 |
+
update_data(subset, research_textbox, selected_columns)
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
def update_data(subset, search_term, selected_columns):
|
96 |
+
return dataframes[subset][dataframes[subset].model.str.contains(search_term, case=False)][selected_columns]
|
97 |
+
|
98 |
+
with gr.Row():
|
99 |
+
refresh_button = gr.Button("Refresh")
|
100 |
+
refresh_button.click(refresh, inputs=[
|
101 |
+
selected_subset,
|
102 |
+
], outputs=data, concurrency_limit=20)
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
selected_subset.change(update_data, inputs=[
|
107 |
+
selected_subset, research_textbox, selected_columns
|
108 |
+
], outputs=data)
|
109 |
+
research_textbox.submit(
|
110 |
+
update_data,
|
111 |
+
inputs=[selected_subset, research_textbox, selected_columns],
|
112 |
+
outputs=data
|
113 |
+
)
|
114 |
+
selected_columns.change(
|
115 |
+
update_data,
|
116 |
+
inputs=[selected_subset, research_textbox, selected_columns],
|
117 |
+
outputs=data
|
118 |
)
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
# return Leaderboard(
|
123 |
+
# value=dataframes,
|
124 |
+
# datatype=[c.type for c in fields(AutoEvalColumn)],
|
125 |
+
# select_columns=SelectColumns(
|
126 |
+
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
127 |
+
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
128 |
+
# label="Select Columns to Display:",
|
129 |
+
# ),
|
130 |
+
# search_columns=[AutoEvalColumn.model.name],
|
131 |
+
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
132 |
+
# filter_columns=[
|
133 |
+
# ColumnFilter(
|
134 |
+
# column=AutoEvalColumn.dataset_version.name,
|
135 |
+
# choices=subsets,
|
136 |
+
# default=subsets[-1],
|
137 |
+
# )
|
138 |
+
# # gr.Dropdown(choices=subsets, label="Select Dataset Subset", value=subsets[-1])
|
139 |
+
# ],
|
140 |
+
# interactive=False,
|
141 |
+
# )
|
142 |
|
143 |
demo = gr.Blocks(css=custom_css)
|
144 |
with demo:
|
|
|
147 |
|
148 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
149 |
with gr.TabItem("π
LiveBench Results", elem_id="llm-benchmark-tab-table", id=0):
|
150 |
+
init_leaderboard(LEADERBOARD_DF)
|
151 |
|
152 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
153 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
src/populate.py
CHANGED
@@ -17,8 +17,9 @@ def get_leaderboard_df(results_repo):
|
|
17 |
subset_df = subset_data.to_pandas()
|
18 |
subset_df = subset_df.sort_values(by="Total", ascending=False)
|
19 |
subset_df = subset_df.round(2) # Round all numeric columns to two decimal places
|
20 |
-
subset_df["Dataset Version"] = [subset] * len(subset_df)
|
21 |
print(subset_df)
|
22 |
subset_dfs[subset] = subset_df
|
23 |
-
df = pd.concat(subset_dfs.values())
|
24 |
-
return df, subset_dfs.keys()
|
|
|
|
17 |
subset_df = subset_data.to_pandas()
|
18 |
subset_df = subset_df.sort_values(by="Total", ascending=False)
|
19 |
subset_df = subset_df.round(2) # Round all numeric columns to two decimal places
|
20 |
+
# subset_df["Dataset Version"] = [subset] * len(subset_df)
|
21 |
print(subset_df)
|
22 |
subset_dfs[subset] = subset_df
|
23 |
+
# df = pd.concat(subset_dfs.values())
|
24 |
+
# return df, subset_dfs.keys()
|
25 |
+
return subset_dfs
|