Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,7 @@ import pandas as pd
|
|
12 |
leader_component_values = [None]
|
13 |
space = " "
|
14 |
|
15 |
-
def make_default_md(
|
16 |
leaderboard_md = f"""
|
17 |
# NeurIPS LLM Merging Competition Leaderboard
|
18 |
[Website](https://llm-merging.github.io/index) | [Starter Kit (Github)](https://github.com/llm-merging/LLM-Merging) | [Discord](https://discord.com/invite/dPBHEVnV)
|
@@ -20,29 +20,20 @@ def make_default_md(arena_df, elo_results):
|
|
20 |
"""
|
21 |
return leaderboard_md
|
22 |
|
23 |
-
def make_arena_leaderboard_md(
|
24 |
total_models = len(arena_df)
|
25 |
leaderboard_md = f"""
|
26 |
-
|
|
|
27 |
|
28 |
-
Higher values are better
|
29 |
|
30 |
-
Total #models: **{total_models}**.{space}
|
31 |
|
32 |
"""
|
33 |
return leaderboard_md
|
34 |
|
35 |
|
36 |
-
|
37 |
-
def make_leaderboard_md_live(elo_results):
|
38 |
-
leaderboard_md = f"""
|
39 |
-
# Leaderboard
|
40 |
-
Last updated: {elo_results["last_updated_datetime"]}
|
41 |
-
{elo_results["leaderboard_table"]}
|
42 |
-
"""
|
43 |
-
return leaderboard_md
|
44 |
-
|
45 |
-
|
46 |
def load_leaderboard_table_csv(filename, add_hyperlink=False):
|
47 |
lines = open(filename).readlines()
|
48 |
heads = [v.strip() for v in lines[0].split(",")]
|
@@ -52,47 +43,27 @@ def load_leaderboard_table_csv(filename, add_hyperlink=False):
|
|
52 |
for j in range(len(heads)):
|
53 |
item = {}
|
54 |
for h, v in zip(heads, row):
|
55 |
-
if h == "
|
56 |
if v != "-":
|
57 |
v = int(ast.literal_eval(v))
|
58 |
else:
|
59 |
v = np.nan
|
60 |
-
elif h == "MMLU":
|
61 |
-
if v != "-":
|
62 |
-
v = round(ast.literal_eval(v) * 100, 1)
|
63 |
-
else:
|
64 |
-
v = np.nan
|
65 |
-
elif h == "MT-bench (win rate %)":
|
66 |
-
if v != "-":
|
67 |
-
v = round(ast.literal_eval(v[:-1]), 1)
|
68 |
-
else:
|
69 |
-
v = np.nan
|
70 |
-
elif h == "MT-bench (score)":
|
71 |
-
if v != "-":
|
72 |
-
v = round(ast.literal_eval(v), 2)
|
73 |
-
else:
|
74 |
-
v = np.nan
|
75 |
item[h] = v
|
76 |
if add_hyperlink:
|
77 |
-
item["Model"] = f'<a target="_blank"
|
78 |
rows.append(item)
|
79 |
return rows
|
80 |
|
81 |
-
def get_full_table(
|
82 |
values = []
|
83 |
for i in range(len(model_table_df)):
|
84 |
row = []
|
85 |
ranking = i+1
|
86 |
row.append(ranking)
|
87 |
-
model_key = model_table_df.iloc[i]["key"]
|
88 |
model_name = model_table_df.iloc[i]["Model"]
|
89 |
# model display name
|
90 |
row.append(model_name)
|
91 |
row.append(np.nan)
|
92 |
-
row.append(np.nan)
|
93 |
-
row.append(np.nan)
|
94 |
-
# Team
|
95 |
-
row.append(model_table_df.iloc[i]["Organization"])
|
96 |
|
97 |
values.append(row)
|
98 |
# values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
|
@@ -105,24 +76,13 @@ cat_name_to_explanation = {
|
|
105 |
"Overall": "Overall Questions",
|
106 |
}
|
107 |
|
108 |
-
def build_leaderboard_tab(
|
109 |
arena_dfs = {}
|
110 |
category_elo_results = {}
|
111 |
-
if
|
112 |
default_md = "Loading ..."
|
113 |
else:
|
114 |
-
|
115 |
-
elo_results = pickle.load(fin)
|
116 |
-
if "full" in elo_results:
|
117 |
-
print("KEYS ", elo_results.keys())
|
118 |
-
for k in elo_results.keys():
|
119 |
-
if k not in key_to_category_name:
|
120 |
-
continue
|
121 |
-
arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
|
122 |
-
category_elo_results[key_to_category_name[k]] = elo_results[k]
|
123 |
-
|
124 |
-
arena_df = arena_dfs["Overall"]
|
125 |
-
default_md = make_default_md(arena_df, category_elo_results["Overall"])
|
126 |
|
127 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
128 |
if leaderboard_table_file:
|
@@ -130,35 +90,29 @@ def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False)
|
|
130 |
model_table_df = pd.DataFrame(data)
|
131 |
|
132 |
with gr.Tabs() as tabs:
|
133 |
-
arena_table_vals = get_full_table(
|
134 |
with gr.Tab("Full leaderboard", id=0):
|
135 |
-
md = make_arena_leaderboard_md(
|
136 |
leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
|
137 |
with gr.Row():
|
138 |
with gr.Column(scale=2):
|
139 |
-
category_dropdown = gr.Dropdown(choices=
|
140 |
|
141 |
display_df = gr.Dataframe(
|
142 |
headers=[
|
143 |
"Rank",
|
144 |
-
"🤖 Model",
|
145 |
-
"⭐
|
146 |
-
"📈 Task 2",
|
147 |
-
"📚 Task 3",
|
148 |
-
"Team",
|
149 |
],
|
150 |
datatype=[
|
151 |
"number",
|
152 |
"markdown",
|
153 |
"number",
|
154 |
-
"number",
|
155 |
-
"number",
|
156 |
-
"str",
|
157 |
],
|
158 |
value=arena_table_vals,
|
159 |
elem_id="arena_leaderboard_dataframe",
|
160 |
height=700,
|
161 |
-
column_widths=[70, 190, 110
|
162 |
wrap=True,
|
163 |
)
|
164 |
|
@@ -179,53 +133,6 @@ def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False)
|
|
179 |
else:
|
180 |
pass
|
181 |
|
182 |
-
def update_leaderboard_df(arena_table_vals):
|
183 |
-
elo_datarame = pd.DataFrame(arena_table_vals, columns=["Rank", "🤖 Model", "⭐ Task 1", "📈 Task 2", "📚 Task 3", "Team"])
|
184 |
-
|
185 |
-
# goal: color the rows based on the rank with styler
|
186 |
-
def highlight_max(s):
|
187 |
-
# all items in S which contain up arrow should be green, down arrow should be red, otherwise black
|
188 |
-
return ["color: green; font-weight: bold" if "\u2191" in v else "color: red; font-weight: bold" if "\u2193" in v else "" for v in s]
|
189 |
-
|
190 |
-
def highlight_rank_max(s):
|
191 |
-
return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
|
192 |
-
|
193 |
-
return elo_datarame.style.apply(highlight_max, subset=["Rank"])
|
194 |
-
|
195 |
-
def update_leaderboard_and_plots(category):
|
196 |
-
arena_subset_df = arena_dfs[category]
|
197 |
-
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
|
198 |
-
elo_subset_results = category_elo_results[category]
|
199 |
-
arena_df = arena_dfs["Overall"]
|
200 |
-
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Overall" else None)
|
201 |
-
if category != "Overall":
|
202 |
-
arena_values = update_leaderboard_df(arena_values)
|
203 |
-
arena_values = gr.Dataframe(
|
204 |
-
headers=[
|
205 |
-
"Rank",
|
206 |
-
"🤖 Model",
|
207 |
-
"⭐ Task 1",
|
208 |
-
"📈 Task 2",
|
209 |
-
"📚 Task 3",
|
210 |
-
"Team",
|
211 |
-
],
|
212 |
-
datatype=[
|
213 |
-
"number",
|
214 |
-
"markdown",
|
215 |
-
"number",
|
216 |
-
"number",
|
217 |
-
"number",
|
218 |
-
"str",
|
219 |
-
],
|
220 |
-
value=arena_values,
|
221 |
-
elem_id="arena_leaderboard_dataframe",
|
222 |
-
height=700,
|
223 |
-
column_widths=[70, 190, 110, 110, 110, 150],
|
224 |
-
wrap=True,
|
225 |
-
)
|
226 |
-
return arena_values
|
227 |
-
|
228 |
-
category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[display_df])
|
229 |
|
230 |
with gr.Accordion(
|
231 |
"📝 Citation",
|
@@ -239,8 +146,6 @@ def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False)
|
|
239 |
gr.Markdown(citation_md, elem_id="leaderboard_markdown")
|
240 |
gr.Markdown(acknowledgment_md)
|
241 |
|
242 |
-
if show_plot:
|
243 |
-
return [md_1]
|
244 |
return [md_1]
|
245 |
|
246 |
|
@@ -318,7 +223,7 @@ We thank []() for their generous [sponsorship]().
|
|
318 |
</div>
|
319 |
"""
|
320 |
|
321 |
-
def build_demo(
|
322 |
text_size = gr.themes.sizes.text_lg
|
323 |
theme = gr.themes.Base(text_size=text_size)
|
324 |
theme.set(button_secondary_background_fill_hover="*primary_300",
|
@@ -330,7 +235,7 @@ def build_demo(elo_results_file, leaderboard_table_file):
|
|
330 |
css=block_css,
|
331 |
) as demo:
|
332 |
leader_components = build_leaderboard_tab(
|
333 |
-
|
334 |
)
|
335 |
return demo
|
336 |
|
@@ -342,13 +247,10 @@ if __name__ == "__main__":
|
|
342 |
parser.add_argument("--port", type=int, default=7860)
|
343 |
args = parser.parse_args()
|
344 |
|
345 |
-
elo_result_files = glob.glob("elo_results_*.pkl")
|
346 |
-
elo_result_files.sort(key=lambda x: int(x[12:-4]))
|
347 |
-
elo_result_file = elo_result_files[-1]
|
348 |
|
349 |
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
350 |
leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
|
351 |
leaderboard_table_file = leaderboard_table_files[-1]
|
352 |
|
353 |
-
demo = build_demo(
|
354 |
demo.launch(share=args.share, server_name=args.host, server_port=args.port)
|
|
|
12 |
leader_component_values = [None]
|
13 |
space = " "
|
14 |
|
15 |
+
def make_default_md():
|
16 |
leaderboard_md = f"""
|
17 |
# NeurIPS LLM Merging Competition Leaderboard
|
18 |
[Website](https://llm-merging.github.io/index) | [Starter Kit (Github)](https://github.com/llm-merging/LLM-Merging) | [Discord](https://discord.com/invite/dPBHEVnV)
|
|
|
20 |
"""
|
21 |
return leaderboard_md
|
22 |
|
23 |
+
def make_arena_leaderboard_md(model_table_df):
|
24 |
total_models = len(arena_df)
|
25 |
leaderboard_md = f"""
|
26 |
+
Validation Benchmark Performance is averaged.
|
27 |
+
Final performance will be assessed at the end of the competition on a hidden test set, which may or may not be correlated with Validation performance.
|
28 |
|
29 |
+
Higher values are better.
|
30 |
|
31 |
+
Total #models: **{total_models}**.{space}
|
32 |
|
33 |
"""
|
34 |
return leaderboard_md
|
35 |
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def load_leaderboard_table_csv(filename, add_hyperlink=False):
|
38 |
lines = open(filename).readlines()
|
39 |
heads = [v.strip() for v in lines[0].split(",")]
|
|
|
43 |
for j in range(len(heads)):
|
44 |
item = {}
|
45 |
for h, v in zip(heads, row):
|
46 |
+
if h == "Validation Score":
|
47 |
if v != "-":
|
48 |
v = int(ast.literal_eval(v))
|
49 |
else:
|
50 |
v = np.nan
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
item[h] = v
|
52 |
if add_hyperlink:
|
53 |
+
item["Model"] = f'<a target="_blank" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{item["Model"]}</a>'
|
54 |
rows.append(item)
|
55 |
return rows
|
56 |
|
57 |
+
def get_full_table(model_table_df):
|
58 |
values = []
|
59 |
for i in range(len(model_table_df)):
|
60 |
row = []
|
61 |
ranking = i+1
|
62 |
row.append(ranking)
|
|
|
63 |
model_name = model_table_df.iloc[i]["Model"]
|
64 |
# model display name
|
65 |
row.append(model_name)
|
66 |
row.append(np.nan)
|
|
|
|
|
|
|
|
|
67 |
|
68 |
values.append(row)
|
69 |
# values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
|
|
|
76 |
"Overall": "Overall Questions",
|
77 |
}
|
78 |
|
79 |
+
def build_leaderboard_tab(leaderboard_table_file, show_plot=False):
|
80 |
arena_dfs = {}
|
81 |
category_elo_results = {}
|
82 |
+
if leaderboard_table_file is None: # Do live update
|
83 |
default_md = "Loading ..."
|
84 |
else:
|
85 |
+
default_md = make_default_md()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
88 |
if leaderboard_table_file:
|
|
|
90 |
model_table_df = pd.DataFrame(data)
|
91 |
|
92 |
with gr.Tabs() as tabs:
|
93 |
+
arena_table_vals = get_full_table(model_table_df)
|
94 |
with gr.Tab("Full leaderboard", id=0):
|
95 |
+
md = make_arena_leaderboard_md(model_table_df)
|
96 |
leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
|
97 |
with gr.Row():
|
98 |
with gr.Column(scale=2):
|
99 |
+
category_dropdown = gr.Dropdown(choices=["Overall"]), label="Category", value="Overall")
|
100 |
|
101 |
display_df = gr.Dataframe(
|
102 |
headers=[
|
103 |
"Rank",
|
104 |
+
"🤖 Model / Submission Name",
|
105 |
+
"⭐ Validation Performance",
|
|
|
|
|
|
|
106 |
],
|
107 |
datatype=[
|
108 |
"number",
|
109 |
"markdown",
|
110 |
"number",
|
|
|
|
|
|
|
111 |
],
|
112 |
value=arena_table_vals,
|
113 |
elem_id="arena_leaderboard_dataframe",
|
114 |
height=700,
|
115 |
+
column_widths=[70, 190, 110],
|
116 |
wrap=True,
|
117 |
)
|
118 |
|
|
|
133 |
else:
|
134 |
pass
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
with gr.Accordion(
|
138 |
"📝 Citation",
|
|
|
146 |
gr.Markdown(citation_md, elem_id="leaderboard_markdown")
|
147 |
gr.Markdown(acknowledgment_md)
|
148 |
|
|
|
|
|
149 |
return [md_1]
|
150 |
|
151 |
|
|
|
223 |
</div>
|
224 |
"""
|
225 |
|
226 |
+
def build_demo(leaderboard_table_file):
|
227 |
text_size = gr.themes.sizes.text_lg
|
228 |
theme = gr.themes.Base(text_size=text_size)
|
229 |
theme.set(button_secondary_background_fill_hover="*primary_300",
|
|
|
235 |
css=block_css,
|
236 |
) as demo:
|
237 |
leader_components = build_leaderboard_tab(
|
238 |
+
leaderboard_table_file, show_plot=True
|
239 |
)
|
240 |
return demo
|
241 |
|
|
|
247 |
parser.add_argument("--port", type=int, default=7860)
|
248 |
args = parser.parse_args()
|
249 |
|
|
|
|
|
|
|
250 |
|
251 |
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
252 |
leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
|
253 |
leaderboard_table_file = leaderboard_table_files[-1]
|
254 |
|
255 |
+
demo = build_demo(leaderboard_table_file)
|
256 |
demo.launch(share=args.share, server_name=args.host, server_port=args.port)
|