margsli commited on
Commit
b8c65a2
1 Parent(s): 619107d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -119
app.py CHANGED
@@ -12,7 +12,7 @@ import pandas as pd
12
  leader_component_values = [None]
13
  space = "   "
14
 
15
- def make_default_md(arena_df, elo_results):
16
  leaderboard_md = f"""
17
  # NeurIPS LLM Merging Competition Leaderboard
18
  [Website](https://llm-merging.github.io/index) | [Starter Kit (Github)](https://github.com/llm-merging/LLM-Merging) | [Discord](https://discord.com/invite/dPBHEVnV)
@@ -20,29 +20,20 @@ def make_default_md(arena_df, elo_results):
20
  """
21
  return leaderboard_md
22
 
23
- def make_arena_leaderboard_md(arena_df):
24
  total_models = len(arena_df)
25
  leaderboard_md = f"""
26
- Three benchmarks are displayed: **Test Task 1**, **Test Task 2**, **Test Task 3**.
 
27
 
28
- Higher values are better for all benchmarks.
29
 
30
- Total #models: **{total_models}**.{space} Last updated: June 1, 2024.
31
 
32
  """
33
  return leaderboard_md
34
 
35
 
36
-
37
- def make_leaderboard_md_live(elo_results):
38
- leaderboard_md = f"""
39
- # Leaderboard
40
- Last updated: {elo_results["last_updated_datetime"]}
41
- {elo_results["leaderboard_table"]}
42
- """
43
- return leaderboard_md
44
-
45
-
46
  def load_leaderboard_table_csv(filename, add_hyperlink=False):
47
  lines = open(filename).readlines()
48
  heads = [v.strip() for v in lines[0].split(",")]
@@ -52,47 +43,27 @@ def load_leaderboard_table_csv(filename, add_hyperlink=False):
52
  for j in range(len(heads)):
53
  item = {}
54
  for h, v in zip(heads, row):
55
- if h == "Arena Elo rating":
56
  if v != "-":
57
  v = int(ast.literal_eval(v))
58
  else:
59
  v = np.nan
60
- elif h == "MMLU":
61
- if v != "-":
62
- v = round(ast.literal_eval(v) * 100, 1)
63
- else:
64
- v = np.nan
65
- elif h == "MT-bench (win rate %)":
66
- if v != "-":
67
- v = round(ast.literal_eval(v[:-1]), 1)
68
- else:
69
- v = np.nan
70
- elif h == "MT-bench (score)":
71
- if v != "-":
72
- v = round(ast.literal_eval(v), 2)
73
- else:
74
- v = np.nan
75
  item[h] = v
76
  if add_hyperlink:
77
- item["Model"] = f'<a target="_blank" href="{item["Link"]}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{item["Model"]}</a>'
78
  rows.append(item)
79
  return rows
80
 
81
- def get_full_table(arena_df, model_table_df):
82
  values = []
83
  for i in range(len(model_table_df)):
84
  row = []
85
  ranking = i+1
86
  row.append(ranking)
87
- model_key = model_table_df.iloc[i]["key"]
88
  model_name = model_table_df.iloc[i]["Model"]
89
  # model display name
90
  row.append(model_name)
91
  row.append(np.nan)
92
- row.append(np.nan)
93
- row.append(np.nan)
94
- # Team
95
- row.append(model_table_df.iloc[i]["Organization"])
96
 
97
  values.append(row)
98
  # values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
@@ -105,24 +76,13 @@ cat_name_to_explanation = {
105
  "Overall": "Overall Questions",
106
  }
107
 
108
- def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False):
109
  arena_dfs = {}
110
  category_elo_results = {}
111
- if results_file is None: # Do live update
112
  default_md = "Loading ..."
113
  else:
114
- with open(results_file, "rb") as fin:
115
- elo_results = pickle.load(fin)
116
- if "full" in elo_results:
117
- print("KEYS ", elo_results.keys())
118
- for k in elo_results.keys():
119
- if k not in key_to_category_name:
120
- continue
121
- arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
122
- category_elo_results[key_to_category_name[k]] = elo_results[k]
123
-
124
- arena_df = arena_dfs["Overall"]
125
- default_md = make_default_md(arena_df, category_elo_results["Overall"])
126
 
127
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
128
  if leaderboard_table_file:
@@ -130,35 +90,29 @@ def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False)
130
  model_table_df = pd.DataFrame(data)
131
 
132
  with gr.Tabs() as tabs:
133
- arena_table_vals = get_full_table(arena_df, model_table_df)
134
  with gr.Tab("Full leaderboard", id=0):
135
- md = make_arena_leaderboard_md(arena_df)
136
  leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
137
  with gr.Row():
138
  with gr.Column(scale=2):
139
- category_dropdown = gr.Dropdown(choices=list(arena_dfs.keys()), label="Category", value="Overall")
140
 
141
  display_df = gr.Dataframe(
142
  headers=[
143
  "Rank",
144
- "🤖 Model",
145
- "⭐ Task 1",
146
- "📈 Task 2",
147
- "📚 Task 3",
148
- "Team",
149
  ],
150
  datatype=[
151
  "number",
152
  "markdown",
153
  "number",
154
- "number",
155
- "number",
156
- "str",
157
  ],
158
  value=arena_table_vals,
159
  elem_id="arena_leaderboard_dataframe",
160
  height=700,
161
- column_widths=[70, 190, 110, 110, 110, 150],
162
  wrap=True,
163
  )
164
 
@@ -179,53 +133,6 @@ def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False)
179
  else:
180
  pass
181
 
182
- def update_leaderboard_df(arena_table_vals):
183
- elo_datarame = pd.DataFrame(arena_table_vals, columns=["Rank", "🤖 Model", "⭐ Task 1", "📈 Task 2", "📚 Task 3", "Team"])
184
-
185
- # goal: color the rows based on the rank with styler
186
- def highlight_max(s):
187
- # all items in S which contain up arrow should be green, down arrow should be red, otherwise black
188
- return ["color: green; font-weight: bold" if "\u2191" in v else "color: red; font-weight: bold" if "\u2193" in v else "" for v in s]
189
-
190
- def highlight_rank_max(s):
191
- return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
192
-
193
- return elo_datarame.style.apply(highlight_max, subset=["Rank"])
194
-
195
- def update_leaderboard_and_plots(category):
196
- arena_subset_df = arena_dfs[category]
197
- arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
198
- elo_subset_results = category_elo_results[category]
199
- arena_df = arena_dfs["Overall"]
200
- arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Overall" else None)
201
- if category != "Overall":
202
- arena_values = update_leaderboard_df(arena_values)
203
- arena_values = gr.Dataframe(
204
- headers=[
205
- "Rank",
206
- "🤖 Model",
207
- "⭐ Task 1",
208
- "📈 Task 2",
209
- "📚 Task 3",
210
- "Team",
211
- ],
212
- datatype=[
213
- "number",
214
- "markdown",
215
- "number",
216
- "number",
217
- "number",
218
- "str",
219
- ],
220
- value=arena_values,
221
- elem_id="arena_leaderboard_dataframe",
222
- height=700,
223
- column_widths=[70, 190, 110, 110, 110, 150],
224
- wrap=True,
225
- )
226
- return arena_values
227
-
228
- category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[display_df])
229
 
230
  with gr.Accordion(
231
  "📝 Citation",
@@ -239,8 +146,6 @@ def build_leaderboard_tab(results_file, leaderboard_table_file, show_plot=False)
239
  gr.Markdown(citation_md, elem_id="leaderboard_markdown")
240
  gr.Markdown(acknowledgment_md)
241
 
242
- if show_plot:
243
- return [md_1]
244
  return [md_1]
245
 
246
 
@@ -318,7 +223,7 @@ We thank []() for their generous [sponsorship]().
318
  </div>
319
  """
320
 
321
- def build_demo(elo_results_file, leaderboard_table_file):
322
  text_size = gr.themes.sizes.text_lg
323
  theme = gr.themes.Base(text_size=text_size)
324
  theme.set(button_secondary_background_fill_hover="*primary_300",
@@ -330,7 +235,7 @@ def build_demo(elo_results_file, leaderboard_table_file):
330
  css=block_css,
331
  ) as demo:
332
  leader_components = build_leaderboard_tab(
333
- elo_results_file, leaderboard_table_file, show_plot=True
334
  )
335
  return demo
336
 
@@ -342,13 +247,10 @@ if __name__ == "__main__":
342
  parser.add_argument("--port", type=int, default=7860)
343
  args = parser.parse_args()
344
 
345
- elo_result_files = glob.glob("elo_results_*.pkl")
346
- elo_result_files.sort(key=lambda x: int(x[12:-4]))
347
- elo_result_file = elo_result_files[-1]
348
 
349
  leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
350
  leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
351
  leaderboard_table_file = leaderboard_table_files[-1]
352
 
353
- demo = build_demo(elo_result_file, leaderboard_table_file)
354
  demo.launch(share=args.share, server_name=args.host, server_port=args.port)
 
12
  leader_component_values = [None]
13
  space = "&nbsp;&nbsp;&nbsp;"
14
 
15
+ def make_default_md():
16
  leaderboard_md = f"""
17
  # NeurIPS LLM Merging Competition Leaderboard
18
  [Website](https://llm-merging.github.io/index) | [Starter Kit (Github)](https://github.com/llm-merging/LLM-Merging) | [Discord](https://discord.com/invite/dPBHEVnV)
 
20
  """
21
  return leaderboard_md
22
 
23
+ def make_arena_leaderboard_md(model_table_df):
24
  total_models = len(arena_df)
25
  leaderboard_md = f"""
26
+ Validation Benchmark Performance is averaged.
27
+ Final performance will be assessed at the end of the competition on a hidden test set, which may or may not be correlated with Validation performance.
28
 
29
+ Higher values are better.
30
 
31
+ Total #models: **{total_models}**.{space}
32
 
33
  """
34
  return leaderboard_md
35
 
36
 
 
 
 
 
 
 
 
 
 
 
37
  def load_leaderboard_table_csv(filename, add_hyperlink=False):
38
  lines = open(filename).readlines()
39
  heads = [v.strip() for v in lines[0].split(",")]
 
43
  for j in range(len(heads)):
44
  item = {}
45
  for h, v in zip(heads, row):
46
+ if h == "Validation Score":
47
  if v != "-":
48
  v = int(ast.literal_eval(v))
49
  else:
50
  v = np.nan
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  item[h] = v
52
  if add_hyperlink:
53
+ item["Model"] = f'<a target="_blank" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{item["Model"]}</a>'
54
  rows.append(item)
55
  return rows
56
 
57
+ def get_full_table(model_table_df):
58
  values = []
59
  for i in range(len(model_table_df)):
60
  row = []
61
  ranking = i+1
62
  row.append(ranking)
 
63
  model_name = model_table_df.iloc[i]["Model"]
64
  # model display name
65
  row.append(model_name)
66
  row.append(np.nan)
 
 
 
 
67
 
68
  values.append(row)
69
  # values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
 
76
  "Overall": "Overall Questions",
77
  }
78
 
79
+ def build_leaderboard_tab(leaderboard_table_file, show_plot=False):
80
  arena_dfs = {}
81
  category_elo_results = {}
82
+ if leaderboard_table_file is None: # Do live update
83
  default_md = "Loading ..."
84
  else:
85
+ default_md = make_default_md()
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
88
  if leaderboard_table_file:
 
90
  model_table_df = pd.DataFrame(data)
91
 
92
  with gr.Tabs() as tabs:
93
+ arena_table_vals = get_full_table(model_table_df)
94
  with gr.Tab("Full leaderboard", id=0):
95
+ md = make_arena_leaderboard_md(model_table_df)
96
  leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
97
  with gr.Row():
98
  with gr.Column(scale=2):
99
+ category_dropdown = gr.Dropdown(choices=["Overall"]), label="Category", value="Overall")
100
 
101
  display_df = gr.Dataframe(
102
  headers=[
103
  "Rank",
104
+ "🤖 Model / Submission Name",
105
+ "⭐ Validation Performance",
 
 
 
106
  ],
107
  datatype=[
108
  "number",
109
  "markdown",
110
  "number",
 
 
 
111
  ],
112
  value=arena_table_vals,
113
  elem_id="arena_leaderboard_dataframe",
114
  height=700,
115
+ column_widths=[70, 190, 110],
116
  wrap=True,
117
  )
118
 
 
133
  else:
134
  pass
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  with gr.Accordion(
138
  "📝 Citation",
 
146
  gr.Markdown(citation_md, elem_id="leaderboard_markdown")
147
  gr.Markdown(acknowledgment_md)
148
 
 
 
149
  return [md_1]
150
 
151
 
 
223
  </div>
224
  """
225
 
226
+ def build_demo(leaderboard_table_file):
227
  text_size = gr.themes.sizes.text_lg
228
  theme = gr.themes.Base(text_size=text_size)
229
  theme.set(button_secondary_background_fill_hover="*primary_300",
 
235
  css=block_css,
236
  ) as demo:
237
  leader_components = build_leaderboard_tab(
238
+ leaderboard_table_file, show_plot=True
239
  )
240
  return demo
241
 
 
247
  parser.add_argument("--port", type=int, default=7860)
248
  args = parser.parse_args()
249
 
 
 
 
250
 
251
  leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
252
  leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
253
  leaderboard_table_file = leaderboard_table_files[-1]
254
 
255
+ demo = build_demo(leaderboard_table_file)
256
  demo.launch(share=args.share, server_name=args.host, server_port=args.port)