sherzod-hakimov commited on
Commit
923aff9
1 Parent(s): 18d5ac3

multimodal leaderboard

Browse files
app.py CHANGED
@@ -1,89 +1,181 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- from src.assets.text_content import TITLE, INTRODUCTION_TEXT, CLEMSCORE_TEXT
4
- from src.leaderboard_utils import filter_search, get_github_data
5
- from src.plot_utils import split_models, compare_plots
6
-
7
- # For Leaderboards
8
- dataframe_height = 800 # Height of the table in pixels
9
- # Get CSV data
10
- global primary_leaderboard_df, version_dfs, version_names
11
- primary_leaderboard_df, version_dfs, version_names, date = get_github_data()
12
-
13
- global prev_df
14
- prev_df = version_dfs[0]
15
- def select_prev_df(name):
16
- ind = version_names.index(name)
17
- prev_df = version_dfs[ind]
18
- return prev_df
19
-
20
- # For Plots
21
- global plot_df, OPEN_MODELS, CLOSED_MODELS
22
- plot_df = primary_leaderboard_df[0]
23
- MODELS = list(plot_df[list(plot_df.columns)[0]].unique())
24
- OPEN_MODELS, CLOSED_MODELS = split_models(MODELS)
25
-
26
-
27
- # MAIN APPLICATION s
28
- main_app = gr.Blocks()
29
- with main_app:
30
  gr.HTML(TITLE)
31
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
32
 
33
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
34
- with gr.TabItem("🥇 CLEM Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
 
 
 
35
  with gr.Row():
36
  search_bar = gr.Textbox(
37
  placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
38
  show_label=False,
39
  elem_id="search-bar",
40
  )
41
-
42
  leaderboard_table = gr.Dataframe(
43
- value=primary_leaderboard_df[0],
44
- elem_id="leaderboard-table",
45
  interactive=False,
46
  visible=True,
47
  height=dataframe_height
48
  )
49
 
 
50
  gr.HTML(CLEMSCORE_TEXT)
51
- gr.HTML(f"Last updated - {date}")
52
 
53
- # Add a dummy leaderboard to handle search queries from the primary_leaderboard_df and not update primary_leaderboard_df
 
54
  dummy_leaderboard_table = gr.Dataframe(
55
- value=primary_leaderboard_df[0],
56
- elem_id="leaderboard-table",
57
  interactive=False,
58
  visible=False
59
  )
60
-
 
61
  search_bar.submit(
62
- filter_search,
63
  [dummy_leaderboard_table, search_bar],
64
  leaderboard_table,
65
  queue=True
66
  )
67
 
68
- with gr.TabItem("📈 Plot", id=3):
 
 
 
69
  with gr.Row():
70
- open_models_selection = gr.CheckboxGroup(
71
- OPEN_MODELS,
72
- label="Open-weight Models 🌐",
73
- value=[],
74
- elem_id="value-select",
75
- interactive=True,
76
  )
77
 
78
- with gr.Row():
79
- closed_models_selection = gr.CheckboxGroup(
80
- CLOSED_MODELS,
81
- label="Closed-weight Models 💼",
82
- value=[],
83
- elem_id="value-select-2",
84
- interactive=True,
85
- )
86
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  with gr.Row():
88
  with gr.Column():
89
  show_all = gr.CheckboxGroup(
@@ -93,36 +185,41 @@ with main_app:
93
  elem_id="value-select-3",
94
  interactive=True,
95
  )
96
-
97
  with gr.Column():
98
  show_names = gr.CheckboxGroup(
99
  ["Show Names"],
100
- label ="Show names of models on the plot 🏷️",
101
  value=[],
102
  elem_id="value-select-4",
103
  interactive=True,
104
- )
105
 
106
  with gr.Column():
107
  show_legend = gr.CheckboxGroup(
108
  ["Show Legend"],
109
- label ="Show legend on the plot 💡",
110
  value=[],
111
  elem_id="value-select-5",
112
  interactive=True,
113
- )
114
  with gr.Column():
115
  mobile_view = gr.CheckboxGroup(
116
  ["Mobile View"],
117
- label ="View plot on smaller screens 📱",
118
  value=[],
119
  elem_id="value-select-6",
120
  interactive=True,
121
- )
122
 
 
 
 
 
 
123
  with gr.Row():
124
  dummy_plot_df = gr.DataFrame(
125
- value=plot_df,
126
  visible=False
127
  )
128
 
@@ -131,90 +228,184 @@ with main_app:
131
  # Output block for the plot
132
  plot_output = gr.Plot()
133
 
 
 
 
 
134
  open_models_selection.change(
135
- compare_plots,
136
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend, mobile_view],
137
- plot_output,
 
138
  queue=True
139
  )
140
 
141
  closed_models_selection.change(
142
- compare_plots,
143
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend, mobile_view],
144
- plot_output,
 
145
  queue=True
146
  )
147
-
148
  show_all.change(
149
- compare_plots,
150
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend, mobile_view],
151
- plot_output,
 
152
  queue=True
153
  )
154
 
155
  show_names.change(
156
- compare_plots,
157
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend, mobile_view],
158
- plot_output,
 
159
  queue=True
160
  )
161
 
162
  show_legend.change(
163
- compare_plots,
164
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend, mobile_view],
165
- plot_output,
 
166
  queue=True
167
  )
168
 
169
  mobile_view.change(
170
- compare_plots,
171
- [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend, mobile_view],
172
- plot_output,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  queue=True
174
  )
175
 
176
- with gr.TabItem("🔄 Versions and Details", elem_id="details", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  with gr.Row():
178
  version_select = gr.Dropdown(
179
- version_names, label="Select Version 🕹️", value=version_names[0]
180
  )
181
  with gr.Row():
182
  search_bar_prev = gr.Textbox(
183
  placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
184
  show_label=False,
185
- elem_id="search-bar-2",
186
  )
187
 
188
  prev_table = gr.Dataframe(
189
- value=prev_df,
190
- elem_id="leaderboard-table",
191
  interactive=False,
192
  visible=True,
193
  height=dataframe_height
194
  )
195
 
196
  dummy_prev_table = gr.Dataframe(
197
- value=prev_df,
198
- elem_id="leaderboard-table",
199
  interactive=False,
200
  visible=False
201
  )
202
 
 
 
 
203
  search_bar_prev.submit(
204
- filter_search,
205
  [dummy_prev_table, search_bar_prev],
206
  prev_table,
207
  queue=True
208
  )
209
 
210
  version_select.change(
211
- select_prev_df,
212
  [version_select],
213
  prev_table,
214
  queue=True
215
  )
216
- main_app.load()
217
 
218
- main_app.queue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- main_app.launch()
 
1
  import gradio as gr
2
+ import os
3
+ from apscheduler.schedulers.background import BackgroundScheduler
4
+ from huggingface_hub import HfApi
5
+ from datetime import datetime, timedelta
6
+
7
+ from src.assets.text_content import TITLE, INTRODUCTION_TEXT, CLEMSCORE_TEXT, MULTIMODAL_NAME, TEXT_NAME, HF_REPO
8
+ from src.leaderboard_utils import query_search, get_github_data
9
+ from src.plot_utils import split_models, plotly_plot, get_plot_df, update_open_models, update_closed_models
10
+ from src.plot_utils import reset_show_all, reset_show_names, reset_show_legend, reset_mobile_view
11
+ from src.version_utils import get_versions_data
12
+
13
+ """
14
+ CONSTANTS
15
+ """
16
+ # For restarting the gradio application every 24 Hrs
17
+ TIME = 43200 # in seconds # Reload will not work locally - requires HFToken # The app launches locally as expected - only without the reload utility
18
+ # For Leaderboard table
19
+ dataframe_height = 800 # Height of the table in pixels # Set on average considering all possible devices
20
+
21
+
22
+ """
23
+ AUTO RESTART HF SPACE
24
+ """
25
+ HF_TOKEN = os.environ.get("H4_TOKEN", None)
26
+ api = HfApi()
27
+
28
+ def restart_space():
29
+ api.restart_space(repo_id=HF_REPO, token=HF_TOKEN)
30
+
31
+
32
+ """
33
+ GITHUB UTILS
34
+ """
35
+ github_data = get_github_data()
36
+ text_leaderboard = github_data["text"][0] # Get the text-only leaderboard for its available latest version
37
+ multimodal_leaderboard = github_data["multimodal"][0] # Get multimodal leaderboard for its available latest version.
38
+
39
+ # Show only First 4 columns for the leaderboards
40
+ text_leaderboard = text_leaderboard.iloc[:, :4]
41
+ print(f"Showing the following columns for the latest leaderboard: {text_leaderboard.columns}")
42
+ multimodal_leaderboard = multimodal_leaderboard.iloc[:, :4]
43
+ print(f"Showing the following columns for the multimodal leaderboard: {multimodal_leaderboard.columns}")
44
+
45
+
46
+ """
47
+ VERSIONS UTILS
48
+ """
49
+ versions_data = get_versions_data()
50
+ latest_version = versions_data['latest'] # Always show latest version in text-only benchmark
51
+ last_updated_date = versions_data['date']
52
+ version_names = list(versions_data.keys())
53
+ version_names = [v for v in version_names if v.startswith("v")] # Remove "latest" and "date" keys
54
+
55
+ global version_df
56
+ version_df = versions_data[latest_version]
57
+ def select_version_df(name):
58
+ return versions_data[name]
59
+
60
+ """
61
+ MAIN APPLICATION
62
+ """
63
+ hf_app = gr.Blocks()
64
+ with hf_app:
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  gr.HTML(TITLE)
67
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
68
 
69
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
70
+ """
71
+ ####################### FIRST TAB - TEXT-LEADERBOARD #######################
72
+ """
73
+ with gr.TabItem(TEXT_NAME, elem_id="llm-benchmark-tab-table", id=0):
74
  with gr.Row():
75
  search_bar = gr.Textbox(
76
  placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
77
  show_label=False,
78
  elem_id="search-bar",
79
  )
80
+
81
  leaderboard_table = gr.Dataframe(
82
+ value=text_leaderboard,
83
+ elem_id="text-leaderboard-table",
84
  interactive=False,
85
  visible=True,
86
  height=dataframe_height
87
  )
88
 
89
+ # Show information about the clemscore and last updated date below the table
90
  gr.HTML(CLEMSCORE_TEXT)
91
+ gr.HTML(f"Last updated - {github_data['date']}")
92
 
93
+ # Add a dummy leaderboard to handle search queries in leaderboard_table
94
+ # This will show a temporary leaderboard based on the searched value
95
  dummy_leaderboard_table = gr.Dataframe(
96
+ value=text_leaderboard,
97
+ elem_id="text-leaderboard-table-dummy",
98
  interactive=False,
99
  visible=False
100
  )
101
+
102
+ # Action after submitting a query to the search bar
103
  search_bar.submit(
104
+ query_search,
105
  [dummy_leaderboard_table, search_bar],
106
  leaderboard_table,
107
  queue=True
108
  )
109
 
110
+ """
111
+ ####################### SECOND TAB - MULTIMODAL LEADERBOARD #######################
112
+ """
113
+ with gr.TabItem(MULTIMODAL_NAME, elem_id="mm-llm-benchmark-tab-table", id=1):
114
  with gr.Row():
115
+ mm_search_bar = gr.Textbox(
116
+ placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
117
+ show_label=False,
118
+ elem_id="search-bar",
 
 
119
  )
120
 
121
+ mm_leaderboard_table = gr.Dataframe(
122
+ value=multimodal_leaderboard,
123
+ elem_id="mm-leaderboard-table",
124
+ interactive=False,
125
+ visible=True,
126
+ height=dataframe_height
127
+ )
128
+
129
+ # Show information about the clemscore and last updated date below the table
130
+ gr.HTML(CLEMSCORE_TEXT)
131
+ gr.HTML(f"Last updated - {github_data['date']}")
132
+
133
+ # Add a dummy leaderboard to handle search queries in leaderboard_table
134
+ # This will show a temporary leaderboard based on the searched value
135
+ mm_dummy_leaderboard_table = gr.Dataframe(
136
+ value=multimodal_leaderboard,
137
+ elem_id="mm-leaderboard-table-dummy",
138
+ interactive=False,
139
+ visible=False
140
+ )
141
+
142
+ # Action after submitting a query to the search bar
143
+ mm_search_bar.submit(
144
+ query_search,
145
+ [mm_dummy_leaderboard_table, mm_search_bar],
146
+ mm_leaderboard_table,
147
+ queue=True
148
+ )
149
+
150
+ """
151
+ ####################### THIRD TAB - PLOTS - %PLAYED V/S QUALITY SCORE #######################
152
+ """
153
+ with gr.TabItem("📈 Plots", elem_id="plots", id=2):
154
+ """
155
+ DropDown Select for Text/Multimodal Leaderboard
156
+ """
157
+ leaderboard_selection = gr.Dropdown(
158
+ choices=[TEXT_NAME, MULTIMODAL_NAME],
159
+ value=TEXT_NAME,
160
+ label="Select Leaderboard 🎖️🔽",
161
+ elem_id="value-select-0",
162
+ interactive=True
163
+ )
164
+
165
+ """
166
+ Accordion Groups to select individual models - Hidden by default
167
+ """
168
+ with gr.Accordion("Select Open-weight Models 🌐", open=False):
169
+ open_models_selection = update_open_models()
170
+ clear_button_1 = gr.ClearButton(open_models_selection)
171
+
172
+ with gr.Accordion("Select Commercial Models 💰", open=False):
173
+ closed_models_selection = update_closed_models()
174
+ clear_button_2 = gr.ClearButton(closed_models_selection)
175
+
176
+ """
177
+ Checkbox group to control the layout of the plot
178
+ """
179
  with gr.Row():
180
  with gr.Column():
181
  show_all = gr.CheckboxGroup(
 
185
  elem_id="value-select-3",
186
  interactive=True,
187
  )
188
+
189
  with gr.Column():
190
  show_names = gr.CheckboxGroup(
191
  ["Show Names"],
192
+ label="Show names of models on the plot 🏷️",
193
  value=[],
194
  elem_id="value-select-4",
195
  interactive=True,
196
+ )
197
 
198
  with gr.Column():
199
  show_legend = gr.CheckboxGroup(
200
  ["Show Legend"],
201
+ label="Show legend on the plot 💡",
202
  value=[],
203
  elem_id="value-select-5",
204
  interactive=True,
205
+ )
206
  with gr.Column():
207
  mobile_view = gr.CheckboxGroup(
208
  ["Mobile View"],
209
+ label="View plot on smaller screens 📱",
210
  value=[],
211
  elem_id="value-select-6",
212
  interactive=True,
213
+ )
214
 
215
+ """
216
+ PLOT BLOCK
217
+ """
218
+ # Create a dummy DataFrame as an input to the plotly_plot function.
219
+ # Uses this data to plot the %played v/s quality score
220
  with gr.Row():
221
  dummy_plot_df = gr.DataFrame(
222
+ value=get_plot_df(),
223
  visible=False
224
  )
225
 
 
228
  # Output block for the plot
229
  plot_output = gr.Plot()
230
 
231
+ """
232
+ PLOT CHANGE ACTIONS
233
+ Toggle 'Select All Models' based on the values in Accordion checkbox groups
234
+ """
235
  open_models_selection.change(
236
+ plotly_plot,
237
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
238
+ mobile_view],
239
+ [plot_output],
240
  queue=True
241
  )
242
 
243
  closed_models_selection.change(
244
+ plotly_plot,
245
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
246
+ mobile_view],
247
+ [plot_output],
248
  queue=True
249
  )
250
+
251
  show_all.change(
252
+ plotly_plot,
253
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
254
+ mobile_view],
255
+ [plot_output],
256
  queue=True
257
  )
258
 
259
  show_names.change(
260
+ plotly_plot,
261
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
262
+ mobile_view],
263
+ [plot_output],
264
  queue=True
265
  )
266
 
267
  show_legend.change(
268
+ plotly_plot,
269
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
270
+ mobile_view],
271
+ [plot_output],
272
  queue=True
273
  )
274
 
275
  mobile_view.change(
276
+ plotly_plot,
277
+ [dummy_plot_df, open_models_selection, closed_models_selection, show_all, show_names, show_legend,
278
+ mobile_view],
279
+ [plot_output],
280
+ queue=True
281
+ )
282
+ """
283
+ LEADERBOARD SELECT CHANGE ACTIONS
284
+ Update Checkbox Groups and Dummy DF based on the leaderboard selected
285
+ """
286
+ leaderboard_selection.change(
287
+ update_open_models,
288
+ [leaderboard_selection],
289
+ [open_models_selection],
290
+ queue=True
291
+ )
292
+
293
+ leaderboard_selection.change(
294
+ update_closed_models,
295
+ [leaderboard_selection],
296
+ [closed_models_selection],
297
+ queue=True
298
+ )
299
+
300
+ leaderboard_selection.change(
301
+ get_plot_df,
302
+ [leaderboard_selection],
303
+ [dummy_plot_df],
304
  queue=True
305
  )
306
 
307
+ ## Implement Feature - Reset Plot when Leaderboard selection changes
308
+ leaderboard_selection.change(
309
+ reset_show_all,
310
+ outputs=[show_all],
311
+ queue=True
312
+ )
313
+
314
+ open_models_selection.change(
315
+ reset_show_all,
316
+ outputs=[show_all],
317
+ queue=True
318
+ )
319
+
320
+ closed_models_selection.change(
321
+ reset_show_all,
322
+ outputs=[show_all],
323
+ queue=True
324
+ )
325
+
326
+ leaderboard_selection.change(
327
+ reset_show_names,
328
+ outputs=[show_names],
329
+ queue=True
330
+ )
331
+
332
+ leaderboard_selection.change(
333
+ reset_show_legend,
334
+ outputs=[show_legend],
335
+ queue=True
336
+ )
337
+
338
+ leaderboard_selection.change(
339
+ reset_mobile_view,
340
+ outputs=[mobile_view],
341
+ queue=True
342
+ )
343
+
344
+ """
345
+ ####################### FOURTH TAB - VERSIONS AND DETAILS #######################
346
+ """
347
+ with gr.TabItem("🔄 Versions and Details", elem_id="versions-details-tab", id=3):
348
  with gr.Row():
349
  version_select = gr.Dropdown(
350
+ version_names, label="Select Version 🕹️", value=latest_version
351
  )
352
  with gr.Row():
353
  search_bar_prev = gr.Textbox(
354
  placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
355
  show_label=False,
356
+ elem_id="search-bar-3",
357
  )
358
 
359
  prev_table = gr.Dataframe(
360
+ value=version_df,
361
+ elem_id="version-leaderboard-table",
362
  interactive=False,
363
  visible=True,
364
  height=dataframe_height
365
  )
366
 
367
  dummy_prev_table = gr.Dataframe(
368
+ value=version_df,
369
+ elem_id="version-dummy-leaderboard-table",
370
  interactive=False,
371
  visible=False
372
  )
373
 
374
+ gr.HTML(CLEMSCORE_TEXT)
375
+ gr.HTML(f"Last updated - {last_updated_date}")
376
+
377
  search_bar_prev.submit(
378
+ query_search,
379
  [dummy_prev_table, search_bar_prev],
380
  prev_table,
381
  queue=True
382
  )
383
 
384
  version_select.change(
385
+ select_version_df,
386
  [version_select],
387
  prev_table,
388
  queue=True
389
  )
 
390
 
391
+ # Update Dummy Leaderboard, when changing versions
392
+ version_select.change(
393
+ select_version_df,
394
+ [version_select],
395
+ dummy_prev_table,
396
+ queue=True
397
+ )
398
+
399
+ hf_app.load()
400
+ hf_app.queue()
401
+
402
+ # Add scheduler to auto-restart the HF space at every TIME interval and update every component each time
403
+ scheduler = BackgroundScheduler()
404
+ scheduler.add_job(restart_space, 'interval', seconds=TIME)
405
+ scheduler.start()
406
+
407
+ # Log current start time and scheduled restart time
408
+ print(datetime.now())
409
+ print(f"Scheduled restart at {datetime.now() + timedelta(seconds=TIME)}")
410
 
411
+ hf_app.launch()
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  gradio==4.36.1
2
  pandas==2.0.0
3
- plotly==5.18.0
 
 
1
  gradio==4.36.1
2
  pandas==2.0.0
3
+ plotly==5.18.0
4
+ apscheduler==3.10.4
src/assets/text_content.py CHANGED
@@ -1,11 +1,20 @@
1
  TITLE = """<h1 align="center" id="space-title"> 🏆 CLEM Leaderboard</h1>"""
2
 
 
 
 
 
 
 
3
  INTRODUCTION_TEXT = """
4
  <h6 align="center">
 
5
  The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimized Large Language Models) with the suggested pronounciation “clems”.
6
 
7
  The benchmarking approach is described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://aclanthology.org/2023.emnlp-main.689.pdf).
8
 
 
 
9
  Source code for benchmarking "clems" is available here: [Clembench](https://github.com/clembench/clembench)
10
 
11
  All generated files and results from the benchmark runs are available here: [clembench-runs](https://github.com/clembench/clembench-runs) </h6>
@@ -52,6 +61,6 @@ SHORT_NAMES = {
52
  "vicuna-7b-v1.5": "vic-7b-v1.5",
53
  "vicuna-13b-v1.5": "vic-13b-v1.5",
54
  "gpt4all-13b-snoozy": "g4a-13b-s",
55
- "zephyr-7b-alpha":"z-7b-a",
56
- "zephyr-7b-beta":"z-7b-b"
57
  }
 
1
  TITLE = """<h1 align="center" id="space-title"> 🏆 CLEM Leaderboard</h1>"""
2
 
3
+ REPO = "https://raw.githubusercontent.com/clembench/clembench-runs/main/"
4
+ HF_REPO = "colab-potsdam/clem-leaderboard"
5
+
6
+ TEXT_NAME = "🥇 CLEM Leaderboard"
7
+ MULTIMODAL_NAME = "🥇 Multimodal CLEM Leaderboard"
8
+
9
  INTRODUCTION_TEXT = """
10
  <h6 align="center">
11
+
12
  The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimized Large Language Models) with the suggested pronounciation “clems”.
13
 
14
  The benchmarking approach is described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://aclanthology.org/2023.emnlp-main.689.pdf).
15
 
16
+ The Multimodal Benchmark is descrbied in [Two Giraffes in a Dirt Field: Using Game Play to Investigate Situation Modelling in Large Multimodal Models](https://arxiv.org/abs/2406.14035)
17
+
18
  Source code for benchmarking "clems" is available here: [Clembench](https://github.com/clembench/clembench)
19
 
20
  All generated files and results from the benchmark runs are available here: [clembench-runs](https://github.com/clembench/clembench-runs) </h6>
 
61
  "vicuna-7b-v1.5": "vic-7b-v1.5",
62
  "vicuna-13b-v1.5": "vic-13b-v1.5",
63
  "gpt4all-13b-snoozy": "g4a-13b-s",
64
+ "zephyr-7b-alpha": "z-7b-a",
65
+ "zephyr-7b-beta": "z-7b-b"
66
  }
src/leaderboard_utils.py CHANGED
@@ -1,148 +1,139 @@
1
  import os
2
  import pandas as pd
3
- import requests, json
 
4
  from io import StringIO
5
  from datetime import datetime
6
 
 
7
 
8
  def get_github_data():
9
  """
10
- Get data from csv files on Github
11
- Args:
12
- None
13
  Returns:
14
- latest_df: singular list containing dataframe of the latest version of the leaderboard with only 4 columns
15
- all_dfs: list of dataframes for previous versions + latest version including columns for all games
16
- all_vnames: list of the names for the previous versions + latest version (For Details and Versions Tab Dropdown)
 
17
  """
18
- uname = "clembench"
19
- repo = "clembench-runs"
20
- json_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/benchmark_runs.json"
21
- resp = requests.get(json_url)
22
- if resp.status_code == 200:
23
- json_data = json.loads(resp.text)
24
- versions = json_data['versions']
25
- version_names = []
26
- csv_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/"
27
- for ver in versions:
28
- version_names.append(ver['version'])
29
- csv_path = ver['result_file'].split('/')[1:]
30
- csv_path = '/'.join(csv_path)
31
-
32
- # Sort by latest version
33
- float_content = [float(s[1:]) for s in version_names]
34
- float_content.sort(reverse=True)
35
- version_names = ['v'+str(s) for s in float_content]
36
-
37
- # Get date of latest version
38
- for data in versions:
39
- if data['version'] == version_names[0]:
40
- date = data['date'] # Should be in YYYY/MM/DD format
41
- date_obj = datetime.strptime(date, "%Y/%m/%d")
42
- date = date_obj.strftime("%d %b %Y")
43
-
44
- DFS = []
45
- for version in version_names:
46
- result_url = csv_url+ version + '/' + csv_path
47
- csv_response = requests.get(result_url)
48
- if csv_response.status_code == 200:
49
- df = pd.read_csv(StringIO(csv_response.text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  df = process_df(df)
51
- df = df.sort_values(by=list(df.columns)[1], ascending=False) # Sort by clemscore
52
- DFS.append(df)
53
- else:
54
- print(f"Failed to read CSV file for version : {version}. Status Code : {resp.status_code}")
55
-
56
- # Only keep relevant columns for the main leaderboard
57
- latest_df_dummy = DFS[0]
58
- all_columns = list(latest_df_dummy.columns)
59
- keep_columns = all_columns[0:4]
60
- latest_df_dummy = latest_df_dummy.drop(columns=[c for c in all_columns if c not in keep_columns])
61
-
62
- latest_df = [latest_df_dummy]
63
- all_dfs = []
64
- all_vnames = []
65
- for df, name in zip(DFS, version_names):
66
- all_dfs.append(df)
67
- all_vnames.append(name)
68
- return latest_df, all_dfs, all_vnames, date
69
-
70
- else:
71
- print(f"Failed to read JSON file: Status Code : {resp.status_code}")
72
 
73
 
74
  def process_df(df: pd.DataFrame) -> pd.DataFrame:
75
  """
76
- Process dataframe
77
- - Remove repition in model names
78
- - Convert datatypes to sort by "float" instead of "str" for sorting
79
  - Update column names
 
80
  Args:
81
  df: Unprocessed Dataframe (after using update_cols)
 
82
  Returns:
83
  df: Processed Dataframe
84
  """
85
 
86
- # Change column type to float from str
87
- list_column_names = list(df.columns)
88
- model_col_name = list_column_names[0]
89
- for col in list_column_names:
90
- if col != model_col_name:
91
- df[col] = df[col].astype(float)
92
-
93
- # Remove repetition in model names, if any
94
- models_list = []
95
- for i in range(len(df)):
96
- model_name = df.iloc[i][model_col_name]
97
- splits = model_name.split('--')
98
- splits = [split.replace('-t0.0', '') for split in splits] # Comment to not remove -t0.0
99
- if splits[0] == splits[1]:
100
- models_list.append(splits[0])
101
- else:
102
- models_list.append(splits[0] + "--" + splits[1])
103
- df[model_col_name] = models_list
104
 
105
  # Update column names
106
- update = ['Model', 'Clemscore', '% Played', 'Quality Score']
107
- game_metrics = list_column_names[4:]
108
-
109
- for col in game_metrics:
110
- splits = col.split(',')
111
- update.append(splits[0].capitalize() + "" + splits[1])
112
-
113
- map_cols = {}
114
- for i in range(len(update)):
115
- map_cols[list_column_names[i]] = str(update[i])
116
-
117
- df = df.rename(columns=map_cols)
118
  return df
119
 
120
 
121
- def filter_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
122
  """
123
- Filter the dataframe based on the search query
 
124
  Args:
125
- df: Unfiltered dataframe
126
- query: a string of queries separated by ";"
127
- Return:
128
- filtered_df: Dataframe containing searched queries in the 'Model' column
129
  """
130
- queries = query.split(';')
131
- list_cols = list(df.columns)
132
- df_len = len(df)
133
- filtered_models = []
134
- models_list = list(df[list_cols[0]])
135
- for q in queries:
136
- q = q.lower()
137
- q = q.strip()
138
- for i in range(df_len):
139
- model_name = models_list[i]
140
- if q in model_name.lower():
141
- filtered_models.append(model_name) # Append model names containing query q
142
-
143
- filtered_df = df[df[list_cols[0]].isin(filtered_models)]
144
-
145
- if query == "":
146
  return df
147
 
 
 
 
 
 
148
  return filtered_df
 
1
  import os
2
  import pandas as pd
3
+ import requests
4
+ import json
5
  from io import StringIO
6
  from datetime import datetime
7
 
8
+ from src.assets.text_content import REPO
9
 
10
  def get_github_data():
11
  """
12
+ Read and process data from CSV files hosted on GitHub. - https://github.com/clembench/clembench-runs
13
+
 
14
  Returns:
15
+ github_data (dict): Dictionary containing:
16
+ - "text": List of DataFrames for each version's textual leaderboard data.
17
+ - "multimodal": List of DataFrames for each version's multimodal leaderboard data.
18
+ - "date": Formatted date of the latest version in "DD Month YYYY" format.
19
  """
20
+ base_repo = REPO
21
+ json_url = base_repo + "benchmark_runs.json"
22
+ response = requests.get(json_url)
23
+
24
+ # Check if the JSON file request was successful
25
+ if response.status_code != 200:
26
+ print(f"Failed to read JSON file: Status Code: {response.status_code}")
27
+ return None, None, None, None
28
+
29
+ json_data = response.json()
30
+ versions = json_data['versions']
31
+
32
+ # Sort version names - latest first
33
+ version_names = sorted(
34
+ [ver['version'] for ver in versions],
35
+ key=lambda v: float(v[1:]),
36
+ reverse=True
37
+ )
38
+ print(f"Found {len(version_names)} versions from get_github_data(): {version_names}.")
39
+
40
+ # Get Last updated date of the latest version
41
+ latest_version = version_names[0]
42
+ latest_date = next(
43
+ ver['date'] for ver in versions if ver['version'] == latest_version
44
+ )
45
+ formatted_date = datetime.strptime(latest_date, "%Y/%m/%d").strftime("%d %b %Y")
46
+
47
+ # Get Leaderboard data - for text-only + multimodal
48
+ github_data = {}
49
+
50
+ # Collect Dataframes
51
+ text_dfs = []
52
+ mm_dfs = []
53
+
54
+ for version in version_names:
55
+ # Collect CSV data in descending order of clembench-runs versions
56
+ # Collect Text-only data
57
+ text_url = f"{base_repo}{version}/results.csv"
58
+ csv_response = requests.get(text_url)
59
+ if csv_response.status_code == 200:
60
+ df = pd.read_csv(StringIO(csv_response.text))
61
+ df = process_df(df)
62
+ df = df.sort_values(by=df.columns[1], ascending=False) # Sort by clemscore column
63
+ text_dfs.append(df)
64
+ else:
65
+ print(f"Failed to read Text-only leaderboard CSV file for version: {version}. Status Code: {csv_response.status_code}")
66
+
67
+ # Collect Multimodal data
68
+ if float(version[1:]) >= 1.6:
69
+ mm_url = f"{base_repo}{version}_multimodal/results.csv"
70
+ mm_response = requests.get(mm_url)
71
+ if mm_response.status_code == 200:
72
+ df = pd.read_csv(StringIO(mm_response.text))
73
  df = process_df(df)
74
+ df = df.sort_values(by=df.columns[1], ascending=False) # Sort by clemscore column
75
+ mm_dfs.append(df)
76
+ else:
77
+ print(f"Failed to read multimodal leaderboard CSV file for version: {version}: Status Code: {csv_response.status_code}. Please ignore this message if multimodal results are not available for this version")
78
+
79
+ github_data["text"] = text_dfs
80
+ github_data["multimodal"] = mm_dfs
81
+ github_data["date"] = formatted_date
82
+
83
+ return github_data
 
 
 
 
 
 
 
 
 
 
 
84
 
85
 
86
  def process_df(df: pd.DataFrame) -> pd.DataFrame:
87
  """
88
+ Process dataframe:
89
+ - Convert datatypes to sort by "float" instead of "str"
90
+ - Remove repetition in model names
91
  - Update column names
92
+
93
  Args:
94
  df: Unprocessed Dataframe (after using update_cols)
95
+
96
  Returns:
97
  df: Processed Dataframe
98
  """
99
 
100
+ # Convert column values to float, apart from the model names column
101
+ for col in df.columns[1:]:
102
+ df[col] = pd.to_numeric(df[col], errors='coerce')
103
+
104
+ # Remove repetition in model names
105
+ df[df.columns[0]] = df[df.columns[0]].str.replace('-t0.0', '', regex=True)
106
+ df[df.columns[0]] = df[df.columns[0]].apply(lambda x: '--'.join(set(x.split('--'))))
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # Update column names
109
+ custom_column_names = ['Model', 'Clemscore', '% Played', 'Quality Score']
110
+ for i, col in enumerate(df.columns[4:]): # Start Capitalizing from the 5th column
111
+ parts = col.split(',')
112
+ custom_name = f"{parts[0].strip().capitalize()} {parts[1].strip()}"
113
+ custom_column_names.append(custom_name)
114
+
115
+ # Rename columns
116
+ df.columns = custom_column_names
117
+
 
 
 
118
  return df
119
 
120
 
121
+ def query_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
122
  """
123
+ Filter the dataframe based on the search query.
124
+
125
  Args:
126
+ df (pd.DataFrame): Unfiltered dataframe.
127
+ query (str): A string of queries separated by ";".
128
+ Returns:
129
+ pd.DataFrame: Filtered dataframe containing searched queries in the 'Model' column.
130
  """
131
+ if not query.strip(): # Reset Dataframe if empty query is passed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  return df
133
 
134
+ queries = [q.strip().lower() for q in query.split(';') if q.strip()] # Normalize and split queries
135
+
136
+ # Filter dataframe based on queries in 'Model' column
137
+ filtered_df = df[df['Model'].str.lower().str.contains('|'.join(queries))]
138
+
139
  return filtered_df
src/plot_utils.py CHANGED
@@ -1,22 +1,31 @@
1
  import pandas as pd
2
  import plotly.express as px
 
 
 
3
 
4
- from src.assets.text_content import SHORT_NAMES
 
5
 
6
- def plotly_plot(df:pd.DataFrame, LIST:list, ALL:list, NAMES:list, LEGEND:list, MOBILE:list ):
7
- '''
 
 
 
8
  Takes in a list of models for a plotly plot
9
  Args:
10
  df: A dummy dataframe of latest version
11
- LIST: List of models to plot
12
- ALL: Either [] or ["Show All Models"] - toggle view to plot all models
13
- NAMES: Either [] or ["Show Names"] - toggle view to show model names on plot
14
- LEGEND: Either [] or ["Show Legend"] - toggle view to show legend on plot
15
- MOBILE: Either [] or ["Mobile View"] - toggle view to for smaller screens
 
16
  Returns:
17
- Fig: plotly figure
18
- '''
19
-
 
20
  # Get list of all models and append short names column to df
21
  list_columns = list(df.columns)
22
  ALL_LIST = list(df[list_columns[0]].unique())
@@ -24,25 +33,24 @@ def plotly_plot(df:pd.DataFrame, LIST:list, ALL:list, NAMES:list, LEGEND:list, M
24
  list_short_names = list(short_names.values())
25
  df["Short"] = list_short_names
26
 
27
- if ALL:
28
  LIST = ALL_LIST
29
  # Filter dataframe based on the provided list of models
30
  df = df[df[list_columns[0]].isin(LIST)]
31
-
32
 
33
- if NAMES:
34
  fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
35
- color_discrete_map={"category1": "blue", "category2": "red"},
36
- hover_name=list_columns[0], template="plotly_white", text="Short")
37
  fig.update_traces(textposition='top center')
38
  else:
39
  fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
40
- color_discrete_map={"category1": "blue", "category2": "red"},
41
- hover_name=list_columns[0], template="plotly_white")
42
-
43
- if not LEGEND:
44
  fig.update_layout(showlegend=False)
45
-
46
  fig.update_layout(
47
  xaxis_title='% Played',
48
  yaxis_title='Quality Score',
@@ -53,11 +61,10 @@ def plotly_plot(df:pd.DataFrame, LIST:list, ALL:list, NAMES:list, LEGEND:list, M
53
  fig.update_xaxes(range=[-5, 105])
54
  fig.update_yaxes(range=[-5, 105])
55
 
56
- if MOBILE:
57
  fig.update_layout(height=300)
58
 
59
-
60
- if MOBILE and LEGEND:
61
  fig.update_layout(height=450)
62
  fig.update_layout(legend=dict(
63
  yanchor="bottom",
@@ -75,28 +82,6 @@ def plotly_plot(df:pd.DataFrame, LIST:list, ALL:list, NAMES:list, LEGEND:list, M
75
  return fig
76
 
77
 
78
- # ['Model', 'Clemscore', 'All(Played)', 'All(Quality Score)']
79
- def compare_plots(df: pd.DataFrame, LIST1: list, LIST2: list, ALL:list, NAMES:list, LEGEND: list, MOBILE: list):
80
- '''
81
- Quality Score v/s % Played plot by selecting models
82
- Args:
83
- df: A dummy dataframe of latest version
84
- LIST1: The list of open source models to show in the plot, updated from frontend
85
- LIST2: The list of commercial models to show in the plot, updated from frontend
86
- ALL: Either [] or ["Show All Models"] - toggle view to plot all models
87
- NAMES: Either [] or ["Show Names"] - toggle view to show model names on plot
88
- LEGEND: Either [] or ["Show Legend"] - toggle view to show legend on plot
89
- MOBILE: Either [] or ["Mobile View"] - toggle view to for smaller screens
90
- Returns:
91
- fig: The plot
92
- '''
93
-
94
- # Combine lists for Open source and commercial models
95
- LIST = LIST1 + LIST2
96
- fig = plotly_plot(df, LIST, ALL, NAMES, LEGEND, MOBILE)
97
-
98
- return fig
99
-
100
  def shorten_model_name(full_name):
101
  # Split the name into parts
102
  parts = full_name.split('-')
@@ -111,19 +96,20 @@ def shorten_model_name(full_name):
111
  short_name = '-'.join(short_name_parts)
112
 
113
  # Remove any leading or trailing hyphens
114
- short_name = full_name[0] + '-'+ short_name.strip('-')
115
 
116
  return short_name
117
 
 
118
  def label_map(model_list: list) -> dict:
119
- '''
120
  Generate a map from long names to short names, to plot them in frontend graph
121
  Define the short names in src/assets/text_content.py
122
  Args:
123
  model_list: A list of long model names
124
  Returns:
125
  short_name: A dict from long to short name
126
- '''
127
  short_names = {}
128
  for model_name in model_list:
129
  if model_name in SHORT_NAMES:
@@ -135,20 +121,167 @@ def label_map(model_list: list) -> dict:
135
  short_names[model_name] = short_name
136
 
137
  return short_names
138
-
139
- def split_models(MODEL_LIST: list):
140
- '''
 
141
  Split the models into open source and commercial
142
- '''
 
143
  open_models = []
144
- comm_models = []
 
145
 
146
- for model in MODEL_LIST:
147
- if model.startswith(('gpt-', 'claude-', 'command')):
148
- comm_models.append(model)
149
- else:
150
- open_models.append(model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  open_models.sort(key=lambda o: o.upper())
153
- comm_models.sort(key=lambda c: c.upper())
154
- return open_models, comm_models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import plotly.express as px
3
+ import requests
4
+ import json
5
+ import gradio as gr
6
 
7
+ from src.assets.text_content import SHORT_NAMES, TEXT_NAME, MULTIMODAL_NAME
8
+ from src.leaderboard_utils import get_github_data
9
 
10
+
11
+ def plotly_plot(df: pd.DataFrame, list_op: list, list_co: list,
12
+ show_all: list, show_names: list, show_legend: list,
13
+ mobile_view: list):
14
+ """
15
  Takes in a list of models for a plotly plot
16
  Args:
17
  df: A dummy dataframe of latest version
18
+ list_op: The list of open source models to show in the plot, updated from frontend
19
+ list_co: The list of commercial models to show in the plot, updated from frontend
20
+ show_all: Either [] or ["Show All Models"] - toggle view to plot all models
21
+ show_names: Either [] or ["Show Names"] - toggle view to show model names on plot
22
+ show_legend: Either [] or ["Show Legend"] - toggle view to show legend on plot
23
+ mobile_view: Either [] or ["Mobile View"] - toggle view to for smaller screens
24
  Returns:
25
+ Fig: plotly figure of % played v/s quality score
26
+ """
27
+
28
+ LIST = list_op + list_co
29
  # Get list of all models and append short names column to df
30
  list_columns = list(df.columns)
31
  ALL_LIST = list(df[list_columns[0]].unique())
 
33
  list_short_names = list(short_names.values())
34
  df["Short"] = list_short_names
35
 
36
+ if show_all:
37
  LIST = ALL_LIST
38
  # Filter dataframe based on the provided list of models
39
  df = df[df[list_columns[0]].isin(LIST)]
 
40
 
41
+ if show_names:
42
  fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
43
+ color_discrete_map={"category1": "blue", "category2": "red"},
44
+ hover_name=list_columns[0], template="plotly_white", text="Short")
45
  fig.update_traces(textposition='top center')
46
  else:
47
  fig = px.scatter(df, x=list_columns[2], y=list_columns[3], color=list_columns[0], symbol=list_columns[0],
48
+ color_discrete_map={"category1": "blue", "category2": "red"},
49
+ hover_name=list_columns[0], template="plotly_white")
50
+
51
+ if not show_legend:
52
  fig.update_layout(showlegend=False)
53
+
54
  fig.update_layout(
55
  xaxis_title='% Played',
56
  yaxis_title='Quality Score',
 
61
  fig.update_xaxes(range=[-5, 105])
62
  fig.update_yaxes(range=[-5, 105])
63
 
64
+ if mobile_view:
65
  fig.update_layout(height=300)
66
 
67
+ if mobile_view and show_legend:
 
68
  fig.update_layout(height=450)
69
  fig.update_layout(legend=dict(
70
  yanchor="bottom",
 
82
  return fig
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def shorten_model_name(full_name):
86
  # Split the name into parts
87
  parts = full_name.split('-')
 
96
  short_name = '-'.join(short_name_parts)
97
 
98
  # Remove any leading or trailing hyphens
99
+ short_name = full_name[0] + '-' + short_name.strip('-')
100
 
101
  return short_name
102
 
103
+
104
  def label_map(model_list: list) -> dict:
105
+ """
106
  Generate a map from long names to short names, to plot them in frontend graph
107
  Define the short names in src/assets/text_content.py
108
  Args:
109
  model_list: A list of long model names
110
  Returns:
111
  short_name: A dict from long to short name
112
+ """
113
  short_names = {}
114
  for model_name in model_list:
115
  if model_name in SHORT_NAMES:
 
121
  short_names[model_name] = short_name
122
 
123
  return short_names
124
+
125
+
126
+ def split_models(model_list: list):
127
+ """
128
  Split the models into open source and commercial
129
+ """
130
+
131
  open_models = []
132
+ commercial_models = []
133
+ open_backends = {"huggingface_local", "huggingface_multimodal", "openai_compatible"} # Define backends considered as open
134
 
135
+ # Load model registry data from main repo
136
+ model_registry_url = "https://raw.githubusercontent.com/clp-research/clembench/main/backends/model_registry.json"
137
+ response = requests.get(model_registry_url)
138
+
139
+ if response.status_code == 200:
140
+ json_data = json.loads(response.text)
141
+ # Classify as Open or Commercial based on the defined backend in the model registry
142
+ backend_mapping = {}
143
+
144
+ for model_name in model_list:
145
+ model_prefix = model_name.split('-')[0] # Get the prefix part of the model name
146
+ for entry in json_data:
147
+ if entry["model_name"].startswith(model_prefix):
148
+ backend = entry["backend"]
149
+ # Classify based on backend
150
+ if backend in open_backends:
151
+ open_models.append(model_name)
152
+ else:
153
+ commercial_models.append(model_name)
154
+ break
155
+
156
+ else:
157
+ print(f"Failed to read JSON file: Status Code : {response.status_code}")
158
 
159
  open_models.sort(key=lambda o: o.upper())
160
+ commercial_models.sort(key=lambda c: c.upper())
161
+
162
+ # Add missing model from the model_registry
163
+ if "dolphin-2.5-mixtral-8x7b" in model_list:
164
+ open_models.append("dolphin-2.5-mixtral-8x7b")
165
+
166
+ return open_models, commercial_models
167
+
168
+ """
169
+ Update Functions, for when the leaderboard selection changes
170
+ """
171
+ def update_open_models(leaderboard: str = TEXT_NAME):
172
+ """
173
+ Change the checkbox group of Open Models based on the leaderboard selected
174
+
175
+ Args:
176
+ leaderboard: Selected leaderboard from the frontend [Default - Text Leaderboard]
177
+ Return:
178
+ Updated checkbox group for Open Models, based on the leaderboard selected
179
+ """
180
+ github_data = get_github_data()
181
+ leaderboard_data = github_data["text" if leaderboard == TEXT_NAME else "multimodal"][0]
182
+ models = leaderboard_data.iloc[:, 0].unique().tolist()
183
+ open_models, commercial_models = split_models(models)
184
+ return gr.CheckboxGroup(
185
+ open_models,
186
+ value=[],
187
+ elem_id="value-select-1",
188
+ interactive=True,
189
+ )
190
+
191
+ def update_closed_models(leaderboard: str = TEXT_NAME):
192
+ """
193
+ Change the checkbox group of Closed Models based on the leaderboard selected
194
+
195
+ Args:
196
+ leaderboard: Selected leaderboard from the frontend [Default - Text Leaderboard]
197
+ Return:
198
+ Updated checkbox group for Closed Models, based on the leaderboard selected
199
+ """
200
+ github_data = get_github_data()
201
+ leaderboard_data = github_data["text" if leaderboard == TEXT_NAME else "multimodal"][0]
202
+ models = leaderboard_data.iloc[:, 0].unique().tolist()
203
+ open_models, commercial_models = split_models(models)
204
+ return gr.CheckboxGroup(
205
+ commercial_models,
206
+ value=[],
207
+ elem_id="value-select-2",
208
+ interactive=True,
209
+ )
210
+
211
+ def get_plot_df(leaderboard: str = TEXT_NAME) -> pd.DataFrame:
212
+ """
213
+ Get the DataFrame for plotting based on the selected leaderboard.
214
+ Args:
215
+ leaderboard: Selected leaderboard.
216
+ Returns:
217
+ DataFrame with model data.
218
+ """
219
+ github_data = get_github_data()
220
+ return github_data["text" if leaderboard == TEXT_NAME else "multimodal"][0]
221
+
222
+
223
+ """
224
+ Reset Functions for when the Leaderboard selection changes
225
+ """
226
+ def reset_show_all():
227
+ return gr.CheckboxGroup(
228
+ ["Select All Models"],
229
+ label="Show plot for all models 🤖",
230
+ value=[],
231
+ elem_id="value-select-3",
232
+ interactive=True,
233
+ )
234
+
235
+ def reset_show_names():
236
+ return gr.CheckboxGroup(
237
+ ["Show Names"],
238
+ label="Show names of models on the plot 🏷️",
239
+ value=[],
240
+ elem_id="value-select-4",
241
+ interactive=True,
242
+ )
243
+
244
+
245
+ def reset_show_legend():
246
+ return gr.CheckboxGroup(
247
+ ["Show Legend"],
248
+ label="Show legend on the plot 💡",
249
+ value=[],
250
+ elem_id="value-select-5",
251
+ interactive=True,
252
+ )
253
+
254
+
255
+ def reset_mobile_view():
256
+ return gr.CheckboxGroup(
257
+ ["Mobile View"],
258
+ label="View plot on smaller screens 📱",
259
+ value=[],
260
+ elem_id="value-select-6",
261
+ interactive=True,
262
+ )
263
+
264
+
265
+ if __name__ == '__main__':
266
+ mm_model_list = ['gpt-4o-2024-05-13', 'gpt-4-1106-vision-preview', 'claude-3-opus-20240229', 'gemini-1.5-pro-latest',
267
+ 'gemini-1.5-flash-latest', 'llava-v1.6-34b-hf', 'llava-v1.6-vicuna-13b-hf', 'idefics-80b-instruct',
268
+ 'llava-1.5-13b-hf', 'idefics-9b-instruct']
269
+
270
+ text_model_list = ['vicuna-33b-v1.3', 'gpt-4-0125-preview', 'gpt-4-turbo-2024-04-09', 'claude-3-5-sonnet-20240620', 'gpt-4-1106-preview',
271
+ 'gpt-4-0613', 'gpt-4o-2024-05-13', 'claude-3-opus-20240229', 'gemini-1.5-pro-latest',
272
+ 'Meta-Llama-3-70B-Instruct-hf', 'claude-2.1', 'gemini-1.5-flash-latest', 'claude-3-sonnet-20240229',
273
+ 'Qwen1.5-72B-Chat', 'mistral-large-2402', 'gpt-3.5-turbo-0125', 'gemini-1.0-pro', 'command-r-plus', 'openchat_3.5',
274
+ 'claude-3-haiku-20240307', 'sheep-duck-llama-2-70b-v1.1', 'Meta-Llama-3-8B-Instruct-hf', 'openchat-3.5-1210',
275
+ 'WizardLM-70b-v1.0', 'openchat-3.5-0106', 'Qwen1.5-14B-Chat', 'mistral-medium-2312', 'Qwen1.5-32B-Chat',
276
+ 'codegemma-7b-it', 'dolphin-2.5-mixtral-8x7b', 'CodeLlama-34b-Instruct-hf', 'command-r', 'gemma-1.1-7b-it',
277
+ 'SUS-Chat-34B', 'Mixtral-8x22B-Instruct-v0.1', 'tulu-2-dpo-70b', 'Nous-Hermes-2-Mixtral-8x7B-SFT',
278
+ 'WizardLM-13b-v1.2', 'Mistral-7B-Instruct-v0.2', 'Yi-34B-Chat', 'Mixtral-8x7B-Instruct-v0.1',
279
+ 'Mistral-7B-Instruct-v0.1', 'Yi-1.5-34B-Chat', 'vicuna-13b-v1.5', 'Yi-1.5-6B-Chat', 'Starling-LM-7B-beta',
280
+ 'sheep-duck-llama-2-13b', 'Yi-1.5-9B-Chat', 'gemma-1.1-2b-it', 'Qwen1.5-7B-Chat', 'gemma-7b-it',
281
+ 'llama-2-70b-chat-hf', 'Qwen1.5-0.5B-Chat', 'Qwen1.5-1.8B-Chat']
282
+
283
+ om, cm = split_models(mm_model_list)
284
+ print("Open")
285
+ print(om)
286
+ print("Closed")
287
+ print(cm)
src/version_utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## REQUIRED OUTPUT ###
2
+ # A list of version names -> v1.6, v.6_multimodal, v1.6_quantized, v1.5, v0.9, etc......
3
+ # A corresponding DataFrame?
4
+
5
+ import requests
6
+ from datetime import datetime
7
+ import pandas as pd
8
+ import json
9
+ from io import StringIO
10
+
11
+ from src.leaderboard_utils import process_df
12
+ from src.assets.text_content import REPO
13
+
14
+ def get_versions_data():
15
+ """
16
+ Read and process data from CSV files of all available versions hosted on GitHub. - https://github.com/clembench/clembench-runs
17
+
18
+ Returns:
19
+ versions_data:
20
+ -
21
+ """
22
+ base_repo = REPO
23
+ json_url = base_repo + "benchmark_runs.json"
24
+ response = requests.get(json_url)
25
+
26
+ # Check if the JSON file request was successful
27
+ if response.status_code != 200:
28
+ print(f"Failed to read JSON file: Status Code: {response.status_code}")
29
+ return None, None, None, None
30
+
31
+ json_data = response.json()
32
+ versions = json_data['versions']
33
+
34
+ # Sort version names - latest first
35
+ version_names = sorted(
36
+ [ver['version'] for ver in versions],
37
+ key=lambda v: float(v[1:]),
38
+ reverse=True
39
+ )
40
+ print(f"Found {len(version_names)} versions from get_versions_data(): {version_names}.")
41
+
42
+ # Get Last updated date of the latest version
43
+ latest_version = version_names[0]
44
+ latest_date = next(
45
+ ver['date'] for ver in versions if ver['version'] == latest_version
46
+ )
47
+ formatted_date = datetime.strptime(latest_date, "%Y/%m/%d").strftime("%d %b %Y")
48
+
49
+ # Get Versions data
50
+ versions_data = {"latest": latest_version, "date": formatted_date}
51
+
52
+ # Collect Dataframes
53
+ dfs = []
54
+
55
+ for version in version_names:
56
+ text_url = f"{base_repo}{version}/results.csv"
57
+ mm_url = f"{base_repo}{version}_multimodal/results.csv"
58
+ quant_url = f"{base_repo}{version}_quantized/results.csv"
59
+
60
+ # Text Data
61
+ response = requests.get(text_url)
62
+ if response.status_code == 200:
63
+ df = pd.read_csv(StringIO(response.text))
64
+ df = process_df(df)
65
+ df = df.sort_values(by=df.columns[1], ascending=False) # Sort by clemscore column
66
+ versions_data[version] = df
67
+ else:
68
+ print(f"Failed to read Text-only leaderboard CSV file for version: {version}. Status Code: {response.status_code}")
69
+
70
+ # Multimodal Data
71
+ mm_response = requests.get(mm_url)
72
+ if mm_response.status_code == 200:
73
+ mm_df = pd.read_csv(StringIO(mm_response.text))
74
+ mm_df = process_df(mm_df)
75
+ mm_df = mm_df.sort_values(by=mm_df.columns[1], ascending=False) # Sort by clemscore column
76
+ versions_data[version+"_multimodal"] = mm_df
77
+ else:
78
+ print(f"Failed to read multimodal leaderboard CSV file for version: {version}: Status Code: {mm_response.status_code}. Please ignore this message if multimodal results are not available for this version")
79
+
80
+ # Multimodal Data
81
+ q_response = requests.get(quant_url)
82
+ if q_response.status_code == 200:
83
+ q_df = pd.read_csv(StringIO(q_response.text))
84
+ q_df = process_df(q_df)
85
+ q_df = q_df.sort_values(by=q_df.columns[1], ascending=False) # Sort by clemscore column
86
+ versions_data[version + "_quantized"] = q_df
87
+ else:
88
+ print(f"Failed to read quantized leaderboard CSV file for version: {version}: Status Code: {mm_response.status_code}. Please ignore this message if quantized results are not available for this version")
89
+
90
+ return versions_data
91
+
92
+
93
+ if __name__ == "__main__":
94
+ versions_data = get_versions_data()
95
+ print(versions_data.keys())