Koshti10 commited on
Commit
8d31234
·
verified ·
1 Parent(s): d9fe49a

Upload 8 files

Browse files
Files changed (4) hide show
  1. LICENSE +21 -0
  2. app.py +33 -3
  3. src/assets/text_content.py +3 -1
  4. src/trend_utils.py +357 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 clembench
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py CHANGED
@@ -9,6 +9,7 @@ from src.leaderboard_utils import query_search, get_github_data
9
  from src.plot_utils import split_models, plotly_plot, get_plot_df, update_open_models, update_closed_models
10
  from src.plot_utils import reset_show_all, reset_show_names, reset_show_legend, reset_mobile_view
11
  from src.version_utils import get_versions_data
 
12
 
13
  """
14
  CONSTANTS
@@ -150,7 +151,7 @@ with hf_app:
150
  """
151
  ####################### THIRD TAB - PLOTS - %PLAYED V/S QUALITY SCORE #######################
152
  """
153
- with gr.TabItem("📈 Plots", elem_id="plots", id=2):
154
  """
155
  DropDown Select for Text/Multimodal Leaderboard
156
  """
@@ -342,9 +343,38 @@ with hf_app:
342
  )
343
 
344
  """
345
- ####################### FOURTH TAB - VERSIONS AND DETAILS #######################
346
  """
347
- with gr.TabItem("🔄 Versions and Details", elem_id="versions-details-tab", id=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  with gr.Row():
349
  version_select = gr.Dropdown(
350
  version_names, label="Select Version 🕹️", value=latest_version
 
9
  from src.plot_utils import split_models, plotly_plot, get_plot_df, update_open_models, update_closed_models
10
  from src.plot_utils import reset_show_all, reset_show_names, reset_show_legend, reset_mobile_view
11
  from src.version_utils import get_versions_data
12
+ from src.trend_utils import get_text_trend_plot, get_final_trend_plot
13
 
14
  """
15
  CONSTANTS
 
151
  """
152
  ####################### THIRD TAB - PLOTS - %PLAYED V/S QUALITY SCORE #######################
153
  """
154
+ with gr.TabItem("📊 Plots", elem_id="plots", id=2):
155
  """
156
  DropDown Select for Text/Multimodal Leaderboard
157
  """
 
343
  )
344
 
345
  """
346
+ ####################### FOURTH TAB - TRENDS #######################
347
  """
348
+ with gr.TabItem("📈Trends", elem_id="trends-tab", id=3):
349
+ with gr.Row():
350
+ mkd_text = gr.Markdown("### Commercial v/s Open-Weight models - clemscore over time. The size of the circles represents the scaled value of the parameters of the models. Larger circles indicate higher parameter values.")
351
+
352
+ with gr.Row():
353
+ trend_select = gr.Dropdown(
354
+ choices=["text", "multimodal"],
355
+ value="text",
356
+ label="Select Benchmark 🔍",
357
+ elem_id="value-select-7",
358
+ interactive=True,
359
+ )
360
+
361
+ with gr.Row():
362
+ trend_plot = gr.Plot(get_text_trend_plot(),
363
+ label="Trend over time")
364
+
365
+ trend_select.change(
366
+ get_final_trend_plot,
367
+ [trend_select],
368
+ [trend_plot],
369
+ queue=True
370
+ )
371
+
372
+
373
+
374
+ """
375
+ ####################### FIFTH TAB - VERSIONS AND DETAILS #######################
376
+ """
377
+ with gr.TabItem("🔄 Versions and Details", elem_id="versions-details-tab", id=4):
378
  with gr.Row():
379
  version_select = gr.Dropdown(
380
  version_names, label="Select Version 🕹️", value=latest_version
src/assets/text_content.py CHANGED
@@ -1,6 +1,8 @@
1
  TITLE = """<h1 align="center" id="space-title"> 🏆 CLEM Leaderboard</h1>"""
2
 
3
  REPO = "https://raw.githubusercontent.com/clembench/clembench-runs/main/"
 
 
4
  HF_REPO = "colab-potsdam/clem-leaderboard"
5
 
6
  TEXT_NAME = "🥇 CLEM Leaderboard"
@@ -13,7 +15,7 @@ The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimi
13
 
14
  The benchmarking approach is described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://aclanthology.org/2023.emnlp-main.689.pdf).
15
 
16
- The multimodal benchmark is described in [Two Giraffes in a Dirt Field: Using Game Play to Investigate Situation Modelling in Large Multimodal Models](https://arxiv.org/abs/2406.14035)
17
 
18
  Source code for benchmarking "clems" is available here: [Clembench](https://github.com/clembench/clembench)
19
 
 
1
  TITLE = """<h1 align="center" id="space-title"> 🏆 CLEM Leaderboard</h1>"""
2
 
3
  REPO = "https://raw.githubusercontent.com/clembench/clembench-runs/main/"
4
+ REGISTRY_URL = "https://raw.githubusercontent.com/kushal-10/clembench/feat/registry/backends/model_registry_updated.json"
5
+
6
  HF_REPO = "colab-potsdam/clem-leaderboard"
7
 
8
  TEXT_NAME = "🥇 CLEM Leaderboard"
 
15
 
16
  The benchmarking approach is described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://aclanthology.org/2023.emnlp-main.689.pdf).
17
 
18
+ The multimodal benchmark is described in [Using Game Play to Investigate Multimodal and Conversational Grounding in Large Multimodal Models](https://arxiv.org/abs/2406.14035)
19
 
20
  Source code for benchmarking "clems" is available here: [Clembench](https://github.com/clembench/clembench)
21
 
src/trend_utils.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Fetch Model Registry and clemscores
2
+ import requests
3
+ import pandas as pd
4
+ from datetime import datetime
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ import numpy as np
9
+
10
+ from src.assets.text_content import REGISTRY_URL, REPO
11
+ from src.leaderboard_utils import get_github_data
12
+
13
+
14
+ # Fetch Model Registry
15
+ response = requests.get(REGISTRY_URL)
16
+ model_registry_data = response.json()
17
+ model_registry_df = pd.DataFrame(model_registry_data)
18
+ # Custom tick labels
19
+ base_repo = REPO
20
+ json_url = base_repo + "benchmark_runs.json"
21
+ response = requests.get(json_url)
22
+
23
+ # Check if the JSON file request was successful
24
+ if response.status_code != 200:
25
+ print(f"Failed to read JSON file: Status Code: {response.status_code}")
26
+
27
+ json_data = response.json()
28
+ versions = json_data['versions']
29
+
30
+
31
+ def get_param_size(params: str) -> float:
32
+ """Get the size of parameters in a float format.
33
+
34
+ Args:
35
+ params (str): The parameter size as a string (e.g., '1000B', '1T').
36
+
37
+ Returns:
38
+ float: The size of parameters in float.
39
+ """
40
+ if not params:
41
+ param_size = 0
42
+ else:
43
+ if params[-1] == "B":
44
+ param_size = params[:-1]
45
+ param_size = float(param_size)
46
+ elif params[-1] == "T":
47
+ param_size = params[:-1]
48
+ param_size = float(param_size)
49
+ param_size *= 1000
50
+ else:
51
+ print("Not a valid parameter size")
52
+
53
+ return param_size
54
+
55
+ def date_difference(date_str1: str, date_str2: str) -> int:
56
+ """Calculate the difference in days between two dates.
57
+
58
+ Args:
59
+ date_str1 (str): The first date as a string in 'YYYY-MM-DD' format.
60
+ date_str2 (str): The second date as a string in 'YYYY-MM-DD' format.
61
+
62
+ Returns:
63
+ int: The difference in days between the two dates.
64
+ """
65
+ date_format = "%Y-%m-%d"
66
+ date1 = datetime.strptime(date_str1, date_format)
67
+ date2 = datetime.strptime(date_str2, date_format)
68
+ return (date1 - date2).days
69
+
70
+
71
+ def populate_list(df: pd.DataFrame, abs_diff: float) -> list:
72
+ """Populate a list of models based on clemscore differences.
73
+
74
+ Args:
75
+ df (pd.DataFrame): DataFrame containing model data.
76
+ abs_diff (float): The absolute difference threshold for clemscore.
77
+
78
+ Returns:
79
+ list: A list of model names that meet the criteria.
80
+ """
81
+ l = [df.iloc[0]['model']]
82
+ prev_clemscore = df.iloc[0]['clemscore']
83
+ prev_date = df.iloc[0]['release_date']
84
+
85
+ for i in range(1, len(df)):
86
+ curr_clemscore = df.iloc[i]['clemscore']
87
+ curr_date = df.iloc[i]['release_date']
88
+ date_diff = date_difference(curr_date, prev_date)
89
+
90
+ if curr_clemscore - prev_clemscore >= abs_diff:
91
+ if date_diff == 0:
92
+ l[-1] = df.iloc[i]['model']
93
+ else:
94
+ l.append(df.iloc[i]['model'])
95
+
96
+ prev_clemscore = curr_clemscore
97
+ prev_date = curr_date
98
+
99
+ # Add the last model if the difference between the last and previous date is greater than 15 days
100
+ last_date = df.iloc[-1]['release_date']
101
+ if date_difference(last_date, prev_date) > 15:
102
+ l.append(df.iloc[-1]['model'])
103
+
104
+ return l
105
+
106
+
107
+ def get_models_to_display(result_df: pd.DataFrame, open_diff: float = -0.5, comm_diff: float = -10) -> tuple:
108
+ """Get models to display based on clemscore differences.
109
+
110
+ Args:
111
+ result_df (pd.DataFrame): DataFrame containing model data.
112
+ open_diff (float, optional): Threshold for open models. Defaults to -0.5.
113
+ comm_diff (float, optional): Threshold for commercial models. Defaults to -10.
114
+
115
+ Returns:
116
+ tuple: Two lists of model names (open and commercial).
117
+ """
118
+ open_model_df = result_df[result_df['open_weight']==True]
119
+ comm_model_df = result_df[result_df['open_weight']==False]
120
+
121
+ open_model_df = open_model_df.sort_values(by='release_date', ascending=True)
122
+ comm_model_df = comm_model_df.sort_values(by='release_date', ascending=True)
123
+ open_models = populate_list(open_model_df, open_diff)
124
+ comm_models = populate_list(comm_model_df, comm_diff)
125
+ return open_models, comm_models
126
+
127
+
128
+ def get_trend_data(text_dfs: list, model_registry_data: list) -> pd.DataFrame:
129
+ """Process text data frames to extract model information.
130
+
131
+ Args:
132
+ text_dfs (list): List of DataFrames containing model information.
133
+ model_registry_data (list): List of dictionaries containing model registry data.
134
+
135
+ Returns:
136
+ pd.DataFrame: DataFrame containing processed model data.
137
+ """
138
+ visited = set() # Track models that have been processed
139
+ result_df = pd.DataFrame(columns=['model', 'clemscore', 'open_weight', 'release_date', 'parameters', 'est_flag'])
140
+
141
+ for df in text_dfs:
142
+ for i in range(len(df)):
143
+ model_name = df['Model'].iloc[i]
144
+ if model_name not in visited:
145
+ visited.add(model_name)
146
+ for dict_obj in model_registry_data:
147
+ if dict_obj["model_name"] == model_name:
148
+ if dict_obj["parameters"] == "" :
149
+ params = "1000B"
150
+ est_flag = True
151
+ print(f"EST PARAMS for {model_name}: {params}")
152
+ else:
153
+ params = dict_obj['parameters']
154
+ est_flag = False
155
+
156
+ param_size = get_param_size(params)
157
+ new_data = {'model': model_name, 'clemscore': df['Clemscore'].iloc[i], 'open_weight':dict_obj['open_weight'],
158
+ 'release_date': dict_obj['release_date'], 'parameters': param_size, 'est_flag': est_flag}
159
+ result_df.loc[len(result_df)] = new_data
160
+ break
161
+ return result_df # Return the compiled DataFrame
162
+
163
+
164
+ def get_plot(df: pd.DataFrame, start_date: str = '2023-06-01', end_date: str = '2024-12-30',
165
+ open_diff: float = -0.5, comm_diff: float = -10, benchmark_ticks: dict = {}, data: str = "text") -> go.Figure:
166
+ """Generate a plot for the given DataFrame.
167
+
168
+ Args:
169
+ df (pd.DataFrame): DataFrame containing model data.
170
+ start_date (str, optional): Start date for filtering. Defaults to '2023-06-01'.
171
+ end_date (str, optional): End date for filtering. Defaults to '2024-12-30'.
172
+ open_diff (float, optional): Threshold for open models. Defaults to -0.5. The threshold is the allowed dip in clemscore for the trendline to be plotted.
173
+ comm_diff (float, optional): Threshold for commercial models. Defaults to -10. The threshold is the allowed dip in clemscore for the trendline to be plotted.
174
+ benchmark_ticks (dict, optional): Custom benchmark ticks for the version dates. Defaults to {}.
175
+
176
+ Returns:
177
+ go.Figure: The generated plot.
178
+ """
179
+ max_clemscore = df['clemscore'].max()
180
+ # Convert 'release_date' to datetime
181
+ df['Release date'] = pd.to_datetime(df['release_date'], format='ISO8601')
182
+ # Filter out data before April 2023
183
+ df = df[df['Release date'] >= pd.to_datetime(start_date)]
184
+ open_model_list, comm_model_list = get_models_to_display(df, open_diff, comm_diff)
185
+ models_to_display = open_model_list + comm_model_list
186
+
187
+ # Create a column to indicate if the model should be labeled
188
+ df['label_model'] = df['model'].apply(lambda x: x if x in models_to_display else "")
189
+ # Add an identifier column to each DataFrame
190
+ df['Model Type'] = df['open_weight'].map({True: 'Open-Weight', False: 'Commercial'})
191
+
192
+ marker_size = df['parameters'].apply(lambda x: np.sqrt(x) if x > 0 else np.sqrt(200)).astype(float) # Arbitrary
193
+ marker_symbol = df['parameters'].apply(lambda x: 'circle' if x > 0 else 'circle-open')
194
+
195
+ open_color = 'red'
196
+ comm_color = 'blue'
197
+
198
+ # Create the scatter plot
199
+ fig = px.scatter(df,
200
+ x="Release date",
201
+ y="clemscore",
202
+ color="Model Type", # Differentiates the datasets by color
203
+ text="label_model", # Adds text labels from the 'label_model' column
204
+ hover_name="model",
205
+ size=marker_size,
206
+ size_max=40, # Max size of the circles
207
+ symbol=marker_symbol,
208
+ symbol_sequence=['circle'],
209
+ template="plotly_white")
210
+ # title=f"Commercial v/s Open-Weight models for {data} benchmark - clemscore over time. The size of the circles represents the scaled value of the parameters of the models. Larger circles indicate higher parameter values.")
211
+
212
+ # Sort dataframes for line plotting
213
+ df_open = df[df['model'].isin(open_model_list)].sort_values(by='Release date')
214
+ df_commercial = df[df['model'].isin(comm_model_list)].sort_values(by='Release date')
215
+
216
+ ## Custom tics for x axis
217
+ benchmark_tickvals = list(pd.to_datetime(list(benchmark_ticks.keys())))
218
+ # Define the start and end dates
219
+ start_date = pd.to_datetime(start_date)
220
+ end_date = pd.to_datetime(end_date)
221
+ # Generate ticks every two months
222
+ date_range = pd.date_range(start=start_date, end=end_date, freq='2MS') # '2MS' stands for 2 Months Start frequency
223
+ # Create labels for these ticks
224
+ custom_ticks = {date: date.strftime('%b %Y') for date in date_range}
225
+
226
+ combined_ticks = {}
227
+ for key in benchmark_ticks:
228
+ if key not in combined_ticks:
229
+ combined_ticks[key] = benchmark_ticks[key]
230
+ for key in custom_ticks:
231
+ if key not in combined_ticks:
232
+ combined_ticks[key] = custom_ticks[key]
233
+ combined_tickvals = list(combined_ticks.keys())
234
+ combined_ticktext = list(combined_ticks.values())
235
+
236
+ # Plot Benchmark Ticks with Vertical Dotted Lines
237
+ for date in benchmark_tickvals:
238
+ fig.add_shape(
239
+ go.layout.Shape(
240
+ type='line',
241
+ x0=date,
242
+ x1=date,
243
+ y0=0,
244
+ y1=1,
245
+ yref='paper',
246
+ line=dict(color='#A9A9A9', dash='dash')
247
+ )
248
+ )
249
+
250
+ # Update x-axis with combined ticks
251
+ fig.update_xaxes(
252
+ tickvals=combined_tickvals,
253
+ ticktext=combined_ticktext,
254
+ tickangle=0
255
+ )
256
+
257
+ # Remove old legend entries
258
+ for trace in fig.data:
259
+ trace.showlegend = False
260
+
261
+
262
+ # Add lines connecting the points for open models
263
+ fig.add_trace(go.Scatter(x=df_open['Release date'], y=df_open['clemscore'],
264
+ mode='lines+markers', name='Open Models Trendline',
265
+ line=dict(color=open_color), showlegend=False))
266
+
267
+ # Add lines connecting the points for commercial models
268
+ fig.add_trace(go.Scatter(x=df_commercial['Release date'], y=df_commercial['clemscore'],
269
+ mode='lines+markers', name='Commercial Models Trendline',
270
+ line=dict(color=comm_color), showlegend=False))
271
+
272
+
273
+ # Update layout to ensure text labels are visible
274
+ fig.update_traces(textposition='top center')
275
+ fig.update_yaxes(range=[0, max_clemscore+10])
276
+
277
+ # Update the x-axis title
278
+ fig.update_layout(width=1400, height=1000,
279
+ xaxis_title='Release dates of models and clembench versions' # Set your desired x-axis title here
280
+ )
281
+
282
+ # Add custom legend
283
+ fig.add_trace(go.Scatter(
284
+ x=[None], # X coordinate for the legend entry
285
+ y=[None], # Y coordinate for the legend entry
286
+ mode='markers',
287
+ marker=dict(symbol='circle', color=open_color),
288
+ legendgroup='marker',
289
+ showlegend=True,
290
+ name='Open-Weight Models'
291
+ ))
292
+
293
+ fig.add_trace(go.Scatter(
294
+ x=[None], # X coordinate for the legend entry
295
+ y=[None], # Y coordinate for the legend entry
296
+ mode='markers',
297
+ marker=dict(symbol='circle', color=comm_color),
298
+ legendgroup='marker',
299
+ showlegend=True,
300
+ name='Commercial Models'
301
+ ))
302
+
303
+ return fig
304
+
305
+
306
+
307
+ def get_text_trend_plot() -> go.Figure:
308
+ """Get the trend plot for text models.
309
+
310
+ Returns:
311
+ go.Figure: The generated trend plot for text models.
312
+ """
313
+ text_dfs = get_github_data()['text']
314
+ result_df = get_trend_data(text_dfs, model_registry_data)
315
+ df = result_df
316
+
317
+ benchmark_ticks = {}
318
+ for ver in versions:
319
+ benchmark_ticks[pd.to_datetime(ver['date'])] = ver['version']
320
+
321
+ return get_plot(df, start_date='2023-06-01', end_date=datetime.now().strftime('%Y-%m-%d'), open_diff=-0.5, comm_diff=-5, benchmark_ticks=benchmark_ticks)
322
+
323
+ def get_mm_trend_plot() -> go.Figure:
324
+ """Get the trend plot for multimodal models.
325
+
326
+ Returns:
327
+ go.Figure: The generated trend plot for multimodal models.
328
+ """
329
+ text_dfs = get_github_data()['multimodal']
330
+ result_df = get_trend_data(text_dfs, model_registry_data)
331
+ df = result_df
332
+
333
+ benchmark_ticks = {}
334
+ for ver in versions:
335
+ if 'multimodal' in ver['version']:
336
+ ver['version'] = ver['version'].replace('_multimodal', '')
337
+ benchmark_ticks[pd.to_datetime(ver['date'])] = ver['version']
338
+
339
+ return get_plot(df, start_date='2023-06-01', end_date=datetime.now().strftime('%Y-%m-%d'), open_diff=-0.5, comm_diff=-5, benchmark_ticks=benchmark_ticks, data="text")
340
+
341
+ def get_final_trend_plot(benchmark: str = "text") -> go.Figure:
342
+ """Get the final trend plot for all models.
343
+
344
+ Returns:
345
+ go.Figure: The generated trend plot for selected benchmark.
346
+ """
347
+ if benchmark == "text":
348
+ return get_text_trend_plot()
349
+ elif benchmark == "multimodal":
350
+ return get_mm_trend_plot()
351
+
352
+
353
+ if __name__ == "__main__":
354
+ fig = get_text_trend_plot()
355
+ fig.show()
356
+ fig = get_mm_trend_plot()
357
+ fig.show()