Corey commited on
Commit
59c6dd2
1 Parent(s): cb2c32e

Added clickable links (#1)

Browse files

* Added clickable links
* Combined organization and model as a single column name
* removed redundant filtering
* removed model comparison feature because it did not seem useful as it was made
* Updated to streamlit 1.25.0 for clickable link support
* Removed truthfulQA data temporarily.

app.py CHANGED
@@ -95,7 +95,7 @@ def create_line_chart(df, model_names, metrics):
95
  fig.update_layout(showlegend=True)
96
  return fig
97
 
98
- def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters', 'organization']):
99
  # Calculate the absolute differences for each task between the target model and the closest models
100
  new_df = df.drop(columns=exclude_columns)
101
  differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
@@ -124,35 +124,12 @@ st.markdown("""
124
  """)
125
 
126
  # Load the data into memory
127
- data_path = "processed_data_2023-10-06.csv"
128
  data_df = load_csv_data(data_path)
129
- data_df.rename(columns={"Unnamed: 0": "Model Name"}, inplace=True)
 
130
  data_df.set_index("Model Name", inplace=True)
131
 
132
- filters = st.checkbox('Select Models and/or Evaluations')
133
-
134
- # Initialize selected columns with "Parameters" and "MMLU_average" if filters are checked
135
- selected_columns = ['Parameters', 'MMLU_average'] if filters else data_df.columns.tolist()
136
-
137
- # Initialize selected models as empty if filters are checked
138
- selected_models = [] if filters else data_df.index.tolist()
139
-
140
- if filters:
141
- # Create multi-select for columns with default selection
142
- selected_columns = st.multiselect(
143
- 'Select Columns',
144
- data_df.columns.tolist(),
145
- default=selected_columns
146
- )
147
-
148
- # Create multi-select for models without default selection
149
- selected_models = st.multiselect(
150
- 'Select Models',
151
- data_df.index.tolist()
152
- )
153
-
154
- # Get the filtered data
155
- # filtered_data = data_provider.get_data(selected_models)
156
  filtered_data = data_df
157
 
158
  # sort the table by the MMLU_average column
@@ -165,32 +142,33 @@ parameter_threshold = st.selectbox(
165
  index=4, # Set the default selected option to 'No threshold'
166
  format_func=lambda x: f"{x}" if isinstance(x, int) else x
167
  )
168
-
169
- # Filter the DataFrame based on the selected parameter threshold if not 'No threshold'
170
  if isinstance(parameter_threshold, int):
171
  filtered_data = filtered_data[filtered_data['Parameters'] <= parameter_threshold]
172
 
 
 
 
 
173
 
174
- # Search box
175
- search_query = st.text_input("Filter by Model Name:", "")
176
-
177
- # Filter the DataFrame based on the search query in the index (model name)
178
- if search_query:
179
- filtered_data = filtered_data[filtered_data.index.str.contains(search_query, case=False)]
180
-
181
-
182
- # Search box for columns
183
  column_search_query = st.text_input("Filter by Column/Task Name:", "").replace(" ", "").split(',')
184
-
185
- # Get the columns that contain the search query
186
  matching_columns = [col for col in filtered_data.columns if any(query.lower() in col.lower() for query in column_search_query)]
 
 
187
 
188
  # Display the DataFrame with only the matching columns
189
  st.markdown("## Sortable Results")
190
- st.dataframe(filtered_data[matching_columns])
 
 
 
 
 
 
 
 
191
 
192
  # CSV download
193
-
194
  filtered_data.index.name = "Model Name"
195
 
196
  csv = filtered_data.to_csv(index=True)
@@ -209,6 +187,9 @@ def create_plot(df, x_values, y_values, models=None, title=None):
209
  # remove rows with NaN values
210
  df = df.dropna(subset=[x_values, y_values])
211
 
 
 
 
212
  plot_data = pd.DataFrame({
213
  'Model': df.index,
214
  x_values: df[x_values],
@@ -279,8 +260,11 @@ st.markdown("***The dashed red line indicates random chance accuracy of 0.25 as
279
  st.markdown("***")
280
  st.write("As expected, there is a strong positive relationship between the number of parameters and average performance on the MMLU evaluation.")
281
 
282
- selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=1)
283
- selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=4)
 
 
 
284
 
285
  if selected_x_column != selected_y_column: # Avoid creating a plot with the same column on both axes
286
  fig = create_plot(filtered_data, selected_x_column, selected_y_column)
@@ -289,44 +273,44 @@ else:
289
  st.write("Please select different columns for the x and y axes.")
290
 
291
 
 
292
 
293
 
294
- # end of custom scatter plots
295
 
296
- # Section to select a model and display radar and line charts
297
- st.header("Compare a Selected Model to the 5 Models Closest in MMLU Average Performance")
298
- st.write("""
299
- This comparison highlights the nuances in model performance across different tasks.
300
- While the overall MMLU average score provides a general understanding of a model's capabilities,
301
- examining the closest models reveals variations in performance on individual tasks.
302
- Such an analysis can uncover specific strengths and weaknesses and guide further exploration and improvement.
303
- """)
304
 
305
- default_model_name = "GPT-JT-6B-v0"
306
 
307
- default_model_index = filtered_data.index.tolist().index(default_model_name) if default_model_name in filtered_data.index else 0
308
- selected_model_name = st.selectbox("Select a Model:", filtered_data.index.tolist(), index=default_model_index)
309
 
310
- # Get the closest 5 models with unique indices
311
- closest_models_diffs = filtered_data['MMLU_average'].sub(filtered_data.loc[selected_model_name, 'MMLU_average']).abs()
312
- closest_models = closest_models_diffs.nsmallest(5, keep='first').index.drop_duplicates().tolist()
313
 
314
 
315
  # Find the top 10 tasks with the largest differences and convert to a DataFrame
316
- top_differences_table, top_differences_tasks = find_top_differences_table(filtered_data, selected_model_name, closest_models)
317
 
318
  # Display the DataFrame for the closest models and the top differences tasks
319
- st.dataframe(filtered_data.loc[closest_models, top_differences_tasks])
320
 
321
  # # Display the table in the Streamlit app
322
  # st.markdown("## Top Differences")
323
  # st.dataframe(top_differences_table)
324
 
325
  # Create a radar chart for the tasks with the largest differences
326
- fig_radar_top_differences = create_radar_chart_unfilled(filtered_data, closest_models, top_differences_tasks)
327
 
328
  # Display the radar chart
329
- st.plotly_chart(fig_radar_top_differences)
330
 
331
 
332
  st.markdown("## Notable findings and plots")
 
95
  fig.update_layout(showlegend=True)
96
  return fig
97
 
98
+ def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters']):
99
  # Calculate the absolute differences for each task between the target model and the closest models
100
  new_df = df.drop(columns=exclude_columns)
101
  differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
 
124
  """)
125
 
126
  # Load the data into memory
127
+ data_path = "processed_data_2023-10-08.csv"
128
  data_df = load_csv_data(data_path)
129
+ # drop the column Unnamed: 0
130
+ data_df.rename(columns={'Unnamed: 0': "Model Name"}, inplace=True)
131
  data_df.set_index("Model Name", inplace=True)
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  filtered_data = data_df
134
 
135
  # sort the table by the MMLU_average column
 
142
  index=4, # Set the default selected option to 'No threshold'
143
  format_func=lambda x: f"{x}" if isinstance(x, int) else x
144
  )
 
 
145
  if isinstance(parameter_threshold, int):
146
  filtered_data = filtered_data[filtered_data['Parameters'] <= parameter_threshold]
147
 
148
+ # model name filtering
149
+ search_queries = st.text_input("Filter by Model Name:", "").replace(" ", "").split(',')
150
+ if search_queries:
151
+ filtered_data = filtered_data[filtered_data.index.str.contains('|'.join(search_queries), case=False)]
152
 
153
+ # column name filtering
 
 
 
 
 
 
 
 
154
  column_search_query = st.text_input("Filter by Column/Task Name:", "").replace(" ", "").split(',')
 
 
155
  matching_columns = [col for col in filtered_data.columns if any(query.lower() in col.lower() for query in column_search_query)]
156
+ filtered_data = filtered_data[matching_columns]
157
+
158
 
159
  # Display the DataFrame with only the matching columns
160
  st.markdown("## Sortable Results")
161
+ st.dataframe(
162
+ filtered_data[matching_columns],
163
+ column_config={
164
+ "URL": st.column_config.LinkColumn( # Only current way to make url a clickable link with streamlit without removing the interactivity of the table
165
+ width="small"
166
+ )
167
+ },
168
+ hide_index=True,
169
+ )
170
 
171
  # CSV download
 
172
  filtered_data.index.name = "Model Name"
173
 
174
  csv = filtered_data.to_csv(index=True)
 
187
  # remove rows with NaN values
188
  df = df.dropna(subset=[x_values, y_values])
189
 
190
+ #remove label rows URL, full_model_name
191
+ df = df.drop(columns=['URL', 'full_model_name'])
192
+
193
  plot_data = pd.DataFrame({
194
  'Model': df.index,
195
  x_values: df[x_values],
 
260
  st.markdown("***")
261
  st.write("As expected, there is a strong positive relationship between the number of parameters and average performance on the MMLU evaluation.")
262
 
263
+ column_list_for_plotting = filtered_data.columns.tolist()
264
+ column_list_for_plotting.remove('URL')
265
+ column_list_for_plotting.remove('full_model_name')
266
+ selected_x_column = st.selectbox('Select x-axis', column_list_for_plotting, index=0)
267
+ selected_y_column = st.selectbox('Select y-axis', column_list_for_plotting, index=1)
268
 
269
  if selected_x_column != selected_y_column: # Avoid creating a plot with the same column on both axes
270
  fig = create_plot(filtered_data, selected_x_column, selected_y_column)
 
273
  st.write("Please select different columns for the x and y axes.")
274
 
275
 
276
+ # end of custom scatter plots
277
 
278
 
 
279
 
280
+ # # Section to select a model and display radar and line charts
281
+ # st.header("Compare a Selected Model to the 5 Models Closest in MMLU Average Performance")
282
+ # st.write("""
283
+ # This comparison highlights the nuances in model performance across different tasks.
284
+ # While the overall MMLU average score provides a general understanding of a model's capabilities,
285
+ # examining the closest models reveals variations in performance on individual tasks.
286
+ # Such an analysis can uncover specific strengths and weaknesses and guide further exploration and improvement.
287
+ # """)
288
 
289
+ # default_model_name = "GPT-JT-6B-v0"
290
 
291
+ # default_model_index = filtered_data.index.tolist().index(default_model_name) if default_model_name in filtered_data.index else 0
292
+ # selected_model_name = st.selectbox("Select a Model:", filtered_data.index.tolist(), index=default_model_index)
293
 
294
+ # # Get the closest 5 models with unique indices
295
+ # closest_models_diffs = filtered_data['MMLU_average'].sub(filtered_data.loc[selected_model_name, 'MMLU_average']).abs()
296
+ # closest_models = closest_models_diffs.nsmallest(5, keep='first').index.drop_duplicates().tolist()
297
 
298
 
299
  # Find the top 10 tasks with the largest differences and convert to a DataFrame
300
+ # top_differences_table, top_differences_tasks = find_top_differences_table(filtered_data, selected_model_name, closest_models)
301
 
302
  # Display the DataFrame for the closest models and the top differences tasks
303
+ # st.dataframe(filtered_data.loc[closest_models, top_differences_tasks])
304
 
305
  # # Display the table in the Streamlit app
306
  # st.markdown("## Top Differences")
307
  # st.dataframe(top_differences_table)
308
 
309
  # Create a radar chart for the tasks with the largest differences
310
+ # fig_radar_top_differences = create_radar_chart_unfilled(filtered_data, closest_models, top_differences_tasks)
311
 
312
  # Display the radar chart
313
+ # st.plotly_chart(fig_radar_top_differences)
314
 
315
 
316
  st.markdown("## Notable findings and plots")
generate_csv.ipynb ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "100\n",
13
+ "200\n",
14
+ "300\n",
15
+ "400\n",
16
+ "500\n",
17
+ "600\n",
18
+ "700\n",
19
+ "800\n",
20
+ "900\n",
21
+ "1000\n",
22
+ "1100\n",
23
+ "1200\n",
24
+ "1300\n",
25
+ "1400\n"
26
+ ]
27
+ }
28
+ ],
29
+ "source": [
30
+ "from result_data_processor import ResultDataProcessor\n",
31
+ "result = ResultDataProcessor()"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": []
40
+ }
41
+ ],
42
+ "metadata": {
43
+ "kernelspec": {
44
+ "display_name": "mmlu",
45
+ "language": "python",
46
+ "name": "python3"
47
+ },
48
+ "language_info": {
49
+ "codemirror_mode": {
50
+ "name": "ipython",
51
+ "version": 3
52
+ },
53
+ "file_extension": ".py",
54
+ "mimetype": "text/x-python",
55
+ "name": "python",
56
+ "nbconvert_exporter": "python",
57
+ "pygments_lexer": "ipython3",
58
+ "version": "3.10.12"
59
+ }
60
+ },
61
+ "nbformat": 4,
62
+ "nbformat_minor": 2
63
+ }
processed_data_2023-10-06.csv CHANGED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  # replicating the current hugging face streamlit and pandas versions
2
- streamlit==1.21.0
3
  pandas==1.5.3
4
 
5
  # Other
 
1
  # replicating the current hugging face streamlit and pandas versions
2
+ streamlit==1.25.0
3
  pandas==1.5.3
4
 
5
  # Other
result_data_processor.py CHANGED
@@ -96,32 +96,47 @@ class ResultDataProcessor:
96
 
97
 
98
  def process_data(self):
99
-
 
100
  dataframes = []
101
  organization_names = []
102
  for filename in self._find_files(self.directory, self.pattern):
103
- try:
104
- raw_data = self._read_and_transform_data(filename)
105
- split_path = filename.split('/')
106
- model_name = split_path[2]
107
- organization_name = split_path[1]
108
- cleaned_data = self._cleanup_dataframe(raw_data, model_name)
109
- mc1 = self._extract_mc1(raw_data, model_name)
110
- mc2 = self._extract_mc2(raw_data, model_name)
111
- cleaned_data = pd.concat([cleaned_data, mc1])
112
- cleaned_data = pd.concat([cleaned_data, mc2])
113
- organization_names.append(organization_name)
114
- dataframes.append(cleaned_data)
115
- except Exception as e:
116
- logging.error(f'Error processing {filename}')
117
- logging.error(f'The error is: {e}')
118
- continue
 
 
 
 
 
 
 
 
 
119
 
120
 
121
  data = pd.concat(dataframes, axis=1).transpose()
122
 
123
  # Add organization column
124
- data['organization'] = organization_names
 
 
 
 
 
125
 
126
  # Add Model Name and rearrange columns
127
  data['Model Name'] = data.index
@@ -143,8 +158,7 @@ class ResultDataProcessor:
143
 
144
 
145
 
146
- # Drop specific columns
147
- data = data.drop(columns=['all', 'truthfulqa:mc|0'])
148
 
149
  # Add parameter count column using extract_parameters function
150
  data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
@@ -155,18 +169,36 @@ class ResultDataProcessor:
155
  print(cols)
156
  data = data[cols]
157
 
158
- # Reorder columns to move 'organization' to the second position
159
- cols = data.columns.tolist()
160
- cols = cols[-1:] + cols[:-1]
161
- data = data[cols]
 
 
 
 
162
 
163
  # remove extreme outliers from column harness|truthfulqa:mc1
164
- data = self._remove_mc1_outliers(data)
165
 
166
  data = self.manual_removal_of_models(data)
167
 
168
- # save to csv with the current date as part of the filename
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  data.to_csv(f'processed_data_{pd.Timestamp.now().strftime("%Y-%m-%d")}.csv')
171
 
172
  return data
 
96
 
97
 
98
  def process_data(self):
99
+ full_model_name_count = 0
100
+ full_model_names = []
101
  dataframes = []
102
  organization_names = []
103
  for filename in self._find_files(self.directory, self.pattern):
104
+ # try:
105
+ raw_data = self._read_and_transform_data(filename)
106
+ split_path = filename.split('/')
107
+ model_name = split_path[2]
108
+ organization_name = split_path[1]
109
+ full_model_name = f'{organization_name}/{model_name}'
110
+ full_model_name_count += 1
111
+ # print count every 100 models
112
+ if full_model_name_count % 100 == 0:
113
+ print(full_model_name_count)
114
+
115
+ cleaned_data = self._cleanup_dataframe(raw_data, model_name)
116
+ # mc1 = self._extract_mc1(raw_data, full_model_name)
117
+ # mc2 = self._extract_mc2(raw_data, full_model_name)
118
+ # cleaned_data = pd.concat([cleaned_data, mc1])
119
+ # cleaned_data = pd.concat([cleaned_data, mc2])
120
+ organization_names.append(organization_name)
121
+ full_model_names.append(full_model_name)
122
+ dataframes.append(cleaned_data)
123
+ # except Exception as e:
124
+ # # logging.error(f'Error processing {filename}')
125
+ # # logging.error(f'The error is: {e}')
126
+ # print(f'Error processing {filename}')
127
+ # print(f'The error is: {e}')
128
+ # continue
129
 
130
 
131
  data = pd.concat(dataframes, axis=1).transpose()
132
 
133
  # Add organization column
134
+ # data['organization'] = organization_names
135
+ print("full_model_names")
136
+ print(len(full_model_names))
137
+ print("organization_names")
138
+ print(len(organization_name))
139
+ data['full_model_name'] = full_model_names
140
 
141
  # Add Model Name and rearrange columns
142
  data['Model Name'] = data.index
 
158
 
159
 
160
 
161
+
 
162
 
163
  # Add parameter count column using extract_parameters function
164
  data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
 
169
  print(cols)
170
  data = data[cols]
171
 
172
+
173
+ new_columns = ['full_model_name'] + [col for col in data.columns if col != 'full_model_name']
174
+ data = data.reindex(columns=new_columns)
175
+
176
+ # # Reorder columns to move 'organization' to the second position
177
+ # cols = data.columns.tolist()
178
+ # cols = cols[-1:] + cols[:-1]
179
+ # data = data[cols]
180
 
181
  # remove extreme outliers from column harness|truthfulqa:mc1
182
+ # data = self._remove_mc1_outliers(data)
183
 
184
  data = self.manual_removal_of_models(data)
185
 
 
186
 
187
+ # drop rows if MMLU_abstract_algebra is NaN
188
+ data = data.dropna(subset=['MMLU_abstract_algebra'])
189
+
190
+ # add a URL column that takes https://huggingface.co/ + full_model_name
191
+ data['URL'] = 'https://huggingface.co/' + data['full_model_name']
192
+
193
+ new_columns = ['URL'] + [col for col in data.columns if col != 'URL']
194
+ data = data.reindex(columns=new_columns)
195
+
196
+ # drop columns drop|3 gsm8k and winogrande
197
+ data = data.drop(columns=['drop|3', 'gsm8k', 'winogrande'])
198
+ # # Drop specific columns
199
+ data = data.drop(columns=['all', 'truthfulqa:mc|0'])
200
+
201
+ # save to csv with the current date as part of the filename
202
  data.to_csv(f'processed_data_{pd.Timestamp.now().strftime("%Y-%m-%d")}.csv')
203
 
204
  return data