Spaces:

CoreyMorris
/

MMLU-by-task-Leaderboard

Running

Corey commited on Oct 8, 2023

Commit

59c6dd2

unverified ·

1 Parent(s): cb2c32e

Added clickable links (#1)

* Added clickable links
* Combined organization and model as a single column name
* removed redundant filtering
* removed model comparison feature because it did not seem useful as it was made
* Updated to streamlit 1.25.0 for clickable link support
* Removed truthfulQA data temporarily.

Files changed (5) hide show

app.py +47 -63
generate_csv.ipynb +63 -0
processed_data_2023-10-06.csv +0 -0
requirements.txt +1 -1
result_data_processor.py +58 -26

app.py CHANGED Viewed

@@ -95,7 +95,7 @@ def create_line_chart(df, model_names, metrics):
     fig.update_layout(showlegend=True)
     return fig
-def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters', 'organization']):
     # Calculate the absolute differences for each task between the target model and the closest models
     new_df = df.drop(columns=exclude_columns)
     differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
@@ -124,35 +124,12 @@ st.markdown("""
             """)
 # Load the data into memory
-data_path = "processed_data_2023-10-06.csv"
 data_df = load_csv_data(data_path)
-data_df.rename(columns={"Unnamed: 0": "Model Name"}, inplace=True)
 data_df.set_index("Model Name", inplace=True)
-filters = st.checkbox('Select Models and/or Evaluations')
-# Initialize selected columns with "Parameters" and "MMLU_average" if filters are checked
-selected_columns = ['Parameters', 'MMLU_average'] if filters else data_df.columns.tolist()
-# Initialize selected models as empty if filters are checked
-selected_models = [] if filters else data_df.index.tolist()
-if filters:
-    # Create multi-select for columns with default selection
-    selected_columns = st.multiselect(
-        'Select Columns',
-        data_df.columns.tolist(),
-        default=selected_columns
-    )
-    # Create multi-select for models without default selection
-    selected_models = st.multiselect(
-        'Select Models',
-        data_df.index.tolist()
-    )
-# Get the filtered data
-# filtered_data = data_provider.get_data(selected_models)
 filtered_data = data_df
 # sort the table by the MMLU_average column
@@ -165,32 +142,33 @@ parameter_threshold = st.selectbox(
     index=4,  # Set the default selected option to 'No threshold'
     format_func=lambda x: f"{x}" if isinstance(x, int) else x
 )
-# Filter the DataFrame based on the selected parameter threshold if not 'No threshold'
 if isinstance(parameter_threshold, int):
     filtered_data = filtered_data[filtered_data['Parameters'] <= parameter_threshold]
-# Search box
-search_query = st.text_input("Filter by Model Name:", "")
-# Filter the DataFrame based on the search query in the index (model name)
-if search_query:
-    filtered_data = filtered_data[filtered_data.index.str.contains(search_query, case=False)]
-# Search box for columns
 column_search_query = st.text_input("Filter by Column/Task Name:", "").replace(" ", "").split(',')
-# Get the columns that contain the search query
 matching_columns = [col for col in filtered_data.columns if any(query.lower() in col.lower() for query in column_search_query)]
 # Display the DataFrame with only the matching columns
 st.markdown("## Sortable Results")
-st.dataframe(filtered_data[matching_columns])
 # CSV download
 filtered_data.index.name = "Model Name"
 csv = filtered_data.to_csv(index=True)
@@ -209,6 +187,9 @@ def create_plot(df, x_values, y_values, models=None, title=None):
     # remove rows with NaN values
     df = df.dropna(subset=[x_values, y_values])
     plot_data = pd.DataFrame({
         'Model': df.index,
         x_values: df[x_values],
@@ -279,8 +260,11 @@ st.markdown("***The dashed red line indicates random chance accuracy of 0.25 as
 st.markdown("***")
 st.write("As expected, there is a strong positive relationship between the number of parameters and average performance on the MMLU evaluation.")
-selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=1)
-selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=4)
 if selected_x_column != selected_y_column:    # Avoid creating a plot with the same column on both axes
     fig = create_plot(filtered_data, selected_x_column, selected_y_column)
@@ -289,44 +273,44 @@ else:
     st.write("Please select different columns for the x and y axes.")
-# end of custom scatter plots
-# Section to select a model and display radar and line charts
-st.header("Compare a Selected Model to the 5 Models Closest in MMLU Average Performance")
-st.write("""
-         This comparison highlights the nuances in model performance across different tasks.
-         While the overall MMLU average score provides a general understanding of a model's capabilities,
-         examining the closest models reveals variations in performance on individual tasks.
-         Such an analysis can uncover specific strengths and weaknesses and guide further exploration and improvement.
-         """)
-default_model_name = "GPT-JT-6B-v0"
-default_model_index = filtered_data.index.tolist().index(default_model_name) if default_model_name in filtered_data.index else 0
-selected_model_name = st.selectbox("Select a Model:", filtered_data.index.tolist(), index=default_model_index)
-# Get the closest 5 models with unique indices
-closest_models_diffs = filtered_data['MMLU_average'].sub(filtered_data.loc[selected_model_name, 'MMLU_average']).abs()
-closest_models = closest_models_diffs.nsmallest(5, keep='first').index.drop_duplicates().tolist()
 # Find the top 10 tasks with the largest differences and convert to a DataFrame
-top_differences_table, top_differences_tasks = find_top_differences_table(filtered_data, selected_model_name, closest_models)
 # Display the DataFrame for the closest models and the top differences tasks
-st.dataframe(filtered_data.loc[closest_models, top_differences_tasks])
 # # Display the table in the Streamlit app
 # st.markdown("## Top Differences")
 # st.dataframe(top_differences_table)
 # Create a radar chart for the tasks with the largest differences
-fig_radar_top_differences = create_radar_chart_unfilled(filtered_data, closest_models, top_differences_tasks)
 # Display the radar chart
-st.plotly_chart(fig_radar_top_differences)
 st.markdown("## Notable findings and plots")

     fig.update_layout(showlegend=True)
     return fig
+def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters']):
     # Calculate the absolute differences for each task between the target model and the closest models
     new_df = df.drop(columns=exclude_columns)
     differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
             """)
 # Load the data into memory
+data_path = "processed_data_2023-10-08.csv"
 data_df = load_csv_data(data_path)
+# drop the column Unnamed: 0
+data_df.rename(columns={'Unnamed: 0': "Model Name"}, inplace=True)
 data_df.set_index("Model Name", inplace=True)
 filtered_data = data_df
 # sort the table by the MMLU_average column
     index=4,  # Set the default selected option to 'No threshold'
     format_func=lambda x: f"{x}" if isinstance(x, int) else x
 )
 if isinstance(parameter_threshold, int):
     filtered_data = filtered_data[filtered_data['Parameters'] <= parameter_threshold]
+# model name filtering
+search_queries = st.text_input("Filter by Model Name:", "").replace(" ", "").split(',')
+if search_queries:
+    filtered_data = filtered_data[filtered_data.index.str.contains('|'.join(search_queries), case=False)]
+# column name filtering
 column_search_query = st.text_input("Filter by Column/Task Name:", "").replace(" ", "").split(',')
 matching_columns = [col for col in filtered_data.columns if any(query.lower() in col.lower() for query in column_search_query)]
+filtered_data = filtered_data[matching_columns]
 # Display the DataFrame with only the matching columns
 st.markdown("## Sortable Results")
+st.dataframe(
+    filtered_data[matching_columns],
+    column_config={
+        "URL": st.column_config.LinkColumn( # Only current way to make url a clickable link with streamlit without removing the interactivity of the table
+            width="small"
+        )
+    },
+    hide_index=True,
+)
 # CSV download
 filtered_data.index.name = "Model Name"
 csv = filtered_data.to_csv(index=True)
     # remove rows with NaN values
     df = df.dropna(subset=[x_values, y_values])
+    #remove label rows URL, full_model_name
+    df = df.drop(columns=['URL', 'full_model_name'])
     plot_data = pd.DataFrame({
         'Model': df.index,
         x_values: df[x_values],
 st.markdown("***")
 st.write("As expected, there is a strong positive relationship between the number of parameters and average performance on the MMLU evaluation.")
+column_list_for_plotting = filtered_data.columns.tolist()
+column_list_for_plotting.remove('URL')
+column_list_for_plotting.remove('full_model_name')
+selected_x_column = st.selectbox('Select x-axis', column_list_for_plotting, index=0)
+selected_y_column = st.selectbox('Select y-axis', column_list_for_plotting, index=1)
 if selected_x_column != selected_y_column:    # Avoid creating a plot with the same column on both axes
     fig = create_plot(filtered_data, selected_x_column, selected_y_column)
     st.write("Please select different columns for the x and y axes.")
+# end of custom scatter plots
+# # Section to select a model and display radar and line charts
+# st.header("Compare a Selected Model to the 5 Models Closest in MMLU Average Performance")
+# st.write("""
+#          This comparison highlights the nuances in model performance across different tasks.
+#          While the overall MMLU average score provides a general understanding of a model's capabilities,
+#          examining the closest models reveals variations in performance on individual tasks.
+#          Such an analysis can uncover specific strengths and weaknesses and guide further exploration and improvement.
+#          """)
+# default_model_name = "GPT-JT-6B-v0"
+# default_model_index = filtered_data.index.tolist().index(default_model_name) if default_model_name in filtered_data.index else 0
+# selected_model_name = st.selectbox("Select a Model:", filtered_data.index.tolist(), index=default_model_index)
+# # Get the closest 5 models with unique indices
+# closest_models_diffs = filtered_data['MMLU_average'].sub(filtered_data.loc[selected_model_name, 'MMLU_average']).abs()
+# closest_models = closest_models_diffs.nsmallest(5, keep='first').index.drop_duplicates().tolist()
 # Find the top 10 tasks with the largest differences and convert to a DataFrame
+# top_differences_table, top_differences_tasks = find_top_differences_table(filtered_data, selected_model_name, closest_models)
 # Display the DataFrame for the closest models and the top differences tasks
+# st.dataframe(filtered_data.loc[closest_models, top_differences_tasks])
 # # Display the table in the Streamlit app
 # st.markdown("## Top Differences")
 # st.dataframe(top_differences_table)
 # Create a radar chart for the tasks with the largest differences
+# fig_radar_top_differences = create_radar_chart_unfilled(filtered_data, closest_models, top_differences_tasks)
 # Display the radar chart
+# st.plotly_chart(fig_radar_top_differences)
 st.markdown("## Notable findings and plots")

generate_csv.ipynb ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "100\n",
+      "200\n",
+      "300\n",
+      "400\n",
+      "500\n",
+      "600\n",
+      "700\n",
+      "800\n",
+      "900\n",
+      "1000\n",
+      "1100\n",
+      "1200\n",
+      "1300\n",
+      "1400\n"
+     ]
+    }
+   ],
+   "source": [
+    "from result_data_processor import ResultDataProcessor\n",
+    "result = ResultDataProcessor()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mmlu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

processed_data_2023-10-06.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 # replicating the current hugging face streamlit and pandas versions
-streamlit==1.21.0
 pandas==1.5.3
 # Other

 # replicating the current hugging face streamlit and pandas versions
+streamlit==1.25.0
 pandas==1.5.3
 # Other

result_data_processor.py CHANGED Viewed

@@ -96,32 +96,47 @@ class ResultDataProcessor:
     def process_data(self):
         dataframes = []
         organization_names = []
         for filename in self._find_files(self.directory, self.pattern):
-            try:
-                raw_data = self._read_and_transform_data(filename)
-                split_path = filename.split('/')
-                model_name = split_path[2]
-                organization_name = split_path[1]
-                cleaned_data = self._cleanup_dataframe(raw_data, model_name)
-                mc1 = self._extract_mc1(raw_data, model_name)
-                mc2 = self._extract_mc2(raw_data, model_name)
-                cleaned_data = pd.concat([cleaned_data, mc1])
-                cleaned_data = pd.concat([cleaned_data, mc2])
-                organization_names.append(organization_name)
-                dataframes.append(cleaned_data)
-            except Exception as e:
-                logging.error(f'Error processing {filename}')
-                logging.error(f'The error is: {e}')
-                continue
         data = pd.concat(dataframes, axis=1).transpose()
         # Add organization column
-        data['organization'] = organization_names
         # Add Model Name and rearrange columns
         data['Model Name'] = data.index
@@ -143,8 +158,7 @@ class ResultDataProcessor:
-        # Drop specific columns
-        data = data.drop(columns=['all', 'truthfulqa:mc|0'])
         # Add parameter count column using extract_parameters function
         data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
@@ -155,18 +169,36 @@ class ResultDataProcessor:
         print(cols)
         data = data[cols]
-        # Reorder columns to move 'organization' to the second position
-        cols = data.columns.tolist()
-        cols = cols[-1:] + cols[:-1]
-        data = data[cols]
         # remove extreme outliers from column harness|truthfulqa:mc1
-        data = self._remove_mc1_outliers(data)
         data = self.manual_removal_of_models(data)
-        # save to csv with the current date as part of the filename
         data.to_csv(f'processed_data_{pd.Timestamp.now().strftime("%Y-%m-%d")}.csv')
         return data

     def process_data(self):
+        full_model_name_count = 0
+        full_model_names = []
         dataframes = []
         organization_names = []
         for filename in self._find_files(self.directory, self.pattern):
+            # try:
+            raw_data = self._read_and_transform_data(filename)
+            split_path = filename.split('/')
+            model_name = split_path[2]
+            organization_name = split_path[1]
+            full_model_name = f'{organization_name}/{model_name}'
+            full_model_name_count += 1
+            # print count every 100 models
+            if full_model_name_count % 100 == 0:
+                print(full_model_name_count)
+            cleaned_data = self._cleanup_dataframe(raw_data, model_name)
+            # mc1 = self._extract_mc1(raw_data, full_model_name)
+            # mc2 = self._extract_mc2(raw_data, full_model_name)
+            # cleaned_data = pd.concat([cleaned_data, mc1])
+            # cleaned_data = pd.concat([cleaned_data, mc2])
+            organization_names.append(organization_name)
+            full_model_names.append(full_model_name)
+            dataframes.append(cleaned_data)
+            # except Exception as e:
+            #     # logging.error(f'Error processing {filename}')
+            #     # logging.error(f'The error is: {e}')
+            #     print(f'Error processing {filename}')
+            #     print(f'The error is: {e}')
+            #     continue
         data = pd.concat(dataframes, axis=1).transpose()
         # Add organization column
+        # data['organization'] = organization_names
+        print("full_model_names")
+        print(len(full_model_names))
+        print("organization_names")
+        print(len(organization_name))
+        data['full_model_name'] = full_model_names
         # Add Model Name and rearrange columns
         data['Model Name'] = data.index
         # Add parameter count column using extract_parameters function
         data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
         print(cols)
         data = data[cols]
+        new_columns = ['full_model_name'] + [col for col in data.columns if col != 'full_model_name']
+        data = data.reindex(columns=new_columns)
+        # # Reorder columns to move 'organization' to the second position
+        # cols = data.columns.tolist()
+        # cols = cols[-1:] + cols[:-1]
+        # data = data[cols]
         # remove extreme outliers from column harness|truthfulqa:mc1
+        # data = self._remove_mc1_outliers(data)
         data = self.manual_removal_of_models(data)
+        # drop rows if MMLU_abstract_algebra is NaN
+        data = data.dropna(subset=['MMLU_abstract_algebra'])
+        # add a URL column that takes https://huggingface.co/ + full_model_name
+        data['URL'] = 'https://huggingface.co/' + data['full_model_name']
+        new_columns = ['URL'] + [col for col in data.columns if col != 'URL']
+        data = data.reindex(columns=new_columns)
+        # drop columns drop|3 gsm8k and winogrande
+        data = data.drop(columns=['drop|3', 'gsm8k', 'winogrande'])
+        # # Drop specific columns
+        data = data.drop(columns=['all', 'truthfulqa:mc|0'])
+        # save to csv with the current date as part of the filename
         data.to_csv(f'processed_data_{pd.Timestamp.now().strftime("%Y-%m-%d")}.csv')
         return data