Corey Morris commited on
Commit
2b55a03
1 Parent(s): 298ba1f

Extracted plotting functions from moral_app to plotting_utils to improve organization and testability

Browse files
Files changed (2) hide show
  1. moral_app.py +42 -168
  2. plotting_utils.py +152 -0
moral_app.py CHANGED
@@ -5,90 +5,10 @@ from result_data_processor import ResultDataProcessor
5
  import matplotlib.pyplot as plt
6
  import numpy as np
7
  import plotly.graph_objects as go
 
8
 
9
  st.set_page_config(layout="wide")
10
 
11
- def plot_top_n(df, target_column, n=10):
12
- top_n = df.nlargest(n, target_column)
13
-
14
- # Initialize the bar plot
15
- fig, ax1 = plt.subplots(figsize=(10, 5))
16
-
17
- # Set width for each bar and their positions
18
- width = 0.28
19
- ind = np.arange(len(top_n))
20
-
21
- # Plot target_column and MMLU_average on the primary y-axis with adjusted positions
22
- ax1.bar(ind - width, top_n[target_column], width=width, color='blue', label=target_column)
23
- ax1.bar(ind, top_n['MMLU_average'], width=width, color='orange', label='MMLU_average')
24
-
25
- # Set the primary y-axis labels and title
26
- ax1.set_title(f'Top {n} performing models on {target_column}')
27
- ax1.set_xlabel('Model')
28
- ax1.set_ylabel('Score')
29
-
30
- # Create a secondary y-axis for Parameters
31
- ax2 = ax1.twinx()
32
-
33
- # Plot Parameters as bars on the secondary y-axis with adjusted position
34
- ax2.bar(ind + width, top_n['Parameters'], width=width, color='red', label='Parameters')
35
-
36
- # Set the secondary y-axis labels
37
- ax2.set_ylabel('Parameters', color='red')
38
- ax2.tick_params(axis='y', labelcolor='red')
39
-
40
- # Set the x-ticks and their labels
41
- ax1.set_xticks(ind)
42
- ax1.set_xticklabels(top_n.index, rotation=45, ha="right")
43
-
44
- # Adjust the legend
45
- fig.tight_layout()
46
- fig.legend(loc='center left', bbox_to_anchor=(1, 0.5))
47
-
48
- # Show the plot
49
- st.pyplot(fig)
50
-
51
- # Function to create an unfilled radar chart
52
- def create_radar_chart_unfilled(df, model_names, metrics):
53
- fig = go.Figure()
54
- min_value = df.loc[model_names, metrics].min().min()
55
- max_value = df.loc[model_names, metrics].max().max()
56
- for model_name in model_names:
57
- values_model = df.loc[model_name, metrics]
58
- fig.add_trace(go.Scatterpolar(
59
- r=values_model,
60
- theta=metrics,
61
- name=model_name
62
- ))
63
-
64
- fig.update_layout(
65
- polar=dict(
66
- radialaxis=dict(
67
- visible=True,
68
- range=[min_value, max_value]
69
- )),
70
- showlegend=True,
71
- width=800, # Change the width as needed
72
- height=600 # Change the height as needed
73
- )
74
- return fig
75
-
76
-
77
-
78
- # Function to create a line chart
79
- def create_line_chart(df, model_names, metrics):
80
- line_data = []
81
- for model_name in model_names:
82
- values_model = df.loc[model_name, metrics]
83
- for metric, value in zip(metrics, values_model):
84
- line_data.append({'Model': model_name, 'Metric': metric, 'Value': value})
85
-
86
- line_df = pd.DataFrame(line_data)
87
-
88
- fig = px.line(line_df, x='Metric', y='Value', color='Model', title='Comparison of Models', line_dash_sequence=['solid'])
89
- fig.update_layout(showlegend=True)
90
- return fig
91
-
92
  def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters', 'organization']):
93
  # Calculate the absolute differences for each task between the target model and the closest models
94
  new_df = df.drop(columns=exclude_columns)
@@ -104,6 +24,10 @@ def find_top_differences_table(df, target_model, closest_models, num_differences
104
  unique_top_differences_tasks = list(set(top_differences_table['Task'].tolist()))
105
  return top_differences_table, unique_top_differences_tasks
106
 
 
 
 
 
107
  data_provider = ResultDataProcessor()
108
 
109
  st.title('Why are large language models so bad at the moral scenarios task?')
@@ -171,9 +95,9 @@ column_search_query = st.text_input("Filter by Column/Task Name:", "")
171
  # Get the columns that contain the search query
172
  matching_columns = [col for col in filtered_data.columns if column_search_query.lower() in col.lower()]
173
 
174
- # Display the DataFrame with only the matching columns
175
- st.markdown("## Sortable Results")
176
- st.dataframe(filtered_data[matching_columns])
177
 
178
 
179
  # CSV download
@@ -189,70 +113,43 @@ st.download_button(
189
  )
190
 
191
 
192
- def create_plot(df, x_values, y_values, models=None, title=None):
193
- if models is not None:
194
- df = df[df.index.isin(models)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
- # remove rows with NaN values
197
- df = df.dropna(subset=[x_values, y_values])
198
 
199
- plot_data = pd.DataFrame({
200
- 'Model': df.index,
201
- x_values: df[x_values],
202
- y_values: df[y_values],
203
- })
204
 
205
- plot_data['color'] = 'purple'
206
- fig = px.scatter(plot_data, x=x_values, y=y_values, color='color', hover_data=['Model'], trendline="ols")
207
-
208
- # If title is not provided, use x_values vs. y_values as the default title
209
- if title is None:
210
- title = x_values + " vs. " + y_values
211
-
212
- layout_args = dict(
213
- showlegend=False,
214
- xaxis_title=x_values,
215
- yaxis_title=y_values,
216
- xaxis=dict(),
217
- yaxis=dict(),
218
- title=title,
219
- height=500,
220
- width=1000,
221
- )
222
- fig.update_layout(**layout_args)
223
-
224
- # Add a dashed line at 0.25 for the y_values
225
- x_min = df[x_values].min()
226
- x_max = df[x_values].max()
227
-
228
- y_min = df[y_values].min()
229
- y_max = df[y_values].max()
230
-
231
- if x_values.startswith('MMLU'):
232
- fig.add_shape(
233
- type='line',
234
- x0=0.25, x1=0.25,
235
- y0=y_min, y1=y_max,
236
- line=dict(
237
- color='red',
238
- width=2,
239
- dash='dash'
240
- )
241
- )
242
-
243
- if y_values.startswith('MMLU'):
244
- fig.add_shape(
245
- type='line',
246
- x0=x_min, x1=x_max,
247
- y0=0.25, y1=0.25,
248
- line=dict(
249
- color='red',
250
- width=2,
251
- dash='dash'
252
- )
253
- )
254
-
255
- return fig
256
 
257
 
258
  # Custom scatter plots
@@ -325,31 +222,8 @@ plot_top_n(filtered_data, 'MMLU_abstract_algebra', 10)
325
  fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
326
  st.plotly_chart(fig)
327
 
328
- # Moral scenarios plots
329
- st.markdown("### Moral Scenarios Performance")
330
- def show_random_moral_scenarios_question():
331
- moral_scenarios_data = pd.read_csv('moral_scenarios_questions.csv')
332
- random_question = moral_scenarios_data.sample()
333
- expander = st.expander("Show a random moral scenarios question")
334
- expander.write(random_question['query'].values[0])
335
-
336
- show_random_moral_scenarios_question()
337
-
338
- st.write("""
339
- While smaller models can perform well at many tasks, the model size threshold for decent performance on moral scenarios is much higher.
340
- There are no models with less than 13 billion parameters with performance much better than random chance. Further investigation into other capabilities that emerge at 13 billion parameters could help
341
- identify capabilities that are important for moral reasoning.
342
- """)
343
-
344
- fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios', title="Impact of Parameter Count on Accuracy for Moral Scenarios")
345
- st.plotly_chart(fig)
346
- st.write()
347
-
348
 
349
 
350
- fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
351
- st.plotly_chart(fig)
352
-
353
 
354
 
355
 
 
5
  import matplotlib.pyplot as plt
6
  import numpy as np
7
  import plotly.graph_objects as go
8
+ from plotting_utils import plot_top_n, create_radar_chart_unfilled, create_line_chart, create_plot
9
 
10
  st.set_page_config(layout="wide")
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters', 'organization']):
13
  # Calculate the absolute differences for each task between the target model and the closest models
14
  new_df = df.drop(columns=exclude_columns)
 
24
  unique_top_differences_tasks = list(set(top_differences_table['Task'].tolist()))
25
  return top_differences_table, unique_top_differences_tasks
26
 
27
+
28
+
29
+ # Main Application
30
+
31
  data_provider = ResultDataProcessor()
32
 
33
  st.title('Why are large language models so bad at the moral scenarios task?')
 
95
  # Get the columns that contain the search query
96
  matching_columns = [col for col in filtered_data.columns if column_search_query.lower() in col.lower()]
97
 
98
+ # # Display the DataFrame with only the matching columns
99
+ # st.markdown("## Sortable Results")
100
+ # st.dataframe(filtered_data[matching_columns])
101
 
102
 
103
  # CSV download
 
113
  )
114
 
115
 
116
+ # Moral Scenarios section
117
+ st.markdown("## Why are large language models so bad at the moral scenarios task?")
118
+ st.markdown("### The structure of the task is odd")
119
+
120
+ # - Are the models actually bad at moral reasoning ?
121
+ # - Is it the structure of the task that is the causing the poor performance ?
122
+ # - Are there other tasks with questions in a similar structure ?
123
+ # - How do models perform when the structure of the task is changed ?
124
+ st.markdown("### Moral Scenarios Performance")
125
+ def show_random_moral_scenarios_question():
126
+ moral_scenarios_data = pd.read_csv('moral_scenarios_questions.csv')
127
+ random_question = moral_scenarios_data.sample()
128
+ expander = st.expander("Show a random moral scenarios question")
129
+ expander.write(random_question['query'].values[0])
130
+
131
+ show_random_moral_scenarios_question()
132
+
133
+ st.write("""
134
+ While smaller models can perform well at many tasks, the model size threshold for decent performance on moral scenarios is much higher.
135
+ There are no models with less than 13 billion parameters with performance much better than random chance. Further investigation into other capabilities that emerge at 13 billion parameters could help
136
+ identify capabilities that are important for moral reasoning.
137
+ """)
138
+
139
+ fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios', title="Impact of Parameter Count on Accuracy for Moral Scenarios")
140
+ st.plotly_chart(fig)
141
+ st.write()
142
+
143
+
144
+
145
+ fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
146
+ st.plotly_chart(fig)
147
+
148
+
149
+
150
 
 
 
151
 
 
 
 
 
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
 
155
  # Custom scatter plots
 
222
  fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
223
  st.plotly_chart(fig)
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
 
 
 
 
227
 
228
 
229
 
plotting_utils.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import plotly.graph_objects as go
7
+
8
+ def plot_top_n(df, target_column, n=10):
9
+ top_n = df.nlargest(n, target_column)
10
+
11
+ # Initialize the bar plot
12
+ fig, ax1 = plt.subplots(figsize=(10, 5))
13
+
14
+ # Set width for each bar and their positions
15
+ width = 0.28
16
+ ind = np.arange(len(top_n))
17
+
18
+ # Plot target_column and MMLU_average on the primary y-axis with adjusted positions
19
+ ax1.bar(ind - width, top_n[target_column], width=width, color='blue', label=target_column)
20
+ ax1.bar(ind, top_n['MMLU_average'], width=width, color='orange', label='MMLU_average')
21
+
22
+ # Set the primary y-axis labels and title
23
+ ax1.set_title(f'Top {n} performing models on {target_column}')
24
+ ax1.set_xlabel('Model')
25
+ ax1.set_ylabel('Score')
26
+
27
+ # Create a secondary y-axis for Parameters
28
+ ax2 = ax1.twinx()
29
+
30
+ # Plot Parameters as bars on the secondary y-axis with adjusted position
31
+ ax2.bar(ind + width, top_n['Parameters'], width=width, color='red', label='Parameters')
32
+
33
+ # Set the secondary y-axis labels
34
+ ax2.set_ylabel('Parameters', color='red')
35
+ ax2.tick_params(axis='y', labelcolor='red')
36
+
37
+ # Set the x-ticks and their labels
38
+ ax1.set_xticks(ind)
39
+ ax1.set_xticklabels(top_n.index, rotation=45, ha="right")
40
+
41
+ # Adjust the legend
42
+ fig.tight_layout()
43
+ fig.legend(loc='center left', bbox_to_anchor=(1, 0.5))
44
+
45
+ # Show the plot
46
+ st.pyplot(fig)
47
+
48
+ # Function to create an unfilled radar chart
49
+ def create_radar_chart_unfilled(df, model_names, metrics):
50
+ fig = go.Figure()
51
+ min_value = df.loc[model_names, metrics].min().min()
52
+ max_value = df.loc[model_names, metrics].max().max()
53
+ for model_name in model_names:
54
+ values_model = df.loc[model_name, metrics]
55
+ fig.add_trace(go.Scatterpolar(
56
+ r=values_model,
57
+ theta=metrics,
58
+ name=model_name
59
+ ))
60
+
61
+ fig.update_layout(
62
+ polar=dict(
63
+ radialaxis=dict(
64
+ visible=True,
65
+ range=[min_value, max_value]
66
+ )),
67
+ showlegend=True,
68
+ width=800, # Change the width as needed
69
+ height=600 # Change the height as needed
70
+ )
71
+ return fig
72
+
73
+
74
+
75
+ # Function to create a line chart
76
+ def create_line_chart(df, model_names, metrics):
77
+ line_data = []
78
+ for model_name in model_names:
79
+ values_model = df.loc[model_name, metrics]
80
+ for metric, value in zip(metrics, values_model):
81
+ line_data.append({'Model': model_name, 'Metric': metric, 'Value': value})
82
+
83
+ line_df = pd.DataFrame(line_data)
84
+
85
+ fig = px.line(line_df, x='Metric', y='Value', color='Model', title='Comparison of Models', line_dash_sequence=['solid'])
86
+ fig.update_layout(showlegend=True)
87
+ return fig
88
+
89
+ def create_plot(df, x_values, y_values, models=None, title=None):
90
+ if models is not None:
91
+ df = df[df.index.isin(models)]
92
+
93
+ # remove rows with NaN values
94
+ df = df.dropna(subset=[x_values, y_values])
95
+
96
+ plot_data = pd.DataFrame({
97
+ 'Model': df.index,
98
+ x_values: df[x_values],
99
+ y_values: df[y_values],
100
+ })
101
+
102
+ plot_data['color'] = 'purple'
103
+ fig = px.scatter(plot_data, x=x_values, y=y_values, color='color', hover_data=['Model'], trendline="ols")
104
+
105
+ # If title is not provided, use x_values vs. y_values as the default title
106
+ if title is None:
107
+ title = x_values + " vs. " + y_values
108
+
109
+ layout_args = dict(
110
+ showlegend=False,
111
+ xaxis_title=x_values,
112
+ yaxis_title=y_values,
113
+ xaxis=dict(),
114
+ yaxis=dict(),
115
+ title=title,
116
+ height=500,
117
+ width=1000,
118
+ )
119
+ fig.update_layout(**layout_args)
120
+
121
+ # Add a dashed line at 0.25 for the y_values
122
+ x_min = df[x_values].min()
123
+ x_max = df[x_values].max()
124
+
125
+ y_min = df[y_values].min()
126
+ y_max = df[y_values].max()
127
+
128
+ if x_values.startswith('MMLU'):
129
+ fig.add_shape(
130
+ type='line',
131
+ x0=0.25, x1=0.25,
132
+ y0=y_min, y1=y_max,
133
+ line=dict(
134
+ color='red',
135
+ width=2,
136
+ dash='dash'
137
+ )
138
+ )
139
+
140
+ if y_values.startswith('MMLU'):
141
+ fig.add_shape(
142
+ type='line',
143
+ x0=x_min, x1=x_max,
144
+ y0=0.25, y1=0.25,
145
+ line=dict(
146
+ color='red',
147
+ width=2,
148
+ dash='dash'
149
+ )
150
+ )
151
+
152
+ return fig