Corey Morris commited on
Commit
a5fb364
1 Parent(s): c10db67

Added introduction, links, and reduced the number of plots displayed

Browse files
Files changed (1) hide show
  1. app.py +19 -28
app.py CHANGED
@@ -5,7 +5,15 @@ from result_data_processor import ResultDataProcessor
5
 
6
  data_provider = ResultDataProcessor()
7
 
8
- st.title('Model Evaluation Results including MMLU by task')
 
 
 
 
 
 
 
 
9
 
10
  filters = st.checkbox('Select Models and Evaluations')
11
 
@@ -36,8 +44,6 @@ filtered_data = filtered_data.sort_values(by=['MMLU_average'], ascending=False)
36
  st.dataframe(filtered_data[selected_columns])
37
 
38
  # CSV download
39
- # name the index to include in the csv download
40
-
41
 
42
  filtered_data.index.name = "Model Name"
43
 
@@ -108,7 +114,7 @@ def create_plot(df, arc_column, moral_column, models=None):
108
  # Custom scatter plots
109
  st.header('Custom scatter plots')
110
  selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=0)
111
- selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=1)
112
 
113
  if selected_x_column != selected_y_column: # Avoid creating a plot with the same column on both axes
114
  fig = create_plot(filtered_data, selected_x_column, selected_y_column)
@@ -118,42 +124,27 @@ else:
118
 
119
  # end of custom scatter plots
120
 
121
- st.header('Overall evaluation comparisons')
122
-
123
- fig = create_plot(filtered_data, 'arc:challenge|25', 'hellaswag|10')
124
- st.plotly_chart(fig)
125
-
126
- fig = create_plot(filtered_data, 'arc:challenge|25', 'MMLU_average')
127
- st.plotly_chart(fig)
128
-
129
- fig = create_plot(filtered_data, 'hellaswag|10', 'MMLU_average')
130
- st.plotly_chart(fig)
131
-
132
- st.header('Top 50 models on MMLU_average')
133
- top_50 = filtered_data.nlargest(50, 'MMLU_average')
134
- fig = create_plot(top_50, 'arc:challenge|25', 'MMLU_average')
135
- st.plotly_chart(fig)
136
-
137
- st.header('Moral Reasoning')
138
-
139
- fig = create_plot(filtered_data, 'arc:challenge|25', 'MMLU_moral_scenarios')
140
- st.plotly_chart(fig)
141
 
142
- fig = create_plot(filtered_data, 'MMLU_moral_disputes', 'MMLU_moral_scenarios')
143
  st.plotly_chart(fig)
144
 
145
- fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
146
  st.plotly_chart(fig)
147
 
148
  fig = px.histogram(filtered_data, x="MMLU_moral_scenarios", marginal="rug", hover_data=filtered_data.columns)
149
  st.plotly_chart(fig)
150
 
151
- fig = px.histogram(filtered_data, x="MMLU_moral_disputes", marginal="rug", hover_data=filtered_data.columns)
 
152
  st.plotly_chart(fig)
153
 
 
 
154
 
155
- st.markdown("**Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations**")
156
 
 
157
 
158
  st.markdown("""
159
  # References
 
5
 
6
  data_provider = ResultDataProcessor()
7
 
8
+ # st.title('Model Evaluation Results including MMLU by task')
9
+ st.title('MMLU-by-Task Evaluation Results for 500+ Open Source Models')
10
+ st.markdown("""***Last updated August 7th***""")
11
+ st.markdown("""
12
+ Hugging Face has run evaluations on over 500 open source models and provides results on a
13
+ [publicly available leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [dataset](https://huggingface.co/datasets/open-llm-leaderboard/results).
14
+ The leaderboard currently displays the overall result for MMLU. This page shows individual accuracy scores for all 57 tasks of the MMLU evaluation.
15
+ [Preliminary analysis of MMLU-by-Task data](https://coreymorrisdata.medium.com/preliminary-analysis-of-mmlu-evaluation-data-insights-from-500-open-source-models-e67885aa364b)
16
+ """)
17
 
18
  filters = st.checkbox('Select Models and Evaluations')
19
 
 
44
  st.dataframe(filtered_data[selected_columns])
45
 
46
  # CSV download
 
 
47
 
48
  filtered_data.index.name = "Model Name"
49
 
 
114
  # Custom scatter plots
115
  st.header('Custom scatter plots')
116
  selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=0)
117
+ selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=3)
118
 
119
  if selected_x_column != selected_y_column: # Avoid creating a plot with the same column on both axes
120
  fig = create_plot(filtered_data, selected_x_column, selected_y_column)
 
124
 
125
  # end of custom scatter plots
126
 
127
+ st.header('Moral Scenarios Performance')
128
+ st.write("The dashed red line represents the random chance performance of 0.25")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
131
  st.plotly_chart(fig)
132
 
133
+ fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios')
134
  st.plotly_chart(fig)
135
 
136
  fig = px.histogram(filtered_data, x="MMLU_moral_scenarios", marginal="rug", hover_data=filtered_data.columns)
137
  st.plotly_chart(fig)
138
 
139
+ st.header('Abstract Algebra Performance')
140
+ fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
141
  st.plotly_chart(fig)
142
 
143
+ fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_abstract_algebra')
144
+ st.plotly_chart(fig)
145
 
 
146
 
147
+ st.markdown("***Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations.***")
148
 
149
  st.markdown("""
150
  # References