CoreyMorris commited on
Commit
36799a9
1 Parent(s): 28fcccf

Updated data and added notes about the site.

Browse files
Files changed (3) hide show
  1. app.py +4 -3
  2. processed_data_2023-09-29.csv +0 -0
  3. results +1 -1
app.py CHANGED
@@ -112,8 +112,9 @@ def find_top_differences_table(df, target_model, closest_models, num_differences
112
 
113
  # st.title('Model Evaluation Results including MMLU by task')
114
  st.title('Interactive Portal for Analyzing Open Source Large Language Models')
115
- st.markdown("""***Last updated November 21th***""")
116
- st.markdown("""**Models that are suspected to have training data contaminated with evaluation data have been removed.**""")
 
117
  st.markdown("""
118
  This page provides a way to explore the results for individual tasks and compare models across tasks. Data for the benchmarks hellaswag, arc_challenge, and truthfulQA have also been included for comparison.
119
  There are 57 tasks in the MMLU evaluation that cover a wide variety of subjects including Science, Math, Humanities, Social Science, Applied Science, Logic, and Security.
@@ -121,7 +122,7 @@ st.markdown("""
121
  """)
122
 
123
  # Load the data into memory
124
- data_path = "processed_data_2023-11-21.csv"
125
  data_df = load_csv_data(data_path)
126
  # drop the column Unnamed: 0
127
  data_df.rename(columns={'Unnamed: 0': "Model Name"}, inplace=True)
 
112
 
113
  # st.title('Model Evaluation Results including MMLU by task')
114
  st.title('Interactive Portal for Analyzing Open Source Large Language Models')
115
+ st.markdown("""***Last updated March 17th 2024***""")
116
+ st.markdown("""**It has not been updated to correctly extract the parameter number from mixture of experts models.**""")
117
+ st.markdown("""**As of 04-17-2024, this data was not generated using the chat templates. Smaller models are especially sensative to this and other aspects related to the format of the inputs.**""")
118
  st.markdown("""
119
  This page provides a way to explore the results for individual tasks and compare models across tasks. Data for the benchmarks hellaswag, arc_challenge, and truthfulQA have also been included for comparison.
120
  There are 57 tasks in the MMLU evaluation that cover a wide variety of subjects including Science, Math, Humanities, Social Science, Applied Science, Logic, and Security.
 
122
  """)
123
 
124
  # Load the data into memory
125
+ data_path = "processed_data_2024-04-16.csv"
126
  data_df = load_csv_data(data_path)
127
  # drop the column Unnamed: 0
128
  data_df.rename(columns={'Unnamed: 0': "Model Name"}, inplace=True)
processed_data_2023-09-29.csv ADDED
The diff for this file is too large to render. See raw diff
 
results CHANGED
@@ -1 +1 @@
1
- Subproject commit ed5d75491130aac5d145db63d458f2eaf410745a
 
1
+ Subproject commit ae58c7715592b2f354a89b5b64f3d2d12335dc89