djstrong commited on
Commit
672a5d6
β€’
1 Parent(s): 53db359

update eq-bench

Browse files
Files changed (3) hide show
  1. app.py +5 -0
  2. benchmark_results.csv +11 -0
  3. metadata.json +7 -1
app.py CHANGED
@@ -64,6 +64,8 @@ with demo:
64
  def get_params(model_name):
65
  if model_name in metadata:
66
  return metadata[model_name]
 
 
67
  return numpy.nan
68
 
69
 
@@ -77,6 +79,9 @@ with demo:
77
  # change value of column to nan
78
  leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
79
 
 
 
 
80
  # set datatype of column
81
  leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
82
  leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
 
64
  def get_params(model_name):
65
  if model_name in metadata:
66
  return metadata[model_name]
67
+ else:
68
+ print(model_name)
69
  return numpy.nan
70
 
71
 
 
79
  # change value of column to nan
80
  leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
81
 
82
+ #scale Benchmark Score by Num Questions Parseable*171
83
+ leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 171))
84
+
85
  # set datatype of column
86
  leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
87
  leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
benchmark_results.csv CHANGED
@@ -34,6 +34,7 @@ openchat-gemma,2024-06-19 10:19:44,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-
34
  Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
35
  SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
36
  Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
 
37
  Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
38
  Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
39
  Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
@@ -141,3 +142,13 @@ mistralai/Mixtral-8x22B-v0.1,2024-06-21 20:20:37,,mistralai/Mixtral-8x22B-v0.1,,
141
  mistralai/Mixtral-8x22B-Instruct-v0.1,2024-06-26 23:40:01,,mistralai/Mixtral-8x22B-Instruct-v0.1,,,67.63,eq-bench_v2_pl,171.0,1,transformers, ,,
142
  mistralai/Mixtral-8x22B-v0.1,2024-06-27 01:17:13,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,50.0 questions were parseable (min is 83%)
143
  alpindale/WizardLM-2-8x22B,2024-06-27 01:50:42,,alpindale/WizardLM-2-8x22B,,,69.56,eq-bench_v2_pl,171.0,1,transformers, ,,
 
 
 
 
 
 
 
 
 
 
 
34
  Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
35
  SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
36
  Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
37
+ models/gwint2,2024-06-19 11:21:15,,speakleash/Bielik-11B-v2.0-Instruct,,,68.24,eq-bench_v2_pl,171.0,1,transformers, ,,
38
  Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
39
  Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
40
  Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
 
142
  mistralai/Mixtral-8x22B-Instruct-v0.1,2024-06-26 23:40:01,,mistralai/Mixtral-8x22B-Instruct-v0.1,,,67.63,eq-bench_v2_pl,171.0,1,transformers, ,,
143
  mistralai/Mixtral-8x22B-v0.1,2024-06-27 01:17:13,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,50.0 questions were parseable (min is 83%)
144
  alpindale/WizardLM-2-8x22B,2024-06-27 01:50:42,,alpindale/WizardLM-2-8x22B,,,69.56,eq-bench_v2_pl,171.0,1,transformers, ,,
145
+ Bielik_v2.2b,2024-08-24 09:54:33,,speakleash/Bielik-11B-v2.2-Instruct,,,69.05,eq-bench_v2_pl,171.0,1,transformers, ,,
146
+ Bielik_v2.1,2024-08-24 10:07:46,,speakleash/Bielik-11B-v2.1-Instruct,,,66.27,eq-bench_v2_pl,155.0,1,transformers, ,,
147
+ meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 21:24:39,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,FAILED,eq-bench,FAILED,1,transformers, ,,`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
148
+ mistralai/Mistral-Large-Instruct-2407,2024-08-24 21:51:53,,mistralai/Mistral-Large-Instruct-2407,,,78.07,eq-bench_v2_pl,171.0,1,transformers, ,,
149
+ meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 22:23:40,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,72.53,eq-bench_v2_pl,171.0,1,transformers, ,,
150
+ meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,2024-08-25 20:59:04,openai_api,meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,,,77.23,eq-bench_v2_pl,171.0,1,openai,,,
151
+ gpt-3.5-turbo,2024-08-25 21:14:25,openai_api,gpt-3.5-turbo,,,57.7,eq-bench_v2_pl,171.0,1,openai,,,
152
+ gpt-4o-mini-2024-07-18,2024-08-25 21:17:34,openai_api,gpt-4o-mini-2024-07-18,,,71.15,eq-bench_v2_pl,171.0,1,openai,,,
153
+ gpt-4o-2024-08-06,2024-08-25 21:24:35,openai_api,gpt-4o-2024-08-06,,,75.15,eq-bench_v2_pl,171.0,1,openai,,,
154
+ gpt-4-turbo-2024-04-09,2024-08-25 21:31:42,openai_api,gpt-4-turbo-2024-04-09,,,77.77,eq-bench_v2_pl,164.0,1,openai,,,
metadata.json CHANGED
@@ -312,5 +312,11 @@
312
  "microsoft/Phi-3-small-8k-instruct": 7.4,
313
  "ssmits/Falcon2-5.5B-Polish": 5.5,
314
  "alpindale/WizardLM-2-8x22B,max_length=4096": 141,
315
- "dreamgen/WizardLM-2-7B": 7
 
 
 
 
 
 
316
  }
 
312
  "microsoft/Phi-3-small-8k-instruct": 7.4,
313
  "ssmits/Falcon2-5.5B-Polish": 5.5,
314
  "alpindale/WizardLM-2-8x22B,max_length=4096": 141,
315
+ "dreamgen/WizardLM-2-7B": 7,
316
+ "mistralai/Mistral-Large-Instruct-2407": 123,
317
+ "meta-llama/Meta-Llama-3.1-70B-Instruct": 70,
318
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8": 405,
319
+ "speakleash/Bielik-11B-v2.0-Instruct": 11,
320
+ "speakleash/Bielik-11B-v2.2-Instruct": 11,
321
+ "speakleash/Bielik-11B-v2.1-Instruct": 11
322
  }