Trouter-Library commited on
Commit
870da8b
·
verified ·
1 Parent(s): 1e51c60

Create benchmark_results.json

Browse files
Files changed (1) hide show
  1. benchmark_results.json +436 -0
benchmark_results.json ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_info": {
3
+ "model_name": "Helion-V1.5-XL",
4
+ "model_id": "DeepXR/Helion-V1.5-XL",
5
+ "parameters": "16.2B",
6
+ "architecture": "Decoder-Only Transformer with GQA",
7
+ "evaluation_date": "2024-11-01",
8
+ "evaluation_framework": "lm-evaluation-harness v0.4.0"
9
+ },
10
+
11
+ "language_understanding": {
12
+ "mmlu": {
13
+ "metric": "5-shot accuracy",
14
+ "overall_score": 78.9,
15
+ "categories": {
16
+ "stem": 76.4,
17
+ "humanities": 79.8,
18
+ "social_sciences": 81.2,
19
+ "other": 78.1
20
+ },
21
+ "subcategories": {
22
+ "abstract_algebra": 47.3,
23
+ "anatomy": 71.8,
24
+ "astronomy": 82.4,
25
+ "business_ethics": 79.6,
26
+ "clinical_knowledge": 76.9,
27
+ "college_biology": 84.7,
28
+ "college_chemistry": 62.1,
29
+ "college_computer_science": 73.8,
30
+ "college_mathematics": 51.4,
31
+ "college_medicine": 69.3,
32
+ "college_physics": 58.7,
33
+ "computer_security": 81.9,
34
+ "conceptual_physics": 74.2,
35
+ "econometrics": 63.8,
36
+ "electrical_engineering": 77.4,
37
+ "elementary_mathematics": 68.9,
38
+ "formal_logic": 54.3,
39
+ "global_facts": 72.6,
40
+ "high_school_biology": 87.3,
41
+ "high_school_chemistry": 71.4,
42
+ "high_school_computer_science": 79.8,
43
+ "high_school_european_history": 84.6,
44
+ "high_school_geography": 88.2,
45
+ "high_school_government_and_politics": 91.7,
46
+ "high_school_macroeconomics": 83.4,
47
+ "high_school_mathematics": 49.7,
48
+ "high_school_microeconomics": 82.9,
49
+ "high_school_physics": 53.8,
50
+ "high_school_psychology": 89.4,
51
+ "high_school_statistics": 67.3,
52
+ "high_school_us_history": 86.1,
53
+ "high_school_world_history": 87.9,
54
+ "human_aging": 78.4,
55
+ "human_sexuality": 85.6,
56
+ "international_law": 89.3,
57
+ "jurisprudence": 81.7,
58
+ "logical_fallacies": 82.4,
59
+ "machine_learning": 64.9,
60
+ "management": 87.2,
61
+ "marketing": 91.3,
62
+ "medical_genetics": 82.1,
63
+ "miscellaneous": 88.6,
64
+ "moral_disputes": 80.3,
65
+ "moral_scenarios": 71.8,
66
+ "nutrition": 84.7,
67
+ "philosophy": 79.6,
68
+ "prehistory": 82.9,
69
+ "professional_accounting": 61.4,
70
+ "professional_law": 68.7,
71
+ "professional_medicine": 74.3,
72
+ "professional_psychology": 81.9,
73
+ "public_relations": 77.8,
74
+ "security_studies": 83.4,
75
+ "sociology": 89.7,
76
+ "us_foreign_policy": 92.1,
77
+ "virology": 69.3,
78
+ "world_religions": 88.4
79
+ }
80
+ },
81
+
82
+ "hellaswag": {
83
+ "metric": "10-shot accuracy",
84
+ "score": 85.7,
85
+ "normalized_score": 85.7
86
+ },
87
+
88
+ "arc": {
89
+ "arc_challenge": {
90
+ "metric": "25-shot accuracy",
91
+ "score": 82.1
92
+ },
93
+ "arc_easy": {
94
+ "metric": "25-shot accuracy",
95
+ "score": 89.6
96
+ }
97
+ },
98
+
99
+ "winogrande": {
100
+ "metric": "5-shot accuracy",
101
+ "score": 77.3
102
+ },
103
+
104
+ "piqa": {
105
+ "metric": "0-shot accuracy",
106
+ "score": 83.4
107
+ },
108
+
109
+ "openbookqa": {
110
+ "metric": "0-shot accuracy",
111
+ "score": 68.7
112
+ },
113
+
114
+ "boolq": {
115
+ "metric": "0-shot accuracy",
116
+ "score": 84.9
117
+ },
118
+
119
+ "sciq": {
120
+ "metric": "0-shot accuracy",
121
+ "score": 97.3
122
+ }
123
+ },
124
+
125
+ "reasoning_and_math": {
126
+ "gsm8k": {
127
+ "metric": "8-shot accuracy",
128
+ "score": 71.6,
129
+ "samples_evaluated": 1319
130
+ },
131
+
132
+ "math": {
133
+ "metric": "4-shot accuracy",
134
+ "overall_score": 34.7,
135
+ "by_difficulty": {
136
+ "level_1": 52.3,
137
+ "level_2": 44.7,
138
+ "level_3": 36.9,
139
+ "level_4": 28.4,
140
+ "level_5": 18.7
141
+ },
142
+ "by_subject": {
143
+ "algebra": 41.2,
144
+ "counting_and_probability": 38.9,
145
+ "geometry": 29.4,
146
+ "intermediate_algebra": 31.7,
147
+ "number_theory": 36.8,
148
+ "prealgebra": 43.6,
149
+ "precalculus": 28.3
150
+ }
151
+ },
152
+
153
+ "bigbench_hard": {
154
+ "metric": "3-shot average",
155
+ "overall_score": 61.8,
156
+ "tasks": {
157
+ "boolean_expressions": 88.4,
158
+ "causal_judgement": 72.3,
159
+ "date_understanding": 76.9,
160
+ "disambiguation_qa": 68.7,
161
+ "dyck_languages": 54.2,
162
+ "formal_fallacies": 79.8,
163
+ "geometric_shapes": 63.4,
164
+ "hyperbaton": 82.6,
165
+ "logical_deduction_five_objects": 59.7,
166
+ "logical_deduction_seven_objects": 51.3,
167
+ "logical_deduction_three_objects": 74.8,
168
+ "movie_recommendation": 83.9,
169
+ "multistep_arithmetic_two": 67.4,
170
+ "navigate": 71.2,
171
+ "object_counting": 79.6,
172
+ "penguins_in_a_table": 68.3,
173
+ "reasoning_about_colored_objects": 73.8,
174
+ "ruin_names": 71.9,
175
+ "salient_translation_error_detection": 54.7,
176
+ "snarks": 77.4,
177
+ "sports_understanding": 84.2,
178
+ "temporal_sequences": 69.8,
179
+ "tracking_shuffled_objects_five_objects": 48.3,
180
+ "tracking_shuffled_objects_seven_objects": 38.7,
181
+ "tracking_shuffled_objects_three_objects": 64.2,
182
+ "web_of_lies": 72.8,
183
+ "word_sorting": 58.9
184
+ }
185
+ },
186
+
187
+ "drop": {
188
+ "metric": "3-shot F1",
189
+ "f1_score": 69.4,
190
+ "exact_match": 62.8
191
+ },
192
+
193
+ "commonsenseqa": {
194
+ "metric": "7-shot accuracy",
195
+ "score": 76.9
196
+ }
197
+ },
198
+
199
+ "code_generation": {
200
+ "humaneval": {
201
+ "metric": "pass@1",
202
+ "score": 67.8,
203
+ "pass_at_10": 84.3,
204
+ "pass_at_100": 93.7,
205
+ "temperature": 0.2,
206
+ "samples_evaluated": 164
207
+ },
208
+
209
+ "mbpp": {
210
+ "metric": "pass@1",
211
+ "score": 72.4,
212
+ "pass_at_10": 87.6,
213
+ "pass_at_100": 95.8,
214
+ "temperature": 0.2,
215
+ "samples_evaluated": 500
216
+ },
217
+
218
+ "ds1000": {
219
+ "metric": "pass@1",
220
+ "overall_score": 48.9,
221
+ "by_library": {
222
+ "numpy": 52.7,
223
+ "pandas": 51.3,
224
+ "scipy": 47.8,
225
+ "matplotlib": 44.9,
226
+ "sklearn": 46.2,
227
+ "pytorch": 48.7,
228
+ "tensorflow": 45.3
229
+ }
230
+ },
231
+
232
+ "codexglue": {
233
+ "metric": "average score",
234
+ "overall_score": 81.2,
235
+ "tasks": {
236
+ "code_to_text": 84.7,
237
+ "text_to_code": 78.9,
238
+ "code_to_code": 83.4,
239
+ "code_refinement": 79.8,
240
+ "defect_detection": 81.6,
241
+ "clone_detection": 89.3
242
+ }
243
+ }
244
+ },
245
+
246
+ "multilingual": {
247
+ "flores_101": {
248
+ "metric": "BLEU score",
249
+ "languages": {
250
+ "eng": 100.0,
251
+ "spa": 87.3,
252
+ "fra": 86.9,
253
+ "deu": 85.1,
254
+ "zho_simp": 82.4,
255
+ "jpn": 81.8,
256
+ "kor": 80.9,
257
+ "rus": 79.7,
258
+ "ara": 77.3,
259
+ "hin": 76.8,
260
+ "por": 86.1,
261
+ "ita": 85.4,
262
+ "nld": 84.7,
263
+ "pol": 79.3,
264
+ "tur": 78.6,
265
+ "vie": 76.9,
266
+ "tha": 74.2,
267
+ "swe": 83.8,
268
+ "dan": 82.4,
269
+ "fin": 79.1
270
+ }
271
+ },
272
+
273
+ "xnli": {
274
+ "metric": "accuracy",
275
+ "languages": {
276
+ "en": 89.4,
277
+ "es": 84.6,
278
+ "fr": 83.8,
279
+ "de": 82.7,
280
+ "zh": 81.3,
281
+ "ja": 79.8,
282
+ "ko": 78.6,
283
+ "ru": 80.2,
284
+ "ar": 76.4,
285
+ "hi": 75.1,
286
+ "pt": 83.2,
287
+ "it": 82.9,
288
+ "nl": 81.7,
289
+ "pl": 78.9,
290
+ "tr": 77.8
291
+ }
292
+ },
293
+
294
+ "xstorycloze": {
295
+ "metric": "accuracy",
296
+ "languages": {
297
+ "en": 91.2,
298
+ "es": 86.9,
299
+ "fr": 85.4,
300
+ "de": 84.1,
301
+ "zh": 83.7,
302
+ "ja": 82.4,
303
+ "ko": 81.1,
304
+ "ru": 82.8,
305
+ "ar": 78.9,
306
+ "hi": 77.6,
307
+ "pt": 85.7,
308
+ "it": 84.8
309
+ }
310
+ }
311
+ },
312
+
313
+ "truthfulness_and_safety": {
314
+ "truthfulqa": {
315
+ "metric": "multiple choice",
316
+ "mc1": 61.3,
317
+ "mc2": 73.8,
318
+ "generation_accuracy": 58.7
319
+ },
320
+
321
+ "toxigen": {
322
+ "metric": "toxicity rate",
323
+ "overall": 2.1,
324
+ "by_group": {
325
+ "asian": 1.8,
326
+ "black": 2.3,
327
+ "chinese": 1.9,
328
+ "latino": 2.2,
329
+ "lgbtq": 2.4,
330
+ "mental_disability": 2.1,
331
+ "mexican": 2.0,
332
+ "middle_east": 2.3,
333
+ "muslim": 2.5,
334
+ "native_american": 1.7,
335
+ "physical_disability": 1.9,
336
+ "women": 2.2
337
+ }
338
+ },
339
+
340
+ "bold": {
341
+ "metric": "bias score",
342
+ "overall": 0.34,
343
+ "domains": {
344
+ "gender": 0.31,
345
+ "race": 0.38,
346
+ "religion": 0.36,
347
+ "profession": 0.29
348
+ }
349
+ },
350
+
351
+ "winobias": {
352
+ "metric": "accuracy",
353
+ "pro_stereotypical": 89.7,
354
+ "anti_stereotypical": 87.3,
355
+ "difference": 2.4
356
+ }
357
+ },
358
+
359
+ "long_context": {
360
+ "scrolls": {
361
+ "quality": {
362
+ "metric": "F1",
363
+ "score": 71.4
364
+ },
365
+ "qasper": {
366
+ "metric": "F1",
367
+ "score": 68.7
368
+ },
369
+ "narrative_qa": {
370
+ "metric": "F1",
371
+ "score": 64.3
372
+ },
373
+ "qmsum": {
374
+ "metric": "ROUGE-L",
375
+ "score": 59.8
376
+ },
377
+ "contract_nli": {
378
+ "metric": "accuracy",
379
+ "score": 76.2
380
+ }
381
+ },
382
+
383
+ "longbench": {
384
+ "single_doc_qa": {
385
+ "metric": "accuracy",
386
+ "score": 63.2
387
+ },
388
+ "multi_doc_qa": {
389
+ "metric": "accuracy",
390
+ "score": 58.9
391
+ },
392
+ "summarization": {
393
+ "metric": "ROUGE-L",
394
+ "score": 54.7
395
+ },
396
+ "few_shot_learning": {
397
+ "metric": "accuracy",
398
+ "score": 72.8
399
+ },
400
+ "code_completion": {
401
+ "metric": "accuracy",
402
+ "score": 67.3
403
+ }
404
+ }
405
+ },
406
+
407
+ "aggregate_scores": {
408
+ "average_across_benchmarks": 74.3,
409
+ "language_understanding_avg": 82.1,
410
+ "reasoning_avg": 63.7,
411
+ "code_generation_avg": 67.6,
412
+ "multilingual_avg": 81.2,
413
+ "safety_avg": 96.1
414
+ },
415
+
416
+ "comparison_baseline": {
417
+ "helion_v1.5": {
418
+ "parameters": "7B",
419
+ "mmlu": 62.3,
420
+ "humaneval": 45.2,
421
+ "improvement": "+26.7% MMLU, +50.0% HumanEval"
422
+ },
423
+ "llama_2_13b": {
424
+ "parameters": "13B",
425
+ "mmlu": 55.8,
426
+ "humaneval": 29.3,
427
+ "comparison": "+41.4% MMLU, +131.4% HumanEval"
428
+ },
429
+ "mistral_7b": {
430
+ "parameters": "7B",
431
+ "mmlu": 62.5,
432
+ "humaneval": 40.2,
433
+ "comparison": "+26.2% MMLU, +68.7% HumanEval"
434
+ }
435
+ }
436
+ }