pminervini commited on
Commit
4d9b54f
1 Parent(s): 355a6d5

Upload folder using huggingface_hub

Browse files
Files changed (22) hide show
  1. .gitattributes +25 -0
  2. meta-llama/Llama-2-13b-chat-hf/results.json +871 -0
  3. meta-llama/Llama-2-13b-chat-hf/results_2023-10-14T19-39-26.636545.json +107 -0
  4. meta-llama/Llama-2-13b-hf/results_2023-08-20T22-26-02.660247.json +871 -0
  5. meta-llama/Llama-2-13b-hf/results_2023-08-29T22-26-02.660247.json +1366 -0
  6. meta-llama/Llama-2-13b-hf/results_2023-09-07T13-43-41.802129.json +61 -0
  7. meta-llama/Llama-2-13b-hf/results_2023-09-07T15-27-15.010124.json +107 -0
  8. meta-llama/Llama-2-13b-hf/results_2023-09-08T14-32-14.957248.json +107 -0
  9. meta-llama/Llama-2-13b-hf/results_2023-10-14T23-00-26.644553.json +107 -0
  10. meta-llama/Llama-2-70b-chat-hf/results.json +868 -0
  11. meta-llama/Llama-2-70b-chat-hf/results_2023-10-17T05-07-42.486452.json +107 -0
  12. meta-llama/Llama-2-70b-hf/results.json +447 -0
  13. meta-llama/Llama-2-70b-hf/results_2023-09-08T23-38-08.931556.json +107 -0
  14. meta-llama/Llama-2-70b-hf/results_2023-09-18T06-46-44.905361.json +107 -0
  15. meta-llama/Llama-2-7b-chat-hf/results.json +871 -0
  16. meta-llama/Llama-2-7b-chat-hf/results_2023-10-15T02-34-15.484281.json +107 -0
  17. meta-llama/Llama-2-7b-hf/results_2023-08-20T17-54-59.197645.json +871 -0
  18. meta-llama/Llama-2-7b-hf/results_2023-08-29T17-54-59.197645.json +1366 -0
  19. meta-llama/Llama-2-7b-hf/results_2023-09-07T13-40-06.600532.json +61 -0
  20. meta-llama/Llama-2-7b-hf/results_2023-09-08T17-00-44.389859.json +107 -0
  21. meta-llama/Llama-2-7b-hf/results_2023-09-09T12-32-30.613622.json +107 -0
  22. meta-llama/Llama-2-7b-hf/results_2023-09-20T14-39-46.791628.json +107 -0
.gitattributes CHANGED
@@ -8,6 +8,7 @@
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +34,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
  *.model filter=lfs diff=lfs merge=lfs -text
14
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
34
  *.zip filter=lfs diff=lfs merge=lfs -text
35
  *.zst filter=lfs diff=lfs merge=lfs -text
36
  *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
56
+ tests/eval_results/public/tiiuae/falcon-7b/main/16bit/original|arc:c:letters_queries.jsonl filter=lfs diff=lfs merge=lfs -text
57
+ tests/eval_results/public/tiiuae/falcon-7b/main/16bit/original|arc:c:options_queries.jsonl filter=lfs diff=lfs merge=lfs -text
58
+ tests/eval_results/public/huggingface/llama-7b/main/16bit/original|mmlu:professional_law_queries.jsonl filter=lfs diff=lfs merge=lfs -text
59
+ fsx/nathan_habib/llm-leaderboard-backend/tests/eval_results/huggingface/llama-7b/main/16bit/tgi/original|arc:c:letters_queries.jsonl filter=lfs diff=lfs merge=lfs -text
60
+ fsx/nathan_habib/llm-leaderboard-backend/tests/eval_results/huggingface/llama-7b/main/16bit/tgi/original|arc:c:options_queries.jsonl filter=lfs diff=lfs merge=lfs -text
meta-llama/Llama-2-13b-chat-hf/results.json ADDED
@@ -0,0 +1,871 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "harness|arc:challenge|25": {
4
+ "acc": 0.5563139931740614,
5
+ "acc_stderr": 0.014518421825670449,
6
+ "acc_norm": 0.590443686006826,
7
+ "acc_norm_stderr": 0.01437035863247244
8
+ },
9
+ "harness|hellaswag|10": {
10
+ "acc": 0.6293567018522207,
11
+ "acc_stderr": 0.004819899945342489,
12
+ "acc_norm": 0.8193586934873531,
13
+ "acc_norm_stderr": 0.0038393444971919545
14
+ },
15
+ "harness|hendrycksTest-abstract_algebra|5": {
16
+ "acc": 0.31,
17
+ "acc_stderr": 0.046482319871173156,
18
+ "acc_norm": 0.31,
19
+ "acc_norm_stderr": 0.046482319871173156
20
+ },
21
+ "harness|hendrycksTest-anatomy|5": {
22
+ "acc": 0.4740740740740741,
23
+ "acc_stderr": 0.04313531696750574,
24
+ "acc_norm": 0.4740740740740741,
25
+ "acc_norm_stderr": 0.04313531696750574
26
+ },
27
+ "harness|hendrycksTest-astronomy|5": {
28
+ "acc": 0.5460526315789473,
29
+ "acc_stderr": 0.04051646342874142,
30
+ "acc_norm": 0.5460526315789473,
31
+ "acc_norm_stderr": 0.04051646342874142
32
+ },
33
+ "harness|hendrycksTest-business_ethics|5": {
34
+ "acc": 0.53,
35
+ "acc_stderr": 0.05016135580465919,
36
+ "acc_norm": 0.53,
37
+ "acc_norm_stderr": 0.05016135580465919
38
+ },
39
+ "harness|hendrycksTest-clinical_knowledge|5": {
40
+ "acc": 0.5849056603773585,
41
+ "acc_stderr": 0.03032594578928611,
42
+ "acc_norm": 0.5849056603773585,
43
+ "acc_norm_stderr": 0.03032594578928611
44
+ },
45
+ "harness|hendrycksTest-college_biology|5": {
46
+ "acc": 0.5833333333333334,
47
+ "acc_stderr": 0.04122728707651282,
48
+ "acc_norm": 0.5833333333333334,
49
+ "acc_norm_stderr": 0.04122728707651282
50
+ },
51
+ "harness|hendrycksTest-college_chemistry|5": {
52
+ "acc": 0.38,
53
+ "acc_stderr": 0.048783173121456316,
54
+ "acc_norm": 0.38,
55
+ "acc_norm_stderr": 0.048783173121456316
56
+ },
57
+ "harness|hendrycksTest-college_computer_science|5": {
58
+ "acc": 0.47,
59
+ "acc_stderr": 0.05016135580465919,
60
+ "acc_norm": 0.47,
61
+ "acc_norm_stderr": 0.05016135580465919
62
+ },
63
+ "harness|hendrycksTest-college_mathematics|5": {
64
+ "acc": 0.32,
65
+ "acc_stderr": 0.046882617226215034,
66
+ "acc_norm": 0.32,
67
+ "acc_norm_stderr": 0.046882617226215034
68
+ },
69
+ "harness|hendrycksTest-college_medicine|5": {
70
+ "acc": 0.4624277456647399,
71
+ "acc_stderr": 0.0380168510452446,
72
+ "acc_norm": 0.4624277456647399,
73
+ "acc_norm_stderr": 0.0380168510452446
74
+ },
75
+ "harness|hendrycksTest-college_physics|5": {
76
+ "acc": 0.3137254901960784,
77
+ "acc_stderr": 0.04617034827006717,
78
+ "acc_norm": 0.3137254901960784,
79
+ "acc_norm_stderr": 0.04617034827006717
80
+ },
81
+ "harness|hendrycksTest-computer_security|5": {
82
+ "acc": 0.68,
83
+ "acc_stderr": 0.04688261722621505,
84
+ "acc_norm": 0.68,
85
+ "acc_norm_stderr": 0.04688261722621505
86
+ },
87
+ "harness|hendrycksTest-conceptual_physics|5": {
88
+ "acc": 0.4,
89
+ "acc_stderr": 0.03202563076101735,
90
+ "acc_norm": 0.4,
91
+ "acc_norm_stderr": 0.03202563076101735
92
+ },
93
+ "harness|hendrycksTest-econometrics|5": {
94
+ "acc": 0.3157894736842105,
95
+ "acc_stderr": 0.043727482902780064,
96
+ "acc_norm": 0.3157894736842105,
97
+ "acc_norm_stderr": 0.043727482902780064
98
+ },
99
+ "harness|hendrycksTest-electrical_engineering|5": {
100
+ "acc": 0.503448275862069,
101
+ "acc_stderr": 0.0416656757710158,
102
+ "acc_norm": 0.503448275862069,
103
+ "acc_norm_stderr": 0.0416656757710158
104
+ },
105
+ "harness|hendrycksTest-elementary_mathematics|5": {
106
+ "acc": 0.3412698412698413,
107
+ "acc_stderr": 0.024419234966819064,
108
+ "acc_norm": 0.3412698412698413,
109
+ "acc_norm_stderr": 0.024419234966819064
110
+ },
111
+ "harness|hendrycksTest-formal_logic|5": {
112
+ "acc": 0.30952380952380953,
113
+ "acc_stderr": 0.04134913018303316,
114
+ "acc_norm": 0.30952380952380953,
115
+ "acc_norm_stderr": 0.04134913018303316
116
+ },
117
+ "harness|hendrycksTest-global_facts|5": {
118
+ "acc": 0.3,
119
+ "acc_stderr": 0.046056618647183814,
120
+ "acc_norm": 0.3,
121
+ "acc_norm_stderr": 0.046056618647183814
122
+ },
123
+ "harness|hendrycksTest-high_school_biology|5": {
124
+ "acc": 0.6419354838709678,
125
+ "acc_stderr": 0.02727389059430064,
126
+ "acc_norm": 0.6419354838709678,
127
+ "acc_norm_stderr": 0.02727389059430064
128
+ },
129
+ "harness|hendrycksTest-high_school_chemistry|5": {
130
+ "acc": 0.4630541871921182,
131
+ "acc_stderr": 0.035083705204426656,
132
+ "acc_norm": 0.4630541871921182,
133
+ "acc_norm_stderr": 0.035083705204426656
134
+ },
135
+ "harness|hendrycksTest-high_school_computer_science|5": {
136
+ "acc": 0.59,
137
+ "acc_stderr": 0.04943110704237102,
138
+ "acc_norm": 0.59,
139
+ "acc_norm_stderr": 0.04943110704237102
140
+ },
141
+ "harness|hendrycksTest-high_school_european_history|5": {
142
+ "acc": 0.6727272727272727,
143
+ "acc_stderr": 0.03663974994391244,
144
+ "acc_norm": 0.6727272727272727,
145
+ "acc_norm_stderr": 0.03663974994391244
146
+ },
147
+ "harness|hendrycksTest-high_school_geography|5": {
148
+ "acc": 0.702020202020202,
149
+ "acc_stderr": 0.03258630383836556,
150
+ "acc_norm": 0.702020202020202,
151
+ "acc_norm_stderr": 0.03258630383836556
152
+ },
153
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
154
+ "acc": 0.7875647668393783,
155
+ "acc_stderr": 0.029519282616817234,
156
+ "acc_norm": 0.7875647668393783,
157
+ "acc_norm_stderr": 0.029519282616817234
158
+ },
159
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
160
+ "acc": 0.49230769230769234,
161
+ "acc_stderr": 0.025348006031534788,
162
+ "acc_norm": 0.49230769230769234,
163
+ "acc_norm_stderr": 0.025348006031534788
164
+ },
165
+ "harness|hendrycksTest-high_school_mathematics|5": {
166
+ "acc": 0.3111111111111111,
167
+ "acc_stderr": 0.028226446749683522,
168
+ "acc_norm": 0.3111111111111111,
169
+ "acc_norm_stderr": 0.028226446749683522
170
+ },
171
+ "harness|hendrycksTest-high_school_microeconomics|5": {
172
+ "acc": 0.5294117647058824,
173
+ "acc_stderr": 0.03242225027115007,
174
+ "acc_norm": 0.5294117647058824,
175
+ "acc_norm_stderr": 0.03242225027115007
176
+ },
177
+ "harness|hendrycksTest-high_school_physics|5": {
178
+ "acc": 0.33774834437086093,
179
+ "acc_stderr": 0.038615575462551684,
180
+ "acc_norm": 0.33774834437086093,
181
+ "acc_norm_stderr": 0.038615575462551684
182
+ },
183
+ "harness|hendrycksTest-high_school_psychology|5": {
184
+ "acc": 0.7321100917431193,
185
+ "acc_stderr": 0.018987462257978652,
186
+ "acc_norm": 0.7321100917431193,
187
+ "acc_norm_stderr": 0.018987462257978652
188
+ },
189
+ "harness|hendrycksTest-high_school_statistics|5": {
190
+ "acc": 0.3888888888888889,
191
+ "acc_stderr": 0.03324708911809117,
192
+ "acc_norm": 0.3888888888888889,
193
+ "acc_norm_stderr": 0.03324708911809117
194
+ },
195
+ "harness|hendrycksTest-high_school_us_history|5": {
196
+ "acc": 0.75,
197
+ "acc_stderr": 0.03039153369274154,
198
+ "acc_norm": 0.75,
199
+ "acc_norm_stderr": 0.03039153369274154
200
+ },
201
+ "harness|hendrycksTest-high_school_world_history|5": {
202
+ "acc": 0.7172995780590717,
203
+ "acc_stderr": 0.02931281415395592,
204
+ "acc_norm": 0.7172995780590717,
205
+ "acc_norm_stderr": 0.02931281415395592
206
+ },
207
+ "harness|hendrycksTest-human_aging|5": {
208
+ "acc": 0.6457399103139013,
209
+ "acc_stderr": 0.032100621541349864,
210
+ "acc_norm": 0.6457399103139013,
211
+ "acc_norm_stderr": 0.032100621541349864
212
+ },
213
+ "harness|hendrycksTest-human_sexuality|5": {
214
+ "acc": 0.6335877862595419,
215
+ "acc_stderr": 0.04225875451969637,
216
+ "acc_norm": 0.6335877862595419,
217
+ "acc_norm_stderr": 0.04225875451969637
218
+ },
219
+ "harness|hendrycksTest-international_law|5": {
220
+ "acc": 0.768595041322314,
221
+ "acc_stderr": 0.03849856098794089,
222
+ "acc_norm": 0.768595041322314,
223
+ "acc_norm_stderr": 0.03849856098794089
224
+ },
225
+ "harness|hendrycksTest-jurisprudence|5": {
226
+ "acc": 0.6944444444444444,
227
+ "acc_stderr": 0.044531975073749834,
228
+ "acc_norm": 0.6944444444444444,
229
+ "acc_norm_stderr": 0.044531975073749834
230
+ },
231
+ "harness|hendrycksTest-logical_fallacies|5": {
232
+ "acc": 0.6503067484662577,
233
+ "acc_stderr": 0.037466683254700206,
234
+ "acc_norm": 0.6503067484662577,
235
+ "acc_norm_stderr": 0.037466683254700206
236
+ },
237
+ "harness|hendrycksTest-machine_learning|5": {
238
+ "acc": 0.35714285714285715,
239
+ "acc_stderr": 0.04547960999764376,
240
+ "acc_norm": 0.35714285714285715,
241
+ "acc_norm_stderr": 0.04547960999764376
242
+ },
243
+ "harness|hendrycksTest-management|5": {
244
+ "acc": 0.7378640776699029,
245
+ "acc_stderr": 0.04354631077260595,
246
+ "acc_norm": 0.7378640776699029,
247
+ "acc_norm_stderr": 0.04354631077260595
248
+ },
249
+ "harness|hendrycksTest-marketing|5": {
250
+ "acc": 0.7863247863247863,
251
+ "acc_stderr": 0.026853450377009175,
252
+ "acc_norm": 0.7863247863247863,
253
+ "acc_norm_stderr": 0.026853450377009175
254
+ },
255
+ "harness|hendrycksTest-medical_genetics|5": {
256
+ "acc": 0.57,
257
+ "acc_stderr": 0.049756985195624284,
258
+ "acc_norm": 0.57,
259
+ "acc_norm_stderr": 0.049756985195624284
260
+ },
261
+ "harness|hendrycksTest-miscellaneous|5": {
262
+ "acc": 0.7471264367816092,
263
+ "acc_stderr": 0.015543377313719681,
264
+ "acc_norm": 0.7471264367816092,
265
+ "acc_norm_stderr": 0.015543377313719681
266
+ },
267
+ "harness|hendrycksTest-moral_disputes|5": {
268
+ "acc": 0.6127167630057804,
269
+ "acc_stderr": 0.026226158605124655,
270
+ "acc_norm": 0.6127167630057804,
271
+ "acc_norm_stderr": 0.026226158605124655
272
+ },
273
+ "harness|hendrycksTest-moral_scenarios|5": {
274
+ "acc": 0.30502793296089387,
275
+ "acc_stderr": 0.015398723510916716,
276
+ "acc_norm": 0.30502793296089387,
277
+ "acc_norm_stderr": 0.015398723510916716
278
+ },
279
+ "harness|hendrycksTest-nutrition|5": {
280
+ "acc": 0.5947712418300654,
281
+ "acc_stderr": 0.028110928492809068,
282
+ "acc_norm": 0.5947712418300654,
283
+ "acc_norm_stderr": 0.028110928492809068
284
+ },
285
+ "harness|hendrycksTest-philosophy|5": {
286
+ "acc": 0.5884244372990354,
287
+ "acc_stderr": 0.02795048149440127,
288
+ "acc_norm": 0.5884244372990354,
289
+ "acc_norm_stderr": 0.02795048149440127
290
+ },
291
+ "harness|hendrycksTest-prehistory|5": {
292
+ "acc": 0.6111111111111112,
293
+ "acc_stderr": 0.02712511551316687,
294
+ "acc_norm": 0.6111111111111112,
295
+ "acc_norm_stderr": 0.02712511551316687
296
+ },
297
+ "harness|hendrycksTest-professional_accounting|5": {
298
+ "acc": 0.38652482269503546,
299
+ "acc_stderr": 0.029049190342543454,
300
+ "acc_norm": 0.38652482269503546,
301
+ "acc_norm_stderr": 0.029049190342543454
302
+ },
303
+ "harness|hendrycksTest-professional_law|5": {
304
+ "acc": 0.39113428943937417,
305
+ "acc_stderr": 0.012463861839982058,
306
+ "acc_norm": 0.39113428943937417,
307
+ "acc_norm_stderr": 0.012463861839982058
308
+ },
309
+ "harness|hendrycksTest-professional_medicine|5": {
310
+ "acc": 0.5,
311
+ "acc_stderr": 0.030372836961539352,
312
+ "acc_norm": 0.5,
313
+ "acc_norm_stderr": 0.030372836961539352
314
+ },
315
+ "harness|hendrycksTest-professional_psychology|5": {
316
+ "acc": 0.5424836601307189,
317
+ "acc_stderr": 0.020154685712590888,
318
+ "acc_norm": 0.5424836601307189,
319
+ "acc_norm_stderr": 0.020154685712590888
320
+ },
321
+ "harness|hendrycksTest-public_relations|5": {
322
+ "acc": 0.6636363636363637,
323
+ "acc_stderr": 0.04525393596302505,
324
+ "acc_norm": 0.6636363636363637,
325
+ "acc_norm_stderr": 0.04525393596302505
326
+ },
327
+ "harness|hendrycksTest-security_studies|5": {
328
+ "acc": 0.6408163265306123,
329
+ "acc_stderr": 0.030713560455108493,
330
+ "acc_norm": 0.6408163265306123,
331
+ "acc_norm_stderr": 0.030713560455108493
332
+ },
333
+ "harness|hendrycksTest-sociology|5": {
334
+ "acc": 0.7512437810945274,
335
+ "acc_stderr": 0.030567675938916714,
336
+ "acc_norm": 0.7512437810945274,
337
+ "acc_norm_stderr": 0.030567675938916714
338
+ },
339
+ "harness|hendrycksTest-us_foreign_policy|5": {
340
+ "acc": 0.81,
341
+ "acc_stderr": 0.03942772444036625,
342
+ "acc_norm": 0.81,
343
+ "acc_norm_stderr": 0.03942772444036625
344
+ },
345
+ "harness|hendrycksTest-virology|5": {
346
+ "acc": 0.4819277108433735,
347
+ "acc_stderr": 0.038899512528272166,
348
+ "acc_norm": 0.4819277108433735,
349
+ "acc_norm_stderr": 0.038899512528272166
350
+ },
351
+ "harness|hendrycksTest-world_religions|5": {
352
+ "acc": 0.7309941520467836,
353
+ "acc_stderr": 0.03401052620104089,
354
+ "acc_norm": 0.7309941520467836,
355
+ "acc_norm_stderr": 0.03401052620104089
356
+ },
357
+ "harness|truthfulqa:mc|0": {
358
+ "mc1": 0.28518971848225216,
359
+ "mc1_stderr": 0.015805827874454895,
360
+ "mc2": 0.4411794590119937,
361
+ "mc2_stderr": 0.015755921757439843
362
+ },
363
+ "all": {
364
+ "acc": 0.5479380524707899,
365
+ "acc_stderr": 0.03451142729909022,
366
+ "acc_norm": 0.5517368945804153,
367
+ "acc_norm_stderr": 0.03449229816957583,
368
+ "mc1": 0.28518971848225216,
369
+ "mc1_stderr": 0.015805827874454895,
370
+ "mc2": 0.4411794590119937,
371
+ "mc2_stderr": 0.015755921757439843
372
+ }
373
+ },
374
+ "versions": {
375
+ "harness|arc:challenge|25": 0,
376
+ "harness|hellaswag|10": 0,
377
+ "harness|hendrycksTest-abstract_algebra|5": 1,
378
+ "harness|hendrycksTest-anatomy|5": 1,
379
+ "harness|hendrycksTest-astronomy|5": 1,
380
+ "harness|hendrycksTest-business_ethics|5": 1,
381
+ "harness|hendrycksTest-clinical_knowledge|5": 1,
382
+ "harness|hendrycksTest-college_biology|5": 1,
383
+ "harness|hendrycksTest-college_chemistry|5": 1,
384
+ "harness|hendrycksTest-college_computer_science|5": 1,
385
+ "harness|hendrycksTest-college_mathematics|5": 1,
386
+ "harness|hendrycksTest-college_medicine|5": 1,
387
+ "harness|hendrycksTest-college_physics|5": 1,
388
+ "harness|hendrycksTest-computer_security|5": 1,
389
+ "harness|hendrycksTest-conceptual_physics|5": 1,
390
+ "harness|hendrycksTest-econometrics|5": 1,
391
+ "harness|hendrycksTest-electrical_engineering|5": 1,
392
+ "harness|hendrycksTest-elementary_mathematics|5": 1,
393
+ "harness|hendrycksTest-formal_logic|5": 1,
394
+ "harness|hendrycksTest-global_facts|5": 1,
395
+ "harness|hendrycksTest-high_school_biology|5": 1,
396
+ "harness|hendrycksTest-high_school_chemistry|5": 1,
397
+ "harness|hendrycksTest-high_school_computer_science|5": 1,
398
+ "harness|hendrycksTest-high_school_european_history|5": 1,
399
+ "harness|hendrycksTest-high_school_geography|5": 1,
400
+ "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
+ "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
+ "harness|hendrycksTest-high_school_mathematics|5": 1,
403
+ "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
+ "harness|hendrycksTest-high_school_physics|5": 1,
405
+ "harness|hendrycksTest-high_school_psychology|5": 1,
406
+ "harness|hendrycksTest-high_school_statistics|5": 1,
407
+ "harness|hendrycksTest-high_school_us_history|5": 1,
408
+ "harness|hendrycksTest-high_school_world_history|5": 1,
409
+ "harness|hendrycksTest-human_aging|5": 1,
410
+ "harness|hendrycksTest-human_sexuality|5": 1,
411
+ "harness|hendrycksTest-international_law|5": 1,
412
+ "harness|hendrycksTest-jurisprudence|5": 1,
413
+ "harness|hendrycksTest-logical_fallacies|5": 1,
414
+ "harness|hendrycksTest-machine_learning|5": 1,
415
+ "harness|hendrycksTest-management|5": 1,
416
+ "harness|hendrycksTest-marketing|5": 1,
417
+ "harness|hendrycksTest-medical_genetics|5": 1,
418
+ "harness|hendrycksTest-miscellaneous|5": 1,
419
+ "harness|hendrycksTest-moral_disputes|5": 1,
420
+ "harness|hendrycksTest-moral_scenarios|5": 1,
421
+ "harness|hendrycksTest-nutrition|5": 1,
422
+ "harness|hendrycksTest-philosophy|5": 1,
423
+ "harness|hendrycksTest-prehistory|5": 1,
424
+ "harness|hendrycksTest-professional_accounting|5": 1,
425
+ "harness|hendrycksTest-professional_law|5": 1,
426
+ "harness|hendrycksTest-professional_medicine|5": 1,
427
+ "harness|hendrycksTest-professional_psychology|5": 1,
428
+ "harness|hendrycksTest-public_relations|5": 1,
429
+ "harness|hendrycksTest-security_studies|5": 1,
430
+ "harness|hendrycksTest-sociology|5": 1,
431
+ "harness|hendrycksTest-us_foreign_policy|5": 1,
432
+ "harness|hendrycksTest-virology|5": 1,
433
+ "harness|hendrycksTest-world_religions|5": 1,
434
+ "harness|truthfulqa:mc|0": 1,
435
+ "all": 0
436
+ },
437
+ "config": {
438
+ "model_name": "meta-llama/Llama-2-13b-chat-hf",
439
+ "model_sha": "f848cf15ab9a51ae5735ab28120a9a0773eeb541",
440
+ "model_dtype": "torch.float16",
441
+ "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
442
+ "num_few_shot_default": 0,
443
+ "num_fewshot_seeds": 1,
444
+ "override_batch_size": 1,
445
+ "max_samples": null
446
+ },
447
+ "task_config": {
448
+ "harness|arc:challenge": "LM Harness task",
449
+ "harness|hellaswag": "LM Harness task",
450
+ "harness|hendrycksTest-abstract_algebra": "LM Harness task",
451
+ "harness|hendrycksTest-anatomy": "LM Harness task",
452
+ "harness|hendrycksTest-astronomy": "LM Harness task",
453
+ "harness|hendrycksTest-business_ethics": "LM Harness task",
454
+ "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
455
+ "harness|hendrycksTest-college_biology": "LM Harness task",
456
+ "harness|hendrycksTest-college_chemistry": "LM Harness task",
457
+ "harness|hendrycksTest-college_computer_science": "LM Harness task",
458
+ "harness|hendrycksTest-college_mathematics": "LM Harness task",
459
+ "harness|hendrycksTest-college_medicine": "LM Harness task",
460
+ "harness|hendrycksTest-college_physics": "LM Harness task",
461
+ "harness|hendrycksTest-computer_security": "LM Harness task",
462
+ "harness|hendrycksTest-conceptual_physics": "LM Harness task",
463
+ "harness|hendrycksTest-econometrics": "LM Harness task",
464
+ "harness|hendrycksTest-electrical_engineering": "LM Harness task",
465
+ "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
466
+ "harness|hendrycksTest-formal_logic": "LM Harness task",
467
+ "harness|hendrycksTest-global_facts": "LM Harness task",
468
+ "harness|hendrycksTest-high_school_biology": "LM Harness task",
469
+ "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
470
+ "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
471
+ "harness|hendrycksTest-high_school_european_history": "LM Harness task",
472
+ "harness|hendrycksTest-high_school_geography": "LM Harness task",
473
+ "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
474
+ "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
475
+ "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
476
+ "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
477
+ "harness|hendrycksTest-high_school_physics": "LM Harness task",
478
+ "harness|hendrycksTest-high_school_psychology": "LM Harness task",
479
+ "harness|hendrycksTest-high_school_statistics": "LM Harness task",
480
+ "harness|hendrycksTest-high_school_us_history": "LM Harness task",
481
+ "harness|hendrycksTest-high_school_world_history": "LM Harness task",
482
+ "harness|hendrycksTest-human_aging": "LM Harness task",
483
+ "harness|hendrycksTest-human_sexuality": "LM Harness task",
484
+ "harness|hendrycksTest-international_law": "LM Harness task",
485
+ "harness|hendrycksTest-jurisprudence": "LM Harness task",
486
+ "harness|hendrycksTest-logical_fallacies": "LM Harness task",
487
+ "harness|hendrycksTest-machine_learning": "LM Harness task",
488
+ "harness|hendrycksTest-management": "LM Harness task",
489
+ "harness|hendrycksTest-marketing": "LM Harness task",
490
+ "harness|hendrycksTest-medical_genetics": "LM Harness task",
491
+ "harness|hendrycksTest-miscellaneous": "LM Harness task",
492
+ "harness|hendrycksTest-moral_disputes": "LM Harness task",
493
+ "harness|hendrycksTest-moral_scenarios": "LM Harness task",
494
+ "harness|hendrycksTest-nutrition": "LM Harness task",
495
+ "harness|hendrycksTest-philosophy": "LM Harness task",
496
+ "harness|hendrycksTest-prehistory": "LM Harness task",
497
+ "harness|hendrycksTest-professional_accounting": "LM Harness task",
498
+ "harness|hendrycksTest-professional_law": "LM Harness task",
499
+ "harness|hendrycksTest-professional_medicine": "LM Harness task",
500
+ "harness|hendrycksTest-professional_psychology": "LM Harness task",
501
+ "harness|hendrycksTest-public_relations": "LM Harness task",
502
+ "harness|hendrycksTest-security_studies": "LM Harness task",
503
+ "harness|hendrycksTest-sociology": "LM Harness task",
504
+ "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
505
+ "harness|hendrycksTest-virology": "LM Harness task",
506
+ "harness|hendrycksTest-world_religions": "LM Harness task",
507
+ "harness|truthfulqa:mc": "LM Harness task"
508
+ },
509
+ "hashes": {
510
+ "harness|arc:challenge|25": {
511
+ "hash_examples": "fb8c51b1872daeda",
512
+ "hash_full_prompts": "045cbb916e5145c6",
513
+ "hash_input_tokens": "61571bf68d6d89aa",
514
+ "hash_cont_tokens": "8210decc6ff6f7df"
515
+ },
516
+ "harness|hellaswag|10": {
517
+ "hash_examples": "e1768ecb99d7ecf0",
518
+ "hash_full_prompts": "0b4c16983130f84f",
519
+ "hash_input_tokens": "29906669b1c7054a",
520
+ "hash_cont_tokens": "b3b9e9017afa63af"
521
+ },
522
+ "harness|hendrycksTest-abstract_algebra|5": {
523
+ "hash_examples": "280f9f325b40559a",
524
+ "hash_full_prompts": "2f776a367d23aea2",
525
+ "hash_input_tokens": "c54ff61ad0273dd7",
526
+ "hash_cont_tokens": "50421e30bef398f9"
527
+ },
528
+ "harness|hendrycksTest-anatomy|5": {
529
+ "hash_examples": "2f83a4f1cab4ba18",
530
+ "hash_full_prompts": "516f74bef25df620",
531
+ "hash_input_tokens": "be31a1e22aef5f90",
532
+ "hash_cont_tokens": "f11971a765cb609f"
533
+ },
534
+ "harness|hendrycksTest-astronomy|5": {
535
+ "hash_examples": "7d587b908da4d762",
536
+ "hash_full_prompts": "faf4e80f65de93ca",
537
+ "hash_input_tokens": "277a7b1fad566940",
538
+ "hash_cont_tokens": "bf30e5d3f48250cb"
539
+ },
540
+ "harness|hendrycksTest-business_ethics|5": {
541
+ "hash_examples": "33e51740670de686",
542
+ "hash_full_prompts": "db01c3ef8e1479d4",
543
+ "hash_input_tokens": "ba552605bc116de5",
544
+ "hash_cont_tokens": "bc1dd9b2d995eb61"
545
+ },
546
+ "harness|hendrycksTest-clinical_knowledge|5": {
547
+ "hash_examples": "f3366dbe7eefffa4",
548
+ "hash_full_prompts": "49654f71d94b65c3",
549
+ "hash_input_tokens": "428c7563d0b98ab9",
550
+ "hash_cont_tokens": "890a119624b3b935"
551
+ },
552
+ "harness|hendrycksTest-college_biology|5": {
553
+ "hash_examples": "ca2b6753a0193e7f",
554
+ "hash_full_prompts": "2b460b75f1fdfefd",
555
+ "hash_input_tokens": "da036601573942e2",
556
+ "hash_cont_tokens": "875cde3af7a0ee14"
557
+ },
558
+ "harness|hendrycksTest-college_chemistry|5": {
559
+ "hash_examples": "22ff85f1d34f42d1",
560
+ "hash_full_prompts": "242c9be6da583e95",
561
+ "hash_input_tokens": "94e0196d6aded13d",
562
+ "hash_cont_tokens": "50421e30bef398f9"
563
+ },
564
+ "harness|hendrycksTest-college_computer_science|5": {
565
+ "hash_examples": "30318289d717a5cf",
566
+ "hash_full_prompts": "ed2bdb4e87c4b371",
567
+ "hash_input_tokens": "6e4d0f4a8d36690b",
568
+ "hash_cont_tokens": "ffc0fe414cdc4a83"
569
+ },
570
+ "harness|hendrycksTest-college_mathematics|5": {
571
+ "hash_examples": "4944d1f0b6b5d911",
572
+ "hash_full_prompts": "770bc4281c973190",
573
+ "hash_input_tokens": "614054d17109a25d",
574
+ "hash_cont_tokens": "50421e30bef398f9"
575
+ },
576
+ "harness|hendrycksTest-college_medicine|5": {
577
+ "hash_examples": "dd69cc33381275af",
578
+ "hash_full_prompts": "ad2a53e5250ab46e",
579
+ "hash_input_tokens": "1d633b3cc0524ba8",
580
+ "hash_cont_tokens": "1f88b00d41957d82"
581
+ },
582
+ "harness|hendrycksTest-college_physics|5": {
583
+ "hash_examples": "875dd26d22655b0d",
584
+ "hash_full_prompts": "833a0d7b55aed500",
585
+ "hash_input_tokens": "5421d9a1af86cbd4",
586
+ "hash_cont_tokens": "f7b8097afc16a47c"
587
+ },
588
+ "harness|hendrycksTest-computer_security|5": {
589
+ "hash_examples": "006451eedc0ededb",
590
+ "hash_full_prompts": "94034c97e85d8f46",
591
+ "hash_input_tokens": "5e6b70ecb333cf18",
592
+ "hash_cont_tokens": "50421e30bef398f9"
593
+ },
594
+ "harness|hendrycksTest-conceptual_physics|5": {
595
+ "hash_examples": "8874ece872d2ca4c",
596
+ "hash_full_prompts": "e40d15a34640d6fa",
597
+ "hash_input_tokens": "c2ef11a87264ceed",
598
+ "hash_cont_tokens": "aa0e8bc655f2f641"
599
+ },
600
+ "harness|hendrycksTest-econometrics|5": {
601
+ "hash_examples": "64d3623b0bfaa43f",
602
+ "hash_full_prompts": "612f340fae41338d",
603
+ "hash_input_tokens": "ecaccd912a4c3978",
604
+ "hash_cont_tokens": "bfb7e3c3c88313f1"
605
+ },
606
+ "harness|hendrycksTest-electrical_engineering|5": {
607
+ "hash_examples": "e98f51780c674d7e",
608
+ "hash_full_prompts": "10275b312d812ae6",
609
+ "hash_input_tokens": "1590c84291399be8",
610
+ "hash_cont_tokens": "2425a3f084a591ef"
611
+ },
612
+ "harness|hendrycksTest-elementary_mathematics|5": {
613
+ "hash_examples": "fc48208a5ac1c0ce",
614
+ "hash_full_prompts": "5ec274c6c82aca23",
615
+ "hash_input_tokens": "3269597f715b0da1",
616
+ "hash_cont_tokens": "f52691aef15a407b"
617
+ },
618
+ "harness|hendrycksTest-formal_logic|5": {
619
+ "hash_examples": "5a6525665f63ea72",
620
+ "hash_full_prompts": "07b92638c4a6b500",
621
+ "hash_input_tokens": "a2800d20f3ab8d7c",
622
+ "hash_cont_tokens": "f515d598d9c21263"
623
+ },
624
+ "harness|hendrycksTest-global_facts|5": {
625
+ "hash_examples": "371d70d743b2b89b",
626
+ "hash_full_prompts": "332fdee50a1921b4",
627
+ "hash_input_tokens": "94ed44b3772505ad",
628
+ "hash_cont_tokens": "50421e30bef398f9"
629
+ },
630
+ "harness|hendrycksTest-high_school_biology|5": {
631
+ "hash_examples": "a79e1018b1674052",
632
+ "hash_full_prompts": "e624e26ede922561",
633
+ "hash_input_tokens": "24423acb928db768",
634
+ "hash_cont_tokens": "bd85a4156a3613ee"
635
+ },
636
+ "harness|hendrycksTest-high_school_chemistry|5": {
637
+ "hash_examples": "44bfc25c389f0e03",
638
+ "hash_full_prompts": "0e3e5f5d9246482a",
639
+ "hash_input_tokens": "831ff35c474e5cef",
640
+ "hash_cont_tokens": "a95c97af1c14e068"
641
+ },
642
+ "harness|hendrycksTest-high_school_computer_science|5": {
643
+ "hash_examples": "8b8cdb1084f24169",
644
+ "hash_full_prompts": "c00487e67c1813cc",
645
+ "hash_input_tokens": "8c34e0f2bda77358",
646
+ "hash_cont_tokens": "8abfedef914e33c9"
647
+ },
648
+ "harness|hendrycksTest-high_school_european_history|5": {
649
+ "hash_examples": "11cd32d0ef440171",
650
+ "hash_full_prompts": "318f4513c537c6bf",
651
+ "hash_input_tokens": "f1f73dd687da18d7",
652
+ "hash_cont_tokens": "674fc454bdc5ac93"
653
+ },
654
+ "harness|hendrycksTest-high_school_geography|5": {
655
+ "hash_examples": "b60019b9e80b642f",
656
+ "hash_full_prompts": "ee5789fcc1a81b1e",
657
+ "hash_input_tokens": "7c5547c7da5bc793",
658
+ "hash_cont_tokens": "03a5012b916274ea"
659
+ },
660
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
661
+ "hash_examples": "d221ec983d143dc3",
662
+ "hash_full_prompts": "ac42d888e1ce1155",
663
+ "hash_input_tokens": "f62991cb6a496b05",
664
+ "hash_cont_tokens": "a83effb8f76b7d7c"
665
+ },
666
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
667
+ "hash_examples": "59c2915cacfd3fbb",
668
+ "hash_full_prompts": "c6bd9d25158abd0e",
669
+ "hash_input_tokens": "4cef2aff6e3d59ed",
670
+ "hash_cont_tokens": "c583432ad27fcfe0"
671
+ },
672
+ "harness|hendrycksTest-high_school_mathematics|5": {
673
+ "hash_examples": "1f8ac897608de342",
674
+ "hash_full_prompts": "5d88f41fc2d643a8",
675
+ "hash_input_tokens": "6e2577ea4082ed2b",
676
+ "hash_cont_tokens": "24f5dc613660300b"
677
+ },
678
+ "harness|hendrycksTest-high_school_microeconomics|5": {
679
+ "hash_examples": "ead6a0f2f6c83370",
680
+ "hash_full_prompts": "bfc393381298609e",
681
+ "hash_input_tokens": "c5fc9aeb1079c8e4",
682
+ "hash_cont_tokens": "f47f041de50333b9"
683
+ },
684
+ "harness|hendrycksTest-high_school_physics|5": {
685
+ "hash_examples": "c3f2025990afec64",
686
+ "hash_full_prompts": "fc78b4997e436734",
687
+ "hash_input_tokens": "555fc385cffa84ca",
688
+ "hash_cont_tokens": "ba2efcd283e938cc"
689
+ },
690
+ "harness|hendrycksTest-high_school_psychology|5": {
691
+ "hash_examples": "21f8aab618f6d636",
692
+ "hash_full_prompts": "d5c76aa40b9dbc43",
693
+ "hash_input_tokens": "febd23cbf9973b7f",
694
+ "hash_cont_tokens": "942069cd363844d9"
695
+ },
696
+ "harness|hendrycksTest-high_school_statistics|5": {
697
+ "hash_examples": "2386a60a11fc5de3",
698
+ "hash_full_prompts": "4c5c8be5aafac432",
699
+ "hash_input_tokens": "424b02981230ee83",
700
+ "hash_cont_tokens": "955ed42b6f7fa019"
701
+ },
702
+ "harness|hendrycksTest-high_school_us_history|5": {
703
+ "hash_examples": "74961543be40f04f",
704
+ "hash_full_prompts": "5d5ca4840131ba21",
705
+ "hash_input_tokens": "50c9ff438c85a69e",
706
+ "hash_cont_tokens": "cdd0b3dc06d933e5"
707
+ },
708
+ "harness|hendrycksTest-high_school_world_history|5": {
709
+ "hash_examples": "2ad2f6b7198b2234",
710
+ "hash_full_prompts": "11845057459afd72",
711
+ "hash_input_tokens": "054824cc474caef5",
712
+ "hash_cont_tokens": "9a864184946033ac"
713
+ },
714
+ "harness|hendrycksTest-human_aging|5": {
715
+ "hash_examples": "1a7199dc733e779b",
716
+ "hash_full_prompts": "756b9096b8eaf892",
717
+ "hash_input_tokens": "541a75f071dcf579",
718
+ "hash_cont_tokens": "142a4a8a1138a214"
719
+ },
720
+ "harness|hendrycksTest-human_sexuality|5": {
721
+ "hash_examples": "7acb8fdad97f88a6",
722
+ "hash_full_prompts": "731a52ff15b8cfdb",
723
+ "hash_input_tokens": "04269e5c5a257dd9",
724
+ "hash_cont_tokens": "bc54813e809b796d"
725
+ },
726
+ "harness|hendrycksTest-international_law|5": {
727
+ "hash_examples": "1300bfd0dfc59114",
728
+ "hash_full_prompts": "db2aefbff5eec996",
729
+ "hash_input_tokens": "d93ba9d9d38e4397",
730
+ "hash_cont_tokens": "dc45b45fcda18e5d"
731
+ },
732
+ "harness|hendrycksTest-jurisprudence|5": {
733
+ "hash_examples": "083b1e4904c48dc2",
734
+ "hash_full_prompts": "0f89ee3fe03d6a21",
735
+ "hash_input_tokens": "9eeaccd2698b4f5a",
736
+ "hash_cont_tokens": "e3a8cd951b6e3469"
737
+ },
738
+ "harness|hendrycksTest-logical_fallacies|5": {
739
+ "hash_examples": "709128f9926a634c",
740
+ "hash_full_prompts": "98a04b1f8f841069",
741
+ "hash_input_tokens": "b4f08f544f2b7576",
742
+ "hash_cont_tokens": "1e80dbd30f6453d5"
743
+ },
744
+ "harness|hendrycksTest-machine_learning|5": {
745
+ "hash_examples": "88f22a636029ae47",
746
+ "hash_full_prompts": "2e1c8d4b1e0cc921",
747
+ "hash_input_tokens": "900c2a51f1174b9f",
748
+ "hash_cont_tokens": "9b37da7777378ca9"
749
+ },
750
+ "harness|hendrycksTest-management|5": {
751
+ "hash_examples": "8c8a1e07a2151dca",
752
+ "hash_full_prompts": "f51611f514b265b0",
753
+ "hash_input_tokens": "6b36efb4689c6eca",
754
+ "hash_cont_tokens": "a01d6d39a83c4597"
755
+ },
756
+ "harness|hendrycksTest-marketing|5": {
757
+ "hash_examples": "2668953431f91e96",
758
+ "hash_full_prompts": "77562bef997c7650",
759
+ "hash_input_tokens": "2aaac78a0cfed47a",
760
+ "hash_cont_tokens": "6aeaed4d823c98aa"
761
+ },
762
+ "harness|hendrycksTest-medical_genetics|5": {
763
+ "hash_examples": "9c2dda34a2ea4fd2",
764
+ "hash_full_prompts": "202139046daa118f",
765
+ "hash_input_tokens": "886ca823b41c094a",
766
+ "hash_cont_tokens": "50421e30bef398f9"
767
+ },
768
+ "harness|hendrycksTest-miscellaneous|5": {
769
+ "hash_examples": "41adb694024809c2",
770
+ "hash_full_prompts": "bffec9fc237bcf93",
771
+ "hash_input_tokens": "72fd71de7675e7d0",
772
+ "hash_cont_tokens": "9b0ab02a64603081"
773
+ },
774
+ "harness|hendrycksTest-moral_disputes|5": {
775
+ "hash_examples": "3171c13ba3c594c4",
776
+ "hash_full_prompts": "170831fc36f1d59e",
777
+ "hash_input_tokens": "f3ca0dd8e7a1eb09",
778
+ "hash_cont_tokens": "8badf768f7b0467a"
779
+ },
780
+ "harness|hendrycksTest-moral_scenarios|5": {
781
+ "hash_examples": "9873e077e83e0546",
782
+ "hash_full_prompts": "08f4ceba3131a068",
783
+ "hash_input_tokens": "3e793631e951f23c",
784
+ "hash_cont_tokens": "32ae620376b2bbba"
785
+ },
786
+ "harness|hendrycksTest-nutrition|5": {
787
+ "hash_examples": "7db1d8142ec14323",
788
+ "hash_full_prompts": "4c0e68e3586cb453",
789
+ "hash_input_tokens": "59753c2144ea93af",
790
+ "hash_cont_tokens": "3071def75bacc404"
791
+ },
792
+ "harness|hendrycksTest-philosophy|5": {
793
+ "hash_examples": "9b455b7d72811cc8",
794
+ "hash_full_prompts": "e467f822d8a0d3ff",
795
+ "hash_input_tokens": "bd8d3dbed15a8c34",
796
+ "hash_cont_tokens": "9f6ff69d23a48783"
797
+ },
798
+ "harness|hendrycksTest-prehistory|5": {
799
+ "hash_examples": "8be90d0f538f1560",
800
+ "hash_full_prompts": "152187949bcd0921",
801
+ "hash_input_tokens": "3573cd87facbb7c5",
802
+ "hash_cont_tokens": "de469d2b981e32a3"
803
+ },
804
+ "harness|hendrycksTest-professional_accounting|5": {
805
+ "hash_examples": "8d377597916cd07e",
806
+ "hash_full_prompts": "0eb7345d6144ee0d",
807
+ "hash_input_tokens": "17e721bc1a7cbb47",
808
+ "hash_cont_tokens": "c46f74d2dfc7b13b"
809
+ },
810
+ "harness|hendrycksTest-professional_law|5": {
811
+ "hash_examples": "cd9dbc52b3c932d6",
812
+ "hash_full_prompts": "36ac764272bfb182",
813
+ "hash_input_tokens": "9178e10bd0763ec4",
814
+ "hash_cont_tokens": "2e590029ef41fbcd"
815
+ },
816
+ "harness|hendrycksTest-professional_medicine|5": {
817
+ "hash_examples": "b20e4e816c1e383e",
818
+ "hash_full_prompts": "7b8d69ea2acaf2f7",
819
+ "hash_input_tokens": "f5a22012a54f70ea",
820
+ "hash_cont_tokens": "fe35cfa9c6ca802e"
821
+ },
822
+ "harness|hendrycksTest-professional_psychology|5": {
823
+ "hash_examples": "d45b73b22f9cc039",
824
+ "hash_full_prompts": "fe8937e9ffc99771",
825
+ "hash_input_tokens": "0dfb73a8eb3f692c",
826
+ "hash_cont_tokens": "f020fbddf72c8652"
827
+ },
828
+ "harness|hendrycksTest-public_relations|5": {
829
+ "hash_examples": "0d25072e1761652a",
830
+ "hash_full_prompts": "f9adc39cfa9f42ba",
831
+ "hash_input_tokens": "1710c6ba4c9f3cbd",
832
+ "hash_cont_tokens": "568f585a259965c1"
833
+ },
834
+ "harness|hendrycksTest-security_studies|5": {
835
+ "hash_examples": "62bb8197e63d60d4",
836
+ "hash_full_prompts": "869c9c3ae196b7c3",
837
+ "hash_input_tokens": "d49711415961ced7",
838
+ "hash_cont_tokens": "cc6fd7cccd64cd5d"
839
+ },
840
+ "harness|hendrycksTest-sociology|5": {
841
+ "hash_examples": "e7959df87dea8672",
842
+ "hash_full_prompts": "1a1fc00e17b3a52a",
843
+ "hash_input_tokens": "828999f7624cbe7e",
844
+ "hash_cont_tokens": "c3a3bdfd177eed5b"
845
+ },
846
+ "harness|hendrycksTest-us_foreign_policy|5": {
847
+ "hash_examples": "4a56a01ddca44dca",
848
+ "hash_full_prompts": "0c7a7081c71c07b6",
849
+ "hash_input_tokens": "42054621e718dbee",
850
+ "hash_cont_tokens": "2568d0e8e36fa959"
851
+ },
852
+ "harness|hendrycksTest-virology|5": {
853
+ "hash_examples": "451cc86a8c4f4fe9",
854
+ "hash_full_prompts": "01e95325d8b738e4",
855
+ "hash_input_tokens": "6c4f0aa4dc859c04",
856
+ "hash_cont_tokens": "926cf60b0891f374"
857
+ },
858
+ "harness|hendrycksTest-world_religions|5": {
859
+ "hash_examples": "3b29cfaf1a81c379",
860
+ "hash_full_prompts": "e0d79a15083dfdff",
861
+ "hash_input_tokens": "6c75d44e092ff24f",
862
+ "hash_cont_tokens": "c525a5de974c1ea3"
863
+ },
864
+ "harness|truthfulqa:mc|0": {
865
+ "hash_examples": "23176c0531c7b867",
866
+ "hash_full_prompts": "36a6d90e75d92d4a",
867
+ "hash_input_tokens": "2738d7ed7075faa7",
868
+ "hash_cont_tokens": "c014154380b74b9e"
869
+ }
870
+ }
871
+ }
meta-llama/Llama-2-13b-chat-hf/results_2023-10-14T19-39-26.636545.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-13b-chat-hf",
4
+ "model_sha": "13f8d72c0456c17e41b3d8b4327259125cd0defa",
5
+ "model_size": "24.32 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.1782718120805369,
17
+ "em_stderr": 0.003919630092588375,
18
+ "f1": 0.2387195889261742,
19
+ "f1_stderr": 0.003944947017182046
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.15238817285822592,
23
+ "acc_stderr": 0.009899572254794204
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.745067087608524,
27
+ "acc_stderr": 0.012248806969376422
28
+ },
29
+ "all": {
30
+ "em": 0.1782718120805369,
31
+ "em_stderr": 0.003919630092588375,
32
+ "f1": 0.2387195889261742,
33
+ "f1_stderr": 0.003944947017182046,
34
+ "acc": 0.448727630233375,
35
+ "acc_stderr": 0.011074189612085313
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "eaa0f770b728538e"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "9956899ac09638ce"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "8197d42f9e3e7f68"
99
+ },
100
+ "total_evaluation_time_secondes": "10933.773993730545",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-13b-hf/results_2023-08-20T22-26-02.660247.json ADDED
@@ -0,0 +1,871 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "harness|arc:challenge|25": {
4
+ "acc": 0.5486348122866894,
5
+ "acc_stderr": 0.014542104569955265,
6
+ "acc_norm": 0.5938566552901023,
7
+ "acc_norm_stderr": 0.014351656690097862
8
+ },
9
+ "harness|hellaswag|10": {
10
+ "acc": 0.614618601872137,
11
+ "acc_stderr": 0.004856906473719381,
12
+ "acc_norm": 0.8212507468631747,
13
+ "acc_norm_stderr": 0.003823591814133036
14
+ },
15
+ "harness|hendrycksTest-abstract_algebra|5": {
16
+ "acc": 0.35,
17
+ "acc_stderr": 0.04793724854411022,
18
+ "acc_norm": 0.35,
19
+ "acc_norm_stderr": 0.04793724854411022
20
+ },
21
+ "harness|hendrycksTest-anatomy|5": {
22
+ "acc": 0.4666666666666667,
23
+ "acc_stderr": 0.043097329010363554,
24
+ "acc_norm": 0.4666666666666667,
25
+ "acc_norm_stderr": 0.043097329010363554
26
+ },
27
+ "harness|hendrycksTest-astronomy|5": {
28
+ "acc": 0.5263157894736842,
29
+ "acc_stderr": 0.04063302731486671,
30
+ "acc_norm": 0.5263157894736842,
31
+ "acc_norm_stderr": 0.04063302731486671
32
+ },
33
+ "harness|hendrycksTest-business_ethics|5": {
34
+ "acc": 0.55,
35
+ "acc_stderr": 0.049999999999999996,
36
+ "acc_norm": 0.55,
37
+ "acc_norm_stderr": 0.049999999999999996
38
+ },
39
+ "harness|hendrycksTest-clinical_knowledge|5": {
40
+ "acc": 0.6037735849056604,
41
+ "acc_stderr": 0.030102793781791197,
42
+ "acc_norm": 0.6037735849056604,
43
+ "acc_norm_stderr": 0.030102793781791197
44
+ },
45
+ "harness|hendrycksTest-college_biology|5": {
46
+ "acc": 0.6180555555555556,
47
+ "acc_stderr": 0.040629907841466674,
48
+ "acc_norm": 0.6180555555555556,
49
+ "acc_norm_stderr": 0.040629907841466674
50
+ },
51
+ "harness|hendrycksTest-college_chemistry|5": {
52
+ "acc": 0.44,
53
+ "acc_stderr": 0.04988876515698589,
54
+ "acc_norm": 0.44,
55
+ "acc_norm_stderr": 0.04988876515698589
56
+ },
57
+ "harness|hendrycksTest-college_computer_science|5": {
58
+ "acc": 0.47,
59
+ "acc_stderr": 0.05016135580465919,
60
+ "acc_norm": 0.47,
61
+ "acc_norm_stderr": 0.05016135580465919
62
+ },
63
+ "harness|hendrycksTest-college_mathematics|5": {
64
+ "acc": 0.31,
65
+ "acc_stderr": 0.04648231987117316,
66
+ "acc_norm": 0.31,
67
+ "acc_norm_stderr": 0.04648231987117316
68
+ },
69
+ "harness|hendrycksTest-college_medicine|5": {
70
+ "acc": 0.5317919075144508,
71
+ "acc_stderr": 0.03804749744364764,
72
+ "acc_norm": 0.5317919075144508,
73
+ "acc_norm_stderr": 0.03804749744364764
74
+ },
75
+ "harness|hendrycksTest-college_physics|5": {
76
+ "acc": 0.24509803921568626,
77
+ "acc_stderr": 0.04280105837364395,
78
+ "acc_norm": 0.24509803921568626,
79
+ "acc_norm_stderr": 0.04280105837364395
80
+ },
81
+ "harness|hendrycksTest-computer_security|5": {
82
+ "acc": 0.7,
83
+ "acc_stderr": 0.046056618647183814,
84
+ "acc_norm": 0.7,
85
+ "acc_norm_stderr": 0.046056618647183814
86
+ },
87
+ "harness|hendrycksTest-conceptual_physics|5": {
88
+ "acc": 0.425531914893617,
89
+ "acc_stderr": 0.032321469162244675,
90
+ "acc_norm": 0.425531914893617,
91
+ "acc_norm_stderr": 0.032321469162244675
92
+ },
93
+ "harness|hendrycksTest-econometrics|5": {
94
+ "acc": 0.32456140350877194,
95
+ "acc_stderr": 0.04404556157374768,
96
+ "acc_norm": 0.32456140350877194,
97
+ "acc_norm_stderr": 0.04404556157374768
98
+ },
99
+ "harness|hendrycksTest-electrical_engineering|5": {
100
+ "acc": 0.503448275862069,
101
+ "acc_stderr": 0.04166567577101579,
102
+ "acc_norm": 0.503448275862069,
103
+ "acc_norm_stderr": 0.04166567577101579
104
+ },
105
+ "harness|hendrycksTest-elementary_mathematics|5": {
106
+ "acc": 0.3386243386243386,
107
+ "acc_stderr": 0.02437319786798306,
108
+ "acc_norm": 0.3386243386243386,
109
+ "acc_norm_stderr": 0.02437319786798306
110
+ },
111
+ "harness|hendrycksTest-formal_logic|5": {
112
+ "acc": 0.3888888888888889,
113
+ "acc_stderr": 0.04360314860077459,
114
+ "acc_norm": 0.3888888888888889,
115
+ "acc_norm_stderr": 0.04360314860077459
116
+ },
117
+ "harness|hendrycksTest-global_facts|5": {
118
+ "acc": 0.33,
119
+ "acc_stderr": 0.04725815626252604,
120
+ "acc_norm": 0.33,
121
+ "acc_norm_stderr": 0.04725815626252604
122
+ },
123
+ "harness|hendrycksTest-high_school_biology|5": {
124
+ "acc": 0.6741935483870968,
125
+ "acc_stderr": 0.026662010578567107,
126
+ "acc_norm": 0.6741935483870968,
127
+ "acc_norm_stderr": 0.026662010578567107
128
+ },
129
+ "harness|hendrycksTest-high_school_chemistry|5": {
130
+ "acc": 0.4482758620689655,
131
+ "acc_stderr": 0.034991131376767445,
132
+ "acc_norm": 0.4482758620689655,
133
+ "acc_norm_stderr": 0.034991131376767445
134
+ },
135
+ "harness|hendrycksTest-high_school_computer_science|5": {
136
+ "acc": 0.57,
137
+ "acc_stderr": 0.04975698519562427,
138
+ "acc_norm": 0.57,
139
+ "acc_norm_stderr": 0.04975698519562427
140
+ },
141
+ "harness|hendrycksTest-high_school_european_history|5": {
142
+ "acc": 0.6484848484848484,
143
+ "acc_stderr": 0.037282069986826503,
144
+ "acc_norm": 0.6484848484848484,
145
+ "acc_norm_stderr": 0.037282069986826503
146
+ },
147
+ "harness|hendrycksTest-high_school_geography|5": {
148
+ "acc": 0.6919191919191919,
149
+ "acc_stderr": 0.032894773300986155,
150
+ "acc_norm": 0.6919191919191919,
151
+ "acc_norm_stderr": 0.032894773300986155
152
+ },
153
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
154
+ "acc": 0.8186528497409327,
155
+ "acc_stderr": 0.02780703236068609,
156
+ "acc_norm": 0.8186528497409327,
157
+ "acc_norm_stderr": 0.02780703236068609
158
+ },
159
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
160
+ "acc": 0.5102564102564102,
161
+ "acc_stderr": 0.025345672221942374,
162
+ "acc_norm": 0.5102564102564102,
163
+ "acc_norm_stderr": 0.025345672221942374
164
+ },
165
+ "harness|hendrycksTest-high_school_mathematics|5": {
166
+ "acc": 0.2777777777777778,
167
+ "acc_stderr": 0.02730914058823018,
168
+ "acc_norm": 0.2777777777777778,
169
+ "acc_norm_stderr": 0.02730914058823018
170
+ },
171
+ "harness|hendrycksTest-high_school_microeconomics|5": {
172
+ "acc": 0.5840336134453782,
173
+ "acc_stderr": 0.032016501007396114,
174
+ "acc_norm": 0.5840336134453782,
175
+ "acc_norm_stderr": 0.032016501007396114
176
+ },
177
+ "harness|hendrycksTest-high_school_physics|5": {
178
+ "acc": 0.36423841059602646,
179
+ "acc_stderr": 0.03929111781242742,
180
+ "acc_norm": 0.36423841059602646,
181
+ "acc_norm_stderr": 0.03929111781242742
182
+ },
183
+ "harness|hendrycksTest-high_school_psychology|5": {
184
+ "acc": 0.7614678899082569,
185
+ "acc_stderr": 0.01827257581023187,
186
+ "acc_norm": 0.7614678899082569,
187
+ "acc_norm_stderr": 0.01827257581023187
188
+ },
189
+ "harness|hendrycksTest-high_school_statistics|5": {
190
+ "acc": 0.4398148148148148,
191
+ "acc_stderr": 0.03385177976044812,
192
+ "acc_norm": 0.4398148148148148,
193
+ "acc_norm_stderr": 0.03385177976044812
194
+ },
195
+ "harness|hendrycksTest-high_school_us_history|5": {
196
+ "acc": 0.7450980392156863,
197
+ "acc_stderr": 0.030587591351604246,
198
+ "acc_norm": 0.7450980392156863,
199
+ "acc_norm_stderr": 0.030587591351604246
200
+ },
201
+ "harness|hendrycksTest-high_school_world_history|5": {
202
+ "acc": 0.7215189873417721,
203
+ "acc_stderr": 0.029178682304842538,
204
+ "acc_norm": 0.7215189873417721,
205
+ "acc_norm_stderr": 0.029178682304842538
206
+ },
207
+ "harness|hendrycksTest-human_aging|5": {
208
+ "acc": 0.6367713004484304,
209
+ "acc_stderr": 0.03227790442850499,
210
+ "acc_norm": 0.6367713004484304,
211
+ "acc_norm_stderr": 0.03227790442850499
212
+ },
213
+ "harness|hendrycksTest-human_sexuality|5": {
214
+ "acc": 0.6183206106870229,
215
+ "acc_stderr": 0.04260735157644559,
216
+ "acc_norm": 0.6183206106870229,
217
+ "acc_norm_stderr": 0.04260735157644559
218
+ },
219
+ "harness|hendrycksTest-international_law|5": {
220
+ "acc": 0.743801652892562,
221
+ "acc_stderr": 0.03984979653302873,
222
+ "acc_norm": 0.743801652892562,
223
+ "acc_norm_stderr": 0.03984979653302873
224
+ },
225
+ "harness|hendrycksTest-jurisprudence|5": {
226
+ "acc": 0.7407407407407407,
227
+ "acc_stderr": 0.04236511258094633,
228
+ "acc_norm": 0.7407407407407407,
229
+ "acc_norm_stderr": 0.04236511258094633
230
+ },
231
+ "harness|hendrycksTest-logical_fallacies|5": {
232
+ "acc": 0.6687116564417178,
233
+ "acc_stderr": 0.03697983910025588,
234
+ "acc_norm": 0.6687116564417178,
235
+ "acc_norm_stderr": 0.03697983910025588
236
+ },
237
+ "harness|hendrycksTest-machine_learning|5": {
238
+ "acc": 0.2857142857142857,
239
+ "acc_stderr": 0.042878587513404565,
240
+ "acc_norm": 0.2857142857142857,
241
+ "acc_norm_stderr": 0.042878587513404565
242
+ },
243
+ "harness|hendrycksTest-management|5": {
244
+ "acc": 0.7378640776699029,
245
+ "acc_stderr": 0.04354631077260595,
246
+ "acc_norm": 0.7378640776699029,
247
+ "acc_norm_stderr": 0.04354631077260595
248
+ },
249
+ "harness|hendrycksTest-marketing|5": {
250
+ "acc": 0.7948717948717948,
251
+ "acc_stderr": 0.02645350805404032,
252
+ "acc_norm": 0.7948717948717948,
253
+ "acc_norm_stderr": 0.02645350805404032
254
+ },
255
+ "harness|hendrycksTest-medical_genetics|5": {
256
+ "acc": 0.55,
257
+ "acc_stderr": 0.04999999999999999,
258
+ "acc_norm": 0.55,
259
+ "acc_norm_stderr": 0.04999999999999999
260
+ },
261
+ "harness|hendrycksTest-miscellaneous|5": {
262
+ "acc": 0.7471264367816092,
263
+ "acc_stderr": 0.015543377313719681,
264
+ "acc_norm": 0.7471264367816092,
265
+ "acc_norm_stderr": 0.015543377313719681
266
+ },
267
+ "harness|hendrycksTest-moral_disputes|5": {
268
+ "acc": 0.6473988439306358,
269
+ "acc_stderr": 0.025722802200895803,
270
+ "acc_norm": 0.6473988439306358,
271
+ "acc_norm_stderr": 0.025722802200895803
272
+ },
273
+ "harness|hendrycksTest-moral_scenarios|5": {
274
+ "acc": 0.39776536312849164,
275
+ "acc_stderr": 0.01636920497126298,
276
+ "acc_norm": 0.39776536312849164,
277
+ "acc_norm_stderr": 0.01636920497126298
278
+ },
279
+ "harness|hendrycksTest-nutrition|5": {
280
+ "acc": 0.6241830065359477,
281
+ "acc_stderr": 0.027732834353363947,
282
+ "acc_norm": 0.6241830065359477,
283
+ "acc_norm_stderr": 0.027732834353363947
284
+ },
285
+ "harness|hendrycksTest-philosophy|5": {
286
+ "acc": 0.6463022508038585,
287
+ "acc_stderr": 0.02715520810320086,
288
+ "acc_norm": 0.6463022508038585,
289
+ "acc_norm_stderr": 0.02715520810320086
290
+ },
291
+ "harness|hendrycksTest-prehistory|5": {
292
+ "acc": 0.6512345679012346,
293
+ "acc_stderr": 0.026517597724465013,
294
+ "acc_norm": 0.6512345679012346,
295
+ "acc_norm_stderr": 0.026517597724465013
296
+ },
297
+ "harness|hendrycksTest-professional_accounting|5": {
298
+ "acc": 0.3900709219858156,
299
+ "acc_stderr": 0.029097675599463926,
300
+ "acc_norm": 0.3900709219858156,
301
+ "acc_norm_stderr": 0.029097675599463926
302
+ },
303
+ "harness|hendrycksTest-professional_law|5": {
304
+ "acc": 0.424380704041721,
305
+ "acc_stderr": 0.012623343757430018,
306
+ "acc_norm": 0.424380704041721,
307
+ "acc_norm_stderr": 0.012623343757430018
308
+ },
309
+ "harness|hendrycksTest-professional_medicine|5": {
310
+ "acc": 0.5404411764705882,
311
+ "acc_stderr": 0.030273325077345755,
312
+ "acc_norm": 0.5404411764705882,
313
+ "acc_norm_stderr": 0.030273325077345755
314
+ },
315
+ "harness|hendrycksTest-professional_psychology|5": {
316
+ "acc": 0.5490196078431373,
317
+ "acc_stderr": 0.020130388312904528,
318
+ "acc_norm": 0.5490196078431373,
319
+ "acc_norm_stderr": 0.020130388312904528
320
+ },
321
+ "harness|hendrycksTest-public_relations|5": {
322
+ "acc": 0.6090909090909091,
323
+ "acc_stderr": 0.04673752333670239,
324
+ "acc_norm": 0.6090909090909091,
325
+ "acc_norm_stderr": 0.04673752333670239
326
+ },
327
+ "harness|hendrycksTest-security_studies|5": {
328
+ "acc": 0.636734693877551,
329
+ "acc_stderr": 0.030789051139030806,
330
+ "acc_norm": 0.636734693877551,
331
+ "acc_norm_stderr": 0.030789051139030806
332
+ },
333
+ "harness|hendrycksTest-sociology|5": {
334
+ "acc": 0.7263681592039801,
335
+ "acc_stderr": 0.031524391865554016,
336
+ "acc_norm": 0.7263681592039801,
337
+ "acc_norm_stderr": 0.031524391865554016
338
+ },
339
+ "harness|hendrycksTest-us_foreign_policy|5": {
340
+ "acc": 0.82,
341
+ "acc_stderr": 0.038612291966536934,
342
+ "acc_norm": 0.82,
343
+ "acc_norm_stderr": 0.038612291966536934
344
+ },
345
+ "harness|hendrycksTest-virology|5": {
346
+ "acc": 0.463855421686747,
347
+ "acc_stderr": 0.03882310850890593,
348
+ "acc_norm": 0.463855421686747,
349
+ "acc_norm_stderr": 0.03882310850890593
350
+ },
351
+ "harness|hendrycksTest-world_religions|5": {
352
+ "acc": 0.7602339181286549,
353
+ "acc_stderr": 0.03274485211946956,
354
+ "acc_norm": 0.7602339181286549,
355
+ "acc_norm_stderr": 0.03274485211946956
356
+ },
357
+ "harness|truthfulqa:mc|0": {
358
+ "mc1": 0.26805385556915545,
359
+ "mc1_stderr": 0.01550620472283456,
360
+ "mc2": 0.37375264473944586,
361
+ "mc2_stderr": 0.01368799302217441
362
+ },
363
+ "all": {
364
+ "acc": 0.5585210868491985,
365
+ "acc_stderr": 0.03442553546843938,
366
+ "acc_norm": 0.5627897985101215,
367
+ "acc_norm_stderr": 0.034404793730482705,
368
+ "mc1": 0.26805385556915545,
369
+ "mc1_stderr": 0.01550620472283456,
370
+ "mc2": 0.37375264473944586,
371
+ "mc2_stderr": 0.01368799302217441
372
+ }
373
+ },
374
+ "versions": {
375
+ "harness|arc:challenge|25": 0,
376
+ "harness|hellaswag|10": 0,
377
+ "harness|hendrycksTest-abstract_algebra|5": 1,
378
+ "harness|hendrycksTest-anatomy|5": 1,
379
+ "harness|hendrycksTest-astronomy|5": 1,
380
+ "harness|hendrycksTest-business_ethics|5": 1,
381
+ "harness|hendrycksTest-clinical_knowledge|5": 1,
382
+ "harness|hendrycksTest-college_biology|5": 1,
383
+ "harness|hendrycksTest-college_chemistry|5": 1,
384
+ "harness|hendrycksTest-college_computer_science|5": 1,
385
+ "harness|hendrycksTest-college_mathematics|5": 1,
386
+ "harness|hendrycksTest-college_medicine|5": 1,
387
+ "harness|hendrycksTest-college_physics|5": 1,
388
+ "harness|hendrycksTest-computer_security|5": 1,
389
+ "harness|hendrycksTest-conceptual_physics|5": 1,
390
+ "harness|hendrycksTest-econometrics|5": 1,
391
+ "harness|hendrycksTest-electrical_engineering|5": 1,
392
+ "harness|hendrycksTest-elementary_mathematics|5": 1,
393
+ "harness|hendrycksTest-formal_logic|5": 1,
394
+ "harness|hendrycksTest-global_facts|5": 1,
395
+ "harness|hendrycksTest-high_school_biology|5": 1,
396
+ "harness|hendrycksTest-high_school_chemistry|5": 1,
397
+ "harness|hendrycksTest-high_school_computer_science|5": 1,
398
+ "harness|hendrycksTest-high_school_european_history|5": 1,
399
+ "harness|hendrycksTest-high_school_geography|5": 1,
400
+ "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
+ "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
+ "harness|hendrycksTest-high_school_mathematics|5": 1,
403
+ "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
+ "harness|hendrycksTest-high_school_physics|5": 1,
405
+ "harness|hendrycksTest-high_school_psychology|5": 1,
406
+ "harness|hendrycksTest-high_school_statistics|5": 1,
407
+ "harness|hendrycksTest-high_school_us_history|5": 1,
408
+ "harness|hendrycksTest-high_school_world_history|5": 1,
409
+ "harness|hendrycksTest-human_aging|5": 1,
410
+ "harness|hendrycksTest-human_sexuality|5": 1,
411
+ "harness|hendrycksTest-international_law|5": 1,
412
+ "harness|hendrycksTest-jurisprudence|5": 1,
413
+ "harness|hendrycksTest-logical_fallacies|5": 1,
414
+ "harness|hendrycksTest-machine_learning|5": 1,
415
+ "harness|hendrycksTest-management|5": 1,
416
+ "harness|hendrycksTest-marketing|5": 1,
417
+ "harness|hendrycksTest-medical_genetics|5": 1,
418
+ "harness|hendrycksTest-miscellaneous|5": 1,
419
+ "harness|hendrycksTest-moral_disputes|5": 1,
420
+ "harness|hendrycksTest-moral_scenarios|5": 1,
421
+ "harness|hendrycksTest-nutrition|5": 1,
422
+ "harness|hendrycksTest-philosophy|5": 1,
423
+ "harness|hendrycksTest-prehistory|5": 1,
424
+ "harness|hendrycksTest-professional_accounting|5": 1,
425
+ "harness|hendrycksTest-professional_law|5": 1,
426
+ "harness|hendrycksTest-professional_medicine|5": 1,
427
+ "harness|hendrycksTest-professional_psychology|5": 1,
428
+ "harness|hendrycksTest-public_relations|5": 1,
429
+ "harness|hendrycksTest-security_studies|5": 1,
430
+ "harness|hendrycksTest-sociology|5": 1,
431
+ "harness|hendrycksTest-us_foreign_policy|5": 1,
432
+ "harness|hendrycksTest-virology|5": 1,
433
+ "harness|hendrycksTest-world_religions|5": 1,
434
+ "harness|truthfulqa:mc|0": 1,
435
+ "all": 0
436
+ },
437
+ "config": {
438
+ "model_name": "meta-llama/Llama-2-13b-hf",
439
+ "model_sha": "7da18fb10421c3ae2a1eb92815bad75e84816e35",
440
+ "model_dtype": "torch.float16",
441
+ "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
442
+ "num_few_shot_default": 0,
443
+ "num_fewshot_seeds": 1,
444
+ "override_batch_size": 1,
445
+ "max_samples": null
446
+ },
447
+ "task_config": {
448
+ "harness|arc:challenge": "LM Harness task",
449
+ "harness|hellaswag": "LM Harness task",
450
+ "harness|hendrycksTest-abstract_algebra": "LM Harness task",
451
+ "harness|hendrycksTest-anatomy": "LM Harness task",
452
+ "harness|hendrycksTest-astronomy": "LM Harness task",
453
+ "harness|hendrycksTest-business_ethics": "LM Harness task",
454
+ "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
455
+ "harness|hendrycksTest-college_biology": "LM Harness task",
456
+ "harness|hendrycksTest-college_chemistry": "LM Harness task",
457
+ "harness|hendrycksTest-college_computer_science": "LM Harness task",
458
+ "harness|hendrycksTest-college_mathematics": "LM Harness task",
459
+ "harness|hendrycksTest-college_medicine": "LM Harness task",
460
+ "harness|hendrycksTest-college_physics": "LM Harness task",
461
+ "harness|hendrycksTest-computer_security": "LM Harness task",
462
+ "harness|hendrycksTest-conceptual_physics": "LM Harness task",
463
+ "harness|hendrycksTest-econometrics": "LM Harness task",
464
+ "harness|hendrycksTest-electrical_engineering": "LM Harness task",
465
+ "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
466
+ "harness|hendrycksTest-formal_logic": "LM Harness task",
467
+ "harness|hendrycksTest-global_facts": "LM Harness task",
468
+ "harness|hendrycksTest-high_school_biology": "LM Harness task",
469
+ "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
470
+ "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
471
+ "harness|hendrycksTest-high_school_european_history": "LM Harness task",
472
+ "harness|hendrycksTest-high_school_geography": "LM Harness task",
473
+ "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
474
+ "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
475
+ "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
476
+ "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
477
+ "harness|hendrycksTest-high_school_physics": "LM Harness task",
478
+ "harness|hendrycksTest-high_school_psychology": "LM Harness task",
479
+ "harness|hendrycksTest-high_school_statistics": "LM Harness task",
480
+ "harness|hendrycksTest-high_school_us_history": "LM Harness task",
481
+ "harness|hendrycksTest-high_school_world_history": "LM Harness task",
482
+ "harness|hendrycksTest-human_aging": "LM Harness task",
483
+ "harness|hendrycksTest-human_sexuality": "LM Harness task",
484
+ "harness|hendrycksTest-international_law": "LM Harness task",
485
+ "harness|hendrycksTest-jurisprudence": "LM Harness task",
486
+ "harness|hendrycksTest-logical_fallacies": "LM Harness task",
487
+ "harness|hendrycksTest-machine_learning": "LM Harness task",
488
+ "harness|hendrycksTest-management": "LM Harness task",
489
+ "harness|hendrycksTest-marketing": "LM Harness task",
490
+ "harness|hendrycksTest-medical_genetics": "LM Harness task",
491
+ "harness|hendrycksTest-miscellaneous": "LM Harness task",
492
+ "harness|hendrycksTest-moral_disputes": "LM Harness task",
493
+ "harness|hendrycksTest-moral_scenarios": "LM Harness task",
494
+ "harness|hendrycksTest-nutrition": "LM Harness task",
495
+ "harness|hendrycksTest-philosophy": "LM Harness task",
496
+ "harness|hendrycksTest-prehistory": "LM Harness task",
497
+ "harness|hendrycksTest-professional_accounting": "LM Harness task",
498
+ "harness|hendrycksTest-professional_law": "LM Harness task",
499
+ "harness|hendrycksTest-professional_medicine": "LM Harness task",
500
+ "harness|hendrycksTest-professional_psychology": "LM Harness task",
501
+ "harness|hendrycksTest-public_relations": "LM Harness task",
502
+ "harness|hendrycksTest-security_studies": "LM Harness task",
503
+ "harness|hendrycksTest-sociology": "LM Harness task",
504
+ "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
505
+ "harness|hendrycksTest-virology": "LM Harness task",
506
+ "harness|hendrycksTest-world_religions": "LM Harness task",
507
+ "harness|truthfulqa:mc": "LM Harness task"
508
+ },
509
+ "hashes": {
510
+ "harness|arc:challenge|25": {
511
+ "hash_examples": "fb8c51b1872daeda",
512
+ "hash_full_prompts": "045cbb916e5145c6",
513
+ "hash_input_tokens": "61571bf68d6d89aa",
514
+ "hash_cont_tokens": "8210decc6ff6f7df"
515
+ },
516
+ "harness|hellaswag|10": {
517
+ "hash_examples": "e1768ecb99d7ecf0",
518
+ "hash_full_prompts": "0b4c16983130f84f",
519
+ "hash_input_tokens": "29906669b1c7054a",
520
+ "hash_cont_tokens": "b3b9e9017afa63af"
521
+ },
522
+ "harness|hendrycksTest-abstract_algebra|5": {
523
+ "hash_examples": "280f9f325b40559a",
524
+ "hash_full_prompts": "2f776a367d23aea2",
525
+ "hash_input_tokens": "c54ff61ad0273dd7",
526
+ "hash_cont_tokens": "50421e30bef398f9"
527
+ },
528
+ "harness|hendrycksTest-anatomy|5": {
529
+ "hash_examples": "2f83a4f1cab4ba18",
530
+ "hash_full_prompts": "516f74bef25df620",
531
+ "hash_input_tokens": "be31a1e22aef5f90",
532
+ "hash_cont_tokens": "f11971a765cb609f"
533
+ },
534
+ "harness|hendrycksTest-astronomy|5": {
535
+ "hash_examples": "7d587b908da4d762",
536
+ "hash_full_prompts": "faf4e80f65de93ca",
537
+ "hash_input_tokens": "277a7b1fad566940",
538
+ "hash_cont_tokens": "bf30e5d3f48250cb"
539
+ },
540
+ "harness|hendrycksTest-business_ethics|5": {
541
+ "hash_examples": "33e51740670de686",
542
+ "hash_full_prompts": "db01c3ef8e1479d4",
543
+ "hash_input_tokens": "ba552605bc116de5",
544
+ "hash_cont_tokens": "bc1dd9b2d995eb61"
545
+ },
546
+ "harness|hendrycksTest-clinical_knowledge|5": {
547
+ "hash_examples": "f3366dbe7eefffa4",
548
+ "hash_full_prompts": "49654f71d94b65c3",
549
+ "hash_input_tokens": "428c7563d0b98ab9",
550
+ "hash_cont_tokens": "890a119624b3b935"
551
+ },
552
+ "harness|hendrycksTest-college_biology|5": {
553
+ "hash_examples": "ca2b6753a0193e7f",
554
+ "hash_full_prompts": "2b460b75f1fdfefd",
555
+ "hash_input_tokens": "da036601573942e2",
556
+ "hash_cont_tokens": "875cde3af7a0ee14"
557
+ },
558
+ "harness|hendrycksTest-college_chemistry|5": {
559
+ "hash_examples": "22ff85f1d34f42d1",
560
+ "hash_full_prompts": "242c9be6da583e95",
561
+ "hash_input_tokens": "94e0196d6aded13d",
562
+ "hash_cont_tokens": "50421e30bef398f9"
563
+ },
564
+ "harness|hendrycksTest-college_computer_science|5": {
565
+ "hash_examples": "30318289d717a5cf",
566
+ "hash_full_prompts": "ed2bdb4e87c4b371",
567
+ "hash_input_tokens": "6e4d0f4a8d36690b",
568
+ "hash_cont_tokens": "ffc0fe414cdc4a83"
569
+ },
570
+ "harness|hendrycksTest-college_mathematics|5": {
571
+ "hash_examples": "4944d1f0b6b5d911",
572
+ "hash_full_prompts": "770bc4281c973190",
573
+ "hash_input_tokens": "614054d17109a25d",
574
+ "hash_cont_tokens": "50421e30bef398f9"
575
+ },
576
+ "harness|hendrycksTest-college_medicine|5": {
577
+ "hash_examples": "dd69cc33381275af",
578
+ "hash_full_prompts": "ad2a53e5250ab46e",
579
+ "hash_input_tokens": "1d633b3cc0524ba8",
580
+ "hash_cont_tokens": "1f88b00d41957d82"
581
+ },
582
+ "harness|hendrycksTest-college_physics|5": {
583
+ "hash_examples": "875dd26d22655b0d",
584
+ "hash_full_prompts": "833a0d7b55aed500",
585
+ "hash_input_tokens": "5421d9a1af86cbd4",
586
+ "hash_cont_tokens": "f7b8097afc16a47c"
587
+ },
588
+ "harness|hendrycksTest-computer_security|5": {
589
+ "hash_examples": "006451eedc0ededb",
590
+ "hash_full_prompts": "94034c97e85d8f46",
591
+ "hash_input_tokens": "5e6b70ecb333cf18",
592
+ "hash_cont_tokens": "50421e30bef398f9"
593
+ },
594
+ "harness|hendrycksTest-conceptual_physics|5": {
595
+ "hash_examples": "8874ece872d2ca4c",
596
+ "hash_full_prompts": "e40d15a34640d6fa",
597
+ "hash_input_tokens": "c2ef11a87264ceed",
598
+ "hash_cont_tokens": "aa0e8bc655f2f641"
599
+ },
600
+ "harness|hendrycksTest-econometrics|5": {
601
+ "hash_examples": "64d3623b0bfaa43f",
602
+ "hash_full_prompts": "612f340fae41338d",
603
+ "hash_input_tokens": "ecaccd912a4c3978",
604
+ "hash_cont_tokens": "bfb7e3c3c88313f1"
605
+ },
606
+ "harness|hendrycksTest-electrical_engineering|5": {
607
+ "hash_examples": "e98f51780c674d7e",
608
+ "hash_full_prompts": "10275b312d812ae6",
609
+ "hash_input_tokens": "1590c84291399be8",
610
+ "hash_cont_tokens": "2425a3f084a591ef"
611
+ },
612
+ "harness|hendrycksTest-elementary_mathematics|5": {
613
+ "hash_examples": "fc48208a5ac1c0ce",
614
+ "hash_full_prompts": "5ec274c6c82aca23",
615
+ "hash_input_tokens": "3269597f715b0da1",
616
+ "hash_cont_tokens": "f52691aef15a407b"
617
+ },
618
+ "harness|hendrycksTest-formal_logic|5": {
619
+ "hash_examples": "5a6525665f63ea72",
620
+ "hash_full_prompts": "07b92638c4a6b500",
621
+ "hash_input_tokens": "a2800d20f3ab8d7c",
622
+ "hash_cont_tokens": "f515d598d9c21263"
623
+ },
624
+ "harness|hendrycksTest-global_facts|5": {
625
+ "hash_examples": "371d70d743b2b89b",
626
+ "hash_full_prompts": "332fdee50a1921b4",
627
+ "hash_input_tokens": "94ed44b3772505ad",
628
+ "hash_cont_tokens": "50421e30bef398f9"
629
+ },
630
+ "harness|hendrycksTest-high_school_biology|5": {
631
+ "hash_examples": "a79e1018b1674052",
632
+ "hash_full_prompts": "e624e26ede922561",
633
+ "hash_input_tokens": "24423acb928db768",
634
+ "hash_cont_tokens": "bd85a4156a3613ee"
635
+ },
636
+ "harness|hendrycksTest-high_school_chemistry|5": {
637
+ "hash_examples": "44bfc25c389f0e03",
638
+ "hash_full_prompts": "0e3e5f5d9246482a",
639
+ "hash_input_tokens": "831ff35c474e5cef",
640
+ "hash_cont_tokens": "a95c97af1c14e068"
641
+ },
642
+ "harness|hendrycksTest-high_school_computer_science|5": {
643
+ "hash_examples": "8b8cdb1084f24169",
644
+ "hash_full_prompts": "c00487e67c1813cc",
645
+ "hash_input_tokens": "8c34e0f2bda77358",
646
+ "hash_cont_tokens": "8abfedef914e33c9"
647
+ },
648
+ "harness|hendrycksTest-high_school_european_history|5": {
649
+ "hash_examples": "11cd32d0ef440171",
650
+ "hash_full_prompts": "318f4513c537c6bf",
651
+ "hash_input_tokens": "f1f73dd687da18d7",
652
+ "hash_cont_tokens": "674fc454bdc5ac93"
653
+ },
654
+ "harness|hendrycksTest-high_school_geography|5": {
655
+ "hash_examples": "b60019b9e80b642f",
656
+ "hash_full_prompts": "ee5789fcc1a81b1e",
657
+ "hash_input_tokens": "7c5547c7da5bc793",
658
+ "hash_cont_tokens": "03a5012b916274ea"
659
+ },
660
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
661
+ "hash_examples": "d221ec983d143dc3",
662
+ "hash_full_prompts": "ac42d888e1ce1155",
663
+ "hash_input_tokens": "f62991cb6a496b05",
664
+ "hash_cont_tokens": "a83effb8f76b7d7c"
665
+ },
666
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
667
+ "hash_examples": "59c2915cacfd3fbb",
668
+ "hash_full_prompts": "c6bd9d25158abd0e",
669
+ "hash_input_tokens": "4cef2aff6e3d59ed",
670
+ "hash_cont_tokens": "c583432ad27fcfe0"
671
+ },
672
+ "harness|hendrycksTest-high_school_mathematics|5": {
673
+ "hash_examples": "1f8ac897608de342",
674
+ "hash_full_prompts": "5d88f41fc2d643a8",
675
+ "hash_input_tokens": "6e2577ea4082ed2b",
676
+ "hash_cont_tokens": "24f5dc613660300b"
677
+ },
678
+ "harness|hendrycksTest-high_school_microeconomics|5": {
679
+ "hash_examples": "ead6a0f2f6c83370",
680
+ "hash_full_prompts": "bfc393381298609e",
681
+ "hash_input_tokens": "c5fc9aeb1079c8e4",
682
+ "hash_cont_tokens": "f47f041de50333b9"
683
+ },
684
+ "harness|hendrycksTest-high_school_physics|5": {
685
+ "hash_examples": "c3f2025990afec64",
686
+ "hash_full_prompts": "fc78b4997e436734",
687
+ "hash_input_tokens": "555fc385cffa84ca",
688
+ "hash_cont_tokens": "ba2efcd283e938cc"
689
+ },
690
+ "harness|hendrycksTest-high_school_psychology|5": {
691
+ "hash_examples": "21f8aab618f6d636",
692
+ "hash_full_prompts": "d5c76aa40b9dbc43",
693
+ "hash_input_tokens": "febd23cbf9973b7f",
694
+ "hash_cont_tokens": "942069cd363844d9"
695
+ },
696
+ "harness|hendrycksTest-high_school_statistics|5": {
697
+ "hash_examples": "2386a60a11fc5de3",
698
+ "hash_full_prompts": "4c5c8be5aafac432",
699
+ "hash_input_tokens": "424b02981230ee83",
700
+ "hash_cont_tokens": "955ed42b6f7fa019"
701
+ },
702
+ "harness|hendrycksTest-high_school_us_history|5": {
703
+ "hash_examples": "74961543be40f04f",
704
+ "hash_full_prompts": "5d5ca4840131ba21",
705
+ "hash_input_tokens": "50c9ff438c85a69e",
706
+ "hash_cont_tokens": "cdd0b3dc06d933e5"
707
+ },
708
+ "harness|hendrycksTest-high_school_world_history|5": {
709
+ "hash_examples": "2ad2f6b7198b2234",
710
+ "hash_full_prompts": "11845057459afd72",
711
+ "hash_input_tokens": "054824cc474caef5",
712
+ "hash_cont_tokens": "9a864184946033ac"
713
+ },
714
+ "harness|hendrycksTest-human_aging|5": {
715
+ "hash_examples": "1a7199dc733e779b",
716
+ "hash_full_prompts": "756b9096b8eaf892",
717
+ "hash_input_tokens": "541a75f071dcf579",
718
+ "hash_cont_tokens": "142a4a8a1138a214"
719
+ },
720
+ "harness|hendrycksTest-human_sexuality|5": {
721
+ "hash_examples": "7acb8fdad97f88a6",
722
+ "hash_full_prompts": "731a52ff15b8cfdb",
723
+ "hash_input_tokens": "04269e5c5a257dd9",
724
+ "hash_cont_tokens": "bc54813e809b796d"
725
+ },
726
+ "harness|hendrycksTest-international_law|5": {
727
+ "hash_examples": "1300bfd0dfc59114",
728
+ "hash_full_prompts": "db2aefbff5eec996",
729
+ "hash_input_tokens": "d93ba9d9d38e4397",
730
+ "hash_cont_tokens": "dc45b45fcda18e5d"
731
+ },
732
+ "harness|hendrycksTest-jurisprudence|5": {
733
+ "hash_examples": "083b1e4904c48dc2",
734
+ "hash_full_prompts": "0f89ee3fe03d6a21",
735
+ "hash_input_tokens": "9eeaccd2698b4f5a",
736
+ "hash_cont_tokens": "e3a8cd951b6e3469"
737
+ },
738
+ "harness|hendrycksTest-logical_fallacies|5": {
739
+ "hash_examples": "709128f9926a634c",
740
+ "hash_full_prompts": "98a04b1f8f841069",
741
+ "hash_input_tokens": "b4f08f544f2b7576",
742
+ "hash_cont_tokens": "1e80dbd30f6453d5"
743
+ },
744
+ "harness|hendrycksTest-machine_learning|5": {
745
+ "hash_examples": "88f22a636029ae47",
746
+ "hash_full_prompts": "2e1c8d4b1e0cc921",
747
+ "hash_input_tokens": "900c2a51f1174b9f",
748
+ "hash_cont_tokens": "9b37da7777378ca9"
749
+ },
750
+ "harness|hendrycksTest-management|5": {
751
+ "hash_examples": "8c8a1e07a2151dca",
752
+ "hash_full_prompts": "f51611f514b265b0",
753
+ "hash_input_tokens": "6b36efb4689c6eca",
754
+ "hash_cont_tokens": "a01d6d39a83c4597"
755
+ },
756
+ "harness|hendrycksTest-marketing|5": {
757
+ "hash_examples": "2668953431f91e96",
758
+ "hash_full_prompts": "77562bef997c7650",
759
+ "hash_input_tokens": "2aaac78a0cfed47a",
760
+ "hash_cont_tokens": "6aeaed4d823c98aa"
761
+ },
762
+ "harness|hendrycksTest-medical_genetics|5": {
763
+ "hash_examples": "9c2dda34a2ea4fd2",
764
+ "hash_full_prompts": "202139046daa118f",
765
+ "hash_input_tokens": "886ca823b41c094a",
766
+ "hash_cont_tokens": "50421e30bef398f9"
767
+ },
768
+ "harness|hendrycksTest-miscellaneous|5": {
769
+ "hash_examples": "41adb694024809c2",
770
+ "hash_full_prompts": "bffec9fc237bcf93",
771
+ "hash_input_tokens": "72fd71de7675e7d0",
772
+ "hash_cont_tokens": "9b0ab02a64603081"
773
+ },
774
+ "harness|hendrycksTest-moral_disputes|5": {
775
+ "hash_examples": "3171c13ba3c594c4",
776
+ "hash_full_prompts": "170831fc36f1d59e",
777
+ "hash_input_tokens": "f3ca0dd8e7a1eb09",
778
+ "hash_cont_tokens": "8badf768f7b0467a"
779
+ },
780
+ "harness|hendrycksTest-moral_scenarios|5": {
781
+ "hash_examples": "9873e077e83e0546",
782
+ "hash_full_prompts": "08f4ceba3131a068",
783
+ "hash_input_tokens": "3e793631e951f23c",
784
+ "hash_cont_tokens": "32ae620376b2bbba"
785
+ },
786
+ "harness|hendrycksTest-nutrition|5": {
787
+ "hash_examples": "7db1d8142ec14323",
788
+ "hash_full_prompts": "4c0e68e3586cb453",
789
+ "hash_input_tokens": "59753c2144ea93af",
790
+ "hash_cont_tokens": "3071def75bacc404"
791
+ },
792
+ "harness|hendrycksTest-philosophy|5": {
793
+ "hash_examples": "9b455b7d72811cc8",
794
+ "hash_full_prompts": "e467f822d8a0d3ff",
795
+ "hash_input_tokens": "bd8d3dbed15a8c34",
796
+ "hash_cont_tokens": "9f6ff69d23a48783"
797
+ },
798
+ "harness|hendrycksTest-prehistory|5": {
799
+ "hash_examples": "8be90d0f538f1560",
800
+ "hash_full_prompts": "152187949bcd0921",
801
+ "hash_input_tokens": "3573cd87facbb7c5",
802
+ "hash_cont_tokens": "de469d2b981e32a3"
803
+ },
804
+ "harness|hendrycksTest-professional_accounting|5": {
805
+ "hash_examples": "8d377597916cd07e",
806
+ "hash_full_prompts": "0eb7345d6144ee0d",
807
+ "hash_input_tokens": "17e721bc1a7cbb47",
808
+ "hash_cont_tokens": "c46f74d2dfc7b13b"
809
+ },
810
+ "harness|hendrycksTest-professional_law|5": {
811
+ "hash_examples": "cd9dbc52b3c932d6",
812
+ "hash_full_prompts": "36ac764272bfb182",
813
+ "hash_input_tokens": "9178e10bd0763ec4",
814
+ "hash_cont_tokens": "2e590029ef41fbcd"
815
+ },
816
+ "harness|hendrycksTest-professional_medicine|5": {
817
+ "hash_examples": "b20e4e816c1e383e",
818
+ "hash_full_prompts": "7b8d69ea2acaf2f7",
819
+ "hash_input_tokens": "f5a22012a54f70ea",
820
+ "hash_cont_tokens": "fe35cfa9c6ca802e"
821
+ },
822
+ "harness|hendrycksTest-professional_psychology|5": {
823
+ "hash_examples": "d45b73b22f9cc039",
824
+ "hash_full_prompts": "fe8937e9ffc99771",
825
+ "hash_input_tokens": "0dfb73a8eb3f692c",
826
+ "hash_cont_tokens": "f020fbddf72c8652"
827
+ },
828
+ "harness|hendrycksTest-public_relations|5": {
829
+ "hash_examples": "0d25072e1761652a",
830
+ "hash_full_prompts": "f9adc39cfa9f42ba",
831
+ "hash_input_tokens": "1710c6ba4c9f3cbd",
832
+ "hash_cont_tokens": "568f585a259965c1"
833
+ },
834
+ "harness|hendrycksTest-security_studies|5": {
835
+ "hash_examples": "62bb8197e63d60d4",
836
+ "hash_full_prompts": "869c9c3ae196b7c3",
837
+ "hash_input_tokens": "d49711415961ced7",
838
+ "hash_cont_tokens": "cc6fd7cccd64cd5d"
839
+ },
840
+ "harness|hendrycksTest-sociology|5": {
841
+ "hash_examples": "e7959df87dea8672",
842
+ "hash_full_prompts": "1a1fc00e17b3a52a",
843
+ "hash_input_tokens": "828999f7624cbe7e",
844
+ "hash_cont_tokens": "c3a3bdfd177eed5b"
845
+ },
846
+ "harness|hendrycksTest-us_foreign_policy|5": {
847
+ "hash_examples": "4a56a01ddca44dca",
848
+ "hash_full_prompts": "0c7a7081c71c07b6",
849
+ "hash_input_tokens": "42054621e718dbee",
850
+ "hash_cont_tokens": "2568d0e8e36fa959"
851
+ },
852
+ "harness|hendrycksTest-virology|5": {
853
+ "hash_examples": "451cc86a8c4f4fe9",
854
+ "hash_full_prompts": "01e95325d8b738e4",
855
+ "hash_input_tokens": "6c4f0aa4dc859c04",
856
+ "hash_cont_tokens": "926cf60b0891f374"
857
+ },
858
+ "harness|hendrycksTest-world_religions|5": {
859
+ "hash_examples": "3b29cfaf1a81c379",
860
+ "hash_full_prompts": "e0d79a15083dfdff",
861
+ "hash_input_tokens": "6c75d44e092ff24f",
862
+ "hash_cont_tokens": "c525a5de974c1ea3"
863
+ },
864
+ "harness|truthfulqa:mc|0": {
865
+ "hash_examples": "23176c0531c7b867",
866
+ "hash_full_prompts": "36a6d90e75d92d4a",
867
+ "hash_input_tokens": "2738d7ed7075faa7",
868
+ "hash_cont_tokens": "c014154380b74b9e"
869
+ }
870
+ }
871
+ }
meta-llama/Llama-2-13b-hf/results_2023-08-29T22-26-02.660247.json ADDED
@@ -0,0 +1,1366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-13b-hf",
4
+ "model_sha": "db6b8eb1feabb38985fdf785a89895959e944936",
5
+ "model_dtype": "4bit",
6
+ "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63",
7
+ "num_few_shot_default": 0,
8
+ "num_fewshot_seeds": 1,
9
+ "override_batch_size": 1,
10
+ "max_samples": null,
11
+ "job_id": ""
12
+ },
13
+ "results": {
14
+ "harness|arc:challenge|25": {
15
+ "acc": 0.5366894197952219,
16
+ "acc_stderr": 0.01457200052775699,
17
+ "acc_norm": 0.5810580204778157,
18
+ "acc_norm_stderr": 0.014418106953639013
19
+ },
20
+ "harness|hellaswag|10": {
21
+ "acc": 0.6059549890460068,
22
+ "acc_stderr": 0.004876459434619801,
23
+ "acc_norm": 0.809699263095001,
24
+ "acc_norm_stderr": 0.003917361254101999
25
+ },
26
+ "harness|hendrycksTest-abstract_algebra|5": {
27
+ "acc": 0.33,
28
+ "acc_stderr": 0.04725815626252605,
29
+ "acc_norm": 0.33,
30
+ "acc_norm_stderr": 0.04725815626252605
31
+ },
32
+ "harness|hendrycksTest-anatomy|5": {
33
+ "acc": 0.5111111111111111,
34
+ "acc_stderr": 0.04318275491977976,
35
+ "acc_norm": 0.5111111111111111,
36
+ "acc_norm_stderr": 0.04318275491977976
37
+ },
38
+ "harness|hendrycksTest-astronomy|5": {
39
+ "acc": 0.5131578947368421,
40
+ "acc_stderr": 0.04067533136309174,
41
+ "acc_norm": 0.5131578947368421,
42
+ "acc_norm_stderr": 0.04067533136309174
43
+ },
44
+ "harness|hendrycksTest-business_ethics|5": {
45
+ "acc": 0.54,
46
+ "acc_stderr": 0.05009082659620332,
47
+ "acc_norm": 0.54,
48
+ "acc_norm_stderr": 0.05009082659620332
49
+ },
50
+ "harness|hendrycksTest-clinical_knowledge|5": {
51
+ "acc": 0.6075471698113207,
52
+ "acc_stderr": 0.03005258057955785,
53
+ "acc_norm": 0.6075471698113207,
54
+ "acc_norm_stderr": 0.03005258057955785
55
+ },
56
+ "harness|hendrycksTest-college_biology|5": {
57
+ "acc": 0.5555555555555556,
58
+ "acc_stderr": 0.041553199555931467,
59
+ "acc_norm": 0.5555555555555556,
60
+ "acc_norm_stderr": 0.041553199555931467
61
+ },
62
+ "harness|hendrycksTest-college_chemistry|5": {
63
+ "acc": 0.43,
64
+ "acc_stderr": 0.04975698519562428,
65
+ "acc_norm": 0.43,
66
+ "acc_norm_stderr": 0.04975698519562428
67
+ },
68
+ "harness|hendrycksTest-college_computer_science|5": {
69
+ "acc": 0.44,
70
+ "acc_stderr": 0.04988876515698589,
71
+ "acc_norm": 0.44,
72
+ "acc_norm_stderr": 0.04988876515698589
73
+ },
74
+ "harness|hendrycksTest-college_mathematics|5": {
75
+ "acc": 0.34,
76
+ "acc_stderr": 0.04760952285695235,
77
+ "acc_norm": 0.34,
78
+ "acc_norm_stderr": 0.04760952285695235
79
+ },
80
+ "harness|hendrycksTest-college_medicine|5": {
81
+ "acc": 0.5144508670520231,
82
+ "acc_stderr": 0.03810871630454764,
83
+ "acc_norm": 0.5144508670520231,
84
+ "acc_norm_stderr": 0.03810871630454764
85
+ },
86
+ "harness|hendrycksTest-college_physics|5": {
87
+ "acc": 0.21568627450980393,
88
+ "acc_stderr": 0.040925639582376536,
89
+ "acc_norm": 0.21568627450980393,
90
+ "acc_norm_stderr": 0.040925639582376536
91
+ },
92
+ "harness|hendrycksTest-computer_security|5": {
93
+ "acc": 0.68,
94
+ "acc_stderr": 0.04688261722621504,
95
+ "acc_norm": 0.68,
96
+ "acc_norm_stderr": 0.04688261722621504
97
+ },
98
+ "harness|hendrycksTest-conceptual_physics|5": {
99
+ "acc": 0.4553191489361702,
100
+ "acc_stderr": 0.03255525359340354,
101
+ "acc_norm": 0.4553191489361702,
102
+ "acc_norm_stderr": 0.03255525359340354
103
+ },
104
+ "harness|hendrycksTest-econometrics|5": {
105
+ "acc": 0.2894736842105263,
106
+ "acc_stderr": 0.04266339443159394,
107
+ "acc_norm": 0.2894736842105263,
108
+ "acc_norm_stderr": 0.04266339443159394
109
+ },
110
+ "harness|hendrycksTest-electrical_engineering|5": {
111
+ "acc": 0.496551724137931,
112
+ "acc_stderr": 0.041665675771015785,
113
+ "acc_norm": 0.496551724137931,
114
+ "acc_norm_stderr": 0.041665675771015785
115
+ },
116
+ "harness|hendrycksTest-elementary_mathematics|5": {
117
+ "acc": 0.3439153439153439,
118
+ "acc_stderr": 0.024464426625596433,
119
+ "acc_norm": 0.3439153439153439,
120
+ "acc_norm_stderr": 0.024464426625596433
121
+ },
122
+ "harness|hendrycksTest-formal_logic|5": {
123
+ "acc": 0.3333333333333333,
124
+ "acc_stderr": 0.04216370213557835,
125
+ "acc_norm": 0.3333333333333333,
126
+ "acc_norm_stderr": 0.04216370213557835
127
+ },
128
+ "harness|hendrycksTest-global_facts|5": {
129
+ "acc": 0.32,
130
+ "acc_stderr": 0.046882617226215034,
131
+ "acc_norm": 0.32,
132
+ "acc_norm_stderr": 0.046882617226215034
133
+ },
134
+ "harness|hendrycksTest-high_school_biology|5": {
135
+ "acc": 0.6580645161290323,
136
+ "acc_stderr": 0.02698528957655274,
137
+ "acc_norm": 0.6580645161290323,
138
+ "acc_norm_stderr": 0.02698528957655274
139
+ },
140
+ "harness|hendrycksTest-high_school_chemistry|5": {
141
+ "acc": 0.458128078817734,
142
+ "acc_stderr": 0.03505630140785741,
143
+ "acc_norm": 0.458128078817734,
144
+ "acc_norm_stderr": 0.03505630140785741
145
+ },
146
+ "harness|hendrycksTest-high_school_computer_science|5": {
147
+ "acc": 0.55,
148
+ "acc_stderr": 0.05,
149
+ "acc_norm": 0.55,
150
+ "acc_norm_stderr": 0.05
151
+ },
152
+ "harness|hendrycksTest-high_school_european_history|5": {
153
+ "acc": 0.6484848484848484,
154
+ "acc_stderr": 0.037282069986826503,
155
+ "acc_norm": 0.6484848484848484,
156
+ "acc_norm_stderr": 0.037282069986826503
157
+ },
158
+ "harness|hendrycksTest-high_school_geography|5": {
159
+ "acc": 0.6868686868686869,
160
+ "acc_stderr": 0.033042050878136525,
161
+ "acc_norm": 0.6868686868686869,
162
+ "acc_norm_stderr": 0.033042050878136525
163
+ },
164
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
165
+ "acc": 0.7668393782383419,
166
+ "acc_stderr": 0.03051611137147601,
167
+ "acc_norm": 0.7668393782383419,
168
+ "acc_norm_stderr": 0.03051611137147601
169
+ },
170
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
171
+ "acc": 0.47692307692307695,
172
+ "acc_stderr": 0.025323990861736118,
173
+ "acc_norm": 0.47692307692307695,
174
+ "acc_norm_stderr": 0.025323990861736118
175
+ },
176
+ "harness|hendrycksTest-high_school_mathematics|5": {
177
+ "acc": 0.2814814814814815,
178
+ "acc_stderr": 0.027420019350945277,
179
+ "acc_norm": 0.2814814814814815,
180
+ "acc_norm_stderr": 0.027420019350945277
181
+ },
182
+ "harness|hendrycksTest-high_school_microeconomics|5": {
183
+ "acc": 0.5546218487394958,
184
+ "acc_stderr": 0.032284106267163895,
185
+ "acc_norm": 0.5546218487394958,
186
+ "acc_norm_stderr": 0.032284106267163895
187
+ },
188
+ "harness|hendrycksTest-high_school_physics|5": {
189
+ "acc": 0.39072847682119205,
190
+ "acc_stderr": 0.039837983066598075,
191
+ "acc_norm": 0.39072847682119205,
192
+ "acc_norm_stderr": 0.039837983066598075
193
+ },
194
+ "harness|hendrycksTest-high_school_psychology|5": {
195
+ "acc": 0.7339449541284404,
196
+ "acc_stderr": 0.018946022322225597,
197
+ "acc_norm": 0.7339449541284404,
198
+ "acc_norm_stderr": 0.018946022322225597
199
+ },
200
+ "harness|hendrycksTest-high_school_statistics|5": {
201
+ "acc": 0.4675925925925926,
202
+ "acc_stderr": 0.034028015813589656,
203
+ "acc_norm": 0.4675925925925926,
204
+ "acc_norm_stderr": 0.034028015813589656
205
+ },
206
+ "harness|hendrycksTest-high_school_us_history|5": {
207
+ "acc": 0.6911764705882353,
208
+ "acc_stderr": 0.03242661719827218,
209
+ "acc_norm": 0.6911764705882353,
210
+ "acc_norm_stderr": 0.03242661719827218
211
+ },
212
+ "harness|hendrycksTest-high_school_world_history|5": {
213
+ "acc": 0.7172995780590717,
214
+ "acc_stderr": 0.029312814153955924,
215
+ "acc_norm": 0.7172995780590717,
216
+ "acc_norm_stderr": 0.029312814153955924
217
+ },
218
+ "harness|hendrycksTest-human_aging|5": {
219
+ "acc": 0.6636771300448431,
220
+ "acc_stderr": 0.031708824268455,
221
+ "acc_norm": 0.6636771300448431,
222
+ "acc_norm_stderr": 0.031708824268455
223
+ },
224
+ "harness|hendrycksTest-human_sexuality|5": {
225
+ "acc": 0.6030534351145038,
226
+ "acc_stderr": 0.04291135671009224,
227
+ "acc_norm": 0.6030534351145038,
228
+ "acc_norm_stderr": 0.04291135671009224
229
+ },
230
+ "harness|hendrycksTest-international_law|5": {
231
+ "acc": 0.7024793388429752,
232
+ "acc_stderr": 0.04173349148083499,
233
+ "acc_norm": 0.7024793388429752,
234
+ "acc_norm_stderr": 0.04173349148083499
235
+ },
236
+ "harness|hendrycksTest-jurisprudence|5": {
237
+ "acc": 0.7037037037037037,
238
+ "acc_stderr": 0.044143436668549335,
239
+ "acc_norm": 0.7037037037037037,
240
+ "acc_norm_stderr": 0.044143436668549335
241
+ },
242
+ "harness|hendrycksTest-logical_fallacies|5": {
243
+ "acc": 0.6871165644171779,
244
+ "acc_stderr": 0.03642914578292406,
245
+ "acc_norm": 0.6871165644171779,
246
+ "acc_norm_stderr": 0.03642914578292406
247
+ },
248
+ "harness|hendrycksTest-machine_learning|5": {
249
+ "acc": 0.26785714285714285,
250
+ "acc_stderr": 0.04203277291467762,
251
+ "acc_norm": 0.26785714285714285,
252
+ "acc_norm_stderr": 0.04203277291467762
253
+ },
254
+ "harness|hendrycksTest-management|5": {
255
+ "acc": 0.7572815533980582,
256
+ "acc_stderr": 0.04245022486384495,
257
+ "acc_norm": 0.7572815533980582,
258
+ "acc_norm_stderr": 0.04245022486384495
259
+ },
260
+ "harness|hendrycksTest-marketing|5": {
261
+ "acc": 0.7991452991452992,
262
+ "acc_stderr": 0.026246772946890477,
263
+ "acc_norm": 0.7991452991452992,
264
+ "acc_norm_stderr": 0.026246772946890477
265
+ },
266
+ "harness|hendrycksTest-medical_genetics|5": {
267
+ "acc": 0.55,
268
+ "acc_stderr": 0.049999999999999996,
269
+ "acc_norm": 0.55,
270
+ "acc_norm_stderr": 0.049999999999999996
271
+ },
272
+ "harness|hendrycksTest-miscellaneous|5": {
273
+ "acc": 0.7266922094508301,
274
+ "acc_stderr": 0.015936681062628556,
275
+ "acc_norm": 0.7266922094508301,
276
+ "acc_norm_stderr": 0.015936681062628556
277
+ },
278
+ "harness|hendrycksTest-moral_disputes|5": {
279
+ "acc": 0.6358381502890174,
280
+ "acc_stderr": 0.025906632631016124,
281
+ "acc_norm": 0.6358381502890174,
282
+ "acc_norm_stderr": 0.025906632631016124
283
+ },
284
+ "harness|hendrycksTest-moral_scenarios|5": {
285
+ "acc": 0.2759776536312849,
286
+ "acc_stderr": 0.014950103002475356,
287
+ "acc_norm": 0.2759776536312849,
288
+ "acc_norm_stderr": 0.014950103002475356
289
+ },
290
+ "harness|hendrycksTest-nutrition|5": {
291
+ "acc": 0.5947712418300654,
292
+ "acc_stderr": 0.028110928492809075,
293
+ "acc_norm": 0.5947712418300654,
294
+ "acc_norm_stderr": 0.028110928492809075
295
+ },
296
+ "harness|hendrycksTest-philosophy|5": {
297
+ "acc": 0.6495176848874598,
298
+ "acc_stderr": 0.027098652621301754,
299
+ "acc_norm": 0.6495176848874598,
300
+ "acc_norm_stderr": 0.027098652621301754
301
+ },
302
+ "harness|hendrycksTest-prehistory|5": {
303
+ "acc": 0.6080246913580247,
304
+ "acc_stderr": 0.027163686038271146,
305
+ "acc_norm": 0.6080246913580247,
306
+ "acc_norm_stderr": 0.027163686038271146
307
+ },
308
+ "harness|hendrycksTest-professional_accounting|5": {
309
+ "acc": 0.3971631205673759,
310
+ "acc_stderr": 0.02918980567358711,
311
+ "acc_norm": 0.3971631205673759,
312
+ "acc_norm_stderr": 0.02918980567358711
313
+ },
314
+ "harness|hendrycksTest-professional_law|5": {
315
+ "acc": 0.41590612777053454,
316
+ "acc_stderr": 0.012588323850313611,
317
+ "acc_norm": 0.41590612777053454,
318
+ "acc_norm_stderr": 0.012588323850313611
319
+ },
320
+ "harness|hendrycksTest-professional_medicine|5": {
321
+ "acc": 0.5147058823529411,
322
+ "acc_stderr": 0.03035969707904611,
323
+ "acc_norm": 0.5147058823529411,
324
+ "acc_norm_stderr": 0.03035969707904611
325
+ },
326
+ "harness|hendrycksTest-professional_psychology|5": {
327
+ "acc": 0.5424836601307189,
328
+ "acc_stderr": 0.020154685712590888,
329
+ "acc_norm": 0.5424836601307189,
330
+ "acc_norm_stderr": 0.020154685712590888
331
+ },
332
+ "harness|hendrycksTest-public_relations|5": {
333
+ "acc": 0.5727272727272728,
334
+ "acc_stderr": 0.04738198703545483,
335
+ "acc_norm": 0.5727272727272728,
336
+ "acc_norm_stderr": 0.04738198703545483
337
+ },
338
+ "harness|hendrycksTest-security_studies|5": {
339
+ "acc": 0.6122448979591837,
340
+ "acc_stderr": 0.031192230726795656,
341
+ "acc_norm": 0.6122448979591837,
342
+ "acc_norm_stderr": 0.031192230726795656
343
+ },
344
+ "harness|hendrycksTest-sociology|5": {
345
+ "acc": 0.7114427860696517,
346
+ "acc_stderr": 0.03203841040213322,
347
+ "acc_norm": 0.7114427860696517,
348
+ "acc_norm_stderr": 0.03203841040213322
349
+ },
350
+ "harness|hendrycksTest-us_foreign_policy|5": {
351
+ "acc": 0.83,
352
+ "acc_stderr": 0.0377525168068637,
353
+ "acc_norm": 0.83,
354
+ "acc_norm_stderr": 0.0377525168068637
355
+ },
356
+ "harness|hendrycksTest-virology|5": {
357
+ "acc": 0.41566265060240964,
358
+ "acc_stderr": 0.038367221765980515,
359
+ "acc_norm": 0.41566265060240964,
360
+ "acc_norm_stderr": 0.038367221765980515
361
+ },
362
+ "harness|hendrycksTest-world_religions|5": {
363
+ "acc": 0.7368421052631579,
364
+ "acc_stderr": 0.03377310252209204,
365
+ "acc_norm": 0.7368421052631579,
366
+ "acc_norm_stderr": 0.03377310252209204
367
+ },
368
+ "harness|truthfulqa:mc|0": {
369
+ "mc1": 0.2386780905752754,
370
+ "mc1_stderr": 0.014922629695456421,
371
+ "mc2": 0.34172402963708387,
372
+ "mc2_stderr": 0.01332205356000871
373
+ },
374
+ "all": {
375
+ "acc": 0.5443256746853737,
376
+ "acc_stderr": 0.034540859468822654,
377
+ "acc_norm": 0.5485309776469262,
378
+ "acc_norm_stderr": 0.03452199520179493,
379
+ "mc1": 0.2386780905752754,
380
+ "mc1_stderr": 0.014922629695456421,
381
+ "mc2": 0.34172402963708387,
382
+ "mc2_stderr": 0.01332205356000871
383
+ }
384
+ },
385
+ "versions": {
386
+ "harness|arc:challenge|25": 0,
387
+ "harness|hellaswag|10": 0,
388
+ "harness|hendrycksTest-abstract_algebra|5": 1,
389
+ "harness|hendrycksTest-anatomy|5": 1,
390
+ "harness|hendrycksTest-astronomy|5": 1,
391
+ "harness|hendrycksTest-business_ethics|5": 1,
392
+ "harness|hendrycksTest-clinical_knowledge|5": 1,
393
+ "harness|hendrycksTest-college_biology|5": 1,
394
+ "harness|hendrycksTest-college_chemistry|5": 1,
395
+ "harness|hendrycksTest-college_computer_science|5": 1,
396
+ "harness|hendrycksTest-college_mathematics|5": 1,
397
+ "harness|hendrycksTest-college_medicine|5": 1,
398
+ "harness|hendrycksTest-college_physics|5": 1,
399
+ "harness|hendrycksTest-computer_security|5": 1,
400
+ "harness|hendrycksTest-conceptual_physics|5": 1,
401
+ "harness|hendrycksTest-econometrics|5": 1,
402
+ "harness|hendrycksTest-electrical_engineering|5": 1,
403
+ "harness|hendrycksTest-elementary_mathematics|5": 1,
404
+ "harness|hendrycksTest-formal_logic|5": 1,
405
+ "harness|hendrycksTest-global_facts|5": 1,
406
+ "harness|hendrycksTest-high_school_biology|5": 1,
407
+ "harness|hendrycksTest-high_school_chemistry|5": 1,
408
+ "harness|hendrycksTest-high_school_computer_science|5": 1,
409
+ "harness|hendrycksTest-high_school_european_history|5": 1,
410
+ "harness|hendrycksTest-high_school_geography|5": 1,
411
+ "harness|hendrycksTest-high_school_government_and_politics|5": 1,
412
+ "harness|hendrycksTest-high_school_macroeconomics|5": 1,
413
+ "harness|hendrycksTest-high_school_mathematics|5": 1,
414
+ "harness|hendrycksTest-high_school_microeconomics|5": 1,
415
+ "harness|hendrycksTest-high_school_physics|5": 1,
416
+ "harness|hendrycksTest-high_school_psychology|5": 1,
417
+ "harness|hendrycksTest-high_school_statistics|5": 1,
418
+ "harness|hendrycksTest-high_school_us_history|5": 1,
419
+ "harness|hendrycksTest-high_school_world_history|5": 1,
420
+ "harness|hendrycksTest-human_aging|5": 1,
421
+ "harness|hendrycksTest-human_sexuality|5": 1,
422
+ "harness|hendrycksTest-international_law|5": 1,
423
+ "harness|hendrycksTest-jurisprudence|5": 1,
424
+ "harness|hendrycksTest-logical_fallacies|5": 1,
425
+ "harness|hendrycksTest-machine_learning|5": 1,
426
+ "harness|hendrycksTest-management|5": 1,
427
+ "harness|hendrycksTest-marketing|5": 1,
428
+ "harness|hendrycksTest-medical_genetics|5": 1,
429
+ "harness|hendrycksTest-miscellaneous|5": 1,
430
+ "harness|hendrycksTest-moral_disputes|5": 1,
431
+ "harness|hendrycksTest-moral_scenarios|5": 1,
432
+ "harness|hendrycksTest-nutrition|5": 1,
433
+ "harness|hendrycksTest-philosophy|5": 1,
434
+ "harness|hendrycksTest-prehistory|5": 1,
435
+ "harness|hendrycksTest-professional_accounting|5": 1,
436
+ "harness|hendrycksTest-professional_law|5": 1,
437
+ "harness|hendrycksTest-professional_medicine|5": 1,
438
+ "harness|hendrycksTest-professional_psychology|5": 1,
439
+ "harness|hendrycksTest-public_relations|5": 1,
440
+ "harness|hendrycksTest-security_studies|5": 1,
441
+ "harness|hendrycksTest-sociology|5": 1,
442
+ "harness|hendrycksTest-us_foreign_policy|5": 1,
443
+ "harness|hendrycksTest-virology|5": 1,
444
+ "harness|hendrycksTest-world_religions|5": 1,
445
+ "harness|truthfulqa:mc|0": 1,
446
+ "all": 0
447
+ },
448
+ "config_tasks": {
449
+ "harness|arc:challenge": "LM Harness task",
450
+ "harness|hellaswag": "LM Harness task",
451
+ "harness|hendrycksTest-abstract_algebra": "LM Harness task",
452
+ "harness|hendrycksTest-anatomy": "LM Harness task",
453
+ "harness|hendrycksTest-astronomy": "LM Harness task",
454
+ "harness|hendrycksTest-business_ethics": "LM Harness task",
455
+ "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
456
+ "harness|hendrycksTest-college_biology": "LM Harness task",
457
+ "harness|hendrycksTest-college_chemistry": "LM Harness task",
458
+ "harness|hendrycksTest-college_computer_science": "LM Harness task",
459
+ "harness|hendrycksTest-college_mathematics": "LM Harness task",
460
+ "harness|hendrycksTest-college_medicine": "LM Harness task",
461
+ "harness|hendrycksTest-college_physics": "LM Harness task",
462
+ "harness|hendrycksTest-computer_security": "LM Harness task",
463
+ "harness|hendrycksTest-conceptual_physics": "LM Harness task",
464
+ "harness|hendrycksTest-econometrics": "LM Harness task",
465
+ "harness|hendrycksTest-electrical_engineering": "LM Harness task",
466
+ "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
467
+ "harness|hendrycksTest-formal_logic": "LM Harness task",
468
+ "harness|hendrycksTest-global_facts": "LM Harness task",
469
+ "harness|hendrycksTest-high_school_biology": "LM Harness task",
470
+ "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
471
+ "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
472
+ "harness|hendrycksTest-high_school_european_history": "LM Harness task",
473
+ "harness|hendrycksTest-high_school_geography": "LM Harness task",
474
+ "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
475
+ "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
476
+ "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
477
+ "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
478
+ "harness|hendrycksTest-high_school_physics": "LM Harness task",
479
+ "harness|hendrycksTest-high_school_psychology": "LM Harness task",
480
+ "harness|hendrycksTest-high_school_statistics": "LM Harness task",
481
+ "harness|hendrycksTest-high_school_us_history": "LM Harness task",
482
+ "harness|hendrycksTest-high_school_world_history": "LM Harness task",
483
+ "harness|hendrycksTest-human_aging": "LM Harness task",
484
+ "harness|hendrycksTest-human_sexuality": "LM Harness task",
485
+ "harness|hendrycksTest-international_law": "LM Harness task",
486
+ "harness|hendrycksTest-jurisprudence": "LM Harness task",
487
+ "harness|hendrycksTest-logical_fallacies": "LM Harness task",
488
+ "harness|hendrycksTest-machine_learning": "LM Harness task",
489
+ "harness|hendrycksTest-management": "LM Harness task",
490
+ "harness|hendrycksTest-marketing": "LM Harness task",
491
+ "harness|hendrycksTest-medical_genetics": "LM Harness task",
492
+ "harness|hendrycksTest-miscellaneous": "LM Harness task",
493
+ "harness|hendrycksTest-moral_disputes": "LM Harness task",
494
+ "harness|hendrycksTest-moral_scenarios": "LM Harness task",
495
+ "harness|hendrycksTest-nutrition": "LM Harness task",
496
+ "harness|hendrycksTest-philosophy": "LM Harness task",
497
+ "harness|hendrycksTest-prehistory": "LM Harness task",
498
+ "harness|hendrycksTest-professional_accounting": "LM Harness task",
499
+ "harness|hendrycksTest-professional_law": "LM Harness task",
500
+ "harness|hendrycksTest-professional_medicine": "LM Harness task",
501
+ "harness|hendrycksTest-professional_psychology": "LM Harness task",
502
+ "harness|hendrycksTest-public_relations": "LM Harness task",
503
+ "harness|hendrycksTest-security_studies": "LM Harness task",
504
+ "harness|hendrycksTest-sociology": "LM Harness task",
505
+ "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
506
+ "harness|hendrycksTest-virology": "LM Harness task",
507
+ "harness|hendrycksTest-world_religions": "LM Harness task",
508
+ "harness|truthfulqa:mc": "LM Harness task"
509
+ },
510
+ "summary_tasks": {
511
+ "harness|arc:challenge|25": {
512
+ "hashes": {
513
+ "hash_examples": "17b0cae357c0259e",
514
+ "hash_full_prompts": "045cbb916e5145c6",
515
+ "hash_input_tokens": "3722289b79076c44",
516
+ "hash_cont_tokens": "8210decc6ff6f7df"
517
+ },
518
+ "truncated": 0,
519
+ "non-truncated": 4687,
520
+ "padded": 4687,
521
+ "non-padded": 0,
522
+ "effective_few_shots": 25.0,
523
+ "num_truncated_few_shots": 0
524
+ },
525
+ "harness|hellaswag|10": {
526
+ "hashes": {
527
+ "hash_examples": "e1768ecb99d7ecf0",
528
+ "hash_full_prompts": "0b4c16983130f84f",
529
+ "hash_input_tokens": "ececd684171f1ef2",
530
+ "hash_cont_tokens": "b3b9e9017afa63af"
531
+ },
532
+ "truncated": 0,
533
+ "non-truncated": 40168,
534
+ "padded": 40113,
535
+ "non-padded": 55,
536
+ "effective_few_shots": 10.0,
537
+ "num_truncated_few_shots": 0
538
+ },
539
+ "harness|hendrycksTest-abstract_algebra|5": {
540
+ "hashes": {
541
+ "hash_examples": "280f9f325b40559a",
542
+ "hash_full_prompts": "2f776a367d23aea2",
543
+ "hash_input_tokens": "c54ff61ad0273dd7",
544
+ "hash_cont_tokens": "50421e30bef398f9"
545
+ },
546
+ "truncated": 0,
547
+ "non-truncated": 400,
548
+ "padded": 400,
549
+ "non-padded": 0,
550
+ "effective_few_shots": 5.0,
551
+ "num_truncated_few_shots": 0
552
+ },
553
+ "harness|hendrycksTest-anatomy|5": {
554
+ "hashes": {
555
+ "hash_examples": "2f83a4f1cab4ba18",
556
+ "hash_full_prompts": "516f74bef25df620",
557
+ "hash_input_tokens": "be31a1e22aef5f90",
558
+ "hash_cont_tokens": "f11971a765cb609f"
559
+ },
560
+ "truncated": 0,
561
+ "non-truncated": 540,
562
+ "padded": 540,
563
+ "non-padded": 0,
564
+ "effective_few_shots": 5.0,
565
+ "num_truncated_few_shots": 0
566
+ },
567
+ "harness|hendrycksTest-astronomy|5": {
568
+ "hashes": {
569
+ "hash_examples": "7d587b908da4d762",
570
+ "hash_full_prompts": "faf4e80f65de93ca",
571
+ "hash_input_tokens": "277a7b1fad566940",
572
+ "hash_cont_tokens": "bf30e5d3f48250cb"
573
+ },
574
+ "truncated": 0,
575
+ "non-truncated": 608,
576
+ "padded": 608,
577
+ "non-padded": 0,
578
+ "effective_few_shots": 5.0,
579
+ "num_truncated_few_shots": 0
580
+ },
581
+ "harness|hendrycksTest-business_ethics|5": {
582
+ "hashes": {
583
+ "hash_examples": "33e51740670de686",
584
+ "hash_full_prompts": "db01c3ef8e1479d4",
585
+ "hash_input_tokens": "ba552605bc116de5",
586
+ "hash_cont_tokens": "bc1dd9b2d995eb61"
587
+ },
588
+ "truncated": 0,
589
+ "non-truncated": 400,
590
+ "padded": 400,
591
+ "non-padded": 0,
592
+ "effective_few_shots": 5.0,
593
+ "num_truncated_few_shots": 0
594
+ },
595
+ "harness|hendrycksTest-clinical_knowledge|5": {
596
+ "hashes": {
597
+ "hash_examples": "f3366dbe7eefffa4",
598
+ "hash_full_prompts": "49654f71d94b65c3",
599
+ "hash_input_tokens": "428c7563d0b98ab9",
600
+ "hash_cont_tokens": "890a119624b3b935"
601
+ },
602
+ "truncated": 0,
603
+ "non-truncated": 1060,
604
+ "padded": 1060,
605
+ "non-padded": 0,
606
+ "effective_few_shots": 5.0,
607
+ "num_truncated_few_shots": 0
608
+ },
609
+ "harness|hendrycksTest-college_biology|5": {
610
+ "hashes": {
611
+ "hash_examples": "ca2b6753a0193e7f",
612
+ "hash_full_prompts": "2b460b75f1fdfefd",
613
+ "hash_input_tokens": "da036601573942e2",
614
+ "hash_cont_tokens": "875cde3af7a0ee14"
615
+ },
616
+ "truncated": 0,
617
+ "non-truncated": 576,
618
+ "padded": 576,
619
+ "non-padded": 0,
620
+ "effective_few_shots": 5.0,
621
+ "num_truncated_few_shots": 0
622
+ },
623
+ "harness|hendrycksTest-college_chemistry|5": {
624
+ "hashes": {
625
+ "hash_examples": "22ff85f1d34f42d1",
626
+ "hash_full_prompts": "242c9be6da583e95",
627
+ "hash_input_tokens": "94e0196d6aded13d",
628
+ "hash_cont_tokens": "50421e30bef398f9"
629
+ },
630
+ "truncated": 0,
631
+ "non-truncated": 400,
632
+ "padded": 400,
633
+ "non-padded": 0,
634
+ "effective_few_shots": 5.0,
635
+ "num_truncated_few_shots": 0
636
+ },
637
+ "harness|hendrycksTest-college_computer_science|5": {
638
+ "hashes": {
639
+ "hash_examples": "30318289d717a5cf",
640
+ "hash_full_prompts": "ed2bdb4e87c4b371",
641
+ "hash_input_tokens": "6e4d0f4a8d36690b",
642
+ "hash_cont_tokens": "ffc0fe414cdc4a83"
643
+ },
644
+ "truncated": 0,
645
+ "non-truncated": 400,
646
+ "padded": 400,
647
+ "non-padded": 0,
648
+ "effective_few_shots": 5.0,
649
+ "num_truncated_few_shots": 0
650
+ },
651
+ "harness|hendrycksTest-college_mathematics|5": {
652
+ "hashes": {
653
+ "hash_examples": "4944d1f0b6b5d911",
654
+ "hash_full_prompts": "770bc4281c973190",
655
+ "hash_input_tokens": "614054d17109a25d",
656
+ "hash_cont_tokens": "50421e30bef398f9"
657
+ },
658
+ "truncated": 0,
659
+ "non-truncated": 400,
660
+ "padded": 400,
661
+ "non-padded": 0,
662
+ "effective_few_shots": 5.0,
663
+ "num_truncated_few_shots": 0
664
+ },
665
+ "harness|hendrycksTest-college_medicine|5": {
666
+ "hashes": {
667
+ "hash_examples": "dd69cc33381275af",
668
+ "hash_full_prompts": "ad2a53e5250ab46e",
669
+ "hash_input_tokens": "081bb2b524defd1c",
670
+ "hash_cont_tokens": "1f88b00d41957d82"
671
+ },
672
+ "truncated": 0,
673
+ "non-truncated": 692,
674
+ "padded": 692,
675
+ "non-padded": 0,
676
+ "effective_few_shots": 5.0,
677
+ "num_truncated_few_shots": 0
678
+ },
679
+ "harness|hendrycksTest-college_physics|5": {
680
+ "hashes": {
681
+ "hash_examples": "875dd26d22655b0d",
682
+ "hash_full_prompts": "833a0d7b55aed500",
683
+ "hash_input_tokens": "5421d9a1af86cbd4",
684
+ "hash_cont_tokens": "f7b8097afc16a47c"
685
+ },
686
+ "truncated": 0,
687
+ "non-truncated": 408,
688
+ "padded": 408,
689
+ "non-padded": 0,
690
+ "effective_few_shots": 5.0,
691
+ "num_truncated_few_shots": 0
692
+ },
693
+ "harness|hendrycksTest-computer_security|5": {
694
+ "hashes": {
695
+ "hash_examples": "006451eedc0ededb",
696
+ "hash_full_prompts": "94034c97e85d8f46",
697
+ "hash_input_tokens": "5e6b70ecb333cf18",
698
+ "hash_cont_tokens": "50421e30bef398f9"
699
+ },
700
+ "truncated": 0,
701
+ "non-truncated": 400,
702
+ "padded": 400,
703
+ "non-padded": 0,
704
+ "effective_few_shots": 5.0,
705
+ "num_truncated_few_shots": 0
706
+ },
707
+ "harness|hendrycksTest-conceptual_physics|5": {
708
+ "hashes": {
709
+ "hash_examples": "8874ece872d2ca4c",
710
+ "hash_full_prompts": "e40d15a34640d6fa",
711
+ "hash_input_tokens": "c2ef11a87264ceed",
712
+ "hash_cont_tokens": "aa0e8bc655f2f641"
713
+ },
714
+ "truncated": 0,
715
+ "non-truncated": 940,
716
+ "padded": 940,
717
+ "non-padded": 0,
718
+ "effective_few_shots": 5.0,
719
+ "num_truncated_few_shots": 0
720
+ },
721
+ "harness|hendrycksTest-econometrics|5": {
722
+ "hashes": {
723
+ "hash_examples": "64d3623b0bfaa43f",
724
+ "hash_full_prompts": "612f340fae41338d",
725
+ "hash_input_tokens": "ecaccd912a4c3978",
726
+ "hash_cont_tokens": "bfb7e3c3c88313f1"
727
+ },
728
+ "truncated": 0,
729
+ "non-truncated": 456,
730
+ "padded": 456,
731
+ "non-padded": 0,
732
+ "effective_few_shots": 5.0,
733
+ "num_truncated_few_shots": 0
734
+ },
735
+ "harness|hendrycksTest-electrical_engineering|5": {
736
+ "hashes": {
737
+ "hash_examples": "e98f51780c674d7e",
738
+ "hash_full_prompts": "10275b312d812ae6",
739
+ "hash_input_tokens": "1590c84291399be8",
740
+ "hash_cont_tokens": "2425a3f084a591ef"
741
+ },
742
+ "truncated": 0,
743
+ "non-truncated": 580,
744
+ "padded": 580,
745
+ "non-padded": 0,
746
+ "effective_few_shots": 5.0,
747
+ "num_truncated_few_shots": 0
748
+ },
749
+ "harness|hendrycksTest-elementary_mathematics|5": {
750
+ "hashes": {
751
+ "hash_examples": "fc48208a5ac1c0ce",
752
+ "hash_full_prompts": "5ec274c6c82aca23",
753
+ "hash_input_tokens": "3269597f715b0da1",
754
+ "hash_cont_tokens": "f52691aef15a407b"
755
+ },
756
+ "truncated": 0,
757
+ "non-truncated": 1512,
758
+ "padded": 1512,
759
+ "non-padded": 0,
760
+ "effective_few_shots": 5.0,
761
+ "num_truncated_few_shots": 0
762
+ },
763
+ "harness|hendrycksTest-formal_logic|5": {
764
+ "hashes": {
765
+ "hash_examples": "5a6525665f63ea72",
766
+ "hash_full_prompts": "07b92638c4a6b500",
767
+ "hash_input_tokens": "a2800d20f3ab8d7c",
768
+ "hash_cont_tokens": "f515d598d9c21263"
769
+ },
770
+ "truncated": 0,
771
+ "non-truncated": 504,
772
+ "padded": 504,
773
+ "non-padded": 0,
774
+ "effective_few_shots": 5.0,
775
+ "num_truncated_few_shots": 0
776
+ },
777
+ "harness|hendrycksTest-global_facts|5": {
778
+ "hashes": {
779
+ "hash_examples": "371d70d743b2b89b",
780
+ "hash_full_prompts": "332fdee50a1921b4",
781
+ "hash_input_tokens": "94ed44b3772505ad",
782
+ "hash_cont_tokens": "50421e30bef398f9"
783
+ },
784
+ "truncated": 0,
785
+ "non-truncated": 400,
786
+ "padded": 400,
787
+ "non-padded": 0,
788
+ "effective_few_shots": 5.0,
789
+ "num_truncated_few_shots": 0
790
+ },
791
+ "harness|hendrycksTest-high_school_biology|5": {
792
+ "hashes": {
793
+ "hash_examples": "a79e1018b1674052",
794
+ "hash_full_prompts": "e624e26ede922561",
795
+ "hash_input_tokens": "24423acb928db768",
796
+ "hash_cont_tokens": "bd85a4156a3613ee"
797
+ },
798
+ "truncated": 0,
799
+ "non-truncated": 1240,
800
+ "padded": 1240,
801
+ "non-padded": 0,
802
+ "effective_few_shots": 5.0,
803
+ "num_truncated_few_shots": 0
804
+ },
805
+ "harness|hendrycksTest-high_school_chemistry|5": {
806
+ "hashes": {
807
+ "hash_examples": "44bfc25c389f0e03",
808
+ "hash_full_prompts": "0e3e5f5d9246482a",
809
+ "hash_input_tokens": "831ff35c474e5cef",
810
+ "hash_cont_tokens": "a95c97af1c14e068"
811
+ },
812
+ "truncated": 0,
813
+ "non-truncated": 812,
814
+ "padded": 812,
815
+ "non-padded": 0,
816
+ "effective_few_shots": 5.0,
817
+ "num_truncated_few_shots": 0
818
+ },
819
+ "harness|hendrycksTest-high_school_computer_science|5": {
820
+ "hashes": {
821
+ "hash_examples": "8b8cdb1084f24169",
822
+ "hash_full_prompts": "c00487e67c1813cc",
823
+ "hash_input_tokens": "a20a96b44dcc5b30",
824
+ "hash_cont_tokens": "8abfedef914e33c9"
825
+ },
826
+ "truncated": 0,
827
+ "non-truncated": 400,
828
+ "padded": 400,
829
+ "non-padded": 0,
830
+ "effective_few_shots": 5.0,
831
+ "num_truncated_few_shots": 0
832
+ },
833
+ "harness|hendrycksTest-high_school_european_history|5": {
834
+ "hashes": {
835
+ "hash_examples": "11cd32d0ef440171",
836
+ "hash_full_prompts": "318f4513c537c6bf",
837
+ "hash_input_tokens": "5002f4ac8b1562ca",
838
+ "hash_cont_tokens": "674fc454bdc5ac93"
839
+ },
840
+ "truncated": 0,
841
+ "non-truncated": 660,
842
+ "padded": 656,
843
+ "non-padded": 4,
844
+ "effective_few_shots": 5.0,
845
+ "num_truncated_few_shots": 0
846
+ },
847
+ "harness|hendrycksTest-high_school_geography|5": {
848
+ "hashes": {
849
+ "hash_examples": "b60019b9e80b642f",
850
+ "hash_full_prompts": "ee5789fcc1a81b1e",
851
+ "hash_input_tokens": "7c5547c7da5bc793",
852
+ "hash_cont_tokens": "03a5012b916274ea"
853
+ },
854
+ "truncated": 0,
855
+ "non-truncated": 792,
856
+ "padded": 792,
857
+ "non-padded": 0,
858
+ "effective_few_shots": 5.0,
859
+ "num_truncated_few_shots": 0
860
+ },
861
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
862
+ "hashes": {
863
+ "hash_examples": "d221ec983d143dc3",
864
+ "hash_full_prompts": "ac42d888e1ce1155",
865
+ "hash_input_tokens": "f62991cb6a496b05",
866
+ "hash_cont_tokens": "a83effb8f76b7d7c"
867
+ },
868
+ "truncated": 0,
869
+ "non-truncated": 772,
870
+ "padded": 772,
871
+ "non-padded": 0,
872
+ "effective_few_shots": 5.0,
873
+ "num_truncated_few_shots": 0
874
+ },
875
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
876
+ "hashes": {
877
+ "hash_examples": "59c2915cacfd3fbb",
878
+ "hash_full_prompts": "c6bd9d25158abd0e",
879
+ "hash_input_tokens": "4cef2aff6e3d59ed",
880
+ "hash_cont_tokens": "c583432ad27fcfe0"
881
+ },
882
+ "truncated": 0,
883
+ "non-truncated": 1560,
884
+ "padded": 1560,
885
+ "non-padded": 0,
886
+ "effective_few_shots": 5.0,
887
+ "num_truncated_few_shots": 0
888
+ },
889
+ "harness|hendrycksTest-high_school_mathematics|5": {
890
+ "hashes": {
891
+ "hash_examples": "1f8ac897608de342",
892
+ "hash_full_prompts": "5d88f41fc2d643a8",
893
+ "hash_input_tokens": "6e2577ea4082ed2b",
894
+ "hash_cont_tokens": "24f5dc613660300b"
895
+ },
896
+ "truncated": 0,
897
+ "non-truncated": 1080,
898
+ "padded": 1080,
899
+ "non-padded": 0,
900
+ "effective_few_shots": 5.0,
901
+ "num_truncated_few_shots": 0
902
+ },
903
+ "harness|hendrycksTest-high_school_microeconomics|5": {
904
+ "hashes": {
905
+ "hash_examples": "ead6a0f2f6c83370",
906
+ "hash_full_prompts": "bfc393381298609e",
907
+ "hash_input_tokens": "c5fc9aeb1079c8e4",
908
+ "hash_cont_tokens": "f47f041de50333b9"
909
+ },
910
+ "truncated": 0,
911
+ "non-truncated": 952,
912
+ "padded": 952,
913
+ "non-padded": 0,
914
+ "effective_few_shots": 5.0,
915
+ "num_truncated_few_shots": 0
916
+ },
917
+ "harness|hendrycksTest-high_school_physics|5": {
918
+ "hashes": {
919
+ "hash_examples": "c3f2025990afec64",
920
+ "hash_full_prompts": "fc78b4997e436734",
921
+ "hash_input_tokens": "555fc385cffa84ca",
922
+ "hash_cont_tokens": "ba2efcd283e938cc"
923
+ },
924
+ "truncated": 0,
925
+ "non-truncated": 604,
926
+ "padded": 604,
927
+ "non-padded": 0,
928
+ "effective_few_shots": 5.0,
929
+ "num_truncated_few_shots": 0
930
+ },
931
+ "harness|hendrycksTest-high_school_psychology|5": {
932
+ "hashes": {
933
+ "hash_examples": "21f8aab618f6d636",
934
+ "hash_full_prompts": "d5c76aa40b9dbc43",
935
+ "hash_input_tokens": "febd23cbf9973b7f",
936
+ "hash_cont_tokens": "942069cd363844d9"
937
+ },
938
+ "truncated": 0,
939
+ "non-truncated": 2180,
940
+ "padded": 2180,
941
+ "non-padded": 0,
942
+ "effective_few_shots": 5.0,
943
+ "num_truncated_few_shots": 0
944
+ },
945
+ "harness|hendrycksTest-high_school_statistics|5": {
946
+ "hashes": {
947
+ "hash_examples": "2386a60a11fc5de3",
948
+ "hash_full_prompts": "4c5c8be5aafac432",
949
+ "hash_input_tokens": "400e55b56ee6fbd7",
950
+ "hash_cont_tokens": "955ed42b6f7fa019"
951
+ },
952
+ "truncated": 0,
953
+ "non-truncated": 864,
954
+ "padded": 864,
955
+ "non-padded": 0,
956
+ "effective_few_shots": 5.0,
957
+ "num_truncated_few_shots": 0
958
+ },
959
+ "harness|hendrycksTest-high_school_us_history|5": {
960
+ "hashes": {
961
+ "hash_examples": "74961543be40f04f",
962
+ "hash_full_prompts": "5d5ca4840131ba21",
963
+ "hash_input_tokens": "c639cce12a46ebad",
964
+ "hash_cont_tokens": "cdd0b3dc06d933e5"
965
+ },
966
+ "truncated": 0,
967
+ "non-truncated": 816,
968
+ "padded": 816,
969
+ "non-padded": 0,
970
+ "effective_few_shots": 5.0,
971
+ "num_truncated_few_shots": 0
972
+ },
973
+ "harness|hendrycksTest-high_school_world_history|5": {
974
+ "hashes": {
975
+ "hash_examples": "2ad2f6b7198b2234",
976
+ "hash_full_prompts": "11845057459afd72",
977
+ "hash_input_tokens": "b9762065cce6f3a6",
978
+ "hash_cont_tokens": "9a864184946033ac"
979
+ },
980
+ "truncated": 0,
981
+ "non-truncated": 948,
982
+ "padded": 948,
983
+ "non-padded": 0,
984
+ "effective_few_shots": 5.0,
985
+ "num_truncated_few_shots": 0
986
+ },
987
+ "harness|hendrycksTest-human_aging|5": {
988
+ "hashes": {
989
+ "hash_examples": "1a7199dc733e779b",
990
+ "hash_full_prompts": "756b9096b8eaf892",
991
+ "hash_input_tokens": "541a75f071dcf579",
992
+ "hash_cont_tokens": "142a4a8a1138a214"
993
+ },
994
+ "truncated": 0,
995
+ "non-truncated": 892,
996
+ "padded": 892,
997
+ "non-padded": 0,
998
+ "effective_few_shots": 5.0,
999
+ "num_truncated_few_shots": 0
1000
+ },
1001
+ "harness|hendrycksTest-human_sexuality|5": {
1002
+ "hashes": {
1003
+ "hash_examples": "7acb8fdad97f88a6",
1004
+ "hash_full_prompts": "731a52ff15b8cfdb",
1005
+ "hash_input_tokens": "04269e5c5a257dd9",
1006
+ "hash_cont_tokens": "bc54813e809b796d"
1007
+ },
1008
+ "truncated": 0,
1009
+ "non-truncated": 524,
1010
+ "padded": 524,
1011
+ "non-padded": 0,
1012
+ "effective_few_shots": 5.0,
1013
+ "num_truncated_few_shots": 0
1014
+ },
1015
+ "harness|hendrycksTest-international_law|5": {
1016
+ "hashes": {
1017
+ "hash_examples": "1300bfd0dfc59114",
1018
+ "hash_full_prompts": "db2aefbff5eec996",
1019
+ "hash_input_tokens": "d93ba9d9d38e4397",
1020
+ "hash_cont_tokens": "dc45b45fcda18e5d"
1021
+ },
1022
+ "truncated": 0,
1023
+ "non-truncated": 484,
1024
+ "padded": 484,
1025
+ "non-padded": 0,
1026
+ "effective_few_shots": 5.0,
1027
+ "num_truncated_few_shots": 0
1028
+ },
1029
+ "harness|hendrycksTest-jurisprudence|5": {
1030
+ "hashes": {
1031
+ "hash_examples": "083b1e4904c48dc2",
1032
+ "hash_full_prompts": "0f89ee3fe03d6a21",
1033
+ "hash_input_tokens": "9eeaccd2698b4f5a",
1034
+ "hash_cont_tokens": "e3a8cd951b6e3469"
1035
+ },
1036
+ "truncated": 0,
1037
+ "non-truncated": 432,
1038
+ "padded": 432,
1039
+ "non-padded": 0,
1040
+ "effective_few_shots": 5.0,
1041
+ "num_truncated_few_shots": 0
1042
+ },
1043
+ "harness|hendrycksTest-logical_fallacies|5": {
1044
+ "hashes": {
1045
+ "hash_examples": "709128f9926a634c",
1046
+ "hash_full_prompts": "98a04b1f8f841069",
1047
+ "hash_input_tokens": "b4f08f544f2b7576",
1048
+ "hash_cont_tokens": "1e80dbd30f6453d5"
1049
+ },
1050
+ "truncated": 0,
1051
+ "non-truncated": 652,
1052
+ "padded": 648,
1053
+ "non-padded": 4,
1054
+ "effective_few_shots": 5.0,
1055
+ "num_truncated_few_shots": 0
1056
+ },
1057
+ "harness|hendrycksTest-machine_learning|5": {
1058
+ "hashes": {
1059
+ "hash_examples": "88f22a636029ae47",
1060
+ "hash_full_prompts": "2e1c8d4b1e0cc921",
1061
+ "hash_input_tokens": "900c2a51f1174b9f",
1062
+ "hash_cont_tokens": "9b37da7777378ca9"
1063
+ },
1064
+ "truncated": 0,
1065
+ "non-truncated": 448,
1066
+ "padded": 448,
1067
+ "non-padded": 0,
1068
+ "effective_few_shots": 5.0,
1069
+ "num_truncated_few_shots": 0
1070
+ },
1071
+ "harness|hendrycksTest-management|5": {
1072
+ "hashes": {
1073
+ "hash_examples": "8c8a1e07a2151dca",
1074
+ "hash_full_prompts": "f51611f514b265b0",
1075
+ "hash_input_tokens": "6b36efb4689c6eca",
1076
+ "hash_cont_tokens": "a01d6d39a83c4597"
1077
+ },
1078
+ "truncated": 0,
1079
+ "non-truncated": 412,
1080
+ "padded": 412,
1081
+ "non-padded": 0,
1082
+ "effective_few_shots": 5.0,
1083
+ "num_truncated_few_shots": 0
1084
+ },
1085
+ "harness|hendrycksTest-marketing|5": {
1086
+ "hashes": {
1087
+ "hash_examples": "2668953431f91e96",
1088
+ "hash_full_prompts": "77562bef997c7650",
1089
+ "hash_input_tokens": "2aaac78a0cfed47a",
1090
+ "hash_cont_tokens": "6aeaed4d823c98aa"
1091
+ },
1092
+ "truncated": 0,
1093
+ "non-truncated": 936,
1094
+ "padded": 936,
1095
+ "non-padded": 0,
1096
+ "effective_few_shots": 5.0,
1097
+ "num_truncated_few_shots": 0
1098
+ },
1099
+ "harness|hendrycksTest-medical_genetics|5": {
1100
+ "hashes": {
1101
+ "hash_examples": "9c2dda34a2ea4fd2",
1102
+ "hash_full_prompts": "202139046daa118f",
1103
+ "hash_input_tokens": "886ca823b41c094a",
1104
+ "hash_cont_tokens": "50421e30bef398f9"
1105
+ },
1106
+ "truncated": 0,
1107
+ "non-truncated": 400,
1108
+ "padded": 400,
1109
+ "non-padded": 0,
1110
+ "effective_few_shots": 5.0,
1111
+ "num_truncated_few_shots": 0
1112
+ },
1113
+ "harness|hendrycksTest-miscellaneous|5": {
1114
+ "hashes": {
1115
+ "hash_examples": "41adb694024809c2",
1116
+ "hash_full_prompts": "bffec9fc237bcf93",
1117
+ "hash_input_tokens": "72fd71de7675e7d0",
1118
+ "hash_cont_tokens": "9b0ab02a64603081"
1119
+ },
1120
+ "truncated": 0,
1121
+ "non-truncated": 3132,
1122
+ "padded": 3132,
1123
+ "non-padded": 0,
1124
+ "effective_few_shots": 5.0,
1125
+ "num_truncated_few_shots": 0
1126
+ },
1127
+ "harness|hendrycksTest-moral_disputes|5": {
1128
+ "hashes": {
1129
+ "hash_examples": "3171c13ba3c594c4",
1130
+ "hash_full_prompts": "170831fc36f1d59e",
1131
+ "hash_input_tokens": "f3ca0dd8e7a1eb09",
1132
+ "hash_cont_tokens": "8badf768f7b0467a"
1133
+ },
1134
+ "truncated": 0,
1135
+ "non-truncated": 1384,
1136
+ "padded": 1354,
1137
+ "non-padded": 30,
1138
+ "effective_few_shots": 5.0,
1139
+ "num_truncated_few_shots": 0
1140
+ },
1141
+ "harness|hendrycksTest-moral_scenarios|5": {
1142
+ "hashes": {
1143
+ "hash_examples": "9873e077e83e0546",
1144
+ "hash_full_prompts": "08f4ceba3131a068",
1145
+ "hash_input_tokens": "3e793631e951f23c",
1146
+ "hash_cont_tokens": "32ae620376b2bbba"
1147
+ },
1148
+ "truncated": 0,
1149
+ "non-truncated": 3580,
1150
+ "padded": 3580,
1151
+ "non-padded": 0,
1152
+ "effective_few_shots": 5.0,
1153
+ "num_truncated_few_shots": 0
1154
+ },
1155
+ "harness|hendrycksTest-nutrition|5": {
1156
+ "hashes": {
1157
+ "hash_examples": "7db1d8142ec14323",
1158
+ "hash_full_prompts": "4c0e68e3586cb453",
1159
+ "hash_input_tokens": "59753c2144ea93af",
1160
+ "hash_cont_tokens": "3071def75bacc404"
1161
+ },
1162
+ "truncated": 0,
1163
+ "non-truncated": 1224,
1164
+ "padded": 1224,
1165
+ "non-padded": 0,
1166
+ "effective_few_shots": 5.0,
1167
+ "num_truncated_few_shots": 0
1168
+ },
1169
+ "harness|hendrycksTest-philosophy|5": {
1170
+ "hashes": {
1171
+ "hash_examples": "9b455b7d72811cc8",
1172
+ "hash_full_prompts": "e467f822d8a0d3ff",
1173
+ "hash_input_tokens": "bd8d3dbed15a8c34",
1174
+ "hash_cont_tokens": "9f6ff69d23a48783"
1175
+ },
1176
+ "truncated": 0,
1177
+ "non-truncated": 1244,
1178
+ "padded": 1244,
1179
+ "non-padded": 0,
1180
+ "effective_few_shots": 5.0,
1181
+ "num_truncated_few_shots": 0
1182
+ },
1183
+ "harness|hendrycksTest-prehistory|5": {
1184
+ "hashes": {
1185
+ "hash_examples": "8be90d0f538f1560",
1186
+ "hash_full_prompts": "152187949bcd0921",
1187
+ "hash_input_tokens": "3573cd87facbb7c5",
1188
+ "hash_cont_tokens": "de469d2b981e32a3"
1189
+ },
1190
+ "truncated": 0,
1191
+ "non-truncated": 1296,
1192
+ "padded": 1296,
1193
+ "non-padded": 0,
1194
+ "effective_few_shots": 5.0,
1195
+ "num_truncated_few_shots": 0
1196
+ },
1197
+ "harness|hendrycksTest-professional_accounting|5": {
1198
+ "hashes": {
1199
+ "hash_examples": "8d377597916cd07e",
1200
+ "hash_full_prompts": "0eb7345d6144ee0d",
1201
+ "hash_input_tokens": "17e721bc1a7cbb47",
1202
+ "hash_cont_tokens": "c46f74d2dfc7b13b"
1203
+ },
1204
+ "truncated": 0,
1205
+ "non-truncated": 1128,
1206
+ "padded": 1128,
1207
+ "non-padded": 0,
1208
+ "effective_few_shots": 5.0,
1209
+ "num_truncated_few_shots": 0
1210
+ },
1211
+ "harness|hendrycksTest-professional_law|5": {
1212
+ "hashes": {
1213
+ "hash_examples": "cd9dbc52b3c932d6",
1214
+ "hash_full_prompts": "36ac764272bfb182",
1215
+ "hash_input_tokens": "c9f7583fff66d361",
1216
+ "hash_cont_tokens": "2e590029ef41fbcd"
1217
+ },
1218
+ "truncated": 0,
1219
+ "non-truncated": 6136,
1220
+ "padded": 6136,
1221
+ "non-padded": 0,
1222
+ "effective_few_shots": 5.0,
1223
+ "num_truncated_few_shots": 0
1224
+ },
1225
+ "harness|hendrycksTest-professional_medicine|5": {
1226
+ "hashes": {
1227
+ "hash_examples": "b20e4e816c1e383e",
1228
+ "hash_full_prompts": "7b8d69ea2acaf2f7",
1229
+ "hash_input_tokens": "40a933f829116f8d",
1230
+ "hash_cont_tokens": "fe35cfa9c6ca802e"
1231
+ },
1232
+ "truncated": 0,
1233
+ "non-truncated": 1088,
1234
+ "padded": 1088,
1235
+ "non-padded": 0,
1236
+ "effective_few_shots": 5.0,
1237
+ "num_truncated_few_shots": 0
1238
+ },
1239
+ "harness|hendrycksTest-professional_psychology|5": {
1240
+ "hashes": {
1241
+ "hash_examples": "d45b73b22f9cc039",
1242
+ "hash_full_prompts": "fe8937e9ffc99771",
1243
+ "hash_input_tokens": "0dfb73a8eb3f692c",
1244
+ "hash_cont_tokens": "f020fbddf72c8652"
1245
+ },
1246
+ "truncated": 0,
1247
+ "non-truncated": 2448,
1248
+ "padded": 2448,
1249
+ "non-padded": 0,
1250
+ "effective_few_shots": 5.0,
1251
+ "num_truncated_few_shots": 0
1252
+ },
1253
+ "harness|hendrycksTest-public_relations|5": {
1254
+ "hashes": {
1255
+ "hash_examples": "0d25072e1761652a",
1256
+ "hash_full_prompts": "f9adc39cfa9f42ba",
1257
+ "hash_input_tokens": "1710c6ba4c9f3cbd",
1258
+ "hash_cont_tokens": "568f585a259965c1"
1259
+ },
1260
+ "truncated": 0,
1261
+ "non-truncated": 440,
1262
+ "padded": 440,
1263
+ "non-padded": 0,
1264
+ "effective_few_shots": 5.0,
1265
+ "num_truncated_few_shots": 0
1266
+ },
1267
+ "harness|hendrycksTest-security_studies|5": {
1268
+ "hashes": {
1269
+ "hash_examples": "62bb8197e63d60d4",
1270
+ "hash_full_prompts": "869c9c3ae196b7c3",
1271
+ "hash_input_tokens": "32a03f1f22a6e103",
1272
+ "hash_cont_tokens": "cc6fd7cccd64cd5d"
1273
+ },
1274
+ "truncated": 0,
1275
+ "non-truncated": 980,
1276
+ "padded": 980,
1277
+ "non-padded": 0,
1278
+ "effective_few_shots": 5.0,
1279
+ "num_truncated_few_shots": 0
1280
+ },
1281
+ "harness|hendrycksTest-sociology|5": {
1282
+ "hashes": {
1283
+ "hash_examples": "e7959df87dea8672",
1284
+ "hash_full_prompts": "1a1fc00e17b3a52a",
1285
+ "hash_input_tokens": "828999f7624cbe7e",
1286
+ "hash_cont_tokens": "c3a3bdfd177eed5b"
1287
+ },
1288
+ "truncated": 0,
1289
+ "non-truncated": 804,
1290
+ "padded": 804,
1291
+ "non-padded": 0,
1292
+ "effective_few_shots": 5.0,
1293
+ "num_truncated_few_shots": 0
1294
+ },
1295
+ "harness|hendrycksTest-us_foreign_policy|5": {
1296
+ "hashes": {
1297
+ "hash_examples": "4a56a01ddca44dca",
1298
+ "hash_full_prompts": "0c7a7081c71c07b6",
1299
+ "hash_input_tokens": "42054621e718dbee",
1300
+ "hash_cont_tokens": "2568d0e8e36fa959"
1301
+ },
1302
+ "truncated": 0,
1303
+ "non-truncated": 400,
1304
+ "padded": 400,
1305
+ "non-padded": 0,
1306
+ "effective_few_shots": 5.0,
1307
+ "num_truncated_few_shots": 0
1308
+ },
1309
+ "harness|hendrycksTest-virology|5": {
1310
+ "hashes": {
1311
+ "hash_examples": "451cc86a8c4f4fe9",
1312
+ "hash_full_prompts": "01e95325d8b738e4",
1313
+ "hash_input_tokens": "6c4f0aa4dc859c04",
1314
+ "hash_cont_tokens": "926cf60b0891f374"
1315
+ },
1316
+ "truncated": 0,
1317
+ "non-truncated": 664,
1318
+ "padded": 664,
1319
+ "non-padded": 0,
1320
+ "effective_few_shots": 5.0,
1321
+ "num_truncated_few_shots": 0
1322
+ },
1323
+ "harness|hendrycksTest-world_religions|5": {
1324
+ "hashes": {
1325
+ "hash_examples": "3b29cfaf1a81c379",
1326
+ "hash_full_prompts": "e0d79a15083dfdff",
1327
+ "hash_input_tokens": "6c75d44e092ff24f",
1328
+ "hash_cont_tokens": "c525a5de974c1ea3"
1329
+ },
1330
+ "truncated": 0,
1331
+ "non-truncated": 684,
1332
+ "padded": 684,
1333
+ "non-padded": 0,
1334
+ "effective_few_shots": 5.0,
1335
+ "num_truncated_few_shots": 0
1336
+ },
1337
+ "harness|truthfulqa:mc|0": {
1338
+ "hashes": {
1339
+ "hash_examples": "23176c0531c7b867",
1340
+ "hash_full_prompts": "36a6d90e75d92d4a",
1341
+ "hash_input_tokens": "2738d7ed7075faa7",
1342
+ "hash_cont_tokens": "c014154380b74b9e"
1343
+ },
1344
+ "truncated": 0,
1345
+ "non-truncated": 9996,
1346
+ "padded": 9996,
1347
+ "non-padded": 0,
1348
+ "effective_few_shots": 0.0,
1349
+ "num_truncated_few_shots": 0
1350
+ }
1351
+ },
1352
+ "summary_general": {
1353
+ "hashes": {
1354
+ "hash_examples": "d84d18e9a963753d",
1355
+ "hash_full_prompts": "12b540783521a8e6",
1356
+ "hash_input_tokens": "5c73a7dce6ccf737",
1357
+ "hash_cont_tokens": "fb1646e2bdd5fc38"
1358
+ },
1359
+ "total_evaluation_time_secondes": "36167.03701233864",
1360
+ "truncated": 0,
1361
+ "non-truncated": 111019,
1362
+ "padded": 110926,
1363
+ "non-padded": 93,
1364
+ "num_truncated_few_shots": 0
1365
+ }
1366
+ }
meta-llama/Llama-2-13b-hf/results_2023-09-07T13-43-41.802129.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-13b-hf",
4
+ "model_sha": "db6b8eb1feabb38985fdf785a89895959e944936",
5
+ "model_size": "24.32 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|winogrande|5": {
16
+ "acc": 0.7663772691397001,
17
+ "acc_stderr": 0.011892194477183524
18
+ },
19
+ "all": {
20
+ "acc": 0.7663772691397001,
21
+ "acc_stderr": 0.011892194477183524
22
+ }
23
+ },
24
+ "versions": {
25
+ "harness|winogrande|5": 0,
26
+ "all": 0
27
+ },
28
+ "config_tasks": {
29
+ "harness|winogrande": "LM Harness task"
30
+ },
31
+ "summary_tasks": {
32
+ "harness|winogrande|5": {
33
+ "hashes": {
34
+ "hash_examples": "aada0a176fd81218",
35
+ "hash_full_prompts": "c8655cbd12de8409",
36
+ "hash_input_tokens": "c0bedf98cb040854",
37
+ "hash_cont_tokens": "f08975ad6f2d5864"
38
+ },
39
+ "truncated": 0,
40
+ "non-truncated": 2534,
41
+ "padded": 2432,
42
+ "non-padded": 102,
43
+ "effective_few_shots": 5.0,
44
+ "num_truncated_few_shots": 0
45
+ }
46
+ },
47
+ "summary_general": {
48
+ "hashes": {
49
+ "hash_examples": "42f54c7ae3f28ef3",
50
+ "hash_full_prompts": "897c968b27a8c59a",
51
+ "hash_input_tokens": "ee5c3cb253d643d1",
52
+ "hash_cont_tokens": "273a70958f734c00"
53
+ },
54
+ "total_evaluation_time_secondes": "172.65713024139404",
55
+ "truncated": 0,
56
+ "non-truncated": 2534,
57
+ "padded": 2432,
58
+ "non-padded": 102,
59
+ "num_truncated_few_shots": 0
60
+ }
61
+ }
meta-llama/Llama-2-13b-hf/results_2023-09-07T15-27-15.010124.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-13b-hf",
4
+ "model_sha": "db6b8eb1feabb38985fdf785a89895959e944936",
5
+ "model_size": "24.32 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.0014681208053691276,
17
+ "em_stderr": 0.00039210421902982666,
18
+ "f1": 0.0607822986577181,
19
+ "f1_stderr": 0.0013583957676382913
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.10841546626231995,
23
+ "acc_stderr": 0.008563852506627487
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.7663772691397001,
27
+ "acc_stderr": 0.011892194477183524
28
+ },
29
+ "all": {
30
+ "em": 0.0014681208053691276,
31
+ "em_stderr": 0.00039210421902982666,
32
+ "f1": 0.0607822986577181,
33
+ "f1_stderr": 0.0013583957676382913,
34
+ "acc": 0.43739636770101,
35
+ "acc_stderr": 0.010228023491905505
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "c9346ec21b7560de"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "32cafa77d8a3f04e"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "4d8f1e04b1d56e40"
99
+ },
100
+ "total_evaluation_time_secondes": "6066.877633810043",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-13b-hf/results_2023-09-08T14-32-14.957248.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-13b-hf",
4
+ "model_sha": "db6b8eb1feabb38985fdf785a89895959e944936",
5
+ "model_size": "24.32 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.0014681208053691276,
17
+ "em_stderr": 0.00039210421902982666,
18
+ "f1": 0.0607822986577181,
19
+ "f1_stderr": 0.0013583957676382913
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.10841546626231995,
23
+ "acc_stderr": 0.008563852506627487
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.7663772691397001,
27
+ "acc_stderr": 0.011892194477183524
28
+ },
29
+ "all": {
30
+ "em": 0.0014681208053691276,
31
+ "em_stderr": 0.00039210421902982666,
32
+ "f1": 0.0607822986577181,
33
+ "f1_stderr": 0.0013583957676382913,
34
+ "acc": 0.43739636770101,
35
+ "acc_stderr": 0.010228023491905505
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "c9346ec21b7560de"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "32cafa77d8a3f04e"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "4d8f1e04b1d56e40"
99
+ },
100
+ "total_evaluation_time_secondes": "6159.0038006305695",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-13b-hf/results_2023-10-14T23-00-26.644553.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-13b-hf",
4
+ "model_sha": "99afe33d7eaa87c7fc6ea2594a0e4e7e588ee0a4",
5
+ "model_size": "24.32 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.0014681208053691276,
17
+ "em_stderr": 0.00039210421902982666,
18
+ "f1": 0.0607822986577181,
19
+ "f1_stderr": 0.0013583957676382913
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.10841546626231995,
23
+ "acc_stderr": 0.008563852506627487
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.7663772691397001,
27
+ "acc_stderr": 0.011892194477183524
28
+ },
29
+ "all": {
30
+ "em": 0.0014681208053691276,
31
+ "em_stderr": 0.00039210421902982666,
32
+ "f1": 0.0607822986577181,
33
+ "f1_stderr": 0.0013583957676382913,
34
+ "acc": 0.43739636770101,
35
+ "acc_stderr": 0.010228023491905505
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "c9346ec21b7560de"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "32cafa77d8a3f04e"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "4d8f1e04b1d56e40"
99
+ },
100
+ "total_evaluation_time_secondes": "11938.282367944717",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-70b-chat-hf/results.json ADDED
@@ -0,0 +1,868 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "harness|arc:challenge|25": {
4
+ "acc": 0.6049488054607508,
5
+ "acc_stderr": 0.01428589829293817,
6
+ "acc_norm": 0.6459044368600683,
7
+ "acc_norm_stderr": 0.013975454122756564
8
+ },
9
+ "harness|hellaswag|10": {
10
+ "acc": 0.6693885680143398,
11
+ "acc_stderr": 0.004694718918225751,
12
+ "acc_norm": 0.8587930691097391,
13
+ "acc_norm_stderr": 0.003475231889452833
14
+ },
15
+ "harness|truthfulqa:mc|0": {
16
+ "mc1": 0.3561811505507956,
17
+ "mc1_stderr": 0.016763790728446335,
18
+ "mc2": 0.5280473232260097,
19
+ "mc2_stderr": 0.01553022126123046
20
+ },
21
+ "harness|hendrycksTest-abstract_algebra|5": {
22
+ "acc": 0.35,
23
+ "acc_stderr": 0.04793724854411021,
24
+ "acc_norm": 0.35,
25
+ "acc_norm_stderr": 0.04793724854411021
26
+ },
27
+ "harness|hendrycksTest-anatomy|5": {
28
+ "acc": 0.5185185185185185,
29
+ "acc_stderr": 0.043163785995113245,
30
+ "acc_norm": 0.5185185185185185,
31
+ "acc_norm_stderr": 0.043163785995113245
32
+ },
33
+ "harness|hendrycksTest-astronomy|5": {
34
+ "acc": 0.7302631578947368,
35
+ "acc_stderr": 0.03611780560284898,
36
+ "acc_norm": 0.7302631578947368,
37
+ "acc_norm_stderr": 0.03611780560284898
38
+ },
39
+ "harness|hendrycksTest-business_ethics|5": {
40
+ "acc": 0.65,
41
+ "acc_stderr": 0.0479372485441102,
42
+ "acc_norm": 0.65,
43
+ "acc_norm_stderr": 0.0479372485441102
44
+ },
45
+ "harness|hendrycksTest-clinical_knowledge|5": {
46
+ "acc": 0.6377358490566037,
47
+ "acc_stderr": 0.029582245128384303,
48
+ "acc_norm": 0.6377358490566037,
49
+ "acc_norm_stderr": 0.029582245128384303
50
+ },
51
+ "harness|hendrycksTest-college_biology|5": {
52
+ "acc": 0.75,
53
+ "acc_stderr": 0.03621034121889507,
54
+ "acc_norm": 0.75,
55
+ "acc_norm_stderr": 0.03621034121889507
56
+ },
57
+ "harness|hendrycksTest-college_chemistry|5": {
58
+ "acc": 0.48,
59
+ "acc_stderr": 0.050211673156867795,
60
+ "acc_norm": 0.48,
61
+ "acc_norm_stderr": 0.050211673156867795
62
+ },
63
+ "harness|hendrycksTest-college_computer_science|5": {
64
+ "acc": 0.59,
65
+ "acc_stderr": 0.04943110704237101,
66
+ "acc_norm": 0.59,
67
+ "acc_norm_stderr": 0.04943110704237101
68
+ },
69
+ "harness|hendrycksTest-college_mathematics|5": {
70
+ "acc": 0.34,
71
+ "acc_stderr": 0.04760952285695235,
72
+ "acc_norm": 0.34,
73
+ "acc_norm_stderr": 0.04760952285695235
74
+ },
75
+ "harness|hendrycksTest-college_medicine|5": {
76
+ "acc": 0.6011560693641619,
77
+ "acc_stderr": 0.0373362665538351,
78
+ "acc_norm": 0.6011560693641619,
79
+ "acc_norm_stderr": 0.0373362665538351
80
+ },
81
+ "harness|hendrycksTest-college_physics|5": {
82
+ "acc": 0.3333333333333333,
83
+ "acc_stderr": 0.04690650298201943,
84
+ "acc_norm": 0.3333333333333333,
85
+ "acc_norm_stderr": 0.04690650298201943
86
+ },
87
+ "harness|hendrycksTest-computer_security|5": {
88
+ "acc": 0.71,
89
+ "acc_stderr": 0.045604802157206845,
90
+ "acc_norm": 0.71,
91
+ "acc_norm_stderr": 0.045604802157206845
92
+ },
93
+ "harness|hendrycksTest-conceptual_physics|5": {
94
+ "acc": 0.5829787234042553,
95
+ "acc_stderr": 0.032232762667117124,
96
+ "acc_norm": 0.5829787234042553,
97
+ "acc_norm_stderr": 0.032232762667117124
98
+ },
99
+ "harness|hendrycksTest-econometrics|5": {
100
+ "acc": 0.41228070175438597,
101
+ "acc_stderr": 0.04630653203366595,
102
+ "acc_norm": 0.41228070175438597,
103
+ "acc_norm_stderr": 0.04630653203366595
104
+ },
105
+ "harness|hendrycksTest-electrical_engineering|5": {
106
+ "acc": 0.5793103448275863,
107
+ "acc_stderr": 0.0411391498118926,
108
+ "acc_norm": 0.5793103448275863,
109
+ "acc_norm_stderr": 0.0411391498118926
110
+ },
111
+ "harness|hendrycksTest-elementary_mathematics|5": {
112
+ "acc": 0.41005291005291006,
113
+ "acc_stderr": 0.02533120243894442,
114
+ "acc_norm": 0.41005291005291006,
115
+ "acc_norm_stderr": 0.02533120243894442
116
+ },
117
+ "harness|hendrycksTest-formal_logic|5": {
118
+ "acc": 0.4126984126984127,
119
+ "acc_stderr": 0.04403438954768176,
120
+ "acc_norm": 0.4126984126984127,
121
+ "acc_norm_stderr": 0.04403438954768176
122
+ },
123
+ "harness|hendrycksTest-global_facts|5": {
124
+ "acc": 0.43,
125
+ "acc_stderr": 0.049756985195624284,
126
+ "acc_norm": 0.43,
127
+ "acc_norm_stderr": 0.049756985195624284
128
+ },
129
+ "harness|hendrycksTest-high_school_biology|5": {
130
+ "acc": 0.7645161290322581,
131
+ "acc_stderr": 0.02413763242933771,
132
+ "acc_norm": 0.7645161290322581,
133
+ "acc_norm_stderr": 0.02413763242933771
134
+ },
135
+ "harness|hendrycksTest-high_school_chemistry|5": {
136
+ "acc": 0.4630541871921182,
137
+ "acc_stderr": 0.035083705204426656,
138
+ "acc_norm": 0.4630541871921182,
139
+ "acc_norm_stderr": 0.035083705204426656
140
+ },
141
+ "harness|hendrycksTest-high_school_computer_science|5": {
142
+ "acc": 0.65,
143
+ "acc_stderr": 0.047937248544110196,
144
+ "acc_norm": 0.65,
145
+ "acc_norm_stderr": 0.047937248544110196
146
+ },
147
+ "harness|hendrycksTest-high_school_european_history|5": {
148
+ "acc": 0.8181818181818182,
149
+ "acc_stderr": 0.03011768892950359,
150
+ "acc_norm": 0.8181818181818182,
151
+ "acc_norm_stderr": 0.03011768892950359
152
+ },
153
+ "harness|hendrycksTest-high_school_geography|5": {
154
+ "acc": 0.8080808080808081,
155
+ "acc_stderr": 0.02805779167298902,
156
+ "acc_norm": 0.8080808080808081,
157
+ "acc_norm_stderr": 0.02805779167298902
158
+ },
159
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
160
+ "acc": 0.8911917098445595,
161
+ "acc_stderr": 0.022473253332768783,
162
+ "acc_norm": 0.8911917098445595,
163
+ "acc_norm_stderr": 0.022473253332768783
164
+ },
165
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
166
+ "acc": 0.6410256410256411,
167
+ "acc_stderr": 0.02432173848460235,
168
+ "acc_norm": 0.6410256410256411,
169
+ "acc_norm_stderr": 0.02432173848460235
170
+ },
171
+ "harness|hendrycksTest-high_school_mathematics|5": {
172
+ "acc": 0.3,
173
+ "acc_stderr": 0.027940457136228416,
174
+ "acc_norm": 0.3,
175
+ "acc_norm_stderr": 0.027940457136228416
176
+ },
177
+ "harness|hendrycksTest-high_school_microeconomics|5": {
178
+ "acc": 0.6596638655462185,
179
+ "acc_stderr": 0.030778057422931673,
180
+ "acc_norm": 0.6596638655462185,
181
+ "acc_norm_stderr": 0.030778057422931673
182
+ },
183
+ "harness|hendrycksTest-high_school_physics|5": {
184
+ "acc": 0.423841059602649,
185
+ "acc_stderr": 0.04034846678603397,
186
+ "acc_norm": 0.423841059602649,
187
+ "acc_norm_stderr": 0.04034846678603397
188
+ },
189
+ "harness|hendrycksTest-high_school_psychology|5": {
190
+ "acc": 0.8385321100917431,
191
+ "acc_stderr": 0.015776239256163255,
192
+ "acc_norm": 0.8385321100917431,
193
+ "acc_norm_stderr": 0.015776239256163255
194
+ },
195
+ "harness|hendrycksTest-high_school_statistics|5": {
196
+ "acc": 0.48148148148148145,
197
+ "acc_stderr": 0.03407632093854052,
198
+ "acc_norm": 0.48148148148148145,
199
+ "acc_norm_stderr": 0.03407632093854052
200
+ },
201
+ "harness|hendrycksTest-high_school_us_history|5": {
202
+ "acc": 0.8578431372549019,
203
+ "acc_stderr": 0.024509803921568606,
204
+ "acc_norm": 0.8578431372549019,
205
+ "acc_norm_stderr": 0.024509803921568606
206
+ },
207
+ "harness|hendrycksTest-high_school_world_history|5": {
208
+ "acc": 0.8438818565400844,
209
+ "acc_stderr": 0.02362715946031867,
210
+ "acc_norm": 0.8438818565400844,
211
+ "acc_norm_stderr": 0.02362715946031867
212
+ },
213
+ "harness|hendrycksTest-human_aging|5": {
214
+ "acc": 0.726457399103139,
215
+ "acc_stderr": 0.02991858670779883,
216
+ "acc_norm": 0.726457399103139,
217
+ "acc_norm_stderr": 0.02991858670779883
218
+ },
219
+ "harness|hendrycksTest-human_sexuality|5": {
220
+ "acc": 0.7099236641221374,
221
+ "acc_stderr": 0.039800662464677665,
222
+ "acc_norm": 0.7099236641221374,
223
+ "acc_norm_stderr": 0.039800662464677665
224
+ },
225
+ "harness|hendrycksTest-international_law|5": {
226
+ "acc": 0.8016528925619835,
227
+ "acc_stderr": 0.03640118271990946,
228
+ "acc_norm": 0.8016528925619835,
229
+ "acc_norm_stderr": 0.03640118271990946
230
+ },
231
+ "harness|hendrycksTest-jurisprudence|5": {
232
+ "acc": 0.8240740740740741,
233
+ "acc_stderr": 0.036809181416738807,
234
+ "acc_norm": 0.8240740740740741,
235
+ "acc_norm_stderr": 0.036809181416738807
236
+ },
237
+ "harness|hendrycksTest-logical_fallacies|5": {
238
+ "acc": 0.7607361963190185,
239
+ "acc_stderr": 0.033519538795212696,
240
+ "acc_norm": 0.7607361963190185,
241
+ "acc_norm_stderr": 0.033519538795212696
242
+ },
243
+ "harness|hendrycksTest-machine_learning|5": {
244
+ "acc": 0.48214285714285715,
245
+ "acc_stderr": 0.047427623612430116,
246
+ "acc_norm": 0.48214285714285715,
247
+ "acc_norm_stderr": 0.047427623612430116
248
+ },
249
+ "harness|hendrycksTest-management|5": {
250
+ "acc": 0.8058252427184466,
251
+ "acc_stderr": 0.03916667762822584,
252
+ "acc_norm": 0.8058252427184466,
253
+ "acc_norm_stderr": 0.03916667762822584
254
+ },
255
+ "harness|hendrycksTest-marketing|5": {
256
+ "acc": 0.8717948717948718,
257
+ "acc_stderr": 0.02190190511507332,
258
+ "acc_norm": 0.8717948717948718,
259
+ "acc_norm_stderr": 0.02190190511507332
260
+ },
261
+ "harness|hendrycksTest-medical_genetics|5": {
262
+ "acc": 0.65,
263
+ "acc_stderr": 0.047937248544110196,
264
+ "acc_norm": 0.65,
265
+ "acc_norm_stderr": 0.047937248544110196
266
+ },
267
+ "harness|hendrycksTest-miscellaneous|5": {
268
+ "acc": 0.8275862068965517,
269
+ "acc_stderr": 0.013507943909371798,
270
+ "acc_norm": 0.8275862068965517,
271
+ "acc_norm_stderr": 0.013507943909371798
272
+ },
273
+ "harness|hendrycksTest-moral_disputes|5": {
274
+ "acc": 0.7167630057803468,
275
+ "acc_stderr": 0.02425790170532338,
276
+ "acc_norm": 0.7167630057803468,
277
+ "acc_norm_stderr": 0.02425790170532338
278
+ },
279
+ "harness|hendrycksTest-moral_scenarios|5": {
280
+ "acc": 0.39553072625698327,
281
+ "acc_stderr": 0.01635341541007577,
282
+ "acc_norm": 0.39553072625698327,
283
+ "acc_norm_stderr": 0.01635341541007577
284
+ },
285
+ "harness|hendrycksTest-nutrition|5": {
286
+ "acc": 0.6993464052287581,
287
+ "acc_stderr": 0.026256053835718968,
288
+ "acc_norm": 0.6993464052287581,
289
+ "acc_norm_stderr": 0.026256053835718968
290
+ },
291
+ "harness|hendrycksTest-philosophy|5": {
292
+ "acc": 0.7041800643086816,
293
+ "acc_stderr": 0.02592237178881877,
294
+ "acc_norm": 0.7041800643086816,
295
+ "acc_norm_stderr": 0.02592237178881877
296
+ },
297
+ "harness|hendrycksTest-prehistory|5": {
298
+ "acc": 0.7098765432098766,
299
+ "acc_stderr": 0.025251173936495036,
300
+ "acc_norm": 0.7098765432098766,
301
+ "acc_norm_stderr": 0.025251173936495036
302
+ },
303
+ "harness|hendrycksTest-professional_accounting|5": {
304
+ "acc": 0.5070921985815603,
305
+ "acc_stderr": 0.02982449855912901,
306
+ "acc_norm": 0.5070921985815603,
307
+ "acc_norm_stderr": 0.02982449855912901
308
+ },
309
+ "harness|hendrycksTest-professional_law|5": {
310
+ "acc": 0.4771838331160365,
311
+ "acc_stderr": 0.012756933382823694,
312
+ "acc_norm": 0.4771838331160365,
313
+ "acc_norm_stderr": 0.012756933382823694
314
+ },
315
+ "harness|hendrycksTest-professional_medicine|5": {
316
+ "acc": 0.5772058823529411,
317
+ "acc_stderr": 0.030008562845003476,
318
+ "acc_norm": 0.5772058823529411,
319
+ "acc_norm_stderr": 0.030008562845003476
320
+ },
321
+ "harness|hendrycksTest-professional_psychology|5": {
322
+ "acc": 0.6699346405228758,
323
+ "acc_stderr": 0.019023726160724556,
324
+ "acc_norm": 0.6699346405228758,
325
+ "acc_norm_stderr": 0.019023726160724556
326
+ },
327
+ "harness|hendrycksTest-public_relations|5": {
328
+ "acc": 0.6909090909090909,
329
+ "acc_stderr": 0.044262946482000985,
330
+ "acc_norm": 0.6909090909090909,
331
+ "acc_norm_stderr": 0.044262946482000985
332
+ },
333
+ "harness|hendrycksTest-security_studies|5": {
334
+ "acc": 0.7877551020408163,
335
+ "acc_stderr": 0.026176967197866767,
336
+ "acc_norm": 0.7877551020408163,
337
+ "acc_norm_stderr": 0.026176967197866767
338
+ },
339
+ "harness|hendrycksTest-sociology|5": {
340
+ "acc": 0.8706467661691543,
341
+ "acc_stderr": 0.023729830881018526,
342
+ "acc_norm": 0.8706467661691543,
343
+ "acc_norm_stderr": 0.023729830881018526
344
+ },
345
+ "harness|hendrycksTest-us_foreign_policy|5": {
346
+ "acc": 0.87,
347
+ "acc_stderr": 0.03379976689896309,
348
+ "acc_norm": 0.87,
349
+ "acc_norm_stderr": 0.03379976689896309
350
+ },
351
+ "harness|hendrycksTest-virology|5": {
352
+ "acc": 0.5120481927710844,
353
+ "acc_stderr": 0.03891364495835817,
354
+ "acc_norm": 0.5120481927710844,
355
+ "acc_norm_stderr": 0.03891364495835817
356
+ },
357
+ "harness|hendrycksTest-world_religions|5": {
358
+ "acc": 0.8187134502923976,
359
+ "acc_stderr": 0.029547741687640038,
360
+ "acc_norm": 0.8187134502923976,
361
+ "acc_norm_stderr": 0.029547741687640038
362
+ },
363
+ "all": {
364
+ "acc": 0.6390701952816291,
365
+ "acc_stderr": 0.03365809160773111,
366
+ "acc_norm": 0.6390701952816291,
367
+ "acc_norm_stderr": 0.03365809160773111
368
+ }
369
+ },
370
+ "versions": {
371
+ "harness|arc:challenge|25": 0,
372
+ "harness|hellaswag|10": 0,
373
+ "harness|truthfulqa:mc|0": 1,
374
+ "harness|hendrycksTest-abstract_algebra|5": 1,
375
+ "harness|hendrycksTest-anatomy|5": 1,
376
+ "harness|hendrycksTest-astronomy|5": 1,
377
+ "harness|hendrycksTest-business_ethics|5": 1,
378
+ "harness|hendrycksTest-clinical_knowledge|5": 1,
379
+ "harness|hendrycksTest-college_biology|5": 1,
380
+ "harness|hendrycksTest-college_chemistry|5": 1,
381
+ "harness|hendrycksTest-college_computer_science|5": 1,
382
+ "harness|hendrycksTest-college_mathematics|5": 1,
383
+ "harness|hendrycksTest-college_medicine|5": 1,
384
+ "harness|hendrycksTest-college_physics|5": 1,
385
+ "harness|hendrycksTest-computer_security|5": 1,
386
+ "harness|hendrycksTest-conceptual_physics|5": 1,
387
+ "harness|hendrycksTest-econometrics|5": 1,
388
+ "harness|hendrycksTest-electrical_engineering|5": 1,
389
+ "harness|hendrycksTest-elementary_mathematics|5": 1,
390
+ "harness|hendrycksTest-formal_logic|5": 1,
391
+ "harness|hendrycksTest-global_facts|5": 1,
392
+ "harness|hendrycksTest-high_school_biology|5": 1,
393
+ "harness|hendrycksTest-high_school_chemistry|5": 1,
394
+ "harness|hendrycksTest-high_school_computer_science|5": 1,
395
+ "harness|hendrycksTest-high_school_european_history|5": 1,
396
+ "harness|hendrycksTest-high_school_geography|5": 1,
397
+ "harness|hendrycksTest-high_school_government_and_politics|5": 1,
398
+ "harness|hendrycksTest-high_school_macroeconomics|5": 1,
399
+ "harness|hendrycksTest-high_school_mathematics|5": 1,
400
+ "harness|hendrycksTest-high_school_microeconomics|5": 1,
401
+ "harness|hendrycksTest-high_school_physics|5": 1,
402
+ "harness|hendrycksTest-high_school_psychology|5": 1,
403
+ "harness|hendrycksTest-high_school_statistics|5": 1,
404
+ "harness|hendrycksTest-high_school_us_history|5": 1,
405
+ "harness|hendrycksTest-high_school_world_history|5": 1,
406
+ "harness|hendrycksTest-human_aging|5": 1,
407
+ "harness|hendrycksTest-human_sexuality|5": 1,
408
+ "harness|hendrycksTest-international_law|5": 1,
409
+ "harness|hendrycksTest-jurisprudence|5": 1,
410
+ "harness|hendrycksTest-logical_fallacies|5": 1,
411
+ "harness|hendrycksTest-machine_learning|5": 1,
412
+ "harness|hendrycksTest-management|5": 1,
413
+ "harness|hendrycksTest-marketing|5": 1,
414
+ "harness|hendrycksTest-medical_genetics|5": 1,
415
+ "harness|hendrycksTest-miscellaneous|5": 1,
416
+ "harness|hendrycksTest-moral_disputes|5": 1,
417
+ "harness|hendrycksTest-moral_scenarios|5": 1,
418
+ "harness|hendrycksTest-nutrition|5": 1,
419
+ "harness|hendrycksTest-philosophy|5": 1,
420
+ "harness|hendrycksTest-prehistory|5": 1,
421
+ "harness|hendrycksTest-professional_accounting|5": 1,
422
+ "harness|hendrycksTest-professional_law|5": 1,
423
+ "harness|hendrycksTest-professional_medicine|5": 1,
424
+ "harness|hendrycksTest-professional_psychology|5": 1,
425
+ "harness|hendrycksTest-public_relations|5": 1,
426
+ "harness|hendrycksTest-security_studies|5": 1,
427
+ "harness|hendrycksTest-sociology|5": 1,
428
+ "harness|hendrycksTest-us_foreign_policy|5": 1,
429
+ "harness|hendrycksTest-virology|5": 1,
430
+ "harness|hendrycksTest-world_religions|5": 1,
431
+ "all": 0
432
+ },
433
+ "config": {
434
+ "model_name": "meta-llama/Llama-2-70b-chat-hf",
435
+ "model_sha": "7f54101c0fbb67a8143ca23eb8bd09b71f269c74",
436
+ "model_dtype": "torch.float16",
437
+ "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
438
+ "num_few_shot_default": 0,
439
+ "num_fewshot_seeds": 1,
440
+ "override_batch_size": 1,
441
+ "max_samples": null
442
+ },
443
+ "task_config": {
444
+ "harness|arc:challenge": "LM Harness task",
445
+ "harness|hellaswag": "LM Harness task",
446
+ "harness|truthfulqa:mc": "LM Harness task",
447
+ "harness|hendrycksTest-abstract_algebra": "LM Harness task",
448
+ "harness|hendrycksTest-anatomy": "LM Harness task",
449
+ "harness|hendrycksTest-astronomy": "LM Harness task",
450
+ "harness|hendrycksTest-business_ethics": "LM Harness task",
451
+ "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
452
+ "harness|hendrycksTest-college_biology": "LM Harness task",
453
+ "harness|hendrycksTest-college_chemistry": "LM Harness task",
454
+ "harness|hendrycksTest-college_computer_science": "LM Harness task",
455
+ "harness|hendrycksTest-college_mathematics": "LM Harness task",
456
+ "harness|hendrycksTest-college_medicine": "LM Harness task",
457
+ "harness|hendrycksTest-college_physics": "LM Harness task",
458
+ "harness|hendrycksTest-computer_security": "LM Harness task",
459
+ "harness|hendrycksTest-conceptual_physics": "LM Harness task",
460
+ "harness|hendrycksTest-econometrics": "LM Harness task",
461
+ "harness|hendrycksTest-electrical_engineering": "LM Harness task",
462
+ "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
463
+ "harness|hendrycksTest-formal_logic": "LM Harness task",
464
+ "harness|hendrycksTest-global_facts": "LM Harness task",
465
+ "harness|hendrycksTest-high_school_biology": "LM Harness task",
466
+ "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
467
+ "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
468
+ "harness|hendrycksTest-high_school_european_history": "LM Harness task",
469
+ "harness|hendrycksTest-high_school_geography": "LM Harness task",
470
+ "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
471
+ "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
472
+ "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
473
+ "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
474
+ "harness|hendrycksTest-high_school_physics": "LM Harness task",
475
+ "harness|hendrycksTest-high_school_psychology": "LM Harness task",
476
+ "harness|hendrycksTest-high_school_statistics": "LM Harness task",
477
+ "harness|hendrycksTest-high_school_us_history": "LM Harness task",
478
+ "harness|hendrycksTest-high_school_world_history": "LM Harness task",
479
+ "harness|hendrycksTest-human_aging": "LM Harness task",
480
+ "harness|hendrycksTest-human_sexuality": "LM Harness task",
481
+ "harness|hendrycksTest-international_law": "LM Harness task",
482
+ "harness|hendrycksTest-jurisprudence": "LM Harness task",
483
+ "harness|hendrycksTest-logical_fallacies": "LM Harness task",
484
+ "harness|hendrycksTest-machine_learning": "LM Harness task",
485
+ "harness|hendrycksTest-management": "LM Harness task",
486
+ "harness|hendrycksTest-marketing": "LM Harness task",
487
+ "harness|hendrycksTest-medical_genetics": "LM Harness task",
488
+ "harness|hendrycksTest-miscellaneous": "LM Harness task",
489
+ "harness|hendrycksTest-moral_disputes": "LM Harness task",
490
+ "harness|hendrycksTest-moral_scenarios": "LM Harness task",
491
+ "harness|hendrycksTest-nutrition": "LM Harness task",
492
+ "harness|hendrycksTest-philosophy": "LM Harness task",
493
+ "harness|hendrycksTest-prehistory": "LM Harness task",
494
+ "harness|hendrycksTest-professional_accounting": "LM Harness task",
495
+ "harness|hendrycksTest-professional_law": "LM Harness task",
496
+ "harness|hendrycksTest-professional_medicine": "LM Harness task",
497
+ "harness|hendrycksTest-professional_psychology": "LM Harness task",
498
+ "harness|hendrycksTest-public_relations": "LM Harness task",
499
+ "harness|hendrycksTest-security_studies": "LM Harness task",
500
+ "harness|hendrycksTest-sociology": "LM Harness task",
501
+ "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
502
+ "harness|hendrycksTest-virology": "LM Harness task",
503
+ "harness|hendrycksTest-world_religions": "LM Harness task"
504
+
505
+ },
506
+ "hashes": {
507
+ "harness|arc:challenge|25": {
508
+ "hash_examples": "fb8c51b1872daeda",
509
+ "hash_full_prompts": "045cbb916e5145c6",
510
+ "hash_input_tokens": "fab18a8dbccd885e",
511
+ "hash_cont_tokens": "e8abf848493b50f7"
512
+ },
513
+ "harness|hellaswag|10": {
514
+ "hash_examples": "e1768ecb99d7ecf0",
515
+ "hash_full_prompts": "0b4c16983130f84f",
516
+ "hash_input_tokens": "fd3d11be48664a7e",
517
+ "hash_cont_tokens": "9fe0a5c42e1532db"
518
+ },
519
+ "harness|truthfulqa:mc|0": {
520
+ "hash_examples": "23176c0531c7b867",
521
+ "hash_full_prompts": "36a6d90e75d92d4a",
522
+ "hash_input_tokens": "e3c2231820d87234",
523
+ "hash_cont_tokens": "f5da56a132aab151"
524
+ },
525
+ "harness|hendrycksTest-abstract_algebra|5": {
526
+ "hash_examples": "280f9f325b40559a",
527
+ "hash_full_prompts": "2f776a367d23aea2",
528
+ "hash_input_tokens": "c3792fce2534965f",
529
+ "hash_cont_tokens": "50421e30bef398f9"
530
+ },
531
+ "harness|hendrycksTest-anatomy|5": {
532
+ "hash_examples": "2f83a4f1cab4ba18",
533
+ "hash_full_prompts": "516f74bef25df620",
534
+ "hash_input_tokens": "1bfeea5736b995ee",
535
+ "hash_cont_tokens": "f11971a765cb609f"
536
+ },
537
+ "harness|hendrycksTest-astronomy|5": {
538
+ "hash_examples": "7d587b908da4d762",
539
+ "hash_full_prompts": "faf4e80f65de93ca",
540
+ "hash_input_tokens": "c4b2f1160f746871",
541
+ "hash_cont_tokens": "440a970fadecdc7b"
542
+ },
543
+ "harness|hendrycksTest-business_ethics|5": {
544
+ "hash_examples": "33e51740670de686",
545
+ "hash_full_prompts": "db01c3ef8e1479d4",
546
+ "hash_input_tokens": "b98d6ef1d1e2e17b",
547
+ "hash_cont_tokens": "50421e30bef398f9"
548
+ },
549
+ "harness|hendrycksTest-clinical_knowledge|5": {
550
+ "hash_examples": "f3366dbe7eefffa4",
551
+ "hash_full_prompts": "49654f71d94b65c3",
552
+ "hash_input_tokens": "9851119dacda883c",
553
+ "hash_cont_tokens": "7ecd60c25b9bfe5b"
554
+ },
555
+ "harness|hendrycksTest-college_biology|5": {
556
+ "hash_examples": "ca2b6753a0193e7f",
557
+ "hash_full_prompts": "2b460b75f1fdfefd",
558
+ "hash_input_tokens": "81a92a54cddefc2f",
559
+ "hash_cont_tokens": "875cde3af7a0ee14"
560
+ },
561
+ "harness|hendrycksTest-college_chemistry|5": {
562
+ "hash_examples": "22ff85f1d34f42d1",
563
+ "hash_full_prompts": "242c9be6da583e95",
564
+ "hash_input_tokens": "fd4c0cebdc2c1c3d",
565
+ "hash_cont_tokens": "50421e30bef398f9"
566
+ },
567
+ "harness|hendrycksTest-college_computer_science|5": {
568
+ "hash_examples": "30318289d717a5cf",
569
+ "hash_full_prompts": "ed2bdb4e87c4b371",
570
+ "hash_input_tokens": "49f6021f4c075e0d",
571
+ "hash_cont_tokens": "50421e30bef398f9"
572
+ },
573
+ "harness|hendrycksTest-college_mathematics|5": {
574
+ "hash_examples": "4944d1f0b6b5d911",
575
+ "hash_full_prompts": "770bc4281c973190",
576
+ "hash_input_tokens": "db61bad69399bfe8",
577
+ "hash_cont_tokens": "50421e30bef398f9"
578
+ },
579
+ "harness|hendrycksTest-college_medicine|5": {
580
+ "hash_examples": "dd69cc33381275af",
581
+ "hash_full_prompts": "ad2a53e5250ab46e",
582
+ "hash_input_tokens": "c458392f38424d77",
583
+ "hash_cont_tokens": "702fb6d82ff0d6ac"
584
+ },
585
+ "harness|hendrycksTest-college_physics|5": {
586
+ "hash_examples": "875dd26d22655b0d",
587
+ "hash_full_prompts": "833a0d7b55aed500",
588
+ "hash_input_tokens": "49cf4d8d8696b588",
589
+ "hash_cont_tokens": "f7b8097afc16a47c"
590
+ },
591
+ "harness|hendrycksTest-computer_security|5": {
592
+ "hash_examples": "006451eedc0ededb",
593
+ "hash_full_prompts": "94034c97e85d8f46",
594
+ "hash_input_tokens": "e81d46ca85fa2b7c",
595
+ "hash_cont_tokens": "50421e30bef398f9"
596
+ },
597
+ "harness|hendrycksTest-conceptual_physics|5": {
598
+ "hash_examples": "8874ece872d2ca4c",
599
+ "hash_full_prompts": "e40d15a34640d6fa",
600
+ "hash_input_tokens": "d5e231a26622e7d5",
601
+ "hash_cont_tokens": "aa0e8bc655f2f641"
602
+ },
603
+ "harness|hendrycksTest-econometrics|5": {
604
+ "hash_examples": "64d3623b0bfaa43f",
605
+ "hash_full_prompts": "612f340fae41338d",
606
+ "hash_input_tokens": "afa3603fd1622706",
607
+ "hash_cont_tokens": "b1cc6e7e9fcd3827"
608
+ },
609
+ "harness|hendrycksTest-electrical_engineering|5": {
610
+ "hash_examples": "e98f51780c674d7e",
611
+ "hash_full_prompts": "10275b312d812ae6",
612
+ "hash_input_tokens": "e0c62cf84ed22e7e",
613
+ "hash_cont_tokens": "2425a3f084a591ef"
614
+ },
615
+ "harness|hendrycksTest-elementary_mathematics|5": {
616
+ "hash_examples": "fc48208a5ac1c0ce",
617
+ "hash_full_prompts": "5ec274c6c82aca23",
618
+ "hash_input_tokens": "303123d2b857f30b",
619
+ "hash_cont_tokens": "bd87bf0c060fd925"
620
+ },
621
+ "harness|hendrycksTest-formal_logic|5": {
622
+ "hash_examples": "5a6525665f63ea72",
623
+ "hash_full_prompts": "07b92638c4a6b500",
624
+ "hash_input_tokens": "3fd8073b90b9736d",
625
+ "hash_cont_tokens": "eb8932890e0605db"
626
+ },
627
+ "harness|hendrycksTest-global_facts|5": {
628
+ "hash_examples": "371d70d743b2b89b",
629
+ "hash_full_prompts": "332fdee50a1921b4",
630
+ "hash_input_tokens": "f65051acd3210902",
631
+ "hash_cont_tokens": "50421e30bef398f9"
632
+ },
633
+ "harness|hendrycksTest-high_school_biology|5": {
634
+ "hash_examples": "a79e1018b1674052",
635
+ "hash_full_prompts": "e624e26ede922561",
636
+ "hash_input_tokens": "264263fc8c2123bc",
637
+ "hash_cont_tokens": "1ddcb86d28cde266"
638
+ },
639
+ "harness|hendrycksTest-high_school_chemistry|5": {
640
+ "hash_examples": "44bfc25c389f0e03",
641
+ "hash_full_prompts": "0e3e5f5d9246482a",
642
+ "hash_input_tokens": "42e1a18523b075e7",
643
+ "hash_cont_tokens": "176c8dcff38c5f8f"
644
+ },
645
+ "harness|hendrycksTest-high_school_computer_science|5": {
646
+ "hash_examples": "8b8cdb1084f24169",
647
+ "hash_full_prompts": "c00487e67c1813cc",
648
+ "hash_input_tokens": "6f109fbd505d364b",
649
+ "hash_cont_tokens": "50421e30bef398f9"
650
+ },
651
+ "harness|hendrycksTest-high_school_european_history|5": {
652
+ "hash_examples": "11cd32d0ef440171",
653
+ "hash_full_prompts": "318f4513c537c6bf",
654
+ "hash_input_tokens": "f1f73dd687da18d7",
655
+ "hash_cont_tokens": "674fc454bdc5ac93"
656
+ },
657
+ "harness|hendrycksTest-high_school_geography|5": {
658
+ "hash_examples": "b60019b9e80b642f",
659
+ "hash_full_prompts": "ee5789fcc1a81b1e",
660
+ "hash_input_tokens": "575ea4d290807e79",
661
+ "hash_cont_tokens": "03a5012b916274ea"
662
+ },
663
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
664
+ "hash_examples": "d221ec983d143dc3",
665
+ "hash_full_prompts": "ac42d888e1ce1155",
666
+ "hash_input_tokens": "5954aff17f30959c",
667
+ "hash_cont_tokens": "873d2aab226ba1d8"
668
+ },
669
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
670
+ "hash_examples": "59c2915cacfd3fbb",
671
+ "hash_full_prompts": "c6bd9d25158abd0e",
672
+ "hash_input_tokens": "cc4bb974def176ee",
673
+ "hash_cont_tokens": "c583432ad27fcfe0"
674
+ },
675
+ "harness|hendrycksTest-high_school_mathematics|5": {
676
+ "hash_examples": "1f8ac897608de342",
677
+ "hash_full_prompts": "5d88f41fc2d643a8",
678
+ "hash_input_tokens": "94100bcb23e1a13e",
679
+ "hash_cont_tokens": "d7907b61bcb8c123"
680
+ },
681
+ "harness|hendrycksTest-high_school_microeconomics|5": {
682
+ "hash_examples": "ead6a0f2f6c83370",
683
+ "hash_full_prompts": "bfc393381298609e",
684
+ "hash_input_tokens": "129c79724487131d",
685
+ "hash_cont_tokens": "f47f041de50333b9"
686
+ },
687
+ "harness|hendrycksTest-high_school_physics|5": {
688
+ "hash_examples": "c3f2025990afec64",
689
+ "hash_full_prompts": "fc78b4997e436734",
690
+ "hash_input_tokens": "82c2ac81ad5b141c",
691
+ "hash_cont_tokens": "0d56317b3e5eedb5"
692
+ },
693
+ "harness|hendrycksTest-high_school_psychology|5": {
694
+ "hash_examples": "21f8aab618f6d636",
695
+ "hash_full_prompts": "d5c76aa40b9dbc43",
696
+ "hash_input_tokens": "422b8bb7add88cc5",
697
+ "hash_cont_tokens": "09ba1243e7390c0f"
698
+ },
699
+ "harness|hendrycksTest-high_school_statistics|5": {
700
+ "hash_examples": "2386a60a11fc5de3",
701
+ "hash_full_prompts": "4c5c8be5aafac432",
702
+ "hash_input_tokens": "d3e6f7198120fbdc",
703
+ "hash_cont_tokens": "9cc29889c3d3f77d"
704
+ },
705
+ "harness|hendrycksTest-high_school_us_history|5": {
706
+ "hash_examples": "74961543be40f04f",
707
+ "hash_full_prompts": "5d5ca4840131ba21",
708
+ "hash_input_tokens": "50c9ff438c85a69e",
709
+ "hash_cont_tokens": "cdd0b3dc06d933e5"
710
+ },
711
+ "harness|hendrycksTest-high_school_world_history|5": {
712
+ "hash_examples": "2ad2f6b7198b2234",
713
+ "hash_full_prompts": "11845057459afd72",
714
+ "hash_input_tokens": "054824cc474caef5",
715
+ "hash_cont_tokens": "e02816433ff28daf"
716
+ },
717
+ "harness|hendrycksTest-human_aging|5": {
718
+ "hash_examples": "1a7199dc733e779b",
719
+ "hash_full_prompts": "756b9096b8eaf892",
720
+ "hash_input_tokens": "151f31a573d81257",
721
+ "hash_cont_tokens": "142a4a8a1138a214"
722
+ },
723
+ "harness|hendrycksTest-human_sexuality|5": {
724
+ "hash_examples": "7acb8fdad97f88a6",
725
+ "hash_full_prompts": "731a52ff15b8cfdb",
726
+ "hash_input_tokens": "b77763767fb18cc4",
727
+ "hash_cont_tokens": "bc54813e809b796d"
728
+ },
729
+ "harness|hendrycksTest-international_law|5": {
730
+ "hash_examples": "1300bfd0dfc59114",
731
+ "hash_full_prompts": "db2aefbff5eec996",
732
+ "hash_input_tokens": "a4e52c47400b8bca",
733
+ "hash_cont_tokens": "8ea8c5ff76a15bca"
734
+ },
735
+ "harness|hendrycksTest-jurisprudence|5": {
736
+ "hash_examples": "083b1e4904c48dc2",
737
+ "hash_full_prompts": "0f89ee3fe03d6a21",
738
+ "hash_input_tokens": "69644001a800b0f7",
739
+ "hash_cont_tokens": "e3a8cd951b6e3469"
740
+ },
741
+ "harness|hendrycksTest-logical_fallacies|5": {
742
+ "hash_examples": "709128f9926a634c",
743
+ "hash_full_prompts": "98a04b1f8f841069",
744
+ "hash_input_tokens": "332ca144a888ad7f",
745
+ "hash_cont_tokens": "3e9e0bdc248fd88a"
746
+ },
747
+ "harness|hendrycksTest-machine_learning|5": {
748
+ "hash_examples": "88f22a636029ae47",
749
+ "hash_full_prompts": "2e1c8d4b1e0cc921",
750
+ "hash_input_tokens": "a27f6dd3c2837ded",
751
+ "hash_cont_tokens": "55b12fb138c6a064"
752
+ },
753
+ "harness|hendrycksTest-management|5": {
754
+ "hash_examples": "8c8a1e07a2151dca",
755
+ "hash_full_prompts": "f51611f514b265b0",
756
+ "hash_input_tokens": "9f72696f5f9c4c80",
757
+ "hash_cont_tokens": "a01d6d39a83c4597"
758
+ },
759
+ "harness|hendrycksTest-marketing|5": {
760
+ "hash_examples": "2668953431f91e96",
761
+ "hash_full_prompts": "77562bef997c7650",
762
+ "hash_input_tokens": "0d9707022133f086",
763
+ "hash_cont_tokens": "6aeaed4d823c98aa"
764
+ },
765
+ "harness|hendrycksTest-medical_genetics|5": {
766
+ "hash_examples": "9c2dda34a2ea4fd2",
767
+ "hash_full_prompts": "202139046daa118f",
768
+ "hash_input_tokens": "e957962a583e58a2",
769
+ "hash_cont_tokens": "50421e30bef398f9"
770
+ },
771
+ "harness|hendrycksTest-miscellaneous|5": {
772
+ "hash_examples": "41adb694024809c2",
773
+ "hash_full_prompts": "bffec9fc237bcf93",
774
+ "hash_input_tokens": "46fe4585062aa36a",
775
+ "hash_cont_tokens": "9b0ab02a64603081"
776
+ },
777
+ "harness|hendrycksTest-moral_disputes|5": {
778
+ "hash_examples": "3171c13ba3c594c4",
779
+ "hash_full_prompts": "170831fc36f1d59e",
780
+ "hash_input_tokens": "cf9834b2c07721dc",
781
+ "hash_cont_tokens": "3b8bbe9108e55ce9"
782
+ },
783
+ "harness|hendrycksTest-moral_scenarios|5": {
784
+ "hash_examples": "9873e077e83e0546",
785
+ "hash_full_prompts": "08f4ceba3131a068",
786
+ "hash_input_tokens": "f257b7cce9ddb541",
787
+ "hash_cont_tokens": "3e9bfc0362e97330"
788
+ },
789
+ "harness|hendrycksTest-nutrition|5": {
790
+ "hash_examples": "7db1d8142ec14323",
791
+ "hash_full_prompts": "4c0e68e3586cb453",
792
+ "hash_input_tokens": "8650a7e901b42458",
793
+ "hash_cont_tokens": "23b2dc6ee2da4cfc"
794
+ },
795
+ "harness|hendrycksTest-philosophy|5": {
796
+ "hash_examples": "9b455b7d72811cc8",
797
+ "hash_full_prompts": "e467f822d8a0d3ff",
798
+ "hash_input_tokens": "4ba4c1d13e1040ec",
799
+ "hash_cont_tokens": "9f6ff69d23a48783"
800
+ },
801
+ "harness|hendrycksTest-prehistory|5": {
802
+ "hash_examples": "8be90d0f538f1560",
803
+ "hash_full_prompts": "152187949bcd0921",
804
+ "hash_input_tokens": "7431d7b2d5c13409",
805
+ "hash_cont_tokens": "d6458d743d875837"
806
+ },
807
+ "harness|hendrycksTest-professional_accounting|5": {
808
+ "hash_examples": "8d377597916cd07e",
809
+ "hash_full_prompts": "0eb7345d6144ee0d",
810
+ "hash_input_tokens": "e7bbb4a15e991424",
811
+ "hash_cont_tokens": "922a195f53a35662"
812
+ },
813
+ "harness|hendrycksTest-professional_law|5": {
814
+ "hash_examples": "cd9dbc52b3c932d6",
815
+ "hash_full_prompts": "36ac764272bfb182",
816
+ "hash_input_tokens": "9178e10bd0763ec4",
817
+ "hash_cont_tokens": "2e590029ef41fbcd"
818
+ },
819
+ "harness|hendrycksTest-professional_medicine|5": {
820
+ "hash_examples": "b20e4e816c1e383e",
821
+ "hash_full_prompts": "7b8d69ea2acaf2f7",
822
+ "hash_input_tokens": "f5a22012a54f70ea",
823
+ "hash_cont_tokens": "7cfee54dbddd5a98"
824
+ },
825
+ "harness|hendrycksTest-professional_psychology|5": {
826
+ "hash_examples": "d45b73b22f9cc039",
827
+ "hash_full_prompts": "fe8937e9ffc99771",
828
+ "hash_input_tokens": "8eeb91b3a7cbea0a",
829
+ "hash_cont_tokens": "a86677b2a45c20e1"
830
+ },
831
+ "harness|hendrycksTest-public_relations|5": {
832
+ "hash_examples": "0d25072e1761652a",
833
+ "hash_full_prompts": "f9adc39cfa9f42ba",
834
+ "hash_input_tokens": "bdfc559a40a1e8ec",
835
+ "hash_cont_tokens": "0d756ccaae031757"
836
+ },
837
+ "harness|hendrycksTest-security_studies|5": {
838
+ "hash_examples": "62bb8197e63d60d4",
839
+ "hash_full_prompts": "869c9c3ae196b7c3",
840
+ "hash_input_tokens": "d49711415961ced7",
841
+ "hash_cont_tokens": "b2229bc2cfbf594b"
842
+ },
843
+ "harness|hendrycksTest-sociology|5": {
844
+ "hash_examples": "e7959df87dea8672",
845
+ "hash_full_prompts": "1a1fc00e17b3a52a",
846
+ "hash_input_tokens": "f9a00c6fc5e9cea7",
847
+ "hash_cont_tokens": "c3a3bdfd177eed5b"
848
+ },
849
+ "harness|hendrycksTest-us_foreign_policy|5": {
850
+ "hash_examples": "4a56a01ddca44dca",
851
+ "hash_full_prompts": "0c7a7081c71c07b6",
852
+ "hash_input_tokens": "647f2d7d9075afaa",
853
+ "hash_cont_tokens": "50421e30bef398f9"
854
+ },
855
+ "harness|hendrycksTest-virology|5": {
856
+ "hash_examples": "451cc86a8c4f4fe9",
857
+ "hash_full_prompts": "01e95325d8b738e4",
858
+ "hash_input_tokens": "784f75f0ad6e0698",
859
+ "hash_cont_tokens": "af8b3658088cb37f"
860
+ },
861
+ "harness|hendrycksTest-world_religions|5": {
862
+ "hash_examples": "3b29cfaf1a81c379",
863
+ "hash_full_prompts": "e0d79a15083dfdff",
864
+ "hash_input_tokens": "17766ebe38853371",
865
+ "hash_cont_tokens": "060118bef6de4e0a"
866
+ }
867
+ }
868
+ }
meta-llama/Llama-2-70b-chat-hf/results_2023-10-17T05-07-42.486452.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-70b-chat-hf",
4
+ "model_sha": "cfe96d938c52db7c6d936f99370c0801b24233c4",
5
+ "model_size": "128.64 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.040373322147651006,
17
+ "em_stderr": 0.0020157564185176837,
18
+ "f1": 0.1050272651006715,
19
+ "f1_stderr": 0.0023756238577676155
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.266868840030326,
23
+ "acc_stderr": 0.012183780551887957
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.8050513022888713,
27
+ "acc_stderr": 0.011134099415938268
28
+ },
29
+ "all": {
30
+ "em": 0.040373322147651006,
31
+ "em_stderr": 0.0020157564185176837,
32
+ "f1": 0.1050272651006715,
33
+ "f1_stderr": 0.0023756238577676155,
34
+ "acc": 0.5359600711595986,
35
+ "acc_stderr": 0.011658939983913113
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "b7f7e4a7d842e431"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "13bcb12a5f7991f1"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "5fa49b6513c85264"
99
+ },
100
+ "total_evaluation_time_secondes": "42063.55081868172",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-70b-hf/results.json ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "harness|arc:challenge|25": {
4
+ "acc": 0.6262798634812287,
5
+ "acc_stderr": 0.014137708601759091,
6
+ "acc_norm": 0.6732081911262798,
7
+ "acc_norm_stderr": 0.013706665975587333
8
+ },
9
+ "harness|hellaswag|10": {
10
+ "acc": 0.6760605457080263,
11
+ "acc_stderr": 0.00467020812857923,
12
+ "acc_norm": 0.8733320055765784,
13
+ "acc_norm_stderr": 0.0033192094001351187
14
+ },
15
+ "harness|hendrycksTest-abstract_algebra|5": {
16
+ "acc": 0.33,
17
+ "acc_stderr": 0.04725815626252605,
18
+ "acc_norm": 0.33,
19
+ "acc_norm_stderr": 0.04725815626252605
20
+ },
21
+ "harness|hendrycksTest-anatomy|5": {
22
+ "acc": 0.6296296296296297,
23
+ "acc_stderr": 0.04171654161354544,
24
+ "acc_norm": 0.6296296296296297,
25
+ "acc_norm_stderr": 0.04171654161354544
26
+ },
27
+ "harness|hendrycksTest-astronomy|5": {
28
+ "acc": 0.8092105263157895,
29
+ "acc_stderr": 0.031975658210325,
30
+ "acc_norm": 0.8092105263157895,
31
+ "acc_norm_stderr": 0.031975658210325
32
+ },
33
+ "harness|hendrycksTest-business_ethics|5": {
34
+ "acc": 0.72,
35
+ "acc_stderr": 0.04512608598542127,
36
+ "acc_norm": 0.72,
37
+ "acc_norm_stderr": 0.04512608598542127
38
+ },
39
+ "harness|hendrycksTest-clinical_knowledge|5": {
40
+ "acc": 0.7169811320754716,
41
+ "acc_stderr": 0.027724236492700918,
42
+ "acc_norm": 0.7169811320754716,
43
+ "acc_norm_stderr": 0.027724236492700918
44
+ },
45
+ "harness|hendrycksTest-college_biology|5": {
46
+ "acc": 0.8472222222222222,
47
+ "acc_stderr": 0.030085743248565666,
48
+ "acc_norm": 0.8472222222222222,
49
+ "acc_norm_stderr": 0.030085743248565666
50
+ },
51
+ "harness|hendrycksTest-college_chemistry|5": {
52
+ "acc": 0.51,
53
+ "acc_stderr": 0.05024183937956912,
54
+ "acc_norm": 0.51,
55
+ "acc_norm_stderr": 0.05024183937956912
56
+ },
57
+ "harness|hendrycksTest-college_computer_science|5": {
58
+ "acc": 0.6,
59
+ "acc_stderr": 0.049236596391733084,
60
+ "acc_norm": 0.6,
61
+ "acc_norm_stderr": 0.049236596391733084
62
+ },
63
+ "harness|hendrycksTest-college_mathematics|5": {
64
+ "acc": 0.37,
65
+ "acc_stderr": 0.048523658709391,
66
+ "acc_norm": 0.37,
67
+ "acc_norm_stderr": 0.048523658709391
68
+ },
69
+ "harness|hendrycksTest-college_medicine|5": {
70
+ "acc": 0.6416184971098265,
71
+ "acc_stderr": 0.03656343653353159,
72
+ "acc_norm": 0.6416184971098265,
73
+ "acc_norm_stderr": 0.03656343653353159
74
+ },
75
+ "harness|hendrycksTest-college_physics|5": {
76
+ "acc": 0.37254901960784315,
77
+ "acc_stderr": 0.04810840148082635,
78
+ "acc_norm": 0.37254901960784315,
79
+ "acc_norm_stderr": 0.04810840148082635
80
+ },
81
+ "harness|hendrycksTest-computer_security|5": {
82
+ "acc": 0.77,
83
+ "acc_stderr": 0.04229525846816506,
84
+ "acc_norm": 0.77,
85
+ "acc_norm_stderr": 0.04229525846816506
86
+ },
87
+ "harness|hendrycksTest-conceptual_physics|5": {
88
+ "acc": 0.6638297872340425,
89
+ "acc_stderr": 0.030881618520676942,
90
+ "acc_norm": 0.6638297872340425,
91
+ "acc_norm_stderr": 0.030881618520676942
92
+ },
93
+ "harness|hendrycksTest-econometrics|5": {
94
+ "acc": 0.4473684210526316,
95
+ "acc_stderr": 0.04677473004491199,
96
+ "acc_norm": 0.4473684210526316,
97
+ "acc_norm_stderr": 0.04677473004491199
98
+ },
99
+ "harness|hendrycksTest-electrical_engineering|5": {
100
+ "acc": 0.6551724137931034,
101
+ "acc_stderr": 0.03960933549451207,
102
+ "acc_norm": 0.6551724137931034,
103
+ "acc_norm_stderr": 0.03960933549451207
104
+ },
105
+ "harness|hendrycksTest-elementary_mathematics|5": {
106
+ "acc": 0.43386243386243384,
107
+ "acc_stderr": 0.025525034382474894,
108
+ "acc_norm": 0.43386243386243384,
109
+ "acc_norm_stderr": 0.025525034382474894
110
+ },
111
+ "harness|hendrycksTest-formal_logic|5": {
112
+ "acc": 0.47619047619047616,
113
+ "acc_stderr": 0.04467062628403273,
114
+ "acc_norm": 0.47619047619047616,
115
+ "acc_norm_stderr": 0.04467062628403273
116
+ },
117
+ "harness|hendrycksTest-global_facts|5": {
118
+ "acc": 0.46,
119
+ "acc_stderr": 0.05009082659620332,
120
+ "acc_norm": 0.46,
121
+ "acc_norm_stderr": 0.05009082659620332
122
+ },
123
+ "harness|hendrycksTest-high_school_biology|5": {
124
+ "acc": 0.8193548387096774,
125
+ "acc_stderr": 0.02188617856717253,
126
+ "acc_norm": 0.8193548387096774,
127
+ "acc_norm_stderr": 0.02188617856717253
128
+ },
129
+ "harness|hendrycksTest-high_school_chemistry|5": {
130
+ "acc": 0.5123152709359606,
131
+ "acc_stderr": 0.035169204442208966,
132
+ "acc_norm": 0.5123152709359606,
133
+ "acc_norm_stderr": 0.035169204442208966
134
+ },
135
+ "harness|hendrycksTest-high_school_computer_science|5": {
136
+ "acc": 0.79,
137
+ "acc_stderr": 0.040936018074033256,
138
+ "acc_norm": 0.79,
139
+ "acc_norm_stderr": 0.040936018074033256
140
+ },
141
+ "harness|hendrycksTest-high_school_european_history|5": {
142
+ "acc": 0.8303030303030303,
143
+ "acc_stderr": 0.029311188674983134,
144
+ "acc_norm": 0.8303030303030303,
145
+ "acc_norm_stderr": 0.029311188674983134
146
+ },
147
+ "harness|hendrycksTest-high_school_geography|5": {
148
+ "acc": 0.8787878787878788,
149
+ "acc_stderr": 0.023253157951942084,
150
+ "acc_norm": 0.8787878787878788,
151
+ "acc_norm_stderr": 0.023253157951942084
152
+ },
153
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
154
+ "acc": 0.9430051813471503,
155
+ "acc_stderr": 0.016731085293607555,
156
+ "acc_norm": 0.9430051813471503,
157
+ "acc_norm_stderr": 0.016731085293607555
158
+ },
159
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
160
+ "acc": 0.7410256410256411,
161
+ "acc_stderr": 0.02221110681006167,
162
+ "acc_norm": 0.7410256410256411,
163
+ "acc_norm_stderr": 0.02221110681006167
164
+ },
165
+ "harness|hendrycksTest-high_school_mathematics|5": {
166
+ "acc": 0.35555555555555557,
167
+ "acc_stderr": 0.029185714949857403,
168
+ "acc_norm": 0.35555555555555557,
169
+ "acc_norm_stderr": 0.029185714949857403
170
+ },
171
+ "harness|hendrycksTest-high_school_microeconomics|5": {
172
+ "acc": 0.7647058823529411,
173
+ "acc_stderr": 0.02755361446786381,
174
+ "acc_norm": 0.7647058823529411,
175
+ "acc_norm_stderr": 0.02755361446786381
176
+ },
177
+ "harness|hendrycksTest-high_school_physics|5": {
178
+ "acc": 0.4304635761589404,
179
+ "acc_stderr": 0.04042809961395634,
180
+ "acc_norm": 0.4304635761589404,
181
+ "acc_norm_stderr": 0.04042809961395634
182
+ },
183
+ "harness|hendrycksTest-high_school_psychology|5": {
184
+ "acc": 0.8733944954128441,
185
+ "acc_stderr": 0.014257128686165169,
186
+ "acc_norm": 0.8733944954128441,
187
+ "acc_norm_stderr": 0.014257128686165169
188
+ },
189
+ "harness|hendrycksTest-high_school_statistics|5": {
190
+ "acc": 0.6342592592592593,
191
+ "acc_stderr": 0.032847388576472056,
192
+ "acc_norm": 0.6342592592592593,
193
+ "acc_norm_stderr": 0.032847388576472056
194
+ },
195
+ "harness|hendrycksTest-high_school_us_history|5": {
196
+ "acc": 0.8970588235294118,
197
+ "acc_stderr": 0.02132833757080437,
198
+ "acc_norm": 0.8970588235294118,
199
+ "acc_norm_stderr": 0.02132833757080437
200
+ },
201
+ "harness|hendrycksTest-high_school_world_history|5": {
202
+ "acc": 0.8776371308016878,
203
+ "acc_stderr": 0.021331741829746786,
204
+ "acc_norm": 0.8776371308016878,
205
+ "acc_norm_stderr": 0.021331741829746786
206
+ },
207
+ "harness|hendrycksTest-human_aging|5": {
208
+ "acc": 0.8026905829596412,
209
+ "acc_stderr": 0.02670985334496796,
210
+ "acc_norm": 0.8026905829596412,
211
+ "acc_norm_stderr": 0.02670985334496796
212
+ },
213
+ "harness|hendrycksTest-human_sexuality|5": {
214
+ "acc": 0.8778625954198473,
215
+ "acc_stderr": 0.028718776889342344,
216
+ "acc_norm": 0.8778625954198473,
217
+ "acc_norm_stderr": 0.028718776889342344
218
+ },
219
+ "harness|hendrycksTest-international_law|5": {
220
+ "acc": 0.8760330578512396,
221
+ "acc_stderr": 0.03008309871603521,
222
+ "acc_norm": 0.8760330578512396,
223
+ "acc_norm_stderr": 0.03008309871603521
224
+ },
225
+ "harness|hendrycksTest-jurisprudence|5": {
226
+ "acc": 0.8333333333333334,
227
+ "acc_stderr": 0.03602814176392645,
228
+ "acc_norm": 0.8333333333333334,
229
+ "acc_norm_stderr": 0.03602814176392645
230
+ },
231
+ "harness|hendrycksTest-logical_fallacies|5": {
232
+ "acc": 0.803680981595092,
233
+ "acc_stderr": 0.031207970394709218,
234
+ "acc_norm": 0.803680981595092,
235
+ "acc_norm_stderr": 0.031207970394709218
236
+ },
237
+ "harness|hendrycksTest-machine_learning|5": {
238
+ "acc": 0.5357142857142857,
239
+ "acc_stderr": 0.04733667890053756,
240
+ "acc_norm": 0.5357142857142857,
241
+ "acc_norm_stderr": 0.04733667890053756
242
+ },
243
+ "harness|hendrycksTest-management|5": {
244
+ "acc": 0.8349514563106796,
245
+ "acc_stderr": 0.03675668832233188,
246
+ "acc_norm": 0.8349514563106796,
247
+ "acc_norm_stderr": 0.03675668832233188
248
+ },
249
+ "harness|hendrycksTest-marketing|5": {
250
+ "acc": 0.905982905982906,
251
+ "acc_stderr": 0.01911989279892498,
252
+ "acc_norm": 0.905982905982906,
253
+ "acc_norm_stderr": 0.01911989279892498
254
+ },
255
+ "harness|hendrycksTest-medical_genetics|5": {
256
+ "acc": 0.74,
257
+ "acc_stderr": 0.04408440022768077,
258
+ "acc_norm": 0.74,
259
+ "acc_norm_stderr": 0.04408440022768077
260
+ },
261
+ "harness|hendrycksTest-miscellaneous|5": {
262
+ "acc": 0.8620689655172413,
263
+ "acc_stderr": 0.012331009307795656,
264
+ "acc_norm": 0.8620689655172413,
265
+ "acc_norm_stderr": 0.012331009307795656
266
+ },
267
+ "harness|hendrycksTest-moral_disputes|5": {
268
+ "acc": 0.7774566473988439,
269
+ "acc_stderr": 0.02239421566194282,
270
+ "acc_norm": 0.7774566473988439,
271
+ "acc_norm_stderr": 0.02239421566194282
272
+ },
273
+ "harness|hendrycksTest-moral_scenarios|5": {
274
+ "acc": 0.4547486033519553,
275
+ "acc_stderr": 0.016653875777524012,
276
+ "acc_norm": 0.4547486033519553,
277
+ "acc_norm_stderr": 0.016653875777524012
278
+ },
279
+ "harness|hendrycksTest-nutrition|5": {
280
+ "acc": 0.7810457516339869,
281
+ "acc_stderr": 0.02367908986180772,
282
+ "acc_norm": 0.7810457516339869,
283
+ "acc_norm_stderr": 0.02367908986180772
284
+ },
285
+ "harness|hendrycksTest-philosophy|5": {
286
+ "acc": 0.7877813504823151,
287
+ "acc_stderr": 0.023222756797435115,
288
+ "acc_norm": 0.7877813504823151,
289
+ "acc_norm_stderr": 0.023222756797435115
290
+ },
291
+ "harness|hendrycksTest-prehistory|5": {
292
+ "acc": 0.8364197530864198,
293
+ "acc_stderr": 0.020581466138257114,
294
+ "acc_norm": 0.8364197530864198,
295
+ "acc_norm_stderr": 0.020581466138257114
296
+ },
297
+ "harness|hendrycksTest-professional_accounting|5": {
298
+ "acc": 0.5673758865248227,
299
+ "acc_stderr": 0.02955545423677884,
300
+ "acc_norm": 0.5673758865248227,
301
+ "acc_norm_stderr": 0.02955545423677884
302
+ },
303
+ "harness|hendrycksTest-professional_law|5": {
304
+ "acc": 0.5319426336375489,
305
+ "acc_stderr": 0.012744149704869645,
306
+ "acc_norm": 0.5319426336375489,
307
+ "acc_norm_stderr": 0.012744149704869645
308
+ },
309
+ "harness|hendrycksTest-professional_medicine|5": {
310
+ "acc": 0.75,
311
+ "acc_stderr": 0.026303648393696036,
312
+ "acc_norm": 0.75,
313
+ "acc_norm_stderr": 0.026303648393696036
314
+ },
315
+ "harness|hendrycksTest-professional_psychology|5": {
316
+ "acc": 0.7565359477124183,
317
+ "acc_stderr": 0.01736247376214662,
318
+ "acc_norm": 0.7565359477124183,
319
+ "acc_norm_stderr": 0.01736247376214662
320
+ },
321
+ "harness|hendrycksTest-public_relations|5": {
322
+ "acc": 0.6909090909090909,
323
+ "acc_stderr": 0.044262946482000985,
324
+ "acc_norm": 0.6909090909090909,
325
+ "acc_norm_stderr": 0.044262946482000985
326
+ },
327
+ "harness|hendrycksTest-security_studies|5": {
328
+ "acc": 0.7918367346938775,
329
+ "acc_stderr": 0.0259911176728133,
330
+ "acc_norm": 0.7918367346938775,
331
+ "acc_norm_stderr": 0.0259911176728133
332
+ },
333
+ "harness|hendrycksTest-sociology|5": {
334
+ "acc": 0.900497512437811,
335
+ "acc_stderr": 0.021166216304659393,
336
+ "acc_norm": 0.900497512437811,
337
+ "acc_norm_stderr": 0.021166216304659393
338
+ },
339
+ "harness|hendrycksTest-us_foreign_policy|5": {
340
+ "acc": 0.92,
341
+ "acc_stderr": 0.0272659924344291,
342
+ "acc_norm": 0.92,
343
+ "acc_norm_stderr": 0.0272659924344291
344
+ },
345
+ "harness|hendrycksTest-virology|5": {
346
+ "acc": 0.5301204819277109,
347
+ "acc_stderr": 0.03885425420866767,
348
+ "acc_norm": 0.5301204819277109,
349
+ "acc_norm_stderr": 0.03885425420866767
350
+ },
351
+ "harness|hendrycksTest-world_religions|5": {
352
+ "acc": 0.8538011695906432,
353
+ "acc_stderr": 0.027097290118070806,
354
+ "acc_norm": 0.8538011695906432,
355
+ "acc_norm_stderr": 0.027097290118070806
356
+ },
357
+ "harness|truthfulqa:mc|0": {
358
+ "mc1": 0.3108935128518972,
359
+ "mc1_stderr": 0.016203316673559696,
360
+ "mc2": 0.44923493721887353,
361
+ "mc2_stderr": 0.01390226410719232
362
+ },
363
+ "all": {
364
+ "acc": 0.6967225637378714,
365
+ "acc_stderr": 0.030867069907791145,
366
+ "acc_norm": 0.7008615431872544,
367
+ "acc_norm_stderr": 0.030836865817034945,
368
+ "mc1": 0.3108935128518972,
369
+ "mc1_stderr": 0.016203316673559696,
370
+ "mc2": 0.44923493721887353,
371
+ "mc2_stderr": 0.01390226410719232
372
+ }
373
+ },
374
+ "versions": {
375
+ "harness|arc:challenge|25": 0,
376
+ "harness|hellaswag|10": 0,
377
+ "harness|hendrycksTest-abstract_algebra|5": 1,
378
+ "harness|hendrycksTest-anatomy|5": 1,
379
+ "harness|hendrycksTest-astronomy|5": 1,
380
+ "harness|hendrycksTest-business_ethics|5": 1,
381
+ "harness|hendrycksTest-clinical_knowledge|5": 1,
382
+ "harness|hendrycksTest-college_biology|5": 1,
383
+ "harness|hendrycksTest-college_chemistry|5": 1,
384
+ "harness|hendrycksTest-college_computer_science|5": 1,
385
+ "harness|hendrycksTest-college_mathematics|5": 1,
386
+ "harness|hendrycksTest-college_medicine|5": 1,
387
+ "harness|hendrycksTest-college_physics|5": 1,
388
+ "harness|hendrycksTest-computer_security|5": 1,
389
+ "harness|hendrycksTest-conceptual_physics|5": 1,
390
+ "harness|hendrycksTest-econometrics|5": 1,
391
+ "harness|hendrycksTest-electrical_engineering|5": 1,
392
+ "harness|hendrycksTest-elementary_mathematics|5": 1,
393
+ "harness|hendrycksTest-formal_logic|5": 1,
394
+ "harness|hendrycksTest-global_facts|5": 1,
395
+ "harness|hendrycksTest-high_school_biology|5": 1,
396
+ "harness|hendrycksTest-high_school_chemistry|5": 1,
397
+ "harness|hendrycksTest-high_school_computer_science|5": 1,
398
+ "harness|hendrycksTest-high_school_european_history|5": 1,
399
+ "harness|hendrycksTest-high_school_geography|5": 1,
400
+ "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
+ "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
+ "harness|hendrycksTest-high_school_mathematics|5": 1,
403
+ "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
+ "harness|hendrycksTest-high_school_physics|5": 1,
405
+ "harness|hendrycksTest-high_school_psychology|5": 1,
406
+ "harness|hendrycksTest-high_school_statistics|5": 1,
407
+ "harness|hendrycksTest-high_school_us_history|5": 1,
408
+ "harness|hendrycksTest-high_school_world_history|5": 1,
409
+ "harness|hendrycksTest-human_aging|5": 1,
410
+ "harness|hendrycksTest-human_sexuality|5": 1,
411
+ "harness|hendrycksTest-international_law|5": 1,
412
+ "harness|hendrycksTest-jurisprudence|5": 1,
413
+ "harness|hendrycksTest-logical_fallacies|5": 1,
414
+ "harness|hendrycksTest-machine_learning|5": 1,
415
+ "harness|hendrycksTest-management|5": 1,
416
+ "harness|hendrycksTest-marketing|5": 1,
417
+ "harness|hendrycksTest-medical_genetics|5": 1,
418
+ "harness|hendrycksTest-miscellaneous|5": 1,
419
+ "harness|hendrycksTest-moral_disputes|5": 1,
420
+ "harness|hendrycksTest-moral_scenarios|5": 1,
421
+ "harness|hendrycksTest-nutrition|5": 1,
422
+ "harness|hendrycksTest-philosophy|5": 1,
423
+ "harness|hendrycksTest-prehistory|5": 1,
424
+ "harness|hendrycksTest-professional_accounting|5": 1,
425
+ "harness|hendrycksTest-professional_law|5": 1,
426
+ "harness|hendrycksTest-professional_medicine|5": 1,
427
+ "harness|hendrycksTest-professional_psychology|5": 1,
428
+ "harness|hendrycksTest-public_relations|5": 1,
429
+ "harness|hendrycksTest-security_studies|5": 1,
430
+ "harness|hendrycksTest-sociology|5": 1,
431
+ "harness|hendrycksTest-us_foreign_policy|5": 1,
432
+ "harness|hendrycksTest-virology|5": 1,
433
+ "harness|hendrycksTest-world_religions|5": 1,
434
+ "harness|truthfulqa:mc|0": 1,
435
+ "all": 0
436
+ },
437
+ "config": {
438
+ "model_name": "meta-llama/Llama-2-70b-hf",
439
+ "model_sha": "ed7b07231238f836b99bf45701b9a0063576b194",
440
+ "model_dtype": "torch.float16",
441
+ "lighteval_sha": "d2e819bc028044e701a13b954d3326ceddb71b98",
442
+ "num_few_shot_default": 0,
443
+ "num_fewshot_seeds": 1,
444
+ "override_batch_size": 1,
445
+ "max_samples": null
446
+ }
447
+ }
meta-llama/Llama-2-70b-hf/results_2023-09-08T23-38-08.931556.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-70b-hf",
4
+ "model_sha": "cc8aa03a000ff08b4d5c5b39673321a2a396c396",
5
+ "model_size": "128.64 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.0017827181208053692,
17
+ "em_stderr": 0.00043200973460388544,
18
+ "f1": 0.06615562080536916,
19
+ "f1_stderr": 0.0013739852117668813
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.33965125094768767,
23
+ "acc_stderr": 0.01304504506766526
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.8374112075769534,
27
+ "acc_stderr": 0.010370455551343326
28
+ },
29
+ "all": {
30
+ "em": 0.0017827181208053692,
31
+ "em_stderr": 0.00043200973460388544,
32
+ "f1": 0.06615562080536916,
33
+ "f1_stderr": 0.0013739852117668813,
34
+ "acc": 0.5885312292623206,
35
+ "acc_stderr": 0.011707750309504293
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "32bc149506251e60"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "a95ce63226eb9a2d"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "dff37de5e6c9aeb7"
99
+ },
100
+ "total_evaluation_time_secondes": "28373.291680336",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-70b-hf/results_2023-09-18T06-46-44.905361.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-70b-hf",
4
+ "model_sha": "cc8aa03a000ff08b4d5c5b39673321a2a396c396",
5
+ "model_size": "128.64 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.0017827181208053692,
17
+ "em_stderr": 0.00043200973460388544,
18
+ "f1": 0.06615562080536916,
19
+ "f1_stderr": 0.0013739852117668813
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.33965125094768767,
23
+ "acc_stderr": 0.01304504506766526
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.8374112075769534,
27
+ "acc_stderr": 0.010370455551343326
28
+ },
29
+ "all": {
30
+ "em": 0.0017827181208053692,
31
+ "em_stderr": 0.00043200973460388544,
32
+ "f1": 0.06615562080536916,
33
+ "f1_stderr": 0.0013739852117668813,
34
+ "acc": 0.5885312292623206,
35
+ "acc_stderr": 0.011707750309504293
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "32bc149506251e60"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "a95ce63226eb9a2d"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "dff37de5e6c9aeb7"
99
+ },
100
+ "total_evaluation_time_secondes": "45388.11919736862",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-7b-chat-hf/results.json ADDED
@@ -0,0 +1,871 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "harness|arc:challenge|25": {
4
+ "acc": 0.4948805460750853,
5
+ "acc_stderr": 0.01461062489030916,
6
+ "acc_norm": 0.5290102389078498,
7
+ "acc_norm_stderr": 0.014586776355294323
8
+ },
9
+ "harness|hellaswag|10": {
10
+ "acc": 0.5978888667596096,
11
+ "acc_stderr": 0.004893220635011792,
12
+ "acc_norm": 0.7855008962358097,
13
+ "acc_norm_stderr": 0.004096355125117511
14
+ },
15
+ "harness|hendrycksTest-abstract_algebra|5": {
16
+ "acc": 0.28,
17
+ "acc_stderr": 0.04512608598542129,
18
+ "acc_norm": 0.28,
19
+ "acc_norm_stderr": 0.04512608598542129
20
+ },
21
+ "harness|hendrycksTest-anatomy|5": {
22
+ "acc": 0.42962962962962964,
23
+ "acc_stderr": 0.04276349494376599,
24
+ "acc_norm": 0.42962962962962964,
25
+ "acc_norm_stderr": 0.04276349494376599
26
+ },
27
+ "harness|hendrycksTest-astronomy|5": {
28
+ "acc": 0.4868421052631579,
29
+ "acc_stderr": 0.04067533136309173,
30
+ "acc_norm": 0.4868421052631579,
31
+ "acc_norm_stderr": 0.04067533136309173
32
+ },
33
+ "harness|hendrycksTest-business_ethics|5": {
34
+ "acc": 0.53,
35
+ "acc_stderr": 0.050161355804659205,
36
+ "acc_norm": 0.53,
37
+ "acc_norm_stderr": 0.050161355804659205
38
+ },
39
+ "harness|hendrycksTest-clinical_knowledge|5": {
40
+ "acc": 0.5358490566037736,
41
+ "acc_stderr": 0.030693675018458003,
42
+ "acc_norm": 0.5358490566037736,
43
+ "acc_norm_stderr": 0.030693675018458003
44
+ },
45
+ "harness|hendrycksTest-college_biology|5": {
46
+ "acc": 0.5208333333333334,
47
+ "acc_stderr": 0.041775789507399935,
48
+ "acc_norm": 0.5208333333333334,
49
+ "acc_norm_stderr": 0.041775789507399935
50
+ },
51
+ "harness|hendrycksTest-college_chemistry|5": {
52
+ "acc": 0.29,
53
+ "acc_stderr": 0.04560480215720684,
54
+ "acc_norm": 0.29,
55
+ "acc_norm_stderr": 0.04560480215720684
56
+ },
57
+ "harness|hendrycksTest-college_computer_science|5": {
58
+ "acc": 0.38,
59
+ "acc_stderr": 0.048783173121456316,
60
+ "acc_norm": 0.38,
61
+ "acc_norm_stderr": 0.048783173121456316
62
+ },
63
+ "harness|hendrycksTest-college_mathematics|5": {
64
+ "acc": 0.36,
65
+ "acc_stderr": 0.04824181513244218,
66
+ "acc_norm": 0.36,
67
+ "acc_norm_stderr": 0.04824181513244218
68
+ },
69
+ "harness|hendrycksTest-college_medicine|5": {
70
+ "acc": 0.3988439306358382,
71
+ "acc_stderr": 0.037336266553835096,
72
+ "acc_norm": 0.3988439306358382,
73
+ "acc_norm_stderr": 0.037336266553835096
74
+ },
75
+ "harness|hendrycksTest-college_physics|5": {
76
+ "acc": 0.22549019607843138,
77
+ "acc_stderr": 0.041583075330832865,
78
+ "acc_norm": 0.22549019607843138,
79
+ "acc_norm_stderr": 0.041583075330832865
80
+ },
81
+ "harness|hendrycksTest-computer_security|5": {
82
+ "acc": 0.58,
83
+ "acc_stderr": 0.049604496374885836,
84
+ "acc_norm": 0.58,
85
+ "acc_norm_stderr": 0.049604496374885836
86
+ },
87
+ "harness|hendrycksTest-conceptual_physics|5": {
88
+ "acc": 0.4085106382978723,
89
+ "acc_stderr": 0.03213418026701576,
90
+ "acc_norm": 0.4085106382978723,
91
+ "acc_norm_stderr": 0.03213418026701576
92
+ },
93
+ "harness|hendrycksTest-econometrics|5": {
94
+ "acc": 0.37719298245614036,
95
+ "acc_stderr": 0.045595221419582166,
96
+ "acc_norm": 0.37719298245614036,
97
+ "acc_norm_stderr": 0.045595221419582166
98
+ },
99
+ "harness|hendrycksTest-electrical_engineering|5": {
100
+ "acc": 0.4896551724137931,
101
+ "acc_stderr": 0.04165774775728762,
102
+ "acc_norm": 0.4896551724137931,
103
+ "acc_norm_stderr": 0.04165774775728762
104
+ },
105
+ "harness|hendrycksTest-elementary_mathematics|5": {
106
+ "acc": 0.29894179894179895,
107
+ "acc_stderr": 0.023577604791655805,
108
+ "acc_norm": 0.29894179894179895,
109
+ "acc_norm_stderr": 0.023577604791655805
110
+ },
111
+ "harness|hendrycksTest-formal_logic|5": {
112
+ "acc": 0.25396825396825395,
113
+ "acc_stderr": 0.03893259610604675,
114
+ "acc_norm": 0.25396825396825395,
115
+ "acc_norm_stderr": 0.03893259610604675
116
+ },
117
+ "harness|hendrycksTest-global_facts|5": {
118
+ "acc": 0.36,
119
+ "acc_stderr": 0.048241815132442176,
120
+ "acc_norm": 0.36,
121
+ "acc_norm_stderr": 0.048241815132442176
122
+ },
123
+ "harness|hendrycksTest-high_school_biology|5": {
124
+ "acc": 0.5225806451612903,
125
+ "acc_stderr": 0.02841498501970786,
126
+ "acc_norm": 0.5225806451612903,
127
+ "acc_norm_stderr": 0.02841498501970786
128
+ },
129
+ "harness|hendrycksTest-high_school_chemistry|5": {
130
+ "acc": 0.3645320197044335,
131
+ "acc_stderr": 0.033864057460620905,
132
+ "acc_norm": 0.3645320197044335,
133
+ "acc_norm_stderr": 0.033864057460620905
134
+ },
135
+ "harness|hendrycksTest-high_school_computer_science|5": {
136
+ "acc": 0.41,
137
+ "acc_stderr": 0.04943110704237102,
138
+ "acc_norm": 0.41,
139
+ "acc_norm_stderr": 0.04943110704237102
140
+ },
141
+ "harness|hendrycksTest-high_school_european_history|5": {
142
+ "acc": 0.5878787878787879,
143
+ "acc_stderr": 0.03843566993588718,
144
+ "acc_norm": 0.5878787878787879,
145
+ "acc_norm_stderr": 0.03843566993588718
146
+ },
147
+ "harness|hendrycksTest-high_school_geography|5": {
148
+ "acc": 0.6060606060606061,
149
+ "acc_stderr": 0.034812853382329624,
150
+ "acc_norm": 0.6060606060606061,
151
+ "acc_norm_stderr": 0.034812853382329624
152
+ },
153
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
154
+ "acc": 0.7150259067357513,
155
+ "acc_stderr": 0.032577140777096614,
156
+ "acc_norm": 0.7150259067357513,
157
+ "acc_norm_stderr": 0.032577140777096614
158
+ },
159
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
160
+ "acc": 0.4256410256410256,
161
+ "acc_stderr": 0.02506909438729654,
162
+ "acc_norm": 0.4256410256410256,
163
+ "acc_norm_stderr": 0.02506909438729654
164
+ },
165
+ "harness|hendrycksTest-high_school_mathematics|5": {
166
+ "acc": 0.25555555555555554,
167
+ "acc_stderr": 0.02659393910184408,
168
+ "acc_norm": 0.25555555555555554,
169
+ "acc_norm_stderr": 0.02659393910184408
170
+ },
171
+ "harness|hendrycksTest-high_school_microeconomics|5": {
172
+ "acc": 0.42436974789915966,
173
+ "acc_stderr": 0.03210479051015776,
174
+ "acc_norm": 0.42436974789915966,
175
+ "acc_norm_stderr": 0.03210479051015776
176
+ },
177
+ "harness|hendrycksTest-high_school_physics|5": {
178
+ "acc": 0.2913907284768212,
179
+ "acc_stderr": 0.03710185726119995,
180
+ "acc_norm": 0.2913907284768212,
181
+ "acc_norm_stderr": 0.03710185726119995
182
+ },
183
+ "harness|hendrycksTest-high_school_psychology|5": {
184
+ "acc": 0.6752293577981652,
185
+ "acc_stderr": 0.020077729109310327,
186
+ "acc_norm": 0.6752293577981652,
187
+ "acc_norm_stderr": 0.020077729109310327
188
+ },
189
+ "harness|hendrycksTest-high_school_statistics|5": {
190
+ "acc": 0.3333333333333333,
191
+ "acc_stderr": 0.0321495214780275,
192
+ "acc_norm": 0.3333333333333333,
193
+ "acc_norm_stderr": 0.0321495214780275
194
+ },
195
+ "harness|hendrycksTest-high_school_us_history|5": {
196
+ "acc": 0.6764705882352942,
197
+ "acc_stderr": 0.0328347205610856,
198
+ "acc_norm": 0.6764705882352942,
199
+ "acc_norm_stderr": 0.0328347205610856
200
+ },
201
+ "harness|hendrycksTest-high_school_world_history|5": {
202
+ "acc": 0.6666666666666666,
203
+ "acc_stderr": 0.03068582059661079,
204
+ "acc_norm": 0.6666666666666666,
205
+ "acc_norm_stderr": 0.03068582059661079
206
+ },
207
+ "harness|hendrycksTest-human_aging|5": {
208
+ "acc": 0.5605381165919282,
209
+ "acc_stderr": 0.03331092511038179,
210
+ "acc_norm": 0.5605381165919282,
211
+ "acc_norm_stderr": 0.03331092511038179
212
+ },
213
+ "harness|hendrycksTest-human_sexuality|5": {
214
+ "acc": 0.5725190839694656,
215
+ "acc_stderr": 0.04338920305792401,
216
+ "acc_norm": 0.5725190839694656,
217
+ "acc_norm_stderr": 0.04338920305792401
218
+ },
219
+ "harness|hendrycksTest-international_law|5": {
220
+ "acc": 0.628099173553719,
221
+ "acc_stderr": 0.04412015806624504,
222
+ "acc_norm": 0.628099173553719,
223
+ "acc_norm_stderr": 0.04412015806624504
224
+ },
225
+ "harness|hendrycksTest-jurisprudence|5": {
226
+ "acc": 0.5833333333333334,
227
+ "acc_stderr": 0.04766075165356461,
228
+ "acc_norm": 0.5833333333333334,
229
+ "acc_norm_stderr": 0.04766075165356461
230
+ },
231
+ "harness|hendrycksTest-logical_fallacies|5": {
232
+ "acc": 0.5521472392638037,
233
+ "acc_stderr": 0.03906947479456606,
234
+ "acc_norm": 0.5521472392638037,
235
+ "acc_norm_stderr": 0.03906947479456606
236
+ },
237
+ "harness|hendrycksTest-machine_learning|5": {
238
+ "acc": 0.30357142857142855,
239
+ "acc_stderr": 0.04364226155841044,
240
+ "acc_norm": 0.30357142857142855,
241
+ "acc_norm_stderr": 0.04364226155841044
242
+ },
243
+ "harness|hendrycksTest-management|5": {
244
+ "acc": 0.6699029126213593,
245
+ "acc_stderr": 0.04656147110012351,
246
+ "acc_norm": 0.6699029126213593,
247
+ "acc_norm_stderr": 0.04656147110012351
248
+ },
249
+ "harness|hendrycksTest-marketing|5": {
250
+ "acc": 0.7094017094017094,
251
+ "acc_stderr": 0.029745048572674074,
252
+ "acc_norm": 0.7094017094017094,
253
+ "acc_norm_stderr": 0.029745048572674074
254
+ },
255
+ "harness|hendrycksTest-medical_genetics|5": {
256
+ "acc": 0.5,
257
+ "acc_stderr": 0.050251890762960605,
258
+ "acc_norm": 0.5,
259
+ "acc_norm_stderr": 0.050251890762960605
260
+ },
261
+ "harness|hendrycksTest-miscellaneous|5": {
262
+ "acc": 0.6756066411238825,
263
+ "acc_stderr": 0.0167409290471627,
264
+ "acc_norm": 0.6756066411238825,
265
+ "acc_norm_stderr": 0.0167409290471627
266
+ },
267
+ "harness|hendrycksTest-moral_disputes|5": {
268
+ "acc": 0.5144508670520231,
269
+ "acc_stderr": 0.026907849856282542,
270
+ "acc_norm": 0.5144508670520231,
271
+ "acc_norm_stderr": 0.026907849856282542
272
+ },
273
+ "harness|hendrycksTest-moral_scenarios|5": {
274
+ "acc": 0.2201117318435754,
275
+ "acc_stderr": 0.013856994024227175,
276
+ "acc_norm": 0.2201117318435754,
277
+ "acc_norm_stderr": 0.013856994024227175
278
+ },
279
+ "harness|hendrycksTest-nutrition|5": {
280
+ "acc": 0.5196078431372549,
281
+ "acc_stderr": 0.028607893699576066,
282
+ "acc_norm": 0.5196078431372549,
283
+ "acc_norm_stderr": 0.028607893699576066
284
+ },
285
+ "harness|hendrycksTest-philosophy|5": {
286
+ "acc": 0.5659163987138264,
287
+ "acc_stderr": 0.02815023224453559,
288
+ "acc_norm": 0.5659163987138264,
289
+ "acc_norm_stderr": 0.02815023224453559
290
+ },
291
+ "harness|hendrycksTest-prehistory|5": {
292
+ "acc": 0.5679012345679012,
293
+ "acc_stderr": 0.027563010971606676,
294
+ "acc_norm": 0.5679012345679012,
295
+ "acc_norm_stderr": 0.027563010971606676
296
+ },
297
+ "harness|hendrycksTest-professional_accounting|5": {
298
+ "acc": 0.3723404255319149,
299
+ "acc_stderr": 0.028838921471251458,
300
+ "acc_norm": 0.3723404255319149,
301
+ "acc_norm_stderr": 0.028838921471251458
302
+ },
303
+ "harness|hendrycksTest-professional_law|5": {
304
+ "acc": 0.3500651890482399,
305
+ "acc_stderr": 0.012182552313215175,
306
+ "acc_norm": 0.3500651890482399,
307
+ "acc_norm_stderr": 0.012182552313215175
308
+ },
309
+ "harness|hendrycksTest-professional_medicine|5": {
310
+ "acc": 0.45588235294117646,
311
+ "acc_stderr": 0.030254372573976684,
312
+ "acc_norm": 0.45588235294117646,
313
+ "acc_norm_stderr": 0.030254372573976684
314
+ },
315
+ "harness|hendrycksTest-professional_psychology|5": {
316
+ "acc": 0.4803921568627451,
317
+ "acc_stderr": 0.020212274976302957,
318
+ "acc_norm": 0.4803921568627451,
319
+ "acc_norm_stderr": 0.020212274976302957
320
+ },
321
+ "harness|hendrycksTest-public_relations|5": {
322
+ "acc": 0.5272727272727272,
323
+ "acc_stderr": 0.04782001791380061,
324
+ "acc_norm": 0.5272727272727272,
325
+ "acc_norm_stderr": 0.04782001791380061
326
+ },
327
+ "harness|hendrycksTest-security_studies|5": {
328
+ "acc": 0.5265306122448979,
329
+ "acc_stderr": 0.03196412734523272,
330
+ "acc_norm": 0.5265306122448979,
331
+ "acc_norm_stderr": 0.03196412734523272
332
+ },
333
+ "harness|hendrycksTest-sociology|5": {
334
+ "acc": 0.6467661691542289,
335
+ "acc_stderr": 0.03379790611796777,
336
+ "acc_norm": 0.6467661691542289,
337
+ "acc_norm_stderr": 0.03379790611796777
338
+ },
339
+ "harness|hendrycksTest-us_foreign_policy|5": {
340
+ "acc": 0.72,
341
+ "acc_stderr": 0.045126085985421276,
342
+ "acc_norm": 0.72,
343
+ "acc_norm_stderr": 0.045126085985421276
344
+ },
345
+ "harness|hendrycksTest-virology|5": {
346
+ "acc": 0.43373493975903615,
347
+ "acc_stderr": 0.03858158940685517,
348
+ "acc_norm": 0.43373493975903615,
349
+ "acc_norm_stderr": 0.03858158940685517
350
+ },
351
+ "harness|hendrycksTest-world_religions|5": {
352
+ "acc": 0.7251461988304093,
353
+ "acc_stderr": 0.034240429246915824,
354
+ "acc_norm": 0.7251461988304093,
355
+ "acc_norm_stderr": 0.034240429246915824
356
+ },
357
+ "harness|truthfulqa:mc|0": {
358
+ "mc1": 0.3011015911872705,
359
+ "mc1_stderr": 0.016058999026100616,
360
+ "mc2": 0.45570370195101134,
361
+ "mc2_stderr": 0.015691038880908878
362
+ },
363
+ "all": {
364
+ "acc": 0.4853305078812575,
365
+ "acc_stderr": 0.03506342425063614,
366
+ "acc_norm": 0.4890888421576806,
367
+ "acc_norm_stderr": 0.03504951384309531,
368
+ "mc1": 0.3011015911872705,
369
+ "mc1_stderr": 0.016058999026100616,
370
+ "mc2": 0.45570370195101134,
371
+ "mc2_stderr": 0.015691038880908878
372
+ }
373
+ },
374
+ "versions": {
375
+ "harness|arc:challenge|25": 0,
376
+ "harness|hellaswag|10": 0,
377
+ "harness|hendrycksTest-abstract_algebra|5": 1,
378
+ "harness|hendrycksTest-anatomy|5": 1,
379
+ "harness|hendrycksTest-astronomy|5": 1,
380
+ "harness|hendrycksTest-business_ethics|5": 1,
381
+ "harness|hendrycksTest-clinical_knowledge|5": 1,
382
+ "harness|hendrycksTest-college_biology|5": 1,
383
+ "harness|hendrycksTest-college_chemistry|5": 1,
384
+ "harness|hendrycksTest-college_computer_science|5": 1,
385
+ "harness|hendrycksTest-college_mathematics|5": 1,
386
+ "harness|hendrycksTest-college_medicine|5": 1,
387
+ "harness|hendrycksTest-college_physics|5": 1,
388
+ "harness|hendrycksTest-computer_security|5": 1,
389
+ "harness|hendrycksTest-conceptual_physics|5": 1,
390
+ "harness|hendrycksTest-econometrics|5": 1,
391
+ "harness|hendrycksTest-electrical_engineering|5": 1,
392
+ "harness|hendrycksTest-elementary_mathematics|5": 1,
393
+ "harness|hendrycksTest-formal_logic|5": 1,
394
+ "harness|hendrycksTest-global_facts|5": 1,
395
+ "harness|hendrycksTest-high_school_biology|5": 1,
396
+ "harness|hendrycksTest-high_school_chemistry|5": 1,
397
+ "harness|hendrycksTest-high_school_computer_science|5": 1,
398
+ "harness|hendrycksTest-high_school_european_history|5": 1,
399
+ "harness|hendrycksTest-high_school_geography|5": 1,
400
+ "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
+ "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
+ "harness|hendrycksTest-high_school_mathematics|5": 1,
403
+ "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
+ "harness|hendrycksTest-high_school_physics|5": 1,
405
+ "harness|hendrycksTest-high_school_psychology|5": 1,
406
+ "harness|hendrycksTest-high_school_statistics|5": 1,
407
+ "harness|hendrycksTest-high_school_us_history|5": 1,
408
+ "harness|hendrycksTest-high_school_world_history|5": 1,
409
+ "harness|hendrycksTest-human_aging|5": 1,
410
+ "harness|hendrycksTest-human_sexuality|5": 1,
411
+ "harness|hendrycksTest-international_law|5": 1,
412
+ "harness|hendrycksTest-jurisprudence|5": 1,
413
+ "harness|hendrycksTest-logical_fallacies|5": 1,
414
+ "harness|hendrycksTest-machine_learning|5": 1,
415
+ "harness|hendrycksTest-management|5": 1,
416
+ "harness|hendrycksTest-marketing|5": 1,
417
+ "harness|hendrycksTest-medical_genetics|5": 1,
418
+ "harness|hendrycksTest-miscellaneous|5": 1,
419
+ "harness|hendrycksTest-moral_disputes|5": 1,
420
+ "harness|hendrycksTest-moral_scenarios|5": 1,
421
+ "harness|hendrycksTest-nutrition|5": 1,
422
+ "harness|hendrycksTest-philosophy|5": 1,
423
+ "harness|hendrycksTest-prehistory|5": 1,
424
+ "harness|hendrycksTest-professional_accounting|5": 1,
425
+ "harness|hendrycksTest-professional_law|5": 1,
426
+ "harness|hendrycksTest-professional_medicine|5": 1,
427
+ "harness|hendrycksTest-professional_psychology|5": 1,
428
+ "harness|hendrycksTest-public_relations|5": 1,
429
+ "harness|hendrycksTest-security_studies|5": 1,
430
+ "harness|hendrycksTest-sociology|5": 1,
431
+ "harness|hendrycksTest-us_foreign_policy|5": 1,
432
+ "harness|hendrycksTest-virology|5": 1,
433
+ "harness|hendrycksTest-world_religions|5": 1,
434
+ "harness|truthfulqa:mc|0": 1,
435
+ "all": 0
436
+ },
437
+ "config": {
438
+ "model_name": "meta-llama/Llama-2-7b-chat-hf",
439
+ "model_sha": "b7701a9e825e79a5ab18b5801be113c2160cc627",
440
+ "model_dtype": "torch.float16",
441
+ "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
442
+ "num_few_shot_default": 0,
443
+ "num_fewshot_seeds": 1,
444
+ "override_batch_size": 1,
445
+ "max_samples": null
446
+ },
447
+ "task_config": {
448
+ "harness|arc:challenge": "LM Harness task",
449
+ "harness|hellaswag": "LM Harness task",
450
+ "harness|hendrycksTest-abstract_algebra": "LM Harness task",
451
+ "harness|hendrycksTest-anatomy": "LM Harness task",
452
+ "harness|hendrycksTest-astronomy": "LM Harness task",
453
+ "harness|hendrycksTest-business_ethics": "LM Harness task",
454
+ "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
455
+ "harness|hendrycksTest-college_biology": "LM Harness task",
456
+ "harness|hendrycksTest-college_chemistry": "LM Harness task",
457
+ "harness|hendrycksTest-college_computer_science": "LM Harness task",
458
+ "harness|hendrycksTest-college_mathematics": "LM Harness task",
459
+ "harness|hendrycksTest-college_medicine": "LM Harness task",
460
+ "harness|hendrycksTest-college_physics": "LM Harness task",
461
+ "harness|hendrycksTest-computer_security": "LM Harness task",
462
+ "harness|hendrycksTest-conceptual_physics": "LM Harness task",
463
+ "harness|hendrycksTest-econometrics": "LM Harness task",
464
+ "harness|hendrycksTest-electrical_engineering": "LM Harness task",
465
+ "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
466
+ "harness|hendrycksTest-formal_logic": "LM Harness task",
467
+ "harness|hendrycksTest-global_facts": "LM Harness task",
468
+ "harness|hendrycksTest-high_school_biology": "LM Harness task",
469
+ "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
470
+ "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
471
+ "harness|hendrycksTest-high_school_european_history": "LM Harness task",
472
+ "harness|hendrycksTest-high_school_geography": "LM Harness task",
473
+ "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
474
+ "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
475
+ "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
476
+ "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
477
+ "harness|hendrycksTest-high_school_physics": "LM Harness task",
478
+ "harness|hendrycksTest-high_school_psychology": "LM Harness task",
479
+ "harness|hendrycksTest-high_school_statistics": "LM Harness task",
480
+ "harness|hendrycksTest-high_school_us_history": "LM Harness task",
481
+ "harness|hendrycksTest-high_school_world_history": "LM Harness task",
482
+ "harness|hendrycksTest-human_aging": "LM Harness task",
483
+ "harness|hendrycksTest-human_sexuality": "LM Harness task",
484
+ "harness|hendrycksTest-international_law": "LM Harness task",
485
+ "harness|hendrycksTest-jurisprudence": "LM Harness task",
486
+ "harness|hendrycksTest-logical_fallacies": "LM Harness task",
487
+ "harness|hendrycksTest-machine_learning": "LM Harness task",
488
+ "harness|hendrycksTest-management": "LM Harness task",
489
+ "harness|hendrycksTest-marketing": "LM Harness task",
490
+ "harness|hendrycksTest-medical_genetics": "LM Harness task",
491
+ "harness|hendrycksTest-miscellaneous": "LM Harness task",
492
+ "harness|hendrycksTest-moral_disputes": "LM Harness task",
493
+ "harness|hendrycksTest-moral_scenarios": "LM Harness task",
494
+ "harness|hendrycksTest-nutrition": "LM Harness task",
495
+ "harness|hendrycksTest-philosophy": "LM Harness task",
496
+ "harness|hendrycksTest-prehistory": "LM Harness task",
497
+ "harness|hendrycksTest-professional_accounting": "LM Harness task",
498
+ "harness|hendrycksTest-professional_law": "LM Harness task",
499
+ "harness|hendrycksTest-professional_medicine": "LM Harness task",
500
+ "harness|hendrycksTest-professional_psychology": "LM Harness task",
501
+ "harness|hendrycksTest-public_relations": "LM Harness task",
502
+ "harness|hendrycksTest-security_studies": "LM Harness task",
503
+ "harness|hendrycksTest-sociology": "LM Harness task",
504
+ "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
505
+ "harness|hendrycksTest-virology": "LM Harness task",
506
+ "harness|hendrycksTest-world_religions": "LM Harness task",
507
+ "harness|truthfulqa:mc": "LM Harness task"
508
+ },
509
+ "hashes": {
510
+ "harness|arc:challenge|25": {
511
+ "hash_examples": "fb8c51b1872daeda",
512
+ "hash_full_prompts": "045cbb916e5145c6",
513
+ "hash_input_tokens": "61571bf68d6d89aa",
514
+ "hash_cont_tokens": "8210decc6ff6f7df"
515
+ },
516
+ "harness|hellaswag|10": {
517
+ "hash_examples": "e1768ecb99d7ecf0",
518
+ "hash_full_prompts": "0b4c16983130f84f",
519
+ "hash_input_tokens": "29906669b1c7054a",
520
+ "hash_cont_tokens": "b3b9e9017afa63af"
521
+ },
522
+ "harness|hendrycksTest-abstract_algebra|5": {
523
+ "hash_examples": "280f9f325b40559a",
524
+ "hash_full_prompts": "2f776a367d23aea2",
525
+ "hash_input_tokens": "c54ff61ad0273dd7",
526
+ "hash_cont_tokens": "50421e30bef398f9"
527
+ },
528
+ "harness|hendrycksTest-anatomy|5": {
529
+ "hash_examples": "2f83a4f1cab4ba18",
530
+ "hash_full_prompts": "516f74bef25df620",
531
+ "hash_input_tokens": "be31a1e22aef5f90",
532
+ "hash_cont_tokens": "f11971a765cb609f"
533
+ },
534
+ "harness|hendrycksTest-astronomy|5": {
535
+ "hash_examples": "7d587b908da4d762",
536
+ "hash_full_prompts": "faf4e80f65de93ca",
537
+ "hash_input_tokens": "277a7b1fad566940",
538
+ "hash_cont_tokens": "bf30e5d3f48250cb"
539
+ },
540
+ "harness|hendrycksTest-business_ethics|5": {
541
+ "hash_examples": "33e51740670de686",
542
+ "hash_full_prompts": "db01c3ef8e1479d4",
543
+ "hash_input_tokens": "ba552605bc116de5",
544
+ "hash_cont_tokens": "bc1dd9b2d995eb61"
545
+ },
546
+ "harness|hendrycksTest-clinical_knowledge|5": {
547
+ "hash_examples": "f3366dbe7eefffa4",
548
+ "hash_full_prompts": "49654f71d94b65c3",
549
+ "hash_input_tokens": "428c7563d0b98ab9",
550
+ "hash_cont_tokens": "890a119624b3b935"
551
+ },
552
+ "harness|hendrycksTest-college_biology|5": {
553
+ "hash_examples": "ca2b6753a0193e7f",
554
+ "hash_full_prompts": "2b460b75f1fdfefd",
555
+ "hash_input_tokens": "da036601573942e2",
556
+ "hash_cont_tokens": "875cde3af7a0ee14"
557
+ },
558
+ "harness|hendrycksTest-college_chemistry|5": {
559
+ "hash_examples": "22ff85f1d34f42d1",
560
+ "hash_full_prompts": "242c9be6da583e95",
561
+ "hash_input_tokens": "94e0196d6aded13d",
562
+ "hash_cont_tokens": "50421e30bef398f9"
563
+ },
564
+ "harness|hendrycksTest-college_computer_science|5": {
565
+ "hash_examples": "30318289d717a5cf",
566
+ "hash_full_prompts": "ed2bdb4e87c4b371",
567
+ "hash_input_tokens": "6e4d0f4a8d36690b",
568
+ "hash_cont_tokens": "ffc0fe414cdc4a83"
569
+ },
570
+ "harness|hendrycksTest-college_mathematics|5": {
571
+ "hash_examples": "4944d1f0b6b5d911",
572
+ "hash_full_prompts": "770bc4281c973190",
573
+ "hash_input_tokens": "614054d17109a25d",
574
+ "hash_cont_tokens": "50421e30bef398f9"
575
+ },
576
+ "harness|hendrycksTest-college_medicine|5": {
577
+ "hash_examples": "dd69cc33381275af",
578
+ "hash_full_prompts": "ad2a53e5250ab46e",
579
+ "hash_input_tokens": "1d633b3cc0524ba8",
580
+ "hash_cont_tokens": "1f88b00d41957d82"
581
+ },
582
+ "harness|hendrycksTest-college_physics|5": {
583
+ "hash_examples": "875dd26d22655b0d",
584
+ "hash_full_prompts": "833a0d7b55aed500",
585
+ "hash_input_tokens": "5421d9a1af86cbd4",
586
+ "hash_cont_tokens": "f7b8097afc16a47c"
587
+ },
588
+ "harness|hendrycksTest-computer_security|5": {
589
+ "hash_examples": "006451eedc0ededb",
590
+ "hash_full_prompts": "94034c97e85d8f46",
591
+ "hash_input_tokens": "5e6b70ecb333cf18",
592
+ "hash_cont_tokens": "50421e30bef398f9"
593
+ },
594
+ "harness|hendrycksTest-conceptual_physics|5": {
595
+ "hash_examples": "8874ece872d2ca4c",
596
+ "hash_full_prompts": "e40d15a34640d6fa",
597
+ "hash_input_tokens": "c2ef11a87264ceed",
598
+ "hash_cont_tokens": "aa0e8bc655f2f641"
599
+ },
600
+ "harness|hendrycksTest-econometrics|5": {
601
+ "hash_examples": "64d3623b0bfaa43f",
602
+ "hash_full_prompts": "612f340fae41338d",
603
+ "hash_input_tokens": "ecaccd912a4c3978",
604
+ "hash_cont_tokens": "bfb7e3c3c88313f1"
605
+ },
606
+ "harness|hendrycksTest-electrical_engineering|5": {
607
+ "hash_examples": "e98f51780c674d7e",
608
+ "hash_full_prompts": "10275b312d812ae6",
609
+ "hash_input_tokens": "1590c84291399be8",
610
+ "hash_cont_tokens": "2425a3f084a591ef"
611
+ },
612
+ "harness|hendrycksTest-elementary_mathematics|5": {
613
+ "hash_examples": "fc48208a5ac1c0ce",
614
+ "hash_full_prompts": "5ec274c6c82aca23",
615
+ "hash_input_tokens": "3269597f715b0da1",
616
+ "hash_cont_tokens": "f52691aef15a407b"
617
+ },
618
+ "harness|hendrycksTest-formal_logic|5": {
619
+ "hash_examples": "5a6525665f63ea72",
620
+ "hash_full_prompts": "07b92638c4a6b500",
621
+ "hash_input_tokens": "a2800d20f3ab8d7c",
622
+ "hash_cont_tokens": "f515d598d9c21263"
623
+ },
624
+ "harness|hendrycksTest-global_facts|5": {
625
+ "hash_examples": "371d70d743b2b89b",
626
+ "hash_full_prompts": "332fdee50a1921b4",
627
+ "hash_input_tokens": "94ed44b3772505ad",
628
+ "hash_cont_tokens": "50421e30bef398f9"
629
+ },
630
+ "harness|hendrycksTest-high_school_biology|5": {
631
+ "hash_examples": "a79e1018b1674052",
632
+ "hash_full_prompts": "e624e26ede922561",
633
+ "hash_input_tokens": "24423acb928db768",
634
+ "hash_cont_tokens": "bd85a4156a3613ee"
635
+ },
636
+ "harness|hendrycksTest-high_school_chemistry|5": {
637
+ "hash_examples": "44bfc25c389f0e03",
638
+ "hash_full_prompts": "0e3e5f5d9246482a",
639
+ "hash_input_tokens": "831ff35c474e5cef",
640
+ "hash_cont_tokens": "a95c97af1c14e068"
641
+ },
642
+ "harness|hendrycksTest-high_school_computer_science|5": {
643
+ "hash_examples": "8b8cdb1084f24169",
644
+ "hash_full_prompts": "c00487e67c1813cc",
645
+ "hash_input_tokens": "8c34e0f2bda77358",
646
+ "hash_cont_tokens": "8abfedef914e33c9"
647
+ },
648
+ "harness|hendrycksTest-high_school_european_history|5": {
649
+ "hash_examples": "11cd32d0ef440171",
650
+ "hash_full_prompts": "318f4513c537c6bf",
651
+ "hash_input_tokens": "f1f73dd687da18d7",
652
+ "hash_cont_tokens": "674fc454bdc5ac93"
653
+ },
654
+ "harness|hendrycksTest-high_school_geography|5": {
655
+ "hash_examples": "b60019b9e80b642f",
656
+ "hash_full_prompts": "ee5789fcc1a81b1e",
657
+ "hash_input_tokens": "7c5547c7da5bc793",
658
+ "hash_cont_tokens": "03a5012b916274ea"
659
+ },
660
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
661
+ "hash_examples": "d221ec983d143dc3",
662
+ "hash_full_prompts": "ac42d888e1ce1155",
663
+ "hash_input_tokens": "f62991cb6a496b05",
664
+ "hash_cont_tokens": "a83effb8f76b7d7c"
665
+ },
666
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
667
+ "hash_examples": "59c2915cacfd3fbb",
668
+ "hash_full_prompts": "c6bd9d25158abd0e",
669
+ "hash_input_tokens": "4cef2aff6e3d59ed",
670
+ "hash_cont_tokens": "c583432ad27fcfe0"
671
+ },
672
+ "harness|hendrycksTest-high_school_mathematics|5": {
673
+ "hash_examples": "1f8ac897608de342",
674
+ "hash_full_prompts": "5d88f41fc2d643a8",
675
+ "hash_input_tokens": "6e2577ea4082ed2b",
676
+ "hash_cont_tokens": "24f5dc613660300b"
677
+ },
678
+ "harness|hendrycksTest-high_school_microeconomics|5": {
679
+ "hash_examples": "ead6a0f2f6c83370",
680
+ "hash_full_prompts": "bfc393381298609e",
681
+ "hash_input_tokens": "c5fc9aeb1079c8e4",
682
+ "hash_cont_tokens": "f47f041de50333b9"
683
+ },
684
+ "harness|hendrycksTest-high_school_physics|5": {
685
+ "hash_examples": "c3f2025990afec64",
686
+ "hash_full_prompts": "fc78b4997e436734",
687
+ "hash_input_tokens": "555fc385cffa84ca",
688
+ "hash_cont_tokens": "ba2efcd283e938cc"
689
+ },
690
+ "harness|hendrycksTest-high_school_psychology|5": {
691
+ "hash_examples": "21f8aab618f6d636",
692
+ "hash_full_prompts": "d5c76aa40b9dbc43",
693
+ "hash_input_tokens": "febd23cbf9973b7f",
694
+ "hash_cont_tokens": "942069cd363844d9"
695
+ },
696
+ "harness|hendrycksTest-high_school_statistics|5": {
697
+ "hash_examples": "2386a60a11fc5de3",
698
+ "hash_full_prompts": "4c5c8be5aafac432",
699
+ "hash_input_tokens": "424b02981230ee83",
700
+ "hash_cont_tokens": "955ed42b6f7fa019"
701
+ },
702
+ "harness|hendrycksTest-high_school_us_history|5": {
703
+ "hash_examples": "74961543be40f04f",
704
+ "hash_full_prompts": "5d5ca4840131ba21",
705
+ "hash_input_tokens": "50c9ff438c85a69e",
706
+ "hash_cont_tokens": "cdd0b3dc06d933e5"
707
+ },
708
+ "harness|hendrycksTest-high_school_world_history|5": {
709
+ "hash_examples": "2ad2f6b7198b2234",
710
+ "hash_full_prompts": "11845057459afd72",
711
+ "hash_input_tokens": "054824cc474caef5",
712
+ "hash_cont_tokens": "9a864184946033ac"
713
+ },
714
+ "harness|hendrycksTest-human_aging|5": {
715
+ "hash_examples": "1a7199dc733e779b",
716
+ "hash_full_prompts": "756b9096b8eaf892",
717
+ "hash_input_tokens": "541a75f071dcf579",
718
+ "hash_cont_tokens": "142a4a8a1138a214"
719
+ },
720
+ "harness|hendrycksTest-human_sexuality|5": {
721
+ "hash_examples": "7acb8fdad97f88a6",
722
+ "hash_full_prompts": "731a52ff15b8cfdb",
723
+ "hash_input_tokens": "04269e5c5a257dd9",
724
+ "hash_cont_tokens": "bc54813e809b796d"
725
+ },
726
+ "harness|hendrycksTest-international_law|5": {
727
+ "hash_examples": "1300bfd0dfc59114",
728
+ "hash_full_prompts": "db2aefbff5eec996",
729
+ "hash_input_tokens": "d93ba9d9d38e4397",
730
+ "hash_cont_tokens": "dc45b45fcda18e5d"
731
+ },
732
+ "harness|hendrycksTest-jurisprudence|5": {
733
+ "hash_examples": "083b1e4904c48dc2",
734
+ "hash_full_prompts": "0f89ee3fe03d6a21",
735
+ "hash_input_tokens": "9eeaccd2698b4f5a",
736
+ "hash_cont_tokens": "e3a8cd951b6e3469"
737
+ },
738
+ "harness|hendrycksTest-logical_fallacies|5": {
739
+ "hash_examples": "709128f9926a634c",
740
+ "hash_full_prompts": "98a04b1f8f841069",
741
+ "hash_input_tokens": "b4f08f544f2b7576",
742
+ "hash_cont_tokens": "1e80dbd30f6453d5"
743
+ },
744
+ "harness|hendrycksTest-machine_learning|5": {
745
+ "hash_examples": "88f22a636029ae47",
746
+ "hash_full_prompts": "2e1c8d4b1e0cc921",
747
+ "hash_input_tokens": "900c2a51f1174b9f",
748
+ "hash_cont_tokens": "9b37da7777378ca9"
749
+ },
750
+ "harness|hendrycksTest-management|5": {
751
+ "hash_examples": "8c8a1e07a2151dca",
752
+ "hash_full_prompts": "f51611f514b265b0",
753
+ "hash_input_tokens": "6b36efb4689c6eca",
754
+ "hash_cont_tokens": "a01d6d39a83c4597"
755
+ },
756
+ "harness|hendrycksTest-marketing|5": {
757
+ "hash_examples": "2668953431f91e96",
758
+ "hash_full_prompts": "77562bef997c7650",
759
+ "hash_input_tokens": "2aaac78a0cfed47a",
760
+ "hash_cont_tokens": "6aeaed4d823c98aa"
761
+ },
762
+ "harness|hendrycksTest-medical_genetics|5": {
763
+ "hash_examples": "9c2dda34a2ea4fd2",
764
+ "hash_full_prompts": "202139046daa118f",
765
+ "hash_input_tokens": "886ca823b41c094a",
766
+ "hash_cont_tokens": "50421e30bef398f9"
767
+ },
768
+ "harness|hendrycksTest-miscellaneous|5": {
769
+ "hash_examples": "41adb694024809c2",
770
+ "hash_full_prompts": "bffec9fc237bcf93",
771
+ "hash_input_tokens": "72fd71de7675e7d0",
772
+ "hash_cont_tokens": "9b0ab02a64603081"
773
+ },
774
+ "harness|hendrycksTest-moral_disputes|5": {
775
+ "hash_examples": "3171c13ba3c594c4",
776
+ "hash_full_prompts": "170831fc36f1d59e",
777
+ "hash_input_tokens": "f3ca0dd8e7a1eb09",
778
+ "hash_cont_tokens": "8badf768f7b0467a"
779
+ },
780
+ "harness|hendrycksTest-moral_scenarios|5": {
781
+ "hash_examples": "9873e077e83e0546",
782
+ "hash_full_prompts": "08f4ceba3131a068",
783
+ "hash_input_tokens": "3e793631e951f23c",
784
+ "hash_cont_tokens": "32ae620376b2bbba"
785
+ },
786
+ "harness|hendrycksTest-nutrition|5": {
787
+ "hash_examples": "7db1d8142ec14323",
788
+ "hash_full_prompts": "4c0e68e3586cb453",
789
+ "hash_input_tokens": "59753c2144ea93af",
790
+ "hash_cont_tokens": "3071def75bacc404"
791
+ },
792
+ "harness|hendrycksTest-philosophy|5": {
793
+ "hash_examples": "9b455b7d72811cc8",
794
+ "hash_full_prompts": "e467f822d8a0d3ff",
795
+ "hash_input_tokens": "bd8d3dbed15a8c34",
796
+ "hash_cont_tokens": "9f6ff69d23a48783"
797
+ },
798
+ "harness|hendrycksTest-prehistory|5": {
799
+ "hash_examples": "8be90d0f538f1560",
800
+ "hash_full_prompts": "152187949bcd0921",
801
+ "hash_input_tokens": "3573cd87facbb7c5",
802
+ "hash_cont_tokens": "de469d2b981e32a3"
803
+ },
804
+ "harness|hendrycksTest-professional_accounting|5": {
805
+ "hash_examples": "8d377597916cd07e",
806
+ "hash_full_prompts": "0eb7345d6144ee0d",
807
+ "hash_input_tokens": "17e721bc1a7cbb47",
808
+ "hash_cont_tokens": "c46f74d2dfc7b13b"
809
+ },
810
+ "harness|hendrycksTest-professional_law|5": {
811
+ "hash_examples": "cd9dbc52b3c932d6",
812
+ "hash_full_prompts": "36ac764272bfb182",
813
+ "hash_input_tokens": "9178e10bd0763ec4",
814
+ "hash_cont_tokens": "2e590029ef41fbcd"
815
+ },
816
+ "harness|hendrycksTest-professional_medicine|5": {
817
+ "hash_examples": "b20e4e816c1e383e",
818
+ "hash_full_prompts": "7b8d69ea2acaf2f7",
819
+ "hash_input_tokens": "f5a22012a54f70ea",
820
+ "hash_cont_tokens": "fe35cfa9c6ca802e"
821
+ },
822
+ "harness|hendrycksTest-professional_psychology|5": {
823
+ "hash_examples": "d45b73b22f9cc039",
824
+ "hash_full_prompts": "fe8937e9ffc99771",
825
+ "hash_input_tokens": "0dfb73a8eb3f692c",
826
+ "hash_cont_tokens": "f020fbddf72c8652"
827
+ },
828
+ "harness|hendrycksTest-public_relations|5": {
829
+ "hash_examples": "0d25072e1761652a",
830
+ "hash_full_prompts": "f9adc39cfa9f42ba",
831
+ "hash_input_tokens": "1710c6ba4c9f3cbd",
832
+ "hash_cont_tokens": "568f585a259965c1"
833
+ },
834
+ "harness|hendrycksTest-security_studies|5": {
835
+ "hash_examples": "62bb8197e63d60d4",
836
+ "hash_full_prompts": "869c9c3ae196b7c3",
837
+ "hash_input_tokens": "d49711415961ced7",
838
+ "hash_cont_tokens": "cc6fd7cccd64cd5d"
839
+ },
840
+ "harness|hendrycksTest-sociology|5": {
841
+ "hash_examples": "e7959df87dea8672",
842
+ "hash_full_prompts": "1a1fc00e17b3a52a",
843
+ "hash_input_tokens": "828999f7624cbe7e",
844
+ "hash_cont_tokens": "c3a3bdfd177eed5b"
845
+ },
846
+ "harness|hendrycksTest-us_foreign_policy|5": {
847
+ "hash_examples": "4a56a01ddca44dca",
848
+ "hash_full_prompts": "0c7a7081c71c07b6",
849
+ "hash_input_tokens": "42054621e718dbee",
850
+ "hash_cont_tokens": "2568d0e8e36fa959"
851
+ },
852
+ "harness|hendrycksTest-virology|5": {
853
+ "hash_examples": "451cc86a8c4f4fe9",
854
+ "hash_full_prompts": "01e95325d8b738e4",
855
+ "hash_input_tokens": "6c4f0aa4dc859c04",
856
+ "hash_cont_tokens": "926cf60b0891f374"
857
+ },
858
+ "harness|hendrycksTest-world_religions|5": {
859
+ "hash_examples": "3b29cfaf1a81c379",
860
+ "hash_full_prompts": "e0d79a15083dfdff",
861
+ "hash_input_tokens": "6c75d44e092ff24f",
862
+ "hash_cont_tokens": "c525a5de974c1ea3"
863
+ },
864
+ "harness|truthfulqa:mc|0": {
865
+ "hash_examples": "23176c0531c7b867",
866
+ "hash_full_prompts": "36a6d90e75d92d4a",
867
+ "hash_input_tokens": "2738d7ed7075faa7",
868
+ "hash_cont_tokens": "c014154380b74b9e"
869
+ }
870
+ }
871
+ }
meta-llama/Llama-2-7b-chat-hf/results_2023-10-15T02-34-15.484281.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-7b-chat-hf",
4
+ "model_sha": "af6df14e494ef16d69ec55e9a016e900a2dde1c8",
5
+ "model_size": "12.61 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.06763842281879194,
17
+ "em_stderr": 0.0025717489509556085,
18
+ "f1": 0.13085570469798627,
19
+ "f1_stderr": 0.0028825856446422905
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.07354056103108415,
23
+ "acc_stderr": 0.0071898357543652685
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.7174427782162589,
27
+ "acc_stderr": 0.012654062850971384
28
+ },
29
+ "all": {
30
+ "em": 0.06763842281879194,
31
+ "em_stderr": 0.0025717489509556085,
32
+ "f1": 0.13085570469798627,
33
+ "f1_stderr": 0.0028825856446422905,
34
+ "acc": 0.39549166962367155,
35
+ "acc_stderr": 0.009921949302668327
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "c4b3a30639a21038"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "602d6f5b58c4afb3"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "40836b0405f6a16d"
99
+ },
100
+ "total_evaluation_time_secondes": "9024.860627889633",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-7b-hf/results_2023-08-20T17-54-59.197645.json ADDED
@@ -0,0 +1,871 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "harness|arc:challenge|25": {
4
+ "acc": 0.492320819112628,
5
+ "acc_stderr": 0.01460966744089257,
6
+ "acc_norm": 0.5307167235494881,
7
+ "acc_norm_stderr": 0.014583792546304037
8
+ },
9
+ "harness|hellaswag|10": {
10
+ "acc": 0.5883290181238797,
11
+ "acc_stderr": 0.0049113035697697935,
12
+ "acc_norm": 0.7858992232622983,
13
+ "acc_norm_stderr": 0.0040935874043036904
14
+ },
15
+ "harness|hendrycksTest-abstract_algebra|5": {
16
+ "acc": 0.3,
17
+ "acc_stderr": 0.046056618647183814,
18
+ "acc_norm": 0.3,
19
+ "acc_norm_stderr": 0.046056618647183814
20
+ },
21
+ "harness|hendrycksTest-anatomy|5": {
22
+ "acc": 0.48148148148148145,
23
+ "acc_stderr": 0.043163785995113245,
24
+ "acc_norm": 0.48148148148148145,
25
+ "acc_norm_stderr": 0.043163785995113245
26
+ },
27
+ "harness|hendrycksTest-astronomy|5": {
28
+ "acc": 0.40789473684210525,
29
+ "acc_stderr": 0.03999309712777471,
30
+ "acc_norm": 0.40789473684210525,
31
+ "acc_norm_stderr": 0.03999309712777471
32
+ },
33
+ "harness|hendrycksTest-business_ethics|5": {
34
+ "acc": 0.53,
35
+ "acc_stderr": 0.05016135580465919,
36
+ "acc_norm": 0.53,
37
+ "acc_norm_stderr": 0.05016135580465919
38
+ },
39
+ "harness|hendrycksTest-clinical_knowledge|5": {
40
+ "acc": 0.4641509433962264,
41
+ "acc_stderr": 0.030693675018458003,
42
+ "acc_norm": 0.4641509433962264,
43
+ "acc_norm_stderr": 0.030693675018458003
44
+ },
45
+ "harness|hendrycksTest-college_biology|5": {
46
+ "acc": 0.4652777777777778,
47
+ "acc_stderr": 0.04171115858181618,
48
+ "acc_norm": 0.4652777777777778,
49
+ "acc_norm_stderr": 0.04171115858181618
50
+ },
51
+ "harness|hendrycksTest-college_chemistry|5": {
52
+ "acc": 0.35,
53
+ "acc_stderr": 0.047937248544110196,
54
+ "acc_norm": 0.35,
55
+ "acc_norm_stderr": 0.047937248544110196
56
+ },
57
+ "harness|hendrycksTest-college_computer_science|5": {
58
+ "acc": 0.33,
59
+ "acc_stderr": 0.047258156262526045,
60
+ "acc_norm": 0.33,
61
+ "acc_norm_stderr": 0.047258156262526045
62
+ },
63
+ "harness|hendrycksTest-college_mathematics|5": {
64
+ "acc": 0.35,
65
+ "acc_stderr": 0.047937248544110196,
66
+ "acc_norm": 0.35,
67
+ "acc_norm_stderr": 0.047937248544110196
68
+ },
69
+ "harness|hendrycksTest-college_medicine|5": {
70
+ "acc": 0.4277456647398844,
71
+ "acc_stderr": 0.037724468575180255,
72
+ "acc_norm": 0.4277456647398844,
73
+ "acc_norm_stderr": 0.037724468575180255
74
+ },
75
+ "harness|hendrycksTest-college_physics|5": {
76
+ "acc": 0.23529411764705882,
77
+ "acc_stderr": 0.04220773659171453,
78
+ "acc_norm": 0.23529411764705882,
79
+ "acc_norm_stderr": 0.04220773659171453
80
+ },
81
+ "harness|hendrycksTest-computer_security|5": {
82
+ "acc": 0.6,
83
+ "acc_stderr": 0.04923659639173309,
84
+ "acc_norm": 0.6,
85
+ "acc_norm_stderr": 0.04923659639173309
86
+ },
87
+ "harness|hendrycksTest-conceptual_physics|5": {
88
+ "acc": 0.42127659574468085,
89
+ "acc_stderr": 0.03227834510146267,
90
+ "acc_norm": 0.42127659574468085,
91
+ "acc_norm_stderr": 0.03227834510146267
92
+ },
93
+ "harness|hendrycksTest-econometrics|5": {
94
+ "acc": 0.2719298245614035,
95
+ "acc_stderr": 0.04185774424022056,
96
+ "acc_norm": 0.2719298245614035,
97
+ "acc_norm_stderr": 0.04185774424022056
98
+ },
99
+ "harness|hendrycksTest-electrical_engineering|5": {
100
+ "acc": 0.47586206896551725,
101
+ "acc_stderr": 0.041618085035015295,
102
+ "acc_norm": 0.47586206896551725,
103
+ "acc_norm_stderr": 0.041618085035015295
104
+ },
105
+ "harness|hendrycksTest-elementary_mathematics|5": {
106
+ "acc": 0.2724867724867725,
107
+ "acc_stderr": 0.02293097307163336,
108
+ "acc_norm": 0.2724867724867725,
109
+ "acc_norm_stderr": 0.02293097307163336
110
+ },
111
+ "harness|hendrycksTest-formal_logic|5": {
112
+ "acc": 0.2857142857142857,
113
+ "acc_stderr": 0.0404061017820884,
114
+ "acc_norm": 0.2857142857142857,
115
+ "acc_norm_stderr": 0.0404061017820884
116
+ },
117
+ "harness|hendrycksTest-global_facts|5": {
118
+ "acc": 0.32,
119
+ "acc_stderr": 0.04688261722621503,
120
+ "acc_norm": 0.32,
121
+ "acc_norm_stderr": 0.04688261722621503
122
+ },
123
+ "harness|hendrycksTest-high_school_biology|5": {
124
+ "acc": 0.5,
125
+ "acc_stderr": 0.028444006199428714,
126
+ "acc_norm": 0.5,
127
+ "acc_norm_stderr": 0.028444006199428714
128
+ },
129
+ "harness|hendrycksTest-high_school_chemistry|5": {
130
+ "acc": 0.3694581280788177,
131
+ "acc_stderr": 0.033959703819985726,
132
+ "acc_norm": 0.3694581280788177,
133
+ "acc_norm_stderr": 0.033959703819985726
134
+ },
135
+ "harness|hendrycksTest-high_school_computer_science|5": {
136
+ "acc": 0.4,
137
+ "acc_stderr": 0.049236596391733084,
138
+ "acc_norm": 0.4,
139
+ "acc_norm_stderr": 0.049236596391733084
140
+ },
141
+ "harness|hendrycksTest-high_school_european_history|5": {
142
+ "acc": 0.6424242424242425,
143
+ "acc_stderr": 0.03742597043806585,
144
+ "acc_norm": 0.6424242424242425,
145
+ "acc_norm_stderr": 0.03742597043806585
146
+ },
147
+ "harness|hendrycksTest-high_school_geography|5": {
148
+ "acc": 0.48484848484848486,
149
+ "acc_stderr": 0.03560716516531061,
150
+ "acc_norm": 0.48484848484848486,
151
+ "acc_norm_stderr": 0.03560716516531061
152
+ },
153
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
154
+ "acc": 0.6735751295336787,
155
+ "acc_stderr": 0.033840286211432945,
156
+ "acc_norm": 0.6735751295336787,
157
+ "acc_norm_stderr": 0.033840286211432945
158
+ },
159
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
160
+ "acc": 0.45897435897435895,
161
+ "acc_stderr": 0.025265525491284295,
162
+ "acc_norm": 0.45897435897435895,
163
+ "acc_norm_stderr": 0.025265525491284295
164
+ },
165
+ "harness|hendrycksTest-high_school_mathematics|5": {
166
+ "acc": 0.3037037037037037,
167
+ "acc_stderr": 0.02803792996911499,
168
+ "acc_norm": 0.3037037037037037,
169
+ "acc_norm_stderr": 0.02803792996911499
170
+ },
171
+ "harness|hendrycksTest-high_school_microeconomics|5": {
172
+ "acc": 0.4411764705882353,
173
+ "acc_stderr": 0.0322529423239964,
174
+ "acc_norm": 0.4411764705882353,
175
+ "acc_norm_stderr": 0.0322529423239964
176
+ },
177
+ "harness|hendrycksTest-high_school_physics|5": {
178
+ "acc": 0.31125827814569534,
179
+ "acc_stderr": 0.037804458505267334,
180
+ "acc_norm": 0.31125827814569534,
181
+ "acc_norm_stderr": 0.037804458505267334
182
+ },
183
+ "harness|hendrycksTest-high_school_psychology|5": {
184
+ "acc": 0.6293577981651376,
185
+ "acc_stderr": 0.02070745816435298,
186
+ "acc_norm": 0.6293577981651376,
187
+ "acc_norm_stderr": 0.02070745816435298
188
+ },
189
+ "harness|hendrycksTest-high_school_statistics|5": {
190
+ "acc": 0.27314814814814814,
191
+ "acc_stderr": 0.03038805130167812,
192
+ "acc_norm": 0.27314814814814814,
193
+ "acc_norm_stderr": 0.03038805130167812
194
+ },
195
+ "harness|hendrycksTest-high_school_us_history|5": {
196
+ "acc": 0.5343137254901961,
197
+ "acc_stderr": 0.03501038327635897,
198
+ "acc_norm": 0.5343137254901961,
199
+ "acc_norm_stderr": 0.03501038327635897
200
+ },
201
+ "harness|hendrycksTest-high_school_world_history|5": {
202
+ "acc": 0.6286919831223629,
203
+ "acc_stderr": 0.03145068600744859,
204
+ "acc_norm": 0.6286919831223629,
205
+ "acc_norm_stderr": 0.03145068600744859
206
+ },
207
+ "harness|hendrycksTest-human_aging|5": {
208
+ "acc": 0.5695067264573991,
209
+ "acc_stderr": 0.033231973029429394,
210
+ "acc_norm": 0.5695067264573991,
211
+ "acc_norm_stderr": 0.033231973029429394
212
+ },
213
+ "harness|hendrycksTest-human_sexuality|5": {
214
+ "acc": 0.5648854961832062,
215
+ "acc_stderr": 0.04348208051644858,
216
+ "acc_norm": 0.5648854961832062,
217
+ "acc_norm_stderr": 0.04348208051644858
218
+ },
219
+ "harness|hendrycksTest-international_law|5": {
220
+ "acc": 0.6528925619834711,
221
+ "acc_stderr": 0.043457245702925335,
222
+ "acc_norm": 0.6528925619834711,
223
+ "acc_norm_stderr": 0.043457245702925335
224
+ },
225
+ "harness|hendrycksTest-jurisprudence|5": {
226
+ "acc": 0.5370370370370371,
227
+ "acc_stderr": 0.04820403072760628,
228
+ "acc_norm": 0.5370370370370371,
229
+ "acc_norm_stderr": 0.04820403072760628
230
+ },
231
+ "harness|hendrycksTest-logical_fallacies|5": {
232
+ "acc": 0.50920245398773,
233
+ "acc_stderr": 0.03927705600787443,
234
+ "acc_norm": 0.50920245398773,
235
+ "acc_norm_stderr": 0.03927705600787443
236
+ },
237
+ "harness|hendrycksTest-machine_learning|5": {
238
+ "acc": 0.38392857142857145,
239
+ "acc_stderr": 0.04616143075028547,
240
+ "acc_norm": 0.38392857142857145,
241
+ "acc_norm_stderr": 0.04616143075028547
242
+ },
243
+ "harness|hendrycksTest-management|5": {
244
+ "acc": 0.5533980582524272,
245
+ "acc_stderr": 0.04922424153458933,
246
+ "acc_norm": 0.5533980582524272,
247
+ "acc_norm_stderr": 0.04922424153458933
248
+ },
249
+ "harness|hendrycksTest-marketing|5": {
250
+ "acc": 0.6923076923076923,
251
+ "acc_stderr": 0.030236389942173085,
252
+ "acc_norm": 0.6923076923076923,
253
+ "acc_norm_stderr": 0.030236389942173085
254
+ },
255
+ "harness|hendrycksTest-medical_genetics|5": {
256
+ "acc": 0.55,
257
+ "acc_stderr": 0.04999999999999999,
258
+ "acc_norm": 0.55,
259
+ "acc_norm_stderr": 0.04999999999999999
260
+ },
261
+ "harness|hendrycksTest-miscellaneous|5": {
262
+ "acc": 0.6411238825031929,
263
+ "acc_stderr": 0.017152991797501342,
264
+ "acc_norm": 0.6411238825031929,
265
+ "acc_norm_stderr": 0.017152991797501342
266
+ },
267
+ "harness|hendrycksTest-moral_disputes|5": {
268
+ "acc": 0.49421965317919075,
269
+ "acc_stderr": 0.026917296179149116,
270
+ "acc_norm": 0.49421965317919075,
271
+ "acc_norm_stderr": 0.026917296179149116
272
+ },
273
+ "harness|hendrycksTest-moral_scenarios|5": {
274
+ "acc": 0.23910614525139665,
275
+ "acc_stderr": 0.014265554192331144,
276
+ "acc_norm": 0.23910614525139665,
277
+ "acc_norm_stderr": 0.014265554192331144
278
+ },
279
+ "harness|hendrycksTest-nutrition|5": {
280
+ "acc": 0.4934640522875817,
281
+ "acc_stderr": 0.028627470550556047,
282
+ "acc_norm": 0.4934640522875817,
283
+ "acc_norm_stderr": 0.028627470550556047
284
+ },
285
+ "harness|hendrycksTest-philosophy|5": {
286
+ "acc": 0.6012861736334405,
287
+ "acc_stderr": 0.0278093225857745,
288
+ "acc_norm": 0.6012861736334405,
289
+ "acc_norm_stderr": 0.0278093225857745
290
+ },
291
+ "harness|hendrycksTest-prehistory|5": {
292
+ "acc": 0.49074074074074076,
293
+ "acc_stderr": 0.027815973433878014,
294
+ "acc_norm": 0.49074074074074076,
295
+ "acc_norm_stderr": 0.027815973433878014
296
+ },
297
+ "harness|hendrycksTest-professional_accounting|5": {
298
+ "acc": 0.3617021276595745,
299
+ "acc_stderr": 0.028663820147199492,
300
+ "acc_norm": 0.3617021276595745,
301
+ "acc_norm_stderr": 0.028663820147199492
302
+ },
303
+ "harness|hendrycksTest-professional_law|5": {
304
+ "acc": 0.3617992177314211,
305
+ "acc_stderr": 0.01227273623326293,
306
+ "acc_norm": 0.3617992177314211,
307
+ "acc_norm_stderr": 0.01227273623326293
308
+ },
309
+ "harness|hendrycksTest-professional_medicine|5": {
310
+ "acc": 0.5257352941176471,
311
+ "acc_stderr": 0.03033257809455504,
312
+ "acc_norm": 0.5257352941176471,
313
+ "acc_norm_stderr": 0.03033257809455504
314
+ },
315
+ "harness|hendrycksTest-professional_psychology|5": {
316
+ "acc": 0.4411764705882353,
317
+ "acc_stderr": 0.020087362076702857,
318
+ "acc_norm": 0.4411764705882353,
319
+ "acc_norm_stderr": 0.020087362076702857
320
+ },
321
+ "harness|hendrycksTest-public_relations|5": {
322
+ "acc": 0.5272727272727272,
323
+ "acc_stderr": 0.04782001791380061,
324
+ "acc_norm": 0.5272727272727272,
325
+ "acc_norm_stderr": 0.04782001791380061
326
+ },
327
+ "harness|hendrycksTest-security_studies|5": {
328
+ "acc": 0.4775510204081633,
329
+ "acc_stderr": 0.031976941187136725,
330
+ "acc_norm": 0.4775510204081633,
331
+ "acc_norm_stderr": 0.031976941187136725
332
+ },
333
+ "harness|hendrycksTest-sociology|5": {
334
+ "acc": 0.6318407960199005,
335
+ "acc_stderr": 0.03410410565495301,
336
+ "acc_norm": 0.6318407960199005,
337
+ "acc_norm_stderr": 0.03410410565495301
338
+ },
339
+ "harness|hendrycksTest-us_foreign_policy|5": {
340
+ "acc": 0.65,
341
+ "acc_stderr": 0.047937248544110196,
342
+ "acc_norm": 0.65,
343
+ "acc_norm_stderr": 0.047937248544110196
344
+ },
345
+ "harness|hendrycksTest-virology|5": {
346
+ "acc": 0.42168674698795183,
347
+ "acc_stderr": 0.03844453181770917,
348
+ "acc_norm": 0.42168674698795183,
349
+ "acc_norm_stderr": 0.03844453181770917
350
+ },
351
+ "harness|hendrycksTest-world_religions|5": {
352
+ "acc": 0.7017543859649122,
353
+ "acc_stderr": 0.03508771929824563,
354
+ "acc_norm": 0.7017543859649122,
355
+ "acc_norm_stderr": 0.03508771929824563
356
+ },
357
+ "harness|truthfulqa:mc|0": {
358
+ "mc1": 0.24724602203182375,
359
+ "mc1_stderr": 0.01510240479735965,
360
+ "mc2": 0.3875703155565465,
361
+ "mc2_stderr": 0.013511615953021569
362
+ },
363
+ "all": {
364
+ "acc": 0.4710900438949217,
365
+ "acc_stderr": 0.03528130957178532,
366
+ "acc_norm": 0.4750894694809433,
367
+ "acc_norm_stderr": 0.03526701141822507,
368
+ "mc1": 0.24724602203182375,
369
+ "mc1_stderr": 0.01510240479735965,
370
+ "mc2": 0.3875703155565465,
371
+ "mc2_stderr": 0.013511615953021569
372
+ }
373
+ },
374
+ "versions": {
375
+ "harness|arc:challenge|25": 0,
376
+ "harness|hellaswag|10": 0,
377
+ "harness|hendrycksTest-abstract_algebra|5": 1,
378
+ "harness|hendrycksTest-anatomy|5": 1,
379
+ "harness|hendrycksTest-astronomy|5": 1,
380
+ "harness|hendrycksTest-business_ethics|5": 1,
381
+ "harness|hendrycksTest-clinical_knowledge|5": 1,
382
+ "harness|hendrycksTest-college_biology|5": 1,
383
+ "harness|hendrycksTest-college_chemistry|5": 1,
384
+ "harness|hendrycksTest-college_computer_science|5": 1,
385
+ "harness|hendrycksTest-college_mathematics|5": 1,
386
+ "harness|hendrycksTest-college_medicine|5": 1,
387
+ "harness|hendrycksTest-college_physics|5": 1,
388
+ "harness|hendrycksTest-computer_security|5": 1,
389
+ "harness|hendrycksTest-conceptual_physics|5": 1,
390
+ "harness|hendrycksTest-econometrics|5": 1,
391
+ "harness|hendrycksTest-electrical_engineering|5": 1,
392
+ "harness|hendrycksTest-elementary_mathematics|5": 1,
393
+ "harness|hendrycksTest-formal_logic|5": 1,
394
+ "harness|hendrycksTest-global_facts|5": 1,
395
+ "harness|hendrycksTest-high_school_biology|5": 1,
396
+ "harness|hendrycksTest-high_school_chemistry|5": 1,
397
+ "harness|hendrycksTest-high_school_computer_science|5": 1,
398
+ "harness|hendrycksTest-high_school_european_history|5": 1,
399
+ "harness|hendrycksTest-high_school_geography|5": 1,
400
+ "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
+ "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
+ "harness|hendrycksTest-high_school_mathematics|5": 1,
403
+ "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
+ "harness|hendrycksTest-high_school_physics|5": 1,
405
+ "harness|hendrycksTest-high_school_psychology|5": 1,
406
+ "harness|hendrycksTest-high_school_statistics|5": 1,
407
+ "harness|hendrycksTest-high_school_us_history|5": 1,
408
+ "harness|hendrycksTest-high_school_world_history|5": 1,
409
+ "harness|hendrycksTest-human_aging|5": 1,
410
+ "harness|hendrycksTest-human_sexuality|5": 1,
411
+ "harness|hendrycksTest-international_law|5": 1,
412
+ "harness|hendrycksTest-jurisprudence|5": 1,
413
+ "harness|hendrycksTest-logical_fallacies|5": 1,
414
+ "harness|hendrycksTest-machine_learning|5": 1,
415
+ "harness|hendrycksTest-management|5": 1,
416
+ "harness|hendrycksTest-marketing|5": 1,
417
+ "harness|hendrycksTest-medical_genetics|5": 1,
418
+ "harness|hendrycksTest-miscellaneous|5": 1,
419
+ "harness|hendrycksTest-moral_disputes|5": 1,
420
+ "harness|hendrycksTest-moral_scenarios|5": 1,
421
+ "harness|hendrycksTest-nutrition|5": 1,
422
+ "harness|hendrycksTest-philosophy|5": 1,
423
+ "harness|hendrycksTest-prehistory|5": 1,
424
+ "harness|hendrycksTest-professional_accounting|5": 1,
425
+ "harness|hendrycksTest-professional_law|5": 1,
426
+ "harness|hendrycksTest-professional_medicine|5": 1,
427
+ "harness|hendrycksTest-professional_psychology|5": 1,
428
+ "harness|hendrycksTest-public_relations|5": 1,
429
+ "harness|hendrycksTest-security_studies|5": 1,
430
+ "harness|hendrycksTest-sociology|5": 1,
431
+ "harness|hendrycksTest-us_foreign_policy|5": 1,
432
+ "harness|hendrycksTest-virology|5": 1,
433
+ "harness|hendrycksTest-world_religions|5": 1,
434
+ "harness|truthfulqa:mc|0": 1,
435
+ "all": 0
436
+ },
437
+ "config": {
438
+ "model_name": "meta-llama/Llama-2-7b-hf",
439
+ "model_sha": "e8f058fa738b6b308540024e9aa12e274e291f75",
440
+ "model_dtype": "torch.float16",
441
+ "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
442
+ "num_few_shot_default": 0,
443
+ "num_fewshot_seeds": 1,
444
+ "override_batch_size": 1,
445
+ "max_samples": null
446
+ },
447
+ "task_config": {
448
+ "harness|arc:challenge": "LM Harness task",
449
+ "harness|hellaswag": "LM Harness task",
450
+ "harness|hendrycksTest-abstract_algebra": "LM Harness task",
451
+ "harness|hendrycksTest-anatomy": "LM Harness task",
452
+ "harness|hendrycksTest-astronomy": "LM Harness task",
453
+ "harness|hendrycksTest-business_ethics": "LM Harness task",
454
+ "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
455
+ "harness|hendrycksTest-college_biology": "LM Harness task",
456
+ "harness|hendrycksTest-college_chemistry": "LM Harness task",
457
+ "harness|hendrycksTest-college_computer_science": "LM Harness task",
458
+ "harness|hendrycksTest-college_mathematics": "LM Harness task",
459
+ "harness|hendrycksTest-college_medicine": "LM Harness task",
460
+ "harness|hendrycksTest-college_physics": "LM Harness task",
461
+ "harness|hendrycksTest-computer_security": "LM Harness task",
462
+ "harness|hendrycksTest-conceptual_physics": "LM Harness task",
463
+ "harness|hendrycksTest-econometrics": "LM Harness task",
464
+ "harness|hendrycksTest-electrical_engineering": "LM Harness task",
465
+ "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
466
+ "harness|hendrycksTest-formal_logic": "LM Harness task",
467
+ "harness|hendrycksTest-global_facts": "LM Harness task",
468
+ "harness|hendrycksTest-high_school_biology": "LM Harness task",
469
+ "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
470
+ "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
471
+ "harness|hendrycksTest-high_school_european_history": "LM Harness task",
472
+ "harness|hendrycksTest-high_school_geography": "LM Harness task",
473
+ "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
474
+ "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
475
+ "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
476
+ "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
477
+ "harness|hendrycksTest-high_school_physics": "LM Harness task",
478
+ "harness|hendrycksTest-high_school_psychology": "LM Harness task",
479
+ "harness|hendrycksTest-high_school_statistics": "LM Harness task",
480
+ "harness|hendrycksTest-high_school_us_history": "LM Harness task",
481
+ "harness|hendrycksTest-high_school_world_history": "LM Harness task",
482
+ "harness|hendrycksTest-human_aging": "LM Harness task",
483
+ "harness|hendrycksTest-human_sexuality": "LM Harness task",
484
+ "harness|hendrycksTest-international_law": "LM Harness task",
485
+ "harness|hendrycksTest-jurisprudence": "LM Harness task",
486
+ "harness|hendrycksTest-logical_fallacies": "LM Harness task",
487
+ "harness|hendrycksTest-machine_learning": "LM Harness task",
488
+ "harness|hendrycksTest-management": "LM Harness task",
489
+ "harness|hendrycksTest-marketing": "LM Harness task",
490
+ "harness|hendrycksTest-medical_genetics": "LM Harness task",
491
+ "harness|hendrycksTest-miscellaneous": "LM Harness task",
492
+ "harness|hendrycksTest-moral_disputes": "LM Harness task",
493
+ "harness|hendrycksTest-moral_scenarios": "LM Harness task",
494
+ "harness|hendrycksTest-nutrition": "LM Harness task",
495
+ "harness|hendrycksTest-philosophy": "LM Harness task",
496
+ "harness|hendrycksTest-prehistory": "LM Harness task",
497
+ "harness|hendrycksTest-professional_accounting": "LM Harness task",
498
+ "harness|hendrycksTest-professional_law": "LM Harness task",
499
+ "harness|hendrycksTest-professional_medicine": "LM Harness task",
500
+ "harness|hendrycksTest-professional_psychology": "LM Harness task",
501
+ "harness|hendrycksTest-public_relations": "LM Harness task",
502
+ "harness|hendrycksTest-security_studies": "LM Harness task",
503
+ "harness|hendrycksTest-sociology": "LM Harness task",
504
+ "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
505
+ "harness|hendrycksTest-virology": "LM Harness task",
506
+ "harness|hendrycksTest-world_religions": "LM Harness task",
507
+ "harness|truthfulqa:mc": "LM Harness task"
508
+ },
509
+ "hashes": {
510
+ "harness|arc:challenge|25": {
511
+ "hash_examples": "fb8c51b1872daeda",
512
+ "hash_full_prompts": "045cbb916e5145c6",
513
+ "hash_input_tokens": "61571bf68d6d89aa",
514
+ "hash_cont_tokens": "8210decc6ff6f7df"
515
+ },
516
+ "harness|hellaswag|10": {
517
+ "hash_examples": "e1768ecb99d7ecf0",
518
+ "hash_full_prompts": "0b4c16983130f84f",
519
+ "hash_input_tokens": "29906669b1c7054a",
520
+ "hash_cont_tokens": "b3b9e9017afa63af"
521
+ },
522
+ "harness|hendrycksTest-abstract_algebra|5": {
523
+ "hash_examples": "280f9f325b40559a",
524
+ "hash_full_prompts": "2f776a367d23aea2",
525
+ "hash_input_tokens": "c54ff61ad0273dd7",
526
+ "hash_cont_tokens": "50421e30bef398f9"
527
+ },
528
+ "harness|hendrycksTest-anatomy|5": {
529
+ "hash_examples": "2f83a4f1cab4ba18",
530
+ "hash_full_prompts": "516f74bef25df620",
531
+ "hash_input_tokens": "be31a1e22aef5f90",
532
+ "hash_cont_tokens": "f11971a765cb609f"
533
+ },
534
+ "harness|hendrycksTest-astronomy|5": {
535
+ "hash_examples": "7d587b908da4d762",
536
+ "hash_full_prompts": "faf4e80f65de93ca",
537
+ "hash_input_tokens": "277a7b1fad566940",
538
+ "hash_cont_tokens": "bf30e5d3f48250cb"
539
+ },
540
+ "harness|hendrycksTest-business_ethics|5": {
541
+ "hash_examples": "33e51740670de686",
542
+ "hash_full_prompts": "db01c3ef8e1479d4",
543
+ "hash_input_tokens": "ba552605bc116de5",
544
+ "hash_cont_tokens": "bc1dd9b2d995eb61"
545
+ },
546
+ "harness|hendrycksTest-clinical_knowledge|5": {
547
+ "hash_examples": "f3366dbe7eefffa4",
548
+ "hash_full_prompts": "49654f71d94b65c3",
549
+ "hash_input_tokens": "428c7563d0b98ab9",
550
+ "hash_cont_tokens": "890a119624b3b935"
551
+ },
552
+ "harness|hendrycksTest-college_biology|5": {
553
+ "hash_examples": "ca2b6753a0193e7f",
554
+ "hash_full_prompts": "2b460b75f1fdfefd",
555
+ "hash_input_tokens": "da036601573942e2",
556
+ "hash_cont_tokens": "875cde3af7a0ee14"
557
+ },
558
+ "harness|hendrycksTest-college_chemistry|5": {
559
+ "hash_examples": "22ff85f1d34f42d1",
560
+ "hash_full_prompts": "242c9be6da583e95",
561
+ "hash_input_tokens": "94e0196d6aded13d",
562
+ "hash_cont_tokens": "50421e30bef398f9"
563
+ },
564
+ "harness|hendrycksTest-college_computer_science|5": {
565
+ "hash_examples": "30318289d717a5cf",
566
+ "hash_full_prompts": "ed2bdb4e87c4b371",
567
+ "hash_input_tokens": "6e4d0f4a8d36690b",
568
+ "hash_cont_tokens": "ffc0fe414cdc4a83"
569
+ },
570
+ "harness|hendrycksTest-college_mathematics|5": {
571
+ "hash_examples": "4944d1f0b6b5d911",
572
+ "hash_full_prompts": "770bc4281c973190",
573
+ "hash_input_tokens": "614054d17109a25d",
574
+ "hash_cont_tokens": "50421e30bef398f9"
575
+ },
576
+ "harness|hendrycksTest-college_medicine|5": {
577
+ "hash_examples": "dd69cc33381275af",
578
+ "hash_full_prompts": "ad2a53e5250ab46e",
579
+ "hash_input_tokens": "1d633b3cc0524ba8",
580
+ "hash_cont_tokens": "1f88b00d41957d82"
581
+ },
582
+ "harness|hendrycksTest-college_physics|5": {
583
+ "hash_examples": "875dd26d22655b0d",
584
+ "hash_full_prompts": "833a0d7b55aed500",
585
+ "hash_input_tokens": "5421d9a1af86cbd4",
586
+ "hash_cont_tokens": "f7b8097afc16a47c"
587
+ },
588
+ "harness|hendrycksTest-computer_security|5": {
589
+ "hash_examples": "006451eedc0ededb",
590
+ "hash_full_prompts": "94034c97e85d8f46",
591
+ "hash_input_tokens": "5e6b70ecb333cf18",
592
+ "hash_cont_tokens": "50421e30bef398f9"
593
+ },
594
+ "harness|hendrycksTest-conceptual_physics|5": {
595
+ "hash_examples": "8874ece872d2ca4c",
596
+ "hash_full_prompts": "e40d15a34640d6fa",
597
+ "hash_input_tokens": "c2ef11a87264ceed",
598
+ "hash_cont_tokens": "aa0e8bc655f2f641"
599
+ },
600
+ "harness|hendrycksTest-econometrics|5": {
601
+ "hash_examples": "64d3623b0bfaa43f",
602
+ "hash_full_prompts": "612f340fae41338d",
603
+ "hash_input_tokens": "ecaccd912a4c3978",
604
+ "hash_cont_tokens": "bfb7e3c3c88313f1"
605
+ },
606
+ "harness|hendrycksTest-electrical_engineering|5": {
607
+ "hash_examples": "e98f51780c674d7e",
608
+ "hash_full_prompts": "10275b312d812ae6",
609
+ "hash_input_tokens": "1590c84291399be8",
610
+ "hash_cont_tokens": "2425a3f084a591ef"
611
+ },
612
+ "harness|hendrycksTest-elementary_mathematics|5": {
613
+ "hash_examples": "fc48208a5ac1c0ce",
614
+ "hash_full_prompts": "5ec274c6c82aca23",
615
+ "hash_input_tokens": "3269597f715b0da1",
616
+ "hash_cont_tokens": "f52691aef15a407b"
617
+ },
618
+ "harness|hendrycksTest-formal_logic|5": {
619
+ "hash_examples": "5a6525665f63ea72",
620
+ "hash_full_prompts": "07b92638c4a6b500",
621
+ "hash_input_tokens": "a2800d20f3ab8d7c",
622
+ "hash_cont_tokens": "f515d598d9c21263"
623
+ },
624
+ "harness|hendrycksTest-global_facts|5": {
625
+ "hash_examples": "371d70d743b2b89b",
626
+ "hash_full_prompts": "332fdee50a1921b4",
627
+ "hash_input_tokens": "94ed44b3772505ad",
628
+ "hash_cont_tokens": "50421e30bef398f9"
629
+ },
630
+ "harness|hendrycksTest-high_school_biology|5": {
631
+ "hash_examples": "a79e1018b1674052",
632
+ "hash_full_prompts": "e624e26ede922561",
633
+ "hash_input_tokens": "24423acb928db768",
634
+ "hash_cont_tokens": "bd85a4156a3613ee"
635
+ },
636
+ "harness|hendrycksTest-high_school_chemistry|5": {
637
+ "hash_examples": "44bfc25c389f0e03",
638
+ "hash_full_prompts": "0e3e5f5d9246482a",
639
+ "hash_input_tokens": "831ff35c474e5cef",
640
+ "hash_cont_tokens": "a95c97af1c14e068"
641
+ },
642
+ "harness|hendrycksTest-high_school_computer_science|5": {
643
+ "hash_examples": "8b8cdb1084f24169",
644
+ "hash_full_prompts": "c00487e67c1813cc",
645
+ "hash_input_tokens": "8c34e0f2bda77358",
646
+ "hash_cont_tokens": "8abfedef914e33c9"
647
+ },
648
+ "harness|hendrycksTest-high_school_european_history|5": {
649
+ "hash_examples": "11cd32d0ef440171",
650
+ "hash_full_prompts": "318f4513c537c6bf",
651
+ "hash_input_tokens": "f1f73dd687da18d7",
652
+ "hash_cont_tokens": "674fc454bdc5ac93"
653
+ },
654
+ "harness|hendrycksTest-high_school_geography|5": {
655
+ "hash_examples": "b60019b9e80b642f",
656
+ "hash_full_prompts": "ee5789fcc1a81b1e",
657
+ "hash_input_tokens": "7c5547c7da5bc793",
658
+ "hash_cont_tokens": "03a5012b916274ea"
659
+ },
660
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
661
+ "hash_examples": "d221ec983d143dc3",
662
+ "hash_full_prompts": "ac42d888e1ce1155",
663
+ "hash_input_tokens": "f62991cb6a496b05",
664
+ "hash_cont_tokens": "a83effb8f76b7d7c"
665
+ },
666
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
667
+ "hash_examples": "59c2915cacfd3fbb",
668
+ "hash_full_prompts": "c6bd9d25158abd0e",
669
+ "hash_input_tokens": "4cef2aff6e3d59ed",
670
+ "hash_cont_tokens": "c583432ad27fcfe0"
671
+ },
672
+ "harness|hendrycksTest-high_school_mathematics|5": {
673
+ "hash_examples": "1f8ac897608de342",
674
+ "hash_full_prompts": "5d88f41fc2d643a8",
675
+ "hash_input_tokens": "6e2577ea4082ed2b",
676
+ "hash_cont_tokens": "24f5dc613660300b"
677
+ },
678
+ "harness|hendrycksTest-high_school_microeconomics|5": {
679
+ "hash_examples": "ead6a0f2f6c83370",
680
+ "hash_full_prompts": "bfc393381298609e",
681
+ "hash_input_tokens": "c5fc9aeb1079c8e4",
682
+ "hash_cont_tokens": "f47f041de50333b9"
683
+ },
684
+ "harness|hendrycksTest-high_school_physics|5": {
685
+ "hash_examples": "c3f2025990afec64",
686
+ "hash_full_prompts": "fc78b4997e436734",
687
+ "hash_input_tokens": "555fc385cffa84ca",
688
+ "hash_cont_tokens": "ba2efcd283e938cc"
689
+ },
690
+ "harness|hendrycksTest-high_school_psychology|5": {
691
+ "hash_examples": "21f8aab618f6d636",
692
+ "hash_full_prompts": "d5c76aa40b9dbc43",
693
+ "hash_input_tokens": "febd23cbf9973b7f",
694
+ "hash_cont_tokens": "942069cd363844d9"
695
+ },
696
+ "harness|hendrycksTest-high_school_statistics|5": {
697
+ "hash_examples": "2386a60a11fc5de3",
698
+ "hash_full_prompts": "4c5c8be5aafac432",
699
+ "hash_input_tokens": "424b02981230ee83",
700
+ "hash_cont_tokens": "955ed42b6f7fa019"
701
+ },
702
+ "harness|hendrycksTest-high_school_us_history|5": {
703
+ "hash_examples": "74961543be40f04f",
704
+ "hash_full_prompts": "5d5ca4840131ba21",
705
+ "hash_input_tokens": "50c9ff438c85a69e",
706
+ "hash_cont_tokens": "cdd0b3dc06d933e5"
707
+ },
708
+ "harness|hendrycksTest-high_school_world_history|5": {
709
+ "hash_examples": "2ad2f6b7198b2234",
710
+ "hash_full_prompts": "11845057459afd72",
711
+ "hash_input_tokens": "054824cc474caef5",
712
+ "hash_cont_tokens": "9a864184946033ac"
713
+ },
714
+ "harness|hendrycksTest-human_aging|5": {
715
+ "hash_examples": "1a7199dc733e779b",
716
+ "hash_full_prompts": "756b9096b8eaf892",
717
+ "hash_input_tokens": "541a75f071dcf579",
718
+ "hash_cont_tokens": "142a4a8a1138a214"
719
+ },
720
+ "harness|hendrycksTest-human_sexuality|5": {
721
+ "hash_examples": "7acb8fdad97f88a6",
722
+ "hash_full_prompts": "731a52ff15b8cfdb",
723
+ "hash_input_tokens": "04269e5c5a257dd9",
724
+ "hash_cont_tokens": "bc54813e809b796d"
725
+ },
726
+ "harness|hendrycksTest-international_law|5": {
727
+ "hash_examples": "1300bfd0dfc59114",
728
+ "hash_full_prompts": "db2aefbff5eec996",
729
+ "hash_input_tokens": "d93ba9d9d38e4397",
730
+ "hash_cont_tokens": "dc45b45fcda18e5d"
731
+ },
732
+ "harness|hendrycksTest-jurisprudence|5": {
733
+ "hash_examples": "083b1e4904c48dc2",
734
+ "hash_full_prompts": "0f89ee3fe03d6a21",
735
+ "hash_input_tokens": "9eeaccd2698b4f5a",
736
+ "hash_cont_tokens": "e3a8cd951b6e3469"
737
+ },
738
+ "harness|hendrycksTest-logical_fallacies|5": {
739
+ "hash_examples": "709128f9926a634c",
740
+ "hash_full_prompts": "98a04b1f8f841069",
741
+ "hash_input_tokens": "b4f08f544f2b7576",
742
+ "hash_cont_tokens": "1e80dbd30f6453d5"
743
+ },
744
+ "harness|hendrycksTest-machine_learning|5": {
745
+ "hash_examples": "88f22a636029ae47",
746
+ "hash_full_prompts": "2e1c8d4b1e0cc921",
747
+ "hash_input_tokens": "900c2a51f1174b9f",
748
+ "hash_cont_tokens": "9b37da7777378ca9"
749
+ },
750
+ "harness|hendrycksTest-management|5": {
751
+ "hash_examples": "8c8a1e07a2151dca",
752
+ "hash_full_prompts": "f51611f514b265b0",
753
+ "hash_input_tokens": "6b36efb4689c6eca",
754
+ "hash_cont_tokens": "a01d6d39a83c4597"
755
+ },
756
+ "harness|hendrycksTest-marketing|5": {
757
+ "hash_examples": "2668953431f91e96",
758
+ "hash_full_prompts": "77562bef997c7650",
759
+ "hash_input_tokens": "2aaac78a0cfed47a",
760
+ "hash_cont_tokens": "6aeaed4d823c98aa"
761
+ },
762
+ "harness|hendrycksTest-medical_genetics|5": {
763
+ "hash_examples": "9c2dda34a2ea4fd2",
764
+ "hash_full_prompts": "202139046daa118f",
765
+ "hash_input_tokens": "886ca823b41c094a",
766
+ "hash_cont_tokens": "50421e30bef398f9"
767
+ },
768
+ "harness|hendrycksTest-miscellaneous|5": {
769
+ "hash_examples": "41adb694024809c2",
770
+ "hash_full_prompts": "bffec9fc237bcf93",
771
+ "hash_input_tokens": "72fd71de7675e7d0",
772
+ "hash_cont_tokens": "9b0ab02a64603081"
773
+ },
774
+ "harness|hendrycksTest-moral_disputes|5": {
775
+ "hash_examples": "3171c13ba3c594c4",
776
+ "hash_full_prompts": "170831fc36f1d59e",
777
+ "hash_input_tokens": "f3ca0dd8e7a1eb09",
778
+ "hash_cont_tokens": "8badf768f7b0467a"
779
+ },
780
+ "harness|hendrycksTest-moral_scenarios|5": {
781
+ "hash_examples": "9873e077e83e0546",
782
+ "hash_full_prompts": "08f4ceba3131a068",
783
+ "hash_input_tokens": "3e793631e951f23c",
784
+ "hash_cont_tokens": "32ae620376b2bbba"
785
+ },
786
+ "harness|hendrycksTest-nutrition|5": {
787
+ "hash_examples": "7db1d8142ec14323",
788
+ "hash_full_prompts": "4c0e68e3586cb453",
789
+ "hash_input_tokens": "59753c2144ea93af",
790
+ "hash_cont_tokens": "3071def75bacc404"
791
+ },
792
+ "harness|hendrycksTest-philosophy|5": {
793
+ "hash_examples": "9b455b7d72811cc8",
794
+ "hash_full_prompts": "e467f822d8a0d3ff",
795
+ "hash_input_tokens": "bd8d3dbed15a8c34",
796
+ "hash_cont_tokens": "9f6ff69d23a48783"
797
+ },
798
+ "harness|hendrycksTest-prehistory|5": {
799
+ "hash_examples": "8be90d0f538f1560",
800
+ "hash_full_prompts": "152187949bcd0921",
801
+ "hash_input_tokens": "3573cd87facbb7c5",
802
+ "hash_cont_tokens": "de469d2b981e32a3"
803
+ },
804
+ "harness|hendrycksTest-professional_accounting|5": {
805
+ "hash_examples": "8d377597916cd07e",
806
+ "hash_full_prompts": "0eb7345d6144ee0d",
807
+ "hash_input_tokens": "17e721bc1a7cbb47",
808
+ "hash_cont_tokens": "c46f74d2dfc7b13b"
809
+ },
810
+ "harness|hendrycksTest-professional_law|5": {
811
+ "hash_examples": "cd9dbc52b3c932d6",
812
+ "hash_full_prompts": "36ac764272bfb182",
813
+ "hash_input_tokens": "9178e10bd0763ec4",
814
+ "hash_cont_tokens": "2e590029ef41fbcd"
815
+ },
816
+ "harness|hendrycksTest-professional_medicine|5": {
817
+ "hash_examples": "b20e4e816c1e383e",
818
+ "hash_full_prompts": "7b8d69ea2acaf2f7",
819
+ "hash_input_tokens": "f5a22012a54f70ea",
820
+ "hash_cont_tokens": "fe35cfa9c6ca802e"
821
+ },
822
+ "harness|hendrycksTest-professional_psychology|5": {
823
+ "hash_examples": "d45b73b22f9cc039",
824
+ "hash_full_prompts": "fe8937e9ffc99771",
825
+ "hash_input_tokens": "0dfb73a8eb3f692c",
826
+ "hash_cont_tokens": "f020fbddf72c8652"
827
+ },
828
+ "harness|hendrycksTest-public_relations|5": {
829
+ "hash_examples": "0d25072e1761652a",
830
+ "hash_full_prompts": "f9adc39cfa9f42ba",
831
+ "hash_input_tokens": "1710c6ba4c9f3cbd",
832
+ "hash_cont_tokens": "568f585a259965c1"
833
+ },
834
+ "harness|hendrycksTest-security_studies|5": {
835
+ "hash_examples": "62bb8197e63d60d4",
836
+ "hash_full_prompts": "869c9c3ae196b7c3",
837
+ "hash_input_tokens": "d49711415961ced7",
838
+ "hash_cont_tokens": "cc6fd7cccd64cd5d"
839
+ },
840
+ "harness|hendrycksTest-sociology|5": {
841
+ "hash_examples": "e7959df87dea8672",
842
+ "hash_full_prompts": "1a1fc00e17b3a52a",
843
+ "hash_input_tokens": "828999f7624cbe7e",
844
+ "hash_cont_tokens": "c3a3bdfd177eed5b"
845
+ },
846
+ "harness|hendrycksTest-us_foreign_policy|5": {
847
+ "hash_examples": "4a56a01ddca44dca",
848
+ "hash_full_prompts": "0c7a7081c71c07b6",
849
+ "hash_input_tokens": "42054621e718dbee",
850
+ "hash_cont_tokens": "2568d0e8e36fa959"
851
+ },
852
+ "harness|hendrycksTest-virology|5": {
853
+ "hash_examples": "451cc86a8c4f4fe9",
854
+ "hash_full_prompts": "01e95325d8b738e4",
855
+ "hash_input_tokens": "6c4f0aa4dc859c04",
856
+ "hash_cont_tokens": "926cf60b0891f374"
857
+ },
858
+ "harness|hendrycksTest-world_religions|5": {
859
+ "hash_examples": "3b29cfaf1a81c379",
860
+ "hash_full_prompts": "e0d79a15083dfdff",
861
+ "hash_input_tokens": "6c75d44e092ff24f",
862
+ "hash_cont_tokens": "c525a5de974c1ea3"
863
+ },
864
+ "harness|truthfulqa:mc|0": {
865
+ "hash_examples": "23176c0531c7b867",
866
+ "hash_full_prompts": "36a6d90e75d92d4a",
867
+ "hash_input_tokens": "2738d7ed7075faa7",
868
+ "hash_cont_tokens": "c014154380b74b9e"
869
+ }
870
+ }
871
+ }
meta-llama/Llama-2-7b-hf/results_2023-08-29T17-54-59.197645.json ADDED
@@ -0,0 +1,1366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-7b-hf",
4
+ "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
+ "model_dtype": "4bit",
6
+ "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63",
7
+ "num_few_shot_default": 0,
8
+ "num_fewshot_seeds": 1,
9
+ "override_batch_size": 1,
10
+ "max_samples": null,
11
+ "job_id": ""
12
+ },
13
+ "results": {
14
+ "harness|arc:challenge|25": {
15
+ "acc": 0.4854948805460751,
16
+ "acc_stderr": 0.014605241081370056,
17
+ "acc_norm": 0.5307167235494881,
18
+ "acc_norm_stderr": 0.014583792546304037
19
+ },
20
+ "harness|hellaswag|10": {
21
+ "acc": 0.5789683330013942,
22
+ "acc_stderr": 0.0049271558825981845,
23
+ "acc_norm": 0.7774347739494125,
24
+ "acc_norm_stderr": 0.004151185615952062
25
+ },
26
+ "harness|hendrycksTest-abstract_algebra|5": {
27
+ "acc": 0.28,
28
+ "acc_stderr": 0.04512608598542129,
29
+ "acc_norm": 0.28,
30
+ "acc_norm_stderr": 0.04512608598542129
31
+ },
32
+ "harness|hendrycksTest-anatomy|5": {
33
+ "acc": 0.42962962962962964,
34
+ "acc_stderr": 0.04276349494376599,
35
+ "acc_norm": 0.42962962962962964,
36
+ "acc_norm_stderr": 0.04276349494376599
37
+ },
38
+ "harness|hendrycksTest-astronomy|5": {
39
+ "acc": 0.40789473684210525,
40
+ "acc_stderr": 0.03999309712777471,
41
+ "acc_norm": 0.40789473684210525,
42
+ "acc_norm_stderr": 0.03999309712777471
43
+ },
44
+ "harness|hendrycksTest-business_ethics|5": {
45
+ "acc": 0.49,
46
+ "acc_stderr": 0.05024183937956911,
47
+ "acc_norm": 0.49,
48
+ "acc_norm_stderr": 0.05024183937956911
49
+ },
50
+ "harness|hendrycksTest-clinical_knowledge|5": {
51
+ "acc": 0.4377358490566038,
52
+ "acc_stderr": 0.030533338430467516,
53
+ "acc_norm": 0.4377358490566038,
54
+ "acc_norm_stderr": 0.030533338430467516
55
+ },
56
+ "harness|hendrycksTest-college_biology|5": {
57
+ "acc": 0.4375,
58
+ "acc_stderr": 0.04148415739394154,
59
+ "acc_norm": 0.4375,
60
+ "acc_norm_stderr": 0.04148415739394154
61
+ },
62
+ "harness|hendrycksTest-college_chemistry|5": {
63
+ "acc": 0.31,
64
+ "acc_stderr": 0.04648231987117316,
65
+ "acc_norm": 0.31,
66
+ "acc_norm_stderr": 0.04648231987117316
67
+ },
68
+ "harness|hendrycksTest-college_computer_science|5": {
69
+ "acc": 0.39,
70
+ "acc_stderr": 0.04902071300001975,
71
+ "acc_norm": 0.39,
72
+ "acc_norm_stderr": 0.04902071300001975
73
+ },
74
+ "harness|hendrycksTest-college_mathematics|5": {
75
+ "acc": 0.32,
76
+ "acc_stderr": 0.04688261722621505,
77
+ "acc_norm": 0.32,
78
+ "acc_norm_stderr": 0.04688261722621505
79
+ },
80
+ "harness|hendrycksTest-college_medicine|5": {
81
+ "acc": 0.37572254335260113,
82
+ "acc_stderr": 0.036928207672648664,
83
+ "acc_norm": 0.37572254335260113,
84
+ "acc_norm_stderr": 0.036928207672648664
85
+ },
86
+ "harness|hendrycksTest-college_physics|5": {
87
+ "acc": 0.18627450980392157,
88
+ "acc_stderr": 0.038739587141493524,
89
+ "acc_norm": 0.18627450980392157,
90
+ "acc_norm_stderr": 0.038739587141493524
91
+ },
92
+ "harness|hendrycksTest-computer_security|5": {
93
+ "acc": 0.58,
94
+ "acc_stderr": 0.049604496374885836,
95
+ "acc_norm": 0.58,
96
+ "acc_norm_stderr": 0.049604496374885836
97
+ },
98
+ "harness|hendrycksTest-conceptual_physics|5": {
99
+ "acc": 0.4425531914893617,
100
+ "acc_stderr": 0.03246956919789958,
101
+ "acc_norm": 0.4425531914893617,
102
+ "acc_norm_stderr": 0.03246956919789958
103
+ },
104
+ "harness|hendrycksTest-econometrics|5": {
105
+ "acc": 0.30701754385964913,
106
+ "acc_stderr": 0.04339138322579861,
107
+ "acc_norm": 0.30701754385964913,
108
+ "acc_norm_stderr": 0.04339138322579861
109
+ },
110
+ "harness|hendrycksTest-electrical_engineering|5": {
111
+ "acc": 0.46206896551724136,
112
+ "acc_stderr": 0.041546596717075474,
113
+ "acc_norm": 0.46206896551724136,
114
+ "acc_norm_stderr": 0.041546596717075474
115
+ },
116
+ "harness|hendrycksTest-elementary_mathematics|5": {
117
+ "acc": 0.24603174603174602,
118
+ "acc_stderr": 0.02218203720294836,
119
+ "acc_norm": 0.24603174603174602,
120
+ "acc_norm_stderr": 0.02218203720294836
121
+ },
122
+ "harness|hendrycksTest-formal_logic|5": {
123
+ "acc": 0.3412698412698413,
124
+ "acc_stderr": 0.04240799327574924,
125
+ "acc_norm": 0.3412698412698413,
126
+ "acc_norm_stderr": 0.04240799327574924
127
+ },
128
+ "harness|hendrycksTest-global_facts|5": {
129
+ "acc": 0.36,
130
+ "acc_stderr": 0.04824181513244218,
131
+ "acc_norm": 0.36,
132
+ "acc_norm_stderr": 0.04824181513244218
133
+ },
134
+ "harness|hendrycksTest-high_school_biology|5": {
135
+ "acc": 0.4290322580645161,
136
+ "acc_stderr": 0.02815603653823321,
137
+ "acc_norm": 0.4290322580645161,
138
+ "acc_norm_stderr": 0.02815603653823321
139
+ },
140
+ "harness|hendrycksTest-high_school_chemistry|5": {
141
+ "acc": 0.35467980295566504,
142
+ "acc_stderr": 0.0336612448905145,
143
+ "acc_norm": 0.35467980295566504,
144
+ "acc_norm_stderr": 0.0336612448905145
145
+ },
146
+ "harness|hendrycksTest-high_school_computer_science|5": {
147
+ "acc": 0.42,
148
+ "acc_stderr": 0.049604496374885836,
149
+ "acc_norm": 0.42,
150
+ "acc_norm_stderr": 0.049604496374885836
151
+ },
152
+ "harness|hendrycksTest-high_school_european_history|5": {
153
+ "acc": 0.5696969696969697,
154
+ "acc_stderr": 0.03866225962879077,
155
+ "acc_norm": 0.5696969696969697,
156
+ "acc_norm_stderr": 0.03866225962879077
157
+ },
158
+ "harness|hendrycksTest-high_school_geography|5": {
159
+ "acc": 0.4797979797979798,
160
+ "acc_stderr": 0.0355944356556392,
161
+ "acc_norm": 0.4797979797979798,
162
+ "acc_norm_stderr": 0.0355944356556392
163
+ },
164
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
165
+ "acc": 0.6321243523316062,
166
+ "acc_stderr": 0.034801756684660366,
167
+ "acc_norm": 0.6321243523316062,
168
+ "acc_norm_stderr": 0.034801756684660366
169
+ },
170
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
171
+ "acc": 0.4,
172
+ "acc_stderr": 0.024838811988033158,
173
+ "acc_norm": 0.4,
174
+ "acc_norm_stderr": 0.024838811988033158
175
+ },
176
+ "harness|hendrycksTest-high_school_mathematics|5": {
177
+ "acc": 0.24444444444444444,
178
+ "acc_stderr": 0.026202766534652148,
179
+ "acc_norm": 0.24444444444444444,
180
+ "acc_norm_stderr": 0.026202766534652148
181
+ },
182
+ "harness|hendrycksTest-high_school_microeconomics|5": {
183
+ "acc": 0.3907563025210084,
184
+ "acc_stderr": 0.031693802357129965,
185
+ "acc_norm": 0.3907563025210084,
186
+ "acc_norm_stderr": 0.031693802357129965
187
+ },
188
+ "harness|hendrycksTest-high_school_physics|5": {
189
+ "acc": 0.304635761589404,
190
+ "acc_stderr": 0.03757949922943342,
191
+ "acc_norm": 0.304635761589404,
192
+ "acc_norm_stderr": 0.03757949922943342
193
+ },
194
+ "harness|hendrycksTest-high_school_psychology|5": {
195
+ "acc": 0.5798165137614679,
196
+ "acc_stderr": 0.021162420048273508,
197
+ "acc_norm": 0.5798165137614679,
198
+ "acc_norm_stderr": 0.021162420048273508
199
+ },
200
+ "harness|hendrycksTest-high_school_statistics|5": {
201
+ "acc": 0.19444444444444445,
202
+ "acc_stderr": 0.02699145450203673,
203
+ "acc_norm": 0.19444444444444445,
204
+ "acc_norm_stderr": 0.02699145450203673
205
+ },
206
+ "harness|hendrycksTest-high_school_us_history|5": {
207
+ "acc": 0.4803921568627451,
208
+ "acc_stderr": 0.03506612560524867,
209
+ "acc_norm": 0.4803921568627451,
210
+ "acc_norm_stderr": 0.03506612560524867
211
+ },
212
+ "harness|hendrycksTest-high_school_world_history|5": {
213
+ "acc": 0.5485232067510548,
214
+ "acc_stderr": 0.0323936001739747,
215
+ "acc_norm": 0.5485232067510548,
216
+ "acc_norm_stderr": 0.0323936001739747
217
+ },
218
+ "harness|hendrycksTest-human_aging|5": {
219
+ "acc": 0.5246636771300448,
220
+ "acc_stderr": 0.03351695167652628,
221
+ "acc_norm": 0.5246636771300448,
222
+ "acc_norm_stderr": 0.03351695167652628
223
+ },
224
+ "harness|hendrycksTest-human_sexuality|5": {
225
+ "acc": 0.45038167938931295,
226
+ "acc_stderr": 0.04363643698524779,
227
+ "acc_norm": 0.45038167938931295,
228
+ "acc_norm_stderr": 0.04363643698524779
229
+ },
230
+ "harness|hendrycksTest-international_law|5": {
231
+ "acc": 0.6198347107438017,
232
+ "acc_stderr": 0.04431324501968432,
233
+ "acc_norm": 0.6198347107438017,
234
+ "acc_norm_stderr": 0.04431324501968432
235
+ },
236
+ "harness|hendrycksTest-jurisprudence|5": {
237
+ "acc": 0.48148148148148145,
238
+ "acc_stderr": 0.04830366024635331,
239
+ "acc_norm": 0.48148148148148145,
240
+ "acc_norm_stderr": 0.04830366024635331
241
+ },
242
+ "harness|hendrycksTest-logical_fallacies|5": {
243
+ "acc": 0.4601226993865031,
244
+ "acc_stderr": 0.03915857291436972,
245
+ "acc_norm": 0.4601226993865031,
246
+ "acc_norm_stderr": 0.03915857291436972
247
+ },
248
+ "harness|hendrycksTest-machine_learning|5": {
249
+ "acc": 0.36607142857142855,
250
+ "acc_stderr": 0.0457237235873743,
251
+ "acc_norm": 0.36607142857142855,
252
+ "acc_norm_stderr": 0.0457237235873743
253
+ },
254
+ "harness|hendrycksTest-management|5": {
255
+ "acc": 0.49514563106796117,
256
+ "acc_stderr": 0.049505043821289195,
257
+ "acc_norm": 0.49514563106796117,
258
+ "acc_norm_stderr": 0.049505043821289195
259
+ },
260
+ "harness|hendrycksTest-marketing|5": {
261
+ "acc": 0.6837606837606838,
262
+ "acc_stderr": 0.030463656747340275,
263
+ "acc_norm": 0.6837606837606838,
264
+ "acc_norm_stderr": 0.030463656747340275
265
+ },
266
+ "harness|hendrycksTest-medical_genetics|5": {
267
+ "acc": 0.52,
268
+ "acc_stderr": 0.050211673156867795,
269
+ "acc_norm": 0.52,
270
+ "acc_norm_stderr": 0.050211673156867795
271
+ },
272
+ "harness|hendrycksTest-miscellaneous|5": {
273
+ "acc": 0.6002554278416348,
274
+ "acc_stderr": 0.017516847907053282,
275
+ "acc_norm": 0.6002554278416348,
276
+ "acc_norm_stderr": 0.017516847907053282
277
+ },
278
+ "harness|hendrycksTest-moral_disputes|5": {
279
+ "acc": 0.48554913294797686,
280
+ "acc_stderr": 0.02690784985628254,
281
+ "acc_norm": 0.48554913294797686,
282
+ "acc_norm_stderr": 0.02690784985628254
283
+ },
284
+ "harness|hendrycksTest-moral_scenarios|5": {
285
+ "acc": 0.23798882681564246,
286
+ "acc_stderr": 0.014242630070574915,
287
+ "acc_norm": 0.23798882681564246,
288
+ "acc_norm_stderr": 0.014242630070574915
289
+ },
290
+ "harness|hendrycksTest-nutrition|5": {
291
+ "acc": 0.49673202614379086,
292
+ "acc_stderr": 0.02862930519400354,
293
+ "acc_norm": 0.49673202614379086,
294
+ "acc_norm_stderr": 0.02862930519400354
295
+ },
296
+ "harness|hendrycksTest-philosophy|5": {
297
+ "acc": 0.5498392282958199,
298
+ "acc_stderr": 0.028256660723360177,
299
+ "acc_norm": 0.5498392282958199,
300
+ "acc_norm_stderr": 0.028256660723360177
301
+ },
302
+ "harness|hendrycksTest-prehistory|5": {
303
+ "acc": 0.5,
304
+ "acc_stderr": 0.02782074420373286,
305
+ "acc_norm": 0.5,
306
+ "acc_norm_stderr": 0.02782074420373286
307
+ },
308
+ "harness|hendrycksTest-professional_accounting|5": {
309
+ "acc": 0.3262411347517731,
310
+ "acc_stderr": 0.027968453043563168,
311
+ "acc_norm": 0.3262411347517731,
312
+ "acc_norm_stderr": 0.027968453043563168
313
+ },
314
+ "harness|hendrycksTest-professional_law|5": {
315
+ "acc": 0.3318122555410691,
316
+ "acc_stderr": 0.012026088259897637,
317
+ "acc_norm": 0.3318122555410691,
318
+ "acc_norm_stderr": 0.012026088259897637
319
+ },
320
+ "harness|hendrycksTest-professional_medicine|5": {
321
+ "acc": 0.4485294117647059,
322
+ "acc_stderr": 0.030211479609121593,
323
+ "acc_norm": 0.4485294117647059,
324
+ "acc_norm_stderr": 0.030211479609121593
325
+ },
326
+ "harness|hendrycksTest-professional_psychology|5": {
327
+ "acc": 0.4215686274509804,
328
+ "acc_stderr": 0.019977422600227467,
329
+ "acc_norm": 0.4215686274509804,
330
+ "acc_norm_stderr": 0.019977422600227467
331
+ },
332
+ "harness|hendrycksTest-public_relations|5": {
333
+ "acc": 0.4727272727272727,
334
+ "acc_stderr": 0.04782001791380063,
335
+ "acc_norm": 0.4727272727272727,
336
+ "acc_norm_stderr": 0.04782001791380063
337
+ },
338
+ "harness|hendrycksTest-security_studies|5": {
339
+ "acc": 0.3673469387755102,
340
+ "acc_stderr": 0.030862144921087558,
341
+ "acc_norm": 0.3673469387755102,
342
+ "acc_norm_stderr": 0.030862144921087558
343
+ },
344
+ "harness|hendrycksTest-sociology|5": {
345
+ "acc": 0.5970149253731343,
346
+ "acc_stderr": 0.034683432951111266,
347
+ "acc_norm": 0.5970149253731343,
348
+ "acc_norm_stderr": 0.034683432951111266
349
+ },
350
+ "harness|hendrycksTest-us_foreign_policy|5": {
351
+ "acc": 0.66,
352
+ "acc_stderr": 0.04760952285695237,
353
+ "acc_norm": 0.66,
354
+ "acc_norm_stderr": 0.04760952285695237
355
+ },
356
+ "harness|hendrycksTest-virology|5": {
357
+ "acc": 0.3855421686746988,
358
+ "acc_stderr": 0.037891344246115496,
359
+ "acc_norm": 0.3855421686746988,
360
+ "acc_norm_stderr": 0.037891344246115496
361
+ },
362
+ "harness|hendrycksTest-world_religions|5": {
363
+ "acc": 0.6491228070175439,
364
+ "acc_stderr": 0.03660298834049163,
365
+ "acc_norm": 0.6491228070175439,
366
+ "acc_norm_stderr": 0.03660298834049163
367
+ },
368
+ "harness|truthfulqa:mc|0": {
369
+ "mc1": 0.2484700122399021,
370
+ "mc1_stderr": 0.01512742709652068,
371
+ "mc2": 0.38980202801580316,
372
+ "mc2_stderr": 0.013645286936347097
373
+ },
374
+ "all": {
375
+ "acc": 0.4411565786317669,
376
+ "acc_stderr": 0.03521763310724054,
377
+ "acc_norm": 0.44528688852924886,
378
+ "acc_norm_stderr": 0.03520411753433017,
379
+ "mc1": 0.2484700122399021,
380
+ "mc1_stderr": 0.01512742709652068,
381
+ "mc2": 0.38980202801580316,
382
+ "mc2_stderr": 0.013645286936347097
383
+ }
384
+ },
385
+ "versions": {
386
+ "harness|arc:challenge|25": 0,
387
+ "harness|hellaswag|10": 0,
388
+ "harness|hendrycksTest-abstract_algebra|5": 1,
389
+ "harness|hendrycksTest-anatomy|5": 1,
390
+ "harness|hendrycksTest-astronomy|5": 1,
391
+ "harness|hendrycksTest-business_ethics|5": 1,
392
+ "harness|hendrycksTest-clinical_knowledge|5": 1,
393
+ "harness|hendrycksTest-college_biology|5": 1,
394
+ "harness|hendrycksTest-college_chemistry|5": 1,
395
+ "harness|hendrycksTest-college_computer_science|5": 1,
396
+ "harness|hendrycksTest-college_mathematics|5": 1,
397
+ "harness|hendrycksTest-college_medicine|5": 1,
398
+ "harness|hendrycksTest-college_physics|5": 1,
399
+ "harness|hendrycksTest-computer_security|5": 1,
400
+ "harness|hendrycksTest-conceptual_physics|5": 1,
401
+ "harness|hendrycksTest-econometrics|5": 1,
402
+ "harness|hendrycksTest-electrical_engineering|5": 1,
403
+ "harness|hendrycksTest-elementary_mathematics|5": 1,
404
+ "harness|hendrycksTest-formal_logic|5": 1,
405
+ "harness|hendrycksTest-global_facts|5": 1,
406
+ "harness|hendrycksTest-high_school_biology|5": 1,
407
+ "harness|hendrycksTest-high_school_chemistry|5": 1,
408
+ "harness|hendrycksTest-high_school_computer_science|5": 1,
409
+ "harness|hendrycksTest-high_school_european_history|5": 1,
410
+ "harness|hendrycksTest-high_school_geography|5": 1,
411
+ "harness|hendrycksTest-high_school_government_and_politics|5": 1,
412
+ "harness|hendrycksTest-high_school_macroeconomics|5": 1,
413
+ "harness|hendrycksTest-high_school_mathematics|5": 1,
414
+ "harness|hendrycksTest-high_school_microeconomics|5": 1,
415
+ "harness|hendrycksTest-high_school_physics|5": 1,
416
+ "harness|hendrycksTest-high_school_psychology|5": 1,
417
+ "harness|hendrycksTest-high_school_statistics|5": 1,
418
+ "harness|hendrycksTest-high_school_us_history|5": 1,
419
+ "harness|hendrycksTest-high_school_world_history|5": 1,
420
+ "harness|hendrycksTest-human_aging|5": 1,
421
+ "harness|hendrycksTest-human_sexuality|5": 1,
422
+ "harness|hendrycksTest-international_law|5": 1,
423
+ "harness|hendrycksTest-jurisprudence|5": 1,
424
+ "harness|hendrycksTest-logical_fallacies|5": 1,
425
+ "harness|hendrycksTest-machine_learning|5": 1,
426
+ "harness|hendrycksTest-management|5": 1,
427
+ "harness|hendrycksTest-marketing|5": 1,
428
+ "harness|hendrycksTest-medical_genetics|5": 1,
429
+ "harness|hendrycksTest-miscellaneous|5": 1,
430
+ "harness|hendrycksTest-moral_disputes|5": 1,
431
+ "harness|hendrycksTest-moral_scenarios|5": 1,
432
+ "harness|hendrycksTest-nutrition|5": 1,
433
+ "harness|hendrycksTest-philosophy|5": 1,
434
+ "harness|hendrycksTest-prehistory|5": 1,
435
+ "harness|hendrycksTest-professional_accounting|5": 1,
436
+ "harness|hendrycksTest-professional_law|5": 1,
437
+ "harness|hendrycksTest-professional_medicine|5": 1,
438
+ "harness|hendrycksTest-professional_psychology|5": 1,
439
+ "harness|hendrycksTest-public_relations|5": 1,
440
+ "harness|hendrycksTest-security_studies|5": 1,
441
+ "harness|hendrycksTest-sociology|5": 1,
442
+ "harness|hendrycksTest-us_foreign_policy|5": 1,
443
+ "harness|hendrycksTest-virology|5": 1,
444
+ "harness|hendrycksTest-world_religions|5": 1,
445
+ "harness|truthfulqa:mc|0": 1,
446
+ "all": 0
447
+ },
448
+ "config_tasks": {
449
+ "harness|arc:challenge": "LM Harness task",
450
+ "harness|hellaswag": "LM Harness task",
451
+ "harness|hendrycksTest-abstract_algebra": "LM Harness task",
452
+ "harness|hendrycksTest-anatomy": "LM Harness task",
453
+ "harness|hendrycksTest-astronomy": "LM Harness task",
454
+ "harness|hendrycksTest-business_ethics": "LM Harness task",
455
+ "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
456
+ "harness|hendrycksTest-college_biology": "LM Harness task",
457
+ "harness|hendrycksTest-college_chemistry": "LM Harness task",
458
+ "harness|hendrycksTest-college_computer_science": "LM Harness task",
459
+ "harness|hendrycksTest-college_mathematics": "LM Harness task",
460
+ "harness|hendrycksTest-college_medicine": "LM Harness task",
461
+ "harness|hendrycksTest-college_physics": "LM Harness task",
462
+ "harness|hendrycksTest-computer_security": "LM Harness task",
463
+ "harness|hendrycksTest-conceptual_physics": "LM Harness task",
464
+ "harness|hendrycksTest-econometrics": "LM Harness task",
465
+ "harness|hendrycksTest-electrical_engineering": "LM Harness task",
466
+ "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
467
+ "harness|hendrycksTest-formal_logic": "LM Harness task",
468
+ "harness|hendrycksTest-global_facts": "LM Harness task",
469
+ "harness|hendrycksTest-high_school_biology": "LM Harness task",
470
+ "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
471
+ "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
472
+ "harness|hendrycksTest-high_school_european_history": "LM Harness task",
473
+ "harness|hendrycksTest-high_school_geography": "LM Harness task",
474
+ "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
475
+ "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
476
+ "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
477
+ "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
478
+ "harness|hendrycksTest-high_school_physics": "LM Harness task",
479
+ "harness|hendrycksTest-high_school_psychology": "LM Harness task",
480
+ "harness|hendrycksTest-high_school_statistics": "LM Harness task",
481
+ "harness|hendrycksTest-high_school_us_history": "LM Harness task",
482
+ "harness|hendrycksTest-high_school_world_history": "LM Harness task",
483
+ "harness|hendrycksTest-human_aging": "LM Harness task",
484
+ "harness|hendrycksTest-human_sexuality": "LM Harness task",
485
+ "harness|hendrycksTest-international_law": "LM Harness task",
486
+ "harness|hendrycksTest-jurisprudence": "LM Harness task",
487
+ "harness|hendrycksTest-logical_fallacies": "LM Harness task",
488
+ "harness|hendrycksTest-machine_learning": "LM Harness task",
489
+ "harness|hendrycksTest-management": "LM Harness task",
490
+ "harness|hendrycksTest-marketing": "LM Harness task",
491
+ "harness|hendrycksTest-medical_genetics": "LM Harness task",
492
+ "harness|hendrycksTest-miscellaneous": "LM Harness task",
493
+ "harness|hendrycksTest-moral_disputes": "LM Harness task",
494
+ "harness|hendrycksTest-moral_scenarios": "LM Harness task",
495
+ "harness|hendrycksTest-nutrition": "LM Harness task",
496
+ "harness|hendrycksTest-philosophy": "LM Harness task",
497
+ "harness|hendrycksTest-prehistory": "LM Harness task",
498
+ "harness|hendrycksTest-professional_accounting": "LM Harness task",
499
+ "harness|hendrycksTest-professional_law": "LM Harness task",
500
+ "harness|hendrycksTest-professional_medicine": "LM Harness task",
501
+ "harness|hendrycksTest-professional_psychology": "LM Harness task",
502
+ "harness|hendrycksTest-public_relations": "LM Harness task",
503
+ "harness|hendrycksTest-security_studies": "LM Harness task",
504
+ "harness|hendrycksTest-sociology": "LM Harness task",
505
+ "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
506
+ "harness|hendrycksTest-virology": "LM Harness task",
507
+ "harness|hendrycksTest-world_religions": "LM Harness task",
508
+ "harness|truthfulqa:mc": "LM Harness task"
509
+ },
510
+ "summary_tasks": {
511
+ "harness|arc:challenge|25": {
512
+ "hashes": {
513
+ "hash_examples": "17b0cae357c0259e",
514
+ "hash_full_prompts": "045cbb916e5145c6",
515
+ "hash_input_tokens": "3722289b79076c44",
516
+ "hash_cont_tokens": "8210decc6ff6f7df"
517
+ },
518
+ "truncated": 0,
519
+ "non-truncated": 4687,
520
+ "padded": 4687,
521
+ "non-padded": 0,
522
+ "effective_few_shots": 25.0,
523
+ "num_truncated_few_shots": 0
524
+ },
525
+ "harness|hellaswag|10": {
526
+ "hashes": {
527
+ "hash_examples": "e1768ecb99d7ecf0",
528
+ "hash_full_prompts": "0b4c16983130f84f",
529
+ "hash_input_tokens": "ececd684171f1ef2",
530
+ "hash_cont_tokens": "b3b9e9017afa63af"
531
+ },
532
+ "truncated": 0,
533
+ "non-truncated": 40168,
534
+ "padded": 40113,
535
+ "non-padded": 55,
536
+ "effective_few_shots": 10.0,
537
+ "num_truncated_few_shots": 0
538
+ },
539
+ "harness|hendrycksTest-abstract_algebra|5": {
540
+ "hashes": {
541
+ "hash_examples": "280f9f325b40559a",
542
+ "hash_full_prompts": "2f776a367d23aea2",
543
+ "hash_input_tokens": "c54ff61ad0273dd7",
544
+ "hash_cont_tokens": "50421e30bef398f9"
545
+ },
546
+ "truncated": 0,
547
+ "non-truncated": 400,
548
+ "padded": 400,
549
+ "non-padded": 0,
550
+ "effective_few_shots": 5.0,
551
+ "num_truncated_few_shots": 0
552
+ },
553
+ "harness|hendrycksTest-anatomy|5": {
554
+ "hashes": {
555
+ "hash_examples": "2f83a4f1cab4ba18",
556
+ "hash_full_prompts": "516f74bef25df620",
557
+ "hash_input_tokens": "be31a1e22aef5f90",
558
+ "hash_cont_tokens": "f11971a765cb609f"
559
+ },
560
+ "truncated": 0,
561
+ "non-truncated": 540,
562
+ "padded": 540,
563
+ "non-padded": 0,
564
+ "effective_few_shots": 5.0,
565
+ "num_truncated_few_shots": 0
566
+ },
567
+ "harness|hendrycksTest-astronomy|5": {
568
+ "hashes": {
569
+ "hash_examples": "7d587b908da4d762",
570
+ "hash_full_prompts": "faf4e80f65de93ca",
571
+ "hash_input_tokens": "277a7b1fad566940",
572
+ "hash_cont_tokens": "bf30e5d3f48250cb"
573
+ },
574
+ "truncated": 0,
575
+ "non-truncated": 608,
576
+ "padded": 608,
577
+ "non-padded": 0,
578
+ "effective_few_shots": 5.0,
579
+ "num_truncated_few_shots": 0
580
+ },
581
+ "harness|hendrycksTest-business_ethics|5": {
582
+ "hashes": {
583
+ "hash_examples": "33e51740670de686",
584
+ "hash_full_prompts": "db01c3ef8e1479d4",
585
+ "hash_input_tokens": "ba552605bc116de5",
586
+ "hash_cont_tokens": "bc1dd9b2d995eb61"
587
+ },
588
+ "truncated": 0,
589
+ "non-truncated": 400,
590
+ "padded": 400,
591
+ "non-padded": 0,
592
+ "effective_few_shots": 5.0,
593
+ "num_truncated_few_shots": 0
594
+ },
595
+ "harness|hendrycksTest-clinical_knowledge|5": {
596
+ "hashes": {
597
+ "hash_examples": "f3366dbe7eefffa4",
598
+ "hash_full_prompts": "49654f71d94b65c3",
599
+ "hash_input_tokens": "428c7563d0b98ab9",
600
+ "hash_cont_tokens": "890a119624b3b935"
601
+ },
602
+ "truncated": 0,
603
+ "non-truncated": 1060,
604
+ "padded": 1060,
605
+ "non-padded": 0,
606
+ "effective_few_shots": 5.0,
607
+ "num_truncated_few_shots": 0
608
+ },
609
+ "harness|hendrycksTest-college_biology|5": {
610
+ "hashes": {
611
+ "hash_examples": "ca2b6753a0193e7f",
612
+ "hash_full_prompts": "2b460b75f1fdfefd",
613
+ "hash_input_tokens": "da036601573942e2",
614
+ "hash_cont_tokens": "875cde3af7a0ee14"
615
+ },
616
+ "truncated": 0,
617
+ "non-truncated": 576,
618
+ "padded": 576,
619
+ "non-padded": 0,
620
+ "effective_few_shots": 5.0,
621
+ "num_truncated_few_shots": 0
622
+ },
623
+ "harness|hendrycksTest-college_chemistry|5": {
624
+ "hashes": {
625
+ "hash_examples": "22ff85f1d34f42d1",
626
+ "hash_full_prompts": "242c9be6da583e95",
627
+ "hash_input_tokens": "94e0196d6aded13d",
628
+ "hash_cont_tokens": "50421e30bef398f9"
629
+ },
630
+ "truncated": 0,
631
+ "non-truncated": 400,
632
+ "padded": 400,
633
+ "non-padded": 0,
634
+ "effective_few_shots": 5.0,
635
+ "num_truncated_few_shots": 0
636
+ },
637
+ "harness|hendrycksTest-college_computer_science|5": {
638
+ "hashes": {
639
+ "hash_examples": "30318289d717a5cf",
640
+ "hash_full_prompts": "ed2bdb4e87c4b371",
641
+ "hash_input_tokens": "6e4d0f4a8d36690b",
642
+ "hash_cont_tokens": "ffc0fe414cdc4a83"
643
+ },
644
+ "truncated": 0,
645
+ "non-truncated": 400,
646
+ "padded": 400,
647
+ "non-padded": 0,
648
+ "effective_few_shots": 5.0,
649
+ "num_truncated_few_shots": 0
650
+ },
651
+ "harness|hendrycksTest-college_mathematics|5": {
652
+ "hashes": {
653
+ "hash_examples": "4944d1f0b6b5d911",
654
+ "hash_full_prompts": "770bc4281c973190",
655
+ "hash_input_tokens": "614054d17109a25d",
656
+ "hash_cont_tokens": "50421e30bef398f9"
657
+ },
658
+ "truncated": 0,
659
+ "non-truncated": 400,
660
+ "padded": 400,
661
+ "non-padded": 0,
662
+ "effective_few_shots": 5.0,
663
+ "num_truncated_few_shots": 0
664
+ },
665
+ "harness|hendrycksTest-college_medicine|5": {
666
+ "hashes": {
667
+ "hash_examples": "dd69cc33381275af",
668
+ "hash_full_prompts": "ad2a53e5250ab46e",
669
+ "hash_input_tokens": "081bb2b524defd1c",
670
+ "hash_cont_tokens": "1f88b00d41957d82"
671
+ },
672
+ "truncated": 0,
673
+ "non-truncated": 692,
674
+ "padded": 692,
675
+ "non-padded": 0,
676
+ "effective_few_shots": 5.0,
677
+ "num_truncated_few_shots": 0
678
+ },
679
+ "harness|hendrycksTest-college_physics|5": {
680
+ "hashes": {
681
+ "hash_examples": "875dd26d22655b0d",
682
+ "hash_full_prompts": "833a0d7b55aed500",
683
+ "hash_input_tokens": "5421d9a1af86cbd4",
684
+ "hash_cont_tokens": "f7b8097afc16a47c"
685
+ },
686
+ "truncated": 0,
687
+ "non-truncated": 408,
688
+ "padded": 408,
689
+ "non-padded": 0,
690
+ "effective_few_shots": 5.0,
691
+ "num_truncated_few_shots": 0
692
+ },
693
+ "harness|hendrycksTest-computer_security|5": {
694
+ "hashes": {
695
+ "hash_examples": "006451eedc0ededb",
696
+ "hash_full_prompts": "94034c97e85d8f46",
697
+ "hash_input_tokens": "5e6b70ecb333cf18",
698
+ "hash_cont_tokens": "50421e30bef398f9"
699
+ },
700
+ "truncated": 0,
701
+ "non-truncated": 400,
702
+ "padded": 400,
703
+ "non-padded": 0,
704
+ "effective_few_shots": 5.0,
705
+ "num_truncated_few_shots": 0
706
+ },
707
+ "harness|hendrycksTest-conceptual_physics|5": {
708
+ "hashes": {
709
+ "hash_examples": "8874ece872d2ca4c",
710
+ "hash_full_prompts": "e40d15a34640d6fa",
711
+ "hash_input_tokens": "c2ef11a87264ceed",
712
+ "hash_cont_tokens": "aa0e8bc655f2f641"
713
+ },
714
+ "truncated": 0,
715
+ "non-truncated": 940,
716
+ "padded": 940,
717
+ "non-padded": 0,
718
+ "effective_few_shots": 5.0,
719
+ "num_truncated_few_shots": 0
720
+ },
721
+ "harness|hendrycksTest-econometrics|5": {
722
+ "hashes": {
723
+ "hash_examples": "64d3623b0bfaa43f",
724
+ "hash_full_prompts": "612f340fae41338d",
725
+ "hash_input_tokens": "ecaccd912a4c3978",
726
+ "hash_cont_tokens": "bfb7e3c3c88313f1"
727
+ },
728
+ "truncated": 0,
729
+ "non-truncated": 456,
730
+ "padded": 456,
731
+ "non-padded": 0,
732
+ "effective_few_shots": 5.0,
733
+ "num_truncated_few_shots": 0
734
+ },
735
+ "harness|hendrycksTest-electrical_engineering|5": {
736
+ "hashes": {
737
+ "hash_examples": "e98f51780c674d7e",
738
+ "hash_full_prompts": "10275b312d812ae6",
739
+ "hash_input_tokens": "1590c84291399be8",
740
+ "hash_cont_tokens": "2425a3f084a591ef"
741
+ },
742
+ "truncated": 0,
743
+ "non-truncated": 580,
744
+ "padded": 580,
745
+ "non-padded": 0,
746
+ "effective_few_shots": 5.0,
747
+ "num_truncated_few_shots": 0
748
+ },
749
+ "harness|hendrycksTest-elementary_mathematics|5": {
750
+ "hashes": {
751
+ "hash_examples": "fc48208a5ac1c0ce",
752
+ "hash_full_prompts": "5ec274c6c82aca23",
753
+ "hash_input_tokens": "3269597f715b0da1",
754
+ "hash_cont_tokens": "f52691aef15a407b"
755
+ },
756
+ "truncated": 0,
757
+ "non-truncated": 1512,
758
+ "padded": 1512,
759
+ "non-padded": 0,
760
+ "effective_few_shots": 5.0,
761
+ "num_truncated_few_shots": 0
762
+ },
763
+ "harness|hendrycksTest-formal_logic|5": {
764
+ "hashes": {
765
+ "hash_examples": "5a6525665f63ea72",
766
+ "hash_full_prompts": "07b92638c4a6b500",
767
+ "hash_input_tokens": "a2800d20f3ab8d7c",
768
+ "hash_cont_tokens": "f515d598d9c21263"
769
+ },
770
+ "truncated": 0,
771
+ "non-truncated": 504,
772
+ "padded": 504,
773
+ "non-padded": 0,
774
+ "effective_few_shots": 5.0,
775
+ "num_truncated_few_shots": 0
776
+ },
777
+ "harness|hendrycksTest-global_facts|5": {
778
+ "hashes": {
779
+ "hash_examples": "371d70d743b2b89b",
780
+ "hash_full_prompts": "332fdee50a1921b4",
781
+ "hash_input_tokens": "94ed44b3772505ad",
782
+ "hash_cont_tokens": "50421e30bef398f9"
783
+ },
784
+ "truncated": 0,
785
+ "non-truncated": 400,
786
+ "padded": 400,
787
+ "non-padded": 0,
788
+ "effective_few_shots": 5.0,
789
+ "num_truncated_few_shots": 0
790
+ },
791
+ "harness|hendrycksTest-high_school_biology|5": {
792
+ "hashes": {
793
+ "hash_examples": "a79e1018b1674052",
794
+ "hash_full_prompts": "e624e26ede922561",
795
+ "hash_input_tokens": "24423acb928db768",
796
+ "hash_cont_tokens": "bd85a4156a3613ee"
797
+ },
798
+ "truncated": 0,
799
+ "non-truncated": 1240,
800
+ "padded": 1240,
801
+ "non-padded": 0,
802
+ "effective_few_shots": 5.0,
803
+ "num_truncated_few_shots": 0
804
+ },
805
+ "harness|hendrycksTest-high_school_chemistry|5": {
806
+ "hashes": {
807
+ "hash_examples": "44bfc25c389f0e03",
808
+ "hash_full_prompts": "0e3e5f5d9246482a",
809
+ "hash_input_tokens": "831ff35c474e5cef",
810
+ "hash_cont_tokens": "a95c97af1c14e068"
811
+ },
812
+ "truncated": 0,
813
+ "non-truncated": 812,
814
+ "padded": 812,
815
+ "non-padded": 0,
816
+ "effective_few_shots": 5.0,
817
+ "num_truncated_few_shots": 0
818
+ },
819
+ "harness|hendrycksTest-high_school_computer_science|5": {
820
+ "hashes": {
821
+ "hash_examples": "8b8cdb1084f24169",
822
+ "hash_full_prompts": "c00487e67c1813cc",
823
+ "hash_input_tokens": "a20a96b44dcc5b30",
824
+ "hash_cont_tokens": "8abfedef914e33c9"
825
+ },
826
+ "truncated": 0,
827
+ "non-truncated": 400,
828
+ "padded": 400,
829
+ "non-padded": 0,
830
+ "effective_few_shots": 5.0,
831
+ "num_truncated_few_shots": 0
832
+ },
833
+ "harness|hendrycksTest-high_school_european_history|5": {
834
+ "hashes": {
835
+ "hash_examples": "11cd32d0ef440171",
836
+ "hash_full_prompts": "318f4513c537c6bf",
837
+ "hash_input_tokens": "5002f4ac8b1562ca",
838
+ "hash_cont_tokens": "674fc454bdc5ac93"
839
+ },
840
+ "truncated": 0,
841
+ "non-truncated": 660,
842
+ "padded": 656,
843
+ "non-padded": 4,
844
+ "effective_few_shots": 5.0,
845
+ "num_truncated_few_shots": 0
846
+ },
847
+ "harness|hendrycksTest-high_school_geography|5": {
848
+ "hashes": {
849
+ "hash_examples": "b60019b9e80b642f",
850
+ "hash_full_prompts": "ee5789fcc1a81b1e",
851
+ "hash_input_tokens": "7c5547c7da5bc793",
852
+ "hash_cont_tokens": "03a5012b916274ea"
853
+ },
854
+ "truncated": 0,
855
+ "non-truncated": 792,
856
+ "padded": 792,
857
+ "non-padded": 0,
858
+ "effective_few_shots": 5.0,
859
+ "num_truncated_few_shots": 0
860
+ },
861
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
862
+ "hashes": {
863
+ "hash_examples": "d221ec983d143dc3",
864
+ "hash_full_prompts": "ac42d888e1ce1155",
865
+ "hash_input_tokens": "f62991cb6a496b05",
866
+ "hash_cont_tokens": "a83effb8f76b7d7c"
867
+ },
868
+ "truncated": 0,
869
+ "non-truncated": 772,
870
+ "padded": 772,
871
+ "non-padded": 0,
872
+ "effective_few_shots": 5.0,
873
+ "num_truncated_few_shots": 0
874
+ },
875
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
876
+ "hashes": {
877
+ "hash_examples": "59c2915cacfd3fbb",
878
+ "hash_full_prompts": "c6bd9d25158abd0e",
879
+ "hash_input_tokens": "4cef2aff6e3d59ed",
880
+ "hash_cont_tokens": "c583432ad27fcfe0"
881
+ },
882
+ "truncated": 0,
883
+ "non-truncated": 1560,
884
+ "padded": 1560,
885
+ "non-padded": 0,
886
+ "effective_few_shots": 5.0,
887
+ "num_truncated_few_shots": 0
888
+ },
889
+ "harness|hendrycksTest-high_school_mathematics|5": {
890
+ "hashes": {
891
+ "hash_examples": "1f8ac897608de342",
892
+ "hash_full_prompts": "5d88f41fc2d643a8",
893
+ "hash_input_tokens": "6e2577ea4082ed2b",
894
+ "hash_cont_tokens": "24f5dc613660300b"
895
+ },
896
+ "truncated": 0,
897
+ "non-truncated": 1080,
898
+ "padded": 1080,
899
+ "non-padded": 0,
900
+ "effective_few_shots": 5.0,
901
+ "num_truncated_few_shots": 0
902
+ },
903
+ "harness|hendrycksTest-high_school_microeconomics|5": {
904
+ "hashes": {
905
+ "hash_examples": "ead6a0f2f6c83370",
906
+ "hash_full_prompts": "bfc393381298609e",
907
+ "hash_input_tokens": "c5fc9aeb1079c8e4",
908
+ "hash_cont_tokens": "f47f041de50333b9"
909
+ },
910
+ "truncated": 0,
911
+ "non-truncated": 952,
912
+ "padded": 952,
913
+ "non-padded": 0,
914
+ "effective_few_shots": 5.0,
915
+ "num_truncated_few_shots": 0
916
+ },
917
+ "harness|hendrycksTest-high_school_physics|5": {
918
+ "hashes": {
919
+ "hash_examples": "c3f2025990afec64",
920
+ "hash_full_prompts": "fc78b4997e436734",
921
+ "hash_input_tokens": "555fc385cffa84ca",
922
+ "hash_cont_tokens": "ba2efcd283e938cc"
923
+ },
924
+ "truncated": 0,
925
+ "non-truncated": 604,
926
+ "padded": 604,
927
+ "non-padded": 0,
928
+ "effective_few_shots": 5.0,
929
+ "num_truncated_few_shots": 0
930
+ },
931
+ "harness|hendrycksTest-high_school_psychology|5": {
932
+ "hashes": {
933
+ "hash_examples": "21f8aab618f6d636",
934
+ "hash_full_prompts": "d5c76aa40b9dbc43",
935
+ "hash_input_tokens": "febd23cbf9973b7f",
936
+ "hash_cont_tokens": "942069cd363844d9"
937
+ },
938
+ "truncated": 0,
939
+ "non-truncated": 2180,
940
+ "padded": 2180,
941
+ "non-padded": 0,
942
+ "effective_few_shots": 5.0,
943
+ "num_truncated_few_shots": 0
944
+ },
945
+ "harness|hendrycksTest-high_school_statistics|5": {
946
+ "hashes": {
947
+ "hash_examples": "2386a60a11fc5de3",
948
+ "hash_full_prompts": "4c5c8be5aafac432",
949
+ "hash_input_tokens": "400e55b56ee6fbd7",
950
+ "hash_cont_tokens": "955ed42b6f7fa019"
951
+ },
952
+ "truncated": 0,
953
+ "non-truncated": 864,
954
+ "padded": 864,
955
+ "non-padded": 0,
956
+ "effective_few_shots": 5.0,
957
+ "num_truncated_few_shots": 0
958
+ },
959
+ "harness|hendrycksTest-high_school_us_history|5": {
960
+ "hashes": {
961
+ "hash_examples": "74961543be40f04f",
962
+ "hash_full_prompts": "5d5ca4840131ba21",
963
+ "hash_input_tokens": "c639cce12a46ebad",
964
+ "hash_cont_tokens": "cdd0b3dc06d933e5"
965
+ },
966
+ "truncated": 0,
967
+ "non-truncated": 816,
968
+ "padded": 816,
969
+ "non-padded": 0,
970
+ "effective_few_shots": 5.0,
971
+ "num_truncated_few_shots": 0
972
+ },
973
+ "harness|hendrycksTest-high_school_world_history|5": {
974
+ "hashes": {
975
+ "hash_examples": "2ad2f6b7198b2234",
976
+ "hash_full_prompts": "11845057459afd72",
977
+ "hash_input_tokens": "b9762065cce6f3a6",
978
+ "hash_cont_tokens": "9a864184946033ac"
979
+ },
980
+ "truncated": 0,
981
+ "non-truncated": 948,
982
+ "padded": 948,
983
+ "non-padded": 0,
984
+ "effective_few_shots": 5.0,
985
+ "num_truncated_few_shots": 0
986
+ },
987
+ "harness|hendrycksTest-human_aging|5": {
988
+ "hashes": {
989
+ "hash_examples": "1a7199dc733e779b",
990
+ "hash_full_prompts": "756b9096b8eaf892",
991
+ "hash_input_tokens": "541a75f071dcf579",
992
+ "hash_cont_tokens": "142a4a8a1138a214"
993
+ },
994
+ "truncated": 0,
995
+ "non-truncated": 892,
996
+ "padded": 892,
997
+ "non-padded": 0,
998
+ "effective_few_shots": 5.0,
999
+ "num_truncated_few_shots": 0
1000
+ },
1001
+ "harness|hendrycksTest-human_sexuality|5": {
1002
+ "hashes": {
1003
+ "hash_examples": "7acb8fdad97f88a6",
1004
+ "hash_full_prompts": "731a52ff15b8cfdb",
1005
+ "hash_input_tokens": "04269e5c5a257dd9",
1006
+ "hash_cont_tokens": "bc54813e809b796d"
1007
+ },
1008
+ "truncated": 0,
1009
+ "non-truncated": 524,
1010
+ "padded": 524,
1011
+ "non-padded": 0,
1012
+ "effective_few_shots": 5.0,
1013
+ "num_truncated_few_shots": 0
1014
+ },
1015
+ "harness|hendrycksTest-international_law|5": {
1016
+ "hashes": {
1017
+ "hash_examples": "1300bfd0dfc59114",
1018
+ "hash_full_prompts": "db2aefbff5eec996",
1019
+ "hash_input_tokens": "d93ba9d9d38e4397",
1020
+ "hash_cont_tokens": "dc45b45fcda18e5d"
1021
+ },
1022
+ "truncated": 0,
1023
+ "non-truncated": 484,
1024
+ "padded": 484,
1025
+ "non-padded": 0,
1026
+ "effective_few_shots": 5.0,
1027
+ "num_truncated_few_shots": 0
1028
+ },
1029
+ "harness|hendrycksTest-jurisprudence|5": {
1030
+ "hashes": {
1031
+ "hash_examples": "083b1e4904c48dc2",
1032
+ "hash_full_prompts": "0f89ee3fe03d6a21",
1033
+ "hash_input_tokens": "9eeaccd2698b4f5a",
1034
+ "hash_cont_tokens": "e3a8cd951b6e3469"
1035
+ },
1036
+ "truncated": 0,
1037
+ "non-truncated": 432,
1038
+ "padded": 432,
1039
+ "non-padded": 0,
1040
+ "effective_few_shots": 5.0,
1041
+ "num_truncated_few_shots": 0
1042
+ },
1043
+ "harness|hendrycksTest-logical_fallacies|5": {
1044
+ "hashes": {
1045
+ "hash_examples": "709128f9926a634c",
1046
+ "hash_full_prompts": "98a04b1f8f841069",
1047
+ "hash_input_tokens": "b4f08f544f2b7576",
1048
+ "hash_cont_tokens": "1e80dbd30f6453d5"
1049
+ },
1050
+ "truncated": 0,
1051
+ "non-truncated": 652,
1052
+ "padded": 648,
1053
+ "non-padded": 4,
1054
+ "effective_few_shots": 5.0,
1055
+ "num_truncated_few_shots": 0
1056
+ },
1057
+ "harness|hendrycksTest-machine_learning|5": {
1058
+ "hashes": {
1059
+ "hash_examples": "88f22a636029ae47",
1060
+ "hash_full_prompts": "2e1c8d4b1e0cc921",
1061
+ "hash_input_tokens": "900c2a51f1174b9f",
1062
+ "hash_cont_tokens": "9b37da7777378ca9"
1063
+ },
1064
+ "truncated": 0,
1065
+ "non-truncated": 448,
1066
+ "padded": 448,
1067
+ "non-padded": 0,
1068
+ "effective_few_shots": 5.0,
1069
+ "num_truncated_few_shots": 0
1070
+ },
1071
+ "harness|hendrycksTest-management|5": {
1072
+ "hashes": {
1073
+ "hash_examples": "8c8a1e07a2151dca",
1074
+ "hash_full_prompts": "f51611f514b265b0",
1075
+ "hash_input_tokens": "6b36efb4689c6eca",
1076
+ "hash_cont_tokens": "a01d6d39a83c4597"
1077
+ },
1078
+ "truncated": 0,
1079
+ "non-truncated": 412,
1080
+ "padded": 412,
1081
+ "non-padded": 0,
1082
+ "effective_few_shots": 5.0,
1083
+ "num_truncated_few_shots": 0
1084
+ },
1085
+ "harness|hendrycksTest-marketing|5": {
1086
+ "hashes": {
1087
+ "hash_examples": "2668953431f91e96",
1088
+ "hash_full_prompts": "77562bef997c7650",
1089
+ "hash_input_tokens": "2aaac78a0cfed47a",
1090
+ "hash_cont_tokens": "6aeaed4d823c98aa"
1091
+ },
1092
+ "truncated": 0,
1093
+ "non-truncated": 936,
1094
+ "padded": 936,
1095
+ "non-padded": 0,
1096
+ "effective_few_shots": 5.0,
1097
+ "num_truncated_few_shots": 0
1098
+ },
1099
+ "harness|hendrycksTest-medical_genetics|5": {
1100
+ "hashes": {
1101
+ "hash_examples": "9c2dda34a2ea4fd2",
1102
+ "hash_full_prompts": "202139046daa118f",
1103
+ "hash_input_tokens": "886ca823b41c094a",
1104
+ "hash_cont_tokens": "50421e30bef398f9"
1105
+ },
1106
+ "truncated": 0,
1107
+ "non-truncated": 400,
1108
+ "padded": 400,
1109
+ "non-padded": 0,
1110
+ "effective_few_shots": 5.0,
1111
+ "num_truncated_few_shots": 0
1112
+ },
1113
+ "harness|hendrycksTest-miscellaneous|5": {
1114
+ "hashes": {
1115
+ "hash_examples": "41adb694024809c2",
1116
+ "hash_full_prompts": "bffec9fc237bcf93",
1117
+ "hash_input_tokens": "72fd71de7675e7d0",
1118
+ "hash_cont_tokens": "9b0ab02a64603081"
1119
+ },
1120
+ "truncated": 0,
1121
+ "non-truncated": 3132,
1122
+ "padded": 3132,
1123
+ "non-padded": 0,
1124
+ "effective_few_shots": 5.0,
1125
+ "num_truncated_few_shots": 0
1126
+ },
1127
+ "harness|hendrycksTest-moral_disputes|5": {
1128
+ "hashes": {
1129
+ "hash_examples": "3171c13ba3c594c4",
1130
+ "hash_full_prompts": "170831fc36f1d59e",
1131
+ "hash_input_tokens": "f3ca0dd8e7a1eb09",
1132
+ "hash_cont_tokens": "8badf768f7b0467a"
1133
+ },
1134
+ "truncated": 0,
1135
+ "non-truncated": 1384,
1136
+ "padded": 1354,
1137
+ "non-padded": 30,
1138
+ "effective_few_shots": 5.0,
1139
+ "num_truncated_few_shots": 0
1140
+ },
1141
+ "harness|hendrycksTest-moral_scenarios|5": {
1142
+ "hashes": {
1143
+ "hash_examples": "9873e077e83e0546",
1144
+ "hash_full_prompts": "08f4ceba3131a068",
1145
+ "hash_input_tokens": "3e793631e951f23c",
1146
+ "hash_cont_tokens": "32ae620376b2bbba"
1147
+ },
1148
+ "truncated": 0,
1149
+ "non-truncated": 3580,
1150
+ "padded": 3580,
1151
+ "non-padded": 0,
1152
+ "effective_few_shots": 5.0,
1153
+ "num_truncated_few_shots": 0
1154
+ },
1155
+ "harness|hendrycksTest-nutrition|5": {
1156
+ "hashes": {
1157
+ "hash_examples": "7db1d8142ec14323",
1158
+ "hash_full_prompts": "4c0e68e3586cb453",
1159
+ "hash_input_tokens": "59753c2144ea93af",
1160
+ "hash_cont_tokens": "3071def75bacc404"
1161
+ },
1162
+ "truncated": 0,
1163
+ "non-truncated": 1224,
1164
+ "padded": 1224,
1165
+ "non-padded": 0,
1166
+ "effective_few_shots": 5.0,
1167
+ "num_truncated_few_shots": 0
1168
+ },
1169
+ "harness|hendrycksTest-philosophy|5": {
1170
+ "hashes": {
1171
+ "hash_examples": "9b455b7d72811cc8",
1172
+ "hash_full_prompts": "e467f822d8a0d3ff",
1173
+ "hash_input_tokens": "bd8d3dbed15a8c34",
1174
+ "hash_cont_tokens": "9f6ff69d23a48783"
1175
+ },
1176
+ "truncated": 0,
1177
+ "non-truncated": 1244,
1178
+ "padded": 1244,
1179
+ "non-padded": 0,
1180
+ "effective_few_shots": 5.0,
1181
+ "num_truncated_few_shots": 0
1182
+ },
1183
+ "harness|hendrycksTest-prehistory|5": {
1184
+ "hashes": {
1185
+ "hash_examples": "8be90d0f538f1560",
1186
+ "hash_full_prompts": "152187949bcd0921",
1187
+ "hash_input_tokens": "3573cd87facbb7c5",
1188
+ "hash_cont_tokens": "de469d2b981e32a3"
1189
+ },
1190
+ "truncated": 0,
1191
+ "non-truncated": 1296,
1192
+ "padded": 1296,
1193
+ "non-padded": 0,
1194
+ "effective_few_shots": 5.0,
1195
+ "num_truncated_few_shots": 0
1196
+ },
1197
+ "harness|hendrycksTest-professional_accounting|5": {
1198
+ "hashes": {
1199
+ "hash_examples": "8d377597916cd07e",
1200
+ "hash_full_prompts": "0eb7345d6144ee0d",
1201
+ "hash_input_tokens": "17e721bc1a7cbb47",
1202
+ "hash_cont_tokens": "c46f74d2dfc7b13b"
1203
+ },
1204
+ "truncated": 0,
1205
+ "non-truncated": 1128,
1206
+ "padded": 1128,
1207
+ "non-padded": 0,
1208
+ "effective_few_shots": 5.0,
1209
+ "num_truncated_few_shots": 0
1210
+ },
1211
+ "harness|hendrycksTest-professional_law|5": {
1212
+ "hashes": {
1213
+ "hash_examples": "cd9dbc52b3c932d6",
1214
+ "hash_full_prompts": "36ac764272bfb182",
1215
+ "hash_input_tokens": "c9f7583fff66d361",
1216
+ "hash_cont_tokens": "2e590029ef41fbcd"
1217
+ },
1218
+ "truncated": 0,
1219
+ "non-truncated": 6136,
1220
+ "padded": 6136,
1221
+ "non-padded": 0,
1222
+ "effective_few_shots": 5.0,
1223
+ "num_truncated_few_shots": 0
1224
+ },
1225
+ "harness|hendrycksTest-professional_medicine|5": {
1226
+ "hashes": {
1227
+ "hash_examples": "b20e4e816c1e383e",
1228
+ "hash_full_prompts": "7b8d69ea2acaf2f7",
1229
+ "hash_input_tokens": "40a933f829116f8d",
1230
+ "hash_cont_tokens": "fe35cfa9c6ca802e"
1231
+ },
1232
+ "truncated": 0,
1233
+ "non-truncated": 1088,
1234
+ "padded": 1088,
1235
+ "non-padded": 0,
1236
+ "effective_few_shots": 5.0,
1237
+ "num_truncated_few_shots": 0
1238
+ },
1239
+ "harness|hendrycksTest-professional_psychology|5": {
1240
+ "hashes": {
1241
+ "hash_examples": "d45b73b22f9cc039",
1242
+ "hash_full_prompts": "fe8937e9ffc99771",
1243
+ "hash_input_tokens": "0dfb73a8eb3f692c",
1244
+ "hash_cont_tokens": "f020fbddf72c8652"
1245
+ },
1246
+ "truncated": 0,
1247
+ "non-truncated": 2448,
1248
+ "padded": 2448,
1249
+ "non-padded": 0,
1250
+ "effective_few_shots": 5.0,
1251
+ "num_truncated_few_shots": 0
1252
+ },
1253
+ "harness|hendrycksTest-public_relations|5": {
1254
+ "hashes": {
1255
+ "hash_examples": "0d25072e1761652a",
1256
+ "hash_full_prompts": "f9adc39cfa9f42ba",
1257
+ "hash_input_tokens": "1710c6ba4c9f3cbd",
1258
+ "hash_cont_tokens": "568f585a259965c1"
1259
+ },
1260
+ "truncated": 0,
1261
+ "non-truncated": 440,
1262
+ "padded": 440,
1263
+ "non-padded": 0,
1264
+ "effective_few_shots": 5.0,
1265
+ "num_truncated_few_shots": 0
1266
+ },
1267
+ "harness|hendrycksTest-security_studies|5": {
1268
+ "hashes": {
1269
+ "hash_examples": "62bb8197e63d60d4",
1270
+ "hash_full_prompts": "869c9c3ae196b7c3",
1271
+ "hash_input_tokens": "32a03f1f22a6e103",
1272
+ "hash_cont_tokens": "cc6fd7cccd64cd5d"
1273
+ },
1274
+ "truncated": 0,
1275
+ "non-truncated": 980,
1276
+ "padded": 980,
1277
+ "non-padded": 0,
1278
+ "effective_few_shots": 5.0,
1279
+ "num_truncated_few_shots": 0
1280
+ },
1281
+ "harness|hendrycksTest-sociology|5": {
1282
+ "hashes": {
1283
+ "hash_examples": "e7959df87dea8672",
1284
+ "hash_full_prompts": "1a1fc00e17b3a52a",
1285
+ "hash_input_tokens": "828999f7624cbe7e",
1286
+ "hash_cont_tokens": "c3a3bdfd177eed5b"
1287
+ },
1288
+ "truncated": 0,
1289
+ "non-truncated": 804,
1290
+ "padded": 804,
1291
+ "non-padded": 0,
1292
+ "effective_few_shots": 5.0,
1293
+ "num_truncated_few_shots": 0
1294
+ },
1295
+ "harness|hendrycksTest-us_foreign_policy|5": {
1296
+ "hashes": {
1297
+ "hash_examples": "4a56a01ddca44dca",
1298
+ "hash_full_prompts": "0c7a7081c71c07b6",
1299
+ "hash_input_tokens": "42054621e718dbee",
1300
+ "hash_cont_tokens": "2568d0e8e36fa959"
1301
+ },
1302
+ "truncated": 0,
1303
+ "non-truncated": 400,
1304
+ "padded": 400,
1305
+ "non-padded": 0,
1306
+ "effective_few_shots": 5.0,
1307
+ "num_truncated_few_shots": 0
1308
+ },
1309
+ "harness|hendrycksTest-virology|5": {
1310
+ "hashes": {
1311
+ "hash_examples": "451cc86a8c4f4fe9",
1312
+ "hash_full_prompts": "01e95325d8b738e4",
1313
+ "hash_input_tokens": "6c4f0aa4dc859c04",
1314
+ "hash_cont_tokens": "926cf60b0891f374"
1315
+ },
1316
+ "truncated": 0,
1317
+ "non-truncated": 664,
1318
+ "padded": 664,
1319
+ "non-padded": 0,
1320
+ "effective_few_shots": 5.0,
1321
+ "num_truncated_few_shots": 0
1322
+ },
1323
+ "harness|hendrycksTest-world_religions|5": {
1324
+ "hashes": {
1325
+ "hash_examples": "3b29cfaf1a81c379",
1326
+ "hash_full_prompts": "e0d79a15083dfdff",
1327
+ "hash_input_tokens": "6c75d44e092ff24f",
1328
+ "hash_cont_tokens": "c525a5de974c1ea3"
1329
+ },
1330
+ "truncated": 0,
1331
+ "non-truncated": 684,
1332
+ "padded": 684,
1333
+ "non-padded": 0,
1334
+ "effective_few_shots": 5.0,
1335
+ "num_truncated_few_shots": 0
1336
+ },
1337
+ "harness|truthfulqa:mc|0": {
1338
+ "hashes": {
1339
+ "hash_examples": "23176c0531c7b867",
1340
+ "hash_full_prompts": "36a6d90e75d92d4a",
1341
+ "hash_input_tokens": "2738d7ed7075faa7",
1342
+ "hash_cont_tokens": "c014154380b74b9e"
1343
+ },
1344
+ "truncated": 0,
1345
+ "non-truncated": 9996,
1346
+ "padded": 9996,
1347
+ "non-padded": 0,
1348
+ "effective_few_shots": 0.0,
1349
+ "num_truncated_few_shots": 0
1350
+ }
1351
+ },
1352
+ "summary_general": {
1353
+ "hashes": {
1354
+ "hash_examples": "d84d18e9a963753d",
1355
+ "hash_full_prompts": "12b540783521a8e6",
1356
+ "hash_input_tokens": "5c73a7dce6ccf737",
1357
+ "hash_cont_tokens": "fb1646e2bdd5fc38"
1358
+ },
1359
+ "total_evaluation_time_secondes": "19912.24178814888",
1360
+ "truncated": 0,
1361
+ "non-truncated": 111019,
1362
+ "padded": 110926,
1363
+ "non-padded": 93,
1364
+ "num_truncated_few_shots": 0
1365
+ }
1366
+ }
meta-llama/Llama-2-7b-hf/results_2023-09-07T13-40-06.600532.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-7b-hf",
4
+ "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
+ "model_size": "12.61 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|winogrande|5": {
16
+ "acc": 0.7403314917127072,
17
+ "acc_stderr": 0.012322700705552667
18
+ },
19
+ "all": {
20
+ "acc": 0.7403314917127072,
21
+ "acc_stderr": 0.012322700705552667
22
+ }
23
+ },
24
+ "versions": {
25
+ "harness|winogrande|5": 0,
26
+ "all": 0
27
+ },
28
+ "config_tasks": {
29
+ "harness|winogrande": "LM Harness task"
30
+ },
31
+ "summary_tasks": {
32
+ "harness|winogrande|5": {
33
+ "hashes": {
34
+ "hash_examples": "aada0a176fd81218",
35
+ "hash_full_prompts": "c8655cbd12de8409",
36
+ "hash_input_tokens": "c0bedf98cb040854",
37
+ "hash_cont_tokens": "f08975ad6f2d5864"
38
+ },
39
+ "truncated": 0,
40
+ "non-truncated": 2534,
41
+ "padded": 2432,
42
+ "non-padded": 102,
43
+ "effective_few_shots": 5.0,
44
+ "num_truncated_few_shots": 0
45
+ }
46
+ },
47
+ "summary_general": {
48
+ "hashes": {
49
+ "hash_examples": "42f54c7ae3f28ef3",
50
+ "hash_full_prompts": "897c968b27a8c59a",
51
+ "hash_input_tokens": "ee5c3cb253d643d1",
52
+ "hash_cont_tokens": "273a70958f734c00"
53
+ },
54
+ "total_evaluation_time_secondes": "91.96187782287598",
55
+ "truncated": 0,
56
+ "non-truncated": 2534,
57
+ "padded": 2432,
58
+ "non-padded": 102,
59
+ "num_truncated_few_shots": 0
60
+ }
61
+ }
meta-llama/Llama-2-7b-hf/results_2023-09-08T17-00-44.389859.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-7b-hf",
4
+ "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
+ "model_size": "12.61 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.0012583892617449664,
17
+ "em_stderr": 0.00036305608931194434,
18
+ "f1": 0.055925964765100665,
19
+ "f1_stderr": 0.0013181664771628632
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.0712661106899166,
23
+ "acc_stderr": 0.007086462127954491
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.7403314917127072,
27
+ "acc_stderr": 0.012322700705552667
28
+ },
29
+ "all": {
30
+ "em": 0.0012583892617449664,
31
+ "em_stderr": 0.00036305608931194434,
32
+ "f1": 0.055925964765100665,
33
+ "f1_stderr": 0.0013181664771628632,
34
+ "acc": 0.4057988012013119,
35
+ "acc_stderr": 0.00970458141675358
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "ef74ade15eb78da6"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "542d7b742ca594d0"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "58a2c19976e6dde8"
99
+ },
100
+ "total_evaluation_time_secondes": "4621.534999847412",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-7b-hf/results_2023-09-09T12-32-30.613622.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-7b-hf",
4
+ "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
+ "model_size": "3.57 GB",
6
+ "model_dtype": "4bit",
7
+ "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.0010486577181208054,
17
+ "em_stderr": 0.00033145814652191404,
18
+ "f1": 0.05131291946308739,
19
+ "f1_stderr": 0.0012542058656851648
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.053828658074298714,
23
+ "acc_stderr": 0.006216328640238123
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.7458563535911602,
27
+ "acc_stderr": 0.012236307219708262
28
+ },
29
+ "all": {
30
+ "em": 0.0010486577181208054,
31
+ "em_stderr": 0.00033145814652191404,
32
+ "f1": 0.05131291946308739,
33
+ "f1_stderr": 0.0012542058656851648,
34
+ "acc": 0.39984250583272946,
35
+ "acc_stderr": 0.009226317929973193
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "586467d69620d89a"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "d10e414c90dc2a07"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "a4df4eaec2b09094"
99
+ },
100
+ "total_evaluation_time_secondes": "8584.129543542862",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }
meta-llama/Llama-2-7b-hf/results_2023-09-20T14-39-46.791628.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "model_name": "meta-llama/Llama-2-7b-hf",
4
+ "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
+ "model_size": "12.61 GB",
6
+ "model_dtype": "torch.float16",
7
+ "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
+ "num_few_shot_default": 0,
9
+ "num_fewshot_seeds": 1,
10
+ "override_batch_size": 1,
11
+ "max_samples": null,
12
+ "job_id": ""
13
+ },
14
+ "results": {
15
+ "harness|drop|3": {
16
+ "em": 0.0012583892617449664,
17
+ "em_stderr": 0.00036305608931194434,
18
+ "f1": 0.055925964765100665,
19
+ "f1_stderr": 0.0013181664771628632
20
+ },
21
+ "harness|gsm8k|5": {
22
+ "acc": 0.0712661106899166,
23
+ "acc_stderr": 0.007086462127954491
24
+ },
25
+ "harness|winogrande|5": {
26
+ "acc": 0.7403314917127072,
27
+ "acc_stderr": 0.012322700705552667
28
+ },
29
+ "all": {
30
+ "em": 0.0012583892617449664,
31
+ "em_stderr": 0.00036305608931194434,
32
+ "f1": 0.055925964765100665,
33
+ "f1_stderr": 0.0013181664771628632,
34
+ "acc": 0.4057988012013119,
35
+ "acc_stderr": 0.00970458141675358
36
+ }
37
+ },
38
+ "versions": {
39
+ "harness|drop|3": 1,
40
+ "harness|gsm8k|5": 0,
41
+ "harness|winogrande|5": 0,
42
+ "all": 0
43
+ },
44
+ "config_tasks": {
45
+ "harness|drop": "LM Harness task",
46
+ "harness|gsm8k": "LM Harness task",
47
+ "harness|winogrande": "LM Harness task"
48
+ },
49
+ "summary_tasks": {
50
+ "harness|drop|3": {
51
+ "hashes": {
52
+ "hash_examples": "1d27416e8324e9a3",
53
+ "hash_full_prompts": "a5513ff9a741b385",
54
+ "hash_input_tokens": "42076f0efbb50aa6",
55
+ "hash_cont_tokens": "ef74ade15eb78da6"
56
+ },
57
+ "truncated": 3,
58
+ "non-truncated": 9533,
59
+ "padded": 0,
60
+ "non-padded": 9536,
61
+ "effective_few_shots": 3.0,
62
+ "num_truncated_few_shots": 0
63
+ },
64
+ "harness|gsm8k|5": {
65
+ "hashes": {
66
+ "hash_examples": "4c0843a5d99bcfdc",
67
+ "hash_full_prompts": "41d55e83abc0e02d",
68
+ "hash_input_tokens": "bda342e47b5099b2",
69
+ "hash_cont_tokens": "542d7b742ca594d0"
70
+ },
71
+ "truncated": 0,
72
+ "non-truncated": 1319,
73
+ "padded": 0,
74
+ "non-padded": 1319,
75
+ "effective_few_shots": 5.0,
76
+ "num_truncated_few_shots": 0
77
+ },
78
+ "harness|winogrande|5": {
79
+ "hashes": {
80
+ "hash_examples": "aada0a176fd81218",
81
+ "hash_full_prompts": "c8655cbd12de8409",
82
+ "hash_input_tokens": "c0bedf98cb040854",
83
+ "hash_cont_tokens": "f08975ad6f2d5864"
84
+ },
85
+ "truncated": 0,
86
+ "non-truncated": 2534,
87
+ "padded": 2432,
88
+ "non-padded": 102,
89
+ "effective_few_shots": 5.0,
90
+ "num_truncated_few_shots": 0
91
+ }
92
+ },
93
+ "summary_general": {
94
+ "hashes": {
95
+ "hash_examples": "9b4d8993161e637d",
96
+ "hash_full_prompts": "08215e527b7e60a5",
97
+ "hash_input_tokens": "a12f3e3c934bd78b",
98
+ "hash_cont_tokens": "58a2c19976e6dde8"
99
+ },
100
+ "total_evaluation_time_secondes": "9738.608674764633",
101
+ "truncated": 3,
102
+ "non-truncated": 13386,
103
+ "padded": 2432,
104
+ "non-padded": 10957,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ }