lomahony commited on
Commit
81330f2
1 Parent(s): e1a8190

Upload 2 files

Browse files
dpo-410m-eval-files/EleutherAI-pythia-410m-0shot-shelloutput.txt ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bootstrapping for stddev: perplexity
2
+ {
3
+ "results": {
4
+ "arc_challenge": {
5
+ "acc,none": 0.21416382252559726,
6
+ "acc_stderr,none": 0.011988383205966515,
7
+ "acc_norm,none": 0.2431740614334471,
8
+ "acc_norm_stderr,none": 0.012536554144587084
9
+ },
10
+ "arc_easy": {
11
+ "acc,none": 0.5189393939393939,
12
+ "acc_stderr,none": 0.01025242049689449,
13
+ "acc_norm,none": 0.45707070707070707,
14
+ "acc_norm_stderr,none": 0.010221897564256049
15
+ },
16
+ "boolq": {
17
+ "acc,none": 0.6058103975535168,
18
+ "acc_stderr,none": 0.008546995661233639
19
+ },
20
+ "hellaswag": {
21
+ "acc,none": 0.33718382792272455,
22
+ "acc_stderr,none": 0.004717820714968757,
23
+ "acc_norm,none": 0.4060944035052778,
24
+ "acc_norm_stderr,none": 0.004900988997414242
25
+ },
26
+ "lambada_openai": {
27
+ "perplexity,none": 10.780459714601333,
28
+ "perplexity_stderr,none": 0.32049412467424027,
29
+ "acc,none": 0.5163982146322531,
30
+ "acc_stderr,none": 0.006962230326368326
31
+ },
32
+ "openbookqa": {
33
+ "acc,none": 0.182,
34
+ "acc_stderr,none": 0.01727277329773045,
35
+ "acc_norm,none": 0.294,
36
+ "acc_norm_stderr,none": 0.020395095484936624
37
+ },
38
+ "piqa": {
39
+ "acc,none": 0.6670293797606094,
40
+ "acc_stderr,none": 0.010995648822619082,
41
+ "acc_norm,none": 0.6719260065288357,
42
+ "acc_norm_stderr,none": 0.010954487135124227
43
+ },
44
+ "sciq": {
45
+ "acc,none": 0.815,
46
+ "acc_stderr,none": 0.012285191326386667,
47
+ "acc_norm,none": 0.725,
48
+ "acc_norm_stderr,none": 0.014127086556490528
49
+ },
50
+ "wikitext": {
51
+ "word_perplexity,none": 34.50450469911897,
52
+ "byte_perplexity,none": 1.7927778872125213,
53
+ "bits_per_byte,none": 0.842196759334895
54
+ },
55
+ "winogrande": {
56
+ "acc,none": 0.5335438042620363,
57
+ "acc_stderr,none": 0.014020826677598103
58
+ }
59
+ },
60
+ "configs": {
61
+ "arc_challenge": {
62
+ "task": "arc_challenge",
63
+ "group": [
64
+ "ai2_arc",
65
+ "multiple_choice"
66
+ ],
67
+ "dataset_path": "ai2_arc",
68
+ "dataset_name": "ARC-Challenge",
69
+ "training_split": "train",
70
+ "validation_split": "validation",
71
+ "test_split": "test",
72
+ "doc_to_text": "Question: {{question}}\nAnswer:",
73
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
74
+ "doc_to_choice": "{{choices.text}}",
75
+ "description": "",
76
+ "target_delimiter": " ",
77
+ "fewshot_delimiter": "\n\n",
78
+ "num_fewshot": 0,
79
+ "metric_list": [
80
+ {
81
+ "metric": "acc",
82
+ "aggregation": "mean",
83
+ "higher_is_better": true
84
+ },
85
+ {
86
+ "metric": "acc_norm",
87
+ "aggregation": "mean",
88
+ "higher_is_better": true
89
+ }
90
+ ],
91
+ "output_type": "multiple_choice",
92
+ "repeats": 1,
93
+ "should_decontaminate": true,
94
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
95
+ },
96
+ "arc_easy": {
97
+ "task": "arc_easy",
98
+ "group": [
99
+ "ai2_arc",
100
+ "multiple_choice"
101
+ ],
102
+ "dataset_path": "ai2_arc",
103
+ "dataset_name": "ARC-Easy",
104
+ "training_split": "train",
105
+ "validation_split": "validation",
106
+ "test_split": "test",
107
+ "doc_to_text": "Question: {{question}}\nAnswer:",
108
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
109
+ "doc_to_choice": "{{choices.text}}",
110
+ "description": "",
111
+ "target_delimiter": " ",
112
+ "fewshot_delimiter": "\n\n",
113
+ "num_fewshot": 0,
114
+ "metric_list": [
115
+ {
116
+ "metric": "acc",
117
+ "aggregation": "mean",
118
+ "higher_is_better": true
119
+ },
120
+ {
121
+ "metric": "acc_norm",
122
+ "aggregation": "mean",
123
+ "higher_is_better": true
124
+ }
125
+ ],
126
+ "output_type": "multiple_choice",
127
+ "repeats": 1,
128
+ "should_decontaminate": true,
129
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
130
+ },
131
+ "boolq": {
132
+ "task": "boolq",
133
+ "group": [
134
+ "super-glue-lm-eval-v1"
135
+ ],
136
+ "dataset_path": "super_glue",
137
+ "dataset_name": "boolq",
138
+ "training_split": "train",
139
+ "validation_split": "validation",
140
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
141
+ "doc_to_target": "label",
142
+ "doc_to_choice": [
143
+ "no",
144
+ "yes"
145
+ ],
146
+ "description": "",
147
+ "target_delimiter": " ",
148
+ "fewshot_delimiter": "\n\n",
149
+ "num_fewshot": 0,
150
+ "metric_list": [
151
+ {
152
+ "metric": "acc"
153
+ }
154
+ ],
155
+ "output_type": "multiple_choice",
156
+ "repeats": 1,
157
+ "should_decontaminate": true,
158
+ "doc_to_decontamination_query": "passage"
159
+ },
160
+ "hellaswag": {
161
+ "task": "hellaswag",
162
+ "group": [
163
+ "multiple_choice"
164
+ ],
165
+ "dataset_path": "hellaswag",
166
+ "training_split": "train",
167
+ "validation_split": "validation",
168
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
169
+ "doc_to_target": "{{label}}",
170
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
171
+ "description": "",
172
+ "target_delimiter": " ",
173
+ "fewshot_delimiter": "\n\n",
174
+ "num_fewshot": 0,
175
+ "metric_list": [
176
+ {
177
+ "metric": "acc",
178
+ "aggregation": "mean",
179
+ "higher_is_better": true
180
+ },
181
+ {
182
+ "metric": "acc_norm",
183
+ "aggregation": "mean",
184
+ "higher_is_better": true
185
+ }
186
+ ],
187
+ "output_type": "multiple_choice",
188
+ "repeats": 1,
189
+ "should_decontaminate": false
190
+ },
191
+ "lambada_openai": {
192
+ "task": "lambada_openai",
193
+ "group": [
194
+ "lambada",
195
+ "loglikelihood",
196
+ "perplexity"
197
+ ],
198
+ "dataset_path": "EleutherAI/lambada_openai",
199
+ "dataset_name": "default",
200
+ "test_split": "test",
201
+ "template_aliases": "",
202
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
203
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
204
+ "description": "",
205
+ "target_delimiter": " ",
206
+ "fewshot_delimiter": "\n\n",
207
+ "num_fewshot": 0,
208
+ "metric_list": [
209
+ {
210
+ "metric": "perplexity",
211
+ "aggregation": "perplexity",
212
+ "higher_is_better": false
213
+ },
214
+ {
215
+ "metric": "acc",
216
+ "aggregation": "mean",
217
+ "higher_is_better": true
218
+ }
219
+ ],
220
+ "output_type": "loglikelihood",
221
+ "repeats": 1,
222
+ "should_decontaminate": true,
223
+ "doc_to_decontamination_query": "{{text}}"
224
+ },
225
+ "openbookqa": {
226
+ "task": "openbookqa",
227
+ "group": [
228
+ "multiple_choice"
229
+ ],
230
+ "dataset_path": "openbookqa",
231
+ "dataset_name": "main",
232
+ "training_split": "train",
233
+ "validation_split": "validation",
234
+ "test_split": "test",
235
+ "doc_to_text": "question_stem",
236
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
237
+ "doc_to_choice": "{{choices.text}}",
238
+ "description": "",
239
+ "target_delimiter": " ",
240
+ "fewshot_delimiter": "\n\n",
241
+ "num_fewshot": 0,
242
+ "metric_list": [
243
+ {
244
+ "metric": "acc",
245
+ "aggregation": "mean",
246
+ "higher_is_better": true
247
+ },
248
+ {
249
+ "metric": "acc_norm",
250
+ "aggregation": "mean",
251
+ "higher_is_better": true
252
+ }
253
+ ],
254
+ "output_type": "multiple_choice",
255
+ "repeats": 1,
256
+ "should_decontaminate": true,
257
+ "doc_to_decontamination_query": "question_stem"
258
+ },
259
+ "piqa": {
260
+ "task": "piqa",
261
+ "group": [
262
+ "multiple_choice"
263
+ ],
264
+ "dataset_path": "piqa",
265
+ "training_split": "train",
266
+ "validation_split": "validation",
267
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
268
+ "doc_to_target": "label",
269
+ "doc_to_choice": "{{[sol1, sol2]}}",
270
+ "description": "",
271
+ "target_delimiter": " ",
272
+ "fewshot_delimiter": "\n\n",
273
+ "num_fewshot": 0,
274
+ "metric_list": [
275
+ {
276
+ "metric": "acc",
277
+ "aggregation": "mean",
278
+ "higher_is_better": true
279
+ },
280
+ {
281
+ "metric": "acc_norm",
282
+ "aggregation": "mean",
283
+ "higher_is_better": true
284
+ }
285
+ ],
286
+ "output_type": "multiple_choice",
287
+ "repeats": 1,
288
+ "should_decontaminate": true,
289
+ "doc_to_decontamination_query": "goal"
290
+ },
291
+ "sciq": {
292
+ "task": "sciq",
293
+ "group": [
294
+ "multiple_choice"
295
+ ],
296
+ "dataset_path": "sciq",
297
+ "training_split": "train",
298
+ "validation_split": "validation",
299
+ "test_split": "test",
300
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
301
+ "doc_to_target": 3,
302
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
303
+ "description": "",
304
+ "target_delimiter": " ",
305
+ "fewshot_delimiter": "\n\n",
306
+ "num_fewshot": 0,
307
+ "metric_list": [
308
+ {
309
+ "metric": "acc",
310
+ "aggregation": "mean",
311
+ "higher_is_better": true
312
+ },
313
+ {
314
+ "metric": "acc_norm",
315
+ "aggregation": "mean",
316
+ "higher_is_better": true
317
+ }
318
+ ],
319
+ "output_type": "multiple_choice",
320
+ "repeats": 1,
321
+ "should_decontaminate": true,
322
+ "doc_to_decontamination_query": "{{support}} {{question}}"
323
+ },
324
+ "wikitext": {
325
+ "task": "wikitext",
326
+ "group": [
327
+ "perplexity",
328
+ "loglikelihood_rolling"
329
+ ],
330
+ "dataset_path": "EleutherAI/wikitext_document_level",
331
+ "dataset_name": "wikitext-2-raw-v1",
332
+ "training_split": "train",
333
+ "validation_split": "validation",
334
+ "test_split": "test",
335
+ "template_aliases": "",
336
+ "doc_to_text": "",
337
+ "doc_to_target": "<function wikitext_detokenizer at 0x7fae1b130040>",
338
+ "description": "",
339
+ "target_delimiter": " ",
340
+ "fewshot_delimiter": "\n\n",
341
+ "num_fewshot": 0,
342
+ "metric_list": [
343
+ {
344
+ "metric": "word_perplexity"
345
+ },
346
+ {
347
+ "metric": "byte_perplexity"
348
+ },
349
+ {
350
+ "metric": "bits_per_byte"
351
+ }
352
+ ],
353
+ "output_type": "loglikelihood_rolling",
354
+ "repeats": 1,
355
+ "should_decontaminate": true,
356
+ "doc_to_decontamination_query": "{{page}}"
357
+ },
358
+ "winogrande": {
359
+ "task": "winogrande",
360
+ "dataset_path": "winogrande",
361
+ "dataset_name": "winogrande_xl",
362
+ "training_split": "train",
363
+ "validation_split": "validation",
364
+ "doc_to_text": "<function doc_to_text at 0x7fae1b102ef0>",
365
+ "doc_to_target": "<function doc_to_target at 0x7fae1b103370>",
366
+ "doc_to_choice": "<function doc_to_choice at 0x7fae1b1035b0>",
367
+ "description": "",
368
+ "target_delimiter": " ",
369
+ "fewshot_delimiter": "\n\n",
370
+ "num_fewshot": 0,
371
+ "metric_list": [
372
+ {
373
+ "metric": "acc",
374
+ "aggregation": "mean",
375
+ "higher_is_better": true
376
+ }
377
+ ],
378
+ "output_type": "multiple_choice",
379
+ "repeats": 1,
380
+ "should_decontaminate": false
381
+ }
382
+ },
383
+ "versions": {
384
+ "arc_challenge": "Yaml",
385
+ "arc_easy": "Yaml",
386
+ "boolq": "Yaml",
387
+ "hellaswag": "Yaml",
388
+ "lambada_openai": "Yaml",
389
+ "openbookqa": "Yaml",
390
+ "piqa": "Yaml",
391
+ "sciq": "Yaml",
392
+ "wikitext": "Yaml",
393
+ "winogrande": "Yaml"
394
+ },
395
+ "config": {
396
+ "model": "hf",
397
+ "model_args": "pretrained=EleutherAI/pythia-410m",
398
+ "num_fewshot": 0,
399
+ "batch_size": 16,
400
+ "batch_sizes": [],
401
+ "device": "cuda:0",
402
+ "use_cache": null,
403
+ "limit": null,
404
+ "bootstrap_iters": 100000
405
+ },
406
+ "git_hash": "4e44f0a"
407
+ }
408
+ hf (pretrained=EleutherAI/pythia-410m), limit: None, num_fewshot: 0, batch_size: 16
409
+ | Task |Version|Filter| Metric | Value | |Stderr|
410
+ |--------------|-------|------|---------------|------:|---|-----:|
411
+ |arc_challenge |Yaml |none |acc | 0.2142|± |0.0120|
412
+ | | |none |acc_norm | 0.2432|± |0.0125|
413
+ |arc_easy |Yaml |none |acc | 0.5189|± |0.0103|
414
+ | | |none |acc_norm | 0.4571|± |0.0102|
415
+ |boolq |Yaml |none |acc | 0.6058|± |0.0085|
416
+ |hellaswag |Yaml |none |acc | 0.3372|± |0.0047|
417
+ | | |none |acc_norm | 0.4061|± |0.0049|
418
+ |lambada_openai|Yaml |none |perplexity |10.7805|± |0.3205|
419
+ | | |none |acc | 0.5164|± |0.0070|
420
+ |openbookqa |Yaml |none |acc | 0.1820|± |0.0173|
421
+ | | |none |acc_norm | 0.2940|± |0.0204|
422
+ |piqa |Yaml |none |acc | 0.6670|± |0.0110|
423
+ | | |none |acc_norm | 0.6719|± |0.0110|
424
+ |sciq |Yaml |none |acc | 0.8150|± |0.0123|
425
+ | | |none |acc_norm | 0.7250|± |0.0141|
426
+ |wikitext |Yaml |none |word_perplexity|34.5045| | |
427
+ | | |none |byte_perplexity| 1.7928| | |
428
+ | | |none |bits_per_byte | 0.8422| | |
429
+ |winogrande |Yaml |none |acc | 0.5335|± |0.0140|
430
+
dpo-410m-eval-files/EleutherAI-pythia-410m-5shot-shelloutput.txt ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Downloading and preparing dataset super_glue/boolq to /home/laura/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...
2
+ Dataset super_glue downloaded and prepared to /home/laura/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.
3
+ Downloading and preparing dataset openbookqa/main to /home/laura/.cache/huggingface/datasets/openbookqa/main/1.0.1/f338ccacfbc86fb8c2de3aa1c06d2ce686933de3bca284dba97d32592c52b33f...
4
+ Dataset openbookqa downloaded and prepared to /home/laura/.cache/huggingface/datasets/openbookqa/main/1.0.1/f338ccacfbc86fb8c2de3aa1c06d2ce686933de3bca284dba97d32592c52b33f. Subsequent calls will reuse this data.
5
+ Downloading and preparing dataset piqa/plain_text to /home/laura/.cache/huggingface/datasets/piqa/plain_text/1.1.0/6c611c1a9bf220943c4174e117d3b660859665baf1d43156230116185312d011...
6
+ Dataset piqa downloaded and prepared to /home/laura/.cache/huggingface/datasets/piqa/plain_text/1.1.0/6c611c1a9bf220943c4174e117d3b660859665baf1d43156230116185312d011. Subsequent calls will reuse this data.
7
+ Downloading and preparing dataset sciq/default to /home/laura/.cache/huggingface/datasets/sciq/default/0.1.0/50e5c6e3795b55463819d399ec417bfd4c3c621105e00295ddb5f3633d708493...
8
+ Dataset sciq downloaded and prepared to /home/laura/.cache/huggingface/datasets/sciq/default/0.1.0/50e5c6e3795b55463819d399ec417bfd4c3c621105e00295ddb5f3633d708493. Subsequent calls will reuse this data.
9
+ Downloading and preparing dataset winogrande/winogrande_xl to /home/laura/.cache/huggingface/datasets/winogrande/winogrande_xl/1.1.0/a826c3d3506aefe0e9e9390dcb53271070536586bab95849876b2c1743df56e2...
10
+ Dataset winogrande downloaded and prepared to /home/laura/.cache/huggingface/datasets/winogrande/winogrande_xl/1.1.0/a826c3d3506aefe0e9e9390dcb53271070536586bab95849876b2c1743df56e2. Subsequent calls will reuse this data.
11
+ bootstrapping for stddev: perplexity
12
+ {
13
+ "results": {
14
+ "arc_challenge": {
15
+ "acc,none": 0.21843003412969283,
16
+ "acc_stderr,none": 0.012074291605700959,
17
+ "acc_norm,none": 0.2645051194539249,
18
+ "acc_norm_stderr,none": 0.012889272949313368
19
+ },
20
+ "arc_easy": {
21
+ "acc,none": 0.54503367003367,
22
+ "acc_stderr,none": 0.010218084454602589,
23
+ "acc_norm,none": 0.5370370370370371,
24
+ "acc_norm_stderr,none": 0.010231597249131058
25
+ },
26
+ "boolq": {
27
+ "acc,none": 0.4871559633027523,
28
+ "acc_stderr,none": 0.008742169169427067
29
+ },
30
+ "hellaswag": {
31
+ "acc,none": 0.33827922724556864,
32
+ "acc_stderr,none": 0.004721571443354456,
33
+ "acc_norm,none": 0.40818562039434375,
34
+ "acc_norm_stderr,none": 0.004904933500255884
35
+ },
36
+ "lambada_openai": {
37
+ "perplexity,none": 14.485555582236119,
38
+ "perplexity_stderr,none": 0.4358013409476018,
39
+ "acc,none": 0.4422666407917718,
40
+ "acc_stderr,none": 0.006919384666875831
41
+ },
42
+ "openbookqa": {
43
+ "acc,none": 0.188,
44
+ "acc_stderr,none": 0.01749067888034625,
45
+ "acc_norm,none": 0.28,
46
+ "acc_norm_stderr,none": 0.020099950647503237
47
+ },
48
+ "piqa": {
49
+ "acc,none": 0.6806311207834603,
50
+ "acc_stderr,none": 0.010877964076613737,
51
+ "acc_norm,none": 0.6692056583242655,
52
+ "acc_norm_stderr,none": 0.010977520584714429
53
+ },
54
+ "sciq": {
55
+ "acc,none": 0.892,
56
+ "acc_stderr,none": 0.009820001651345682,
57
+ "acc_norm,none": 0.887,
58
+ "acc_norm_stderr,none": 0.01001655286669685
59
+ },
60
+ "wikitext": {
61
+ "word_perplexity,none": 34.50450469911897,
62
+ "byte_perplexity,none": 1.7927778872125213,
63
+ "bits_per_byte,none": 0.842196759334895
64
+ },
65
+ "winogrande": {
66
+ "acc,none": 0.5335438042620363,
67
+ "acc_stderr,none": 0.014020826677598103
68
+ }
69
+ },
70
+ "configs": {
71
+ "arc_challenge": {
72
+ "task": "arc_challenge",
73
+ "group": [
74
+ "ai2_arc",
75
+ "multiple_choice"
76
+ ],
77
+ "dataset_path": "ai2_arc",
78
+ "dataset_name": "ARC-Challenge",
79
+ "training_split": "train",
80
+ "validation_split": "validation",
81
+ "test_split": "test",
82
+ "doc_to_text": "Question: {{question}}\nAnswer:",
83
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
84
+ "doc_to_choice": "{{choices.text}}",
85
+ "description": "",
86
+ "target_delimiter": " ",
87
+ "fewshot_delimiter": "\n\n",
88
+ "num_fewshot": 5,
89
+ "metric_list": [
90
+ {
91
+ "metric": "acc",
92
+ "aggregation": "mean",
93
+ "higher_is_better": true
94
+ },
95
+ {
96
+ "metric": "acc_norm",
97
+ "aggregation": "mean",
98
+ "higher_is_better": true
99
+ }
100
+ ],
101
+ "output_type": "multiple_choice",
102
+ "repeats": 1,
103
+ "should_decontaminate": true,
104
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
105
+ },
106
+ "arc_easy": {
107
+ "task": "arc_easy",
108
+ "group": [
109
+ "ai2_arc",
110
+ "multiple_choice"
111
+ ],
112
+ "dataset_path": "ai2_arc",
113
+ "dataset_name": "ARC-Easy",
114
+ "training_split": "train",
115
+ "validation_split": "validation",
116
+ "test_split": "test",
117
+ "doc_to_text": "Question: {{question}}\nAnswer:",
118
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
119
+ "doc_to_choice": "{{choices.text}}",
120
+ "description": "",
121
+ "target_delimiter": " ",
122
+ "fewshot_delimiter": "\n\n",
123
+ "num_fewshot": 5,
124
+ "metric_list": [
125
+ {
126
+ "metric": "acc",
127
+ "aggregation": "mean",
128
+ "higher_is_better": true
129
+ },
130
+ {
131
+ "metric": "acc_norm",
132
+ "aggregation": "mean",
133
+ "higher_is_better": true
134
+ }
135
+ ],
136
+ "output_type": "multiple_choice",
137
+ "repeats": 1,
138
+ "should_decontaminate": true,
139
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
140
+ },
141
+ "boolq": {
142
+ "task": "boolq",
143
+ "group": [
144
+ "super-glue-lm-eval-v1"
145
+ ],
146
+ "dataset_path": "super_glue",
147
+ "dataset_name": "boolq",
148
+ "training_split": "train",
149
+ "validation_split": "validation",
150
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
151
+ "doc_to_target": "label",
152
+ "doc_to_choice": [
153
+ "no",
154
+ "yes"
155
+ ],
156
+ "description": "",
157
+ "target_delimiter": " ",
158
+ "fewshot_delimiter": "\n\n",
159
+ "num_fewshot": 5,
160
+ "metric_list": [
161
+ {
162
+ "metric": "acc"
163
+ }
164
+ ],
165
+ "output_type": "multiple_choice",
166
+ "repeats": 1,
167
+ "should_decontaminate": true,
168
+ "doc_to_decontamination_query": "passage"
169
+ },
170
+ "hellaswag": {
171
+ "task": "hellaswag",
172
+ "group": [
173
+ "multiple_choice"
174
+ ],
175
+ "dataset_path": "hellaswag",
176
+ "training_split": "train",
177
+ "validation_split": "validation",
178
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
179
+ "doc_to_target": "{{label}}",
180
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
181
+ "description": "",
182
+ "target_delimiter": " ",
183
+ "fewshot_delimiter": "\n\n",
184
+ "num_fewshot": 5,
185
+ "metric_list": [
186
+ {
187
+ "metric": "acc",
188
+ "aggregation": "mean",
189
+ "higher_is_better": true
190
+ },
191
+ {
192
+ "metric": "acc_norm",
193
+ "aggregation": "mean",
194
+ "higher_is_better": true
195
+ }
196
+ ],
197
+ "output_type": "multiple_choice",
198
+ "repeats": 1,
199
+ "should_decontaminate": false
200
+ },
201
+ "lambada_openai": {
202
+ "task": "lambada_openai",
203
+ "group": [
204
+ "lambada",
205
+ "loglikelihood",
206
+ "perplexity"
207
+ ],
208
+ "dataset_path": "EleutherAI/lambada_openai",
209
+ "dataset_name": "default",
210
+ "test_split": "test",
211
+ "template_aliases": "",
212
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
213
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
214
+ "description": "",
215
+ "target_delimiter": " ",
216
+ "fewshot_delimiter": "\n\n",
217
+ "num_fewshot": 5,
218
+ "metric_list": [
219
+ {
220
+ "metric": "perplexity",
221
+ "aggregation": "perplexity",
222
+ "higher_is_better": false
223
+ },
224
+ {
225
+ "metric": "acc",
226
+ "aggregation": "mean",
227
+ "higher_is_better": true
228
+ }
229
+ ],
230
+ "output_type": "loglikelihood",
231
+ "repeats": 1,
232
+ "should_decontaminate": true,
233
+ "doc_to_decontamination_query": "{{text}}"
234
+ },
235
+ "openbookqa": {
236
+ "task": "openbookqa",
237
+ "group": [
238
+ "multiple_choice"
239
+ ],
240
+ "dataset_path": "openbookqa",
241
+ "dataset_name": "main",
242
+ "training_split": "train",
243
+ "validation_split": "validation",
244
+ "test_split": "test",
245
+ "doc_to_text": "question_stem",
246
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
247
+ "doc_to_choice": "{{choices.text}}",
248
+ "description": "",
249
+ "target_delimiter": " ",
250
+ "fewshot_delimiter": "\n\n",
251
+ "num_fewshot": 5,
252
+ "metric_list": [
253
+ {
254
+ "metric": "acc",
255
+ "aggregation": "mean",
256
+ "higher_is_better": true
257
+ },
258
+ {
259
+ "metric": "acc_norm",
260
+ "aggregation": "mean",
261
+ "higher_is_better": true
262
+ }
263
+ ],
264
+ "output_type": "multiple_choice",
265
+ "repeats": 1,
266
+ "should_decontaminate": true,
267
+ "doc_to_decontamination_query": "question_stem"
268
+ },
269
+ "piqa": {
270
+ "task": "piqa",
271
+ "group": [
272
+ "multiple_choice"
273
+ ],
274
+ "dataset_path": "piqa",
275
+ "training_split": "train",
276
+ "validation_split": "validation",
277
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
278
+ "doc_to_target": "label",
279
+ "doc_to_choice": "{{[sol1, sol2]}}",
280
+ "description": "",
281
+ "target_delimiter": " ",
282
+ "fewshot_delimiter": "\n\n",
283
+ "num_fewshot": 5,
284
+ "metric_list": [
285
+ {
286
+ "metric": "acc",
287
+ "aggregation": "mean",
288
+ "higher_is_better": true
289
+ },
290
+ {
291
+ "metric": "acc_norm",
292
+ "aggregation": "mean",
293
+ "higher_is_better": true
294
+ }
295
+ ],
296
+ "output_type": "multiple_choice",
297
+ "repeats": 1,
298
+ "should_decontaminate": true,
299
+ "doc_to_decontamination_query": "goal"
300
+ },
301
+ "sciq": {
302
+ "task": "sciq",
303
+ "group": [
304
+ "multiple_choice"
305
+ ],
306
+ "dataset_path": "sciq",
307
+ "training_split": "train",
308
+ "validation_split": "validation",
309
+ "test_split": "test",
310
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
311
+ "doc_to_target": 3,
312
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
313
+ "description": "",
314
+ "target_delimiter": " ",
315
+ "fewshot_delimiter": "\n\n",
316
+ "num_fewshot": 5,
317
+ "metric_list": [
318
+ {
319
+ "metric": "acc",
320
+ "aggregation": "mean",
321
+ "higher_is_better": true
322
+ },
323
+ {
324
+ "metric": "acc_norm",
325
+ "aggregation": "mean",
326
+ "higher_is_better": true
327
+ }
328
+ ],
329
+ "output_type": "multiple_choice",
330
+ "repeats": 1,
331
+ "should_decontaminate": true,
332
+ "doc_to_decontamination_query": "{{support}} {{question}}"
333
+ },
334
+ "wikitext": {
335
+ "task": "wikitext",
336
+ "group": [
337
+ "perplexity",
338
+ "loglikelihood_rolling"
339
+ ],
340
+ "dataset_path": "EleutherAI/wikitext_document_level",
341
+ "dataset_name": "wikitext-2-raw-v1",
342
+ "training_split": "train",
343
+ "validation_split": "validation",
344
+ "test_split": "test",
345
+ "template_aliases": "",
346
+ "doc_to_text": "",
347
+ "doc_to_target": "<function wikitext_detokenizer at 0x7efb86530040>",
348
+ "description": "",
349
+ "target_delimiter": " ",
350
+ "fewshot_delimiter": "\n\n",
351
+ "num_fewshot": 5,
352
+ "metric_list": [
353
+ {
354
+ "metric": "word_perplexity"
355
+ },
356
+ {
357
+ "metric": "byte_perplexity"
358
+ },
359
+ {
360
+ "metric": "bits_per_byte"
361
+ }
362
+ ],
363
+ "output_type": "loglikelihood_rolling",
364
+ "repeats": 1,
365
+ "should_decontaminate": true,
366
+ "doc_to_decontamination_query": "{{page}}"
367
+ },
368
+ "winogrande": {
369
+ "task": "winogrande",
370
+ "dataset_path": "winogrande",
371
+ "dataset_name": "winogrande_xl",
372
+ "training_split": "train",
373
+ "validation_split": "validation",
374
+ "doc_to_text": "<function doc_to_text at 0x7efb86502ef0>",
375
+ "doc_to_target": "<function doc_to_target at 0x7efb86503370>",
376
+ "doc_to_choice": "<function doc_to_choice at 0x7efb865035b0>",
377
+ "description": "",
378
+ "target_delimiter": " ",
379
+ "fewshot_delimiter": "\n\n",
380
+ "num_fewshot": 5,
381
+ "metric_list": [
382
+ {
383
+ "metric": "acc",
384
+ "aggregation": "mean",
385
+ "higher_is_better": true
386
+ }
387
+ ],
388
+ "output_type": "multiple_choice",
389
+ "repeats": 1,
390
+ "should_decontaminate": false
391
+ }
392
+ },
393
+ "versions": {
394
+ "arc_challenge": "Yaml",
395
+ "arc_easy": "Yaml",
396
+ "boolq": "Yaml",
397
+ "hellaswag": "Yaml",
398
+ "lambada_openai": "Yaml",
399
+ "openbookqa": "Yaml",
400
+ "piqa": "Yaml",
401
+ "sciq": "Yaml",
402
+ "wikitext": "Yaml",
403
+ "winogrande": "Yaml"
404
+ },
405
+ "config": {
406
+ "model": "hf",
407
+ "model_args": "pretrained=EleutherAI/pythia-410m",
408
+ "num_fewshot": 5,
409
+ "batch_size": 16,
410
+ "batch_sizes": [],
411
+ "device": "cuda:0",
412
+ "use_cache": null,
413
+ "limit": null,
414
+ "bootstrap_iters": 100000
415
+ },
416
+ "git_hash": "4e44f0a"
417
+ }
418
+ hf (pretrained=EleutherAI/pythia-410m), limit: None, num_fewshot: 5, batch_size: 16
419
+ | Task |Version|Filter| Metric | Value | |Stderr|
420
+ |--------------|-------|------|---------------|------:|---|-----:|
421
+ |arc_challenge |Yaml |none |acc | 0.2184|± |0.0121|
422
+ | | |none |acc_norm | 0.2645|± |0.0129|
423
+ |arc_easy |Yaml |none |acc | 0.5450|± |0.0102|
424
+ | | |none |acc_norm | 0.5370|± |0.0102|
425
+ |boolq |Yaml |none |acc | 0.4872|± |0.0087|
426
+ |hellaswag |Yaml |none |acc | 0.3383|± |0.0047|
427
+ | | |none |acc_norm | 0.4082|± |0.0049|
428
+ |lambada_openai|Yaml |none |perplexity |14.4856|± |0.4358|
429
+ | | |none |acc | 0.4423|± |0.0069|
430
+ |openbookqa |Yaml |none |acc | 0.1880|± |0.0175|
431
+ | | |none |acc_norm | 0.2800|± |0.0201|
432
+ |piqa |Yaml |none |acc | 0.6806|± |0.0109|
433
+ | | |none |acc_norm | 0.6692|± |0.0110|
434
+ |sciq |Yaml |none |acc | 0.8920|± |0.0098|
435
+ | | |none |acc_norm | 0.8870|± |0.0100|
436
+ |wikitext |Yaml |none |word_perplexity|34.5045| | |
437
+ | | |none |byte_perplexity| 1.7928| | |
438
+ | | |none |bits_per_byte | 0.8422| | |
439
+ |winogrande |Yaml |none |acc | 0.5335|± |0.0140|
440
+