Muennighoff commited on
Commit
d522938
1 Parent(s): 1aec95b
Files changed (50) hide show
  1. 2b855b11bc4/evaluation/rankeval/2b855b11bc4_2.csv +2 -0
  2. 2b855b11bc4/evaluation/rankeval/2b855b11bc4_2.json +8 -1
  3. 2b855b11bc4/evaluation/rankeval/2b855b11bc4_3.csv +9 -0
  4. 2b855b11bc4/evaluation/rankeval/2b855b11bc4_3.json +29 -1
  5. 2b855b11bc4/evaluation/rankeval/2b855b11bc4_4.csv +10 -0
  6. 2b855b11bc4/evaluation/rankeval/2b855b11bc4_4.json +34 -1
  7. 2b855b11bc4/evaluation/rankeval/2b855b11bc4_4_lm-eval_global_step52452_2023-01-31-17-30-37_4shots_backup.json +0 -54
  8. 2b855b11bc4/evaluation/rankeval/2b855b11bc4_5.csv +14 -0
  9. 2b855b11bc4/evaluation/rankeval/2b855b11bc4_5.json +56 -1
  10. 2b855b14bc4/evaluation/rankeval/2b855b14bc4_2.csv +2 -0
  11. 2b855b14bc4/evaluation/rankeval/2b855b14bc4_2.json +8 -1
  12. 2b855b14bc4/evaluation/rankeval/2b855b14bc4_3.csv +9 -0
  13. 2b855b14bc4/evaluation/rankeval/2b855b14bc4_3.json +34 -1
  14. 2b855b14bc4/evaluation/rankeval/2b855b14bc4_4.csv +9 -0
  15. 2b855b14bc4/evaluation/rankeval/2b855b14bc4_4.json +34 -1
  16. 2b855b14bc4/evaluation/rankeval/2b855b14bc4_5.csv +14 -0
  17. 2b855b14bc4/evaluation/rankeval/2b855b14bc4_5.json +56 -1
  18. 2b855b18bc4/evaluation/rankeval/2b855b18bc4_2.csv +2 -0
  19. 2b855b18bc4/evaluation/rankeval/2b855b18bc4_2.json +8 -1
  20. 2b855b18bc4/evaluation/rankeval/2b855b18bc4_3.csv +8 -0
  21. 2b855b18bc4/evaluation/rankeval/2b855b18bc4_3.json +29 -1
  22. 2b855b18bc4/evaluation/rankeval/2b855b18bc4_4.csv +10 -0
  23. 2b855b18bc4/evaluation/rankeval/2b855b18bc4_4.json +39 -1
  24. 2b855b18bc4/evaluation/rankeval/2b855b18bc4_5.csv +14 -0
  25. 2b855b18bc4/evaluation/rankeval/2b855b18bc4_5.json +56 -1
  26. 2b855b28bc4/evaluation/rankeval/2b855b28bc4_2.csv +2 -0
  27. 2b855b28bc4/evaluation/rankeval/2b855b28bc4_2.json +8 -1
  28. 2b855b28bc4/evaluation/rankeval/2b855b28bc4_3.csv +9 -0
  29. 2b855b28bc4/evaluation/rankeval/2b855b28bc4_3.json +34 -1
  30. 2b855b28bc4/evaluation/rankeval/2b855b28bc4_4.csv +9 -0
  31. 2b855b28bc4/evaluation/rankeval/2b855b28bc4_4.json +34 -1
  32. 2b855b28bc4/evaluation/rankeval/2b855b28bc4_5.csv +14 -0
  33. 2b855b28bc4/evaluation/rankeval/2b855b28bc4_5.json +56 -1
  34. 2b855b55bc4/evaluation/rankeval/2b855b55bc4_2.csv +2 -0
  35. 2b855b55bc4/evaluation/rankeval/2b855b55bc4_2.json +8 -1
  36. 2b855b55bc4/evaluation/rankeval/2b855b55bc4_3.csv +9 -0
  37. 2b855b55bc4/evaluation/rankeval/2b855b55bc4_3.json +29 -1
  38. 2b855b55bc4/evaluation/rankeval/2b855b55bc4_3_lm-eval_global_step52452_2023-01-31-17-30-37_3shots_backup.json +0 -59
  39. 2b855b55bc4/evaluation/rankeval/2b855b55bc4_4.csv +9 -0
  40. 2b855b55bc4/evaluation/rankeval/2b855b55bc4_4.json +34 -1
  41. 2b855b55bc4/evaluation/rankeval/2b855b55bc4_5.csv +14 -0
  42. 2b855b55bc4/evaluation/rankeval/2b855b55bc4_5.json +56 -1
  43. 2b855b9bc4/evaluation/rankeval/2b855b9bc4_2.csv +2 -0
  44. 2b855b9bc4/evaluation/rankeval/2b855b9bc4_2.json +8 -1
  45. 2b855b9bc4/evaluation/rankeval/2b855b9bc4_3.csv +8 -0
  46. 2b855b9bc4/evaluation/rankeval/2b855b9bc4_3.json +29 -1
  47. 2b855b9bc4/evaluation/rankeval/2b855b9bc4_4.csv +9 -0
  48. 2b855b9bc4/evaluation/rankeval/2b855b9bc4_4.json +34 -1
  49. 2b855b9bc4/evaluation/rankeval/2b855b9bc4_5.csv +14 -0
  50. 2b855b9bc4/evaluation/rankeval/2b855b9bc4_5.json +56 -1
2b855b11bc4/evaluation/rankeval/2b855b11bc4_2.csv CHANGED
@@ -12,6 +12,8 @@ cb,f1,0.24217687074829933,,1
12
  copa,acc,0.76,0.042923469599092816,0
13
  hellaswag,acc,0.4340768771161123,0.004946221512145273,0
14
  hellaswag,acc_norm,0.5575582553276239,0.004956609327218393,0
 
 
15
  rte,acc,0.4981949458483754,0.030096267148976633,0
16
  sciq,acc,0.883,0.01016928780271333,0
17
  sciq,acc_norm,0.867,0.01074366913239735,0
 
12
  copa,acc,0.76,0.042923469599092816,0
13
  hellaswag,acc,0.4340768771161123,0.004946221512145273,0
14
  hellaswag,acc_norm,0.5575582553276239,0.004956609327218393,0
15
+ piqa,acc,0.7431991294885746,0.010192864802278047,0
16
+ piqa,acc_norm,0.7453754080522307,0.010164432237060499,0
17
  rte,acc,0.4981949458483754,0.030096267148976633,0
18
  sciq,acc,0.883,0.01016928780271333,0
19
  sciq,acc_norm,0.867,0.01074366913239735,0
2b855b11bc4/evaluation/rankeval/2b855b11bc4_2.json CHANGED
@@ -60,6 +60,12 @@
60
  "acc_stderr": 0.01016928780271333,
61
  "acc_norm": 0.867,
62
  "acc_norm_stderr": 0.01074366913239735
 
 
 
 
 
 
63
  }
64
  },
65
  "versions": {
@@ -75,6 +81,7 @@
75
  "boolq": 1,
76
  "arc_easy": 0,
77
  "arc_challenge": 0,
78
- "sciq": 0
 
79
  }
80
  }
 
60
  "acc_stderr": 0.01016928780271333,
61
  "acc_norm": 0.867,
62
  "acc_norm_stderr": 0.01074366913239735
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7431991294885746,
66
+ "acc_stderr": 0.010192864802278047,
67
+ "acc_norm": 0.7453754080522307,
68
+ "acc_norm_stderr": 0.010164432237060499
69
  }
70
  },
71
  "versions": {
 
81
  "boolq": 1,
82
  "arc_easy": 0,
83
  "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b11bc4/evaluation/rankeval/2b855b11bc4_3.csv CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.316,0.014709193056057147,0
3
  anli_r2,acc,0.352,0.015110404505648671,0
4
  anli_r3,acc,0.35083333333333333,0.013782212417178197,0
 
 
 
 
 
5
  cb,acc,0.42857142857142855,0.06672848092813058,1
6
  cb,f1,0.4271380793119923,,1
7
  copa,acc,0.73,0.0446196043338474,0
8
  hellaswag,acc,0.4331806413065126,0.0049450236570322765,0
9
  hellaswag,acc_norm,0.5667197769368651,0.004945157565218203,0
 
 
10
  rte,acc,0.5379061371841155,0.030009848912529113,0
 
 
11
  storycloze_2016,acc,0.6851950828433993,0.010740068943171381,0
12
  winogrande,acc,0.5659037095501184,0.013929882555694054,0
 
2
  anli_r1,acc,0.316,0.014709193056057147,0
3
  anli_r2,acc,0.352,0.015110404505648671,0
4
  anli_r3,acc,0.35083333333333333,0.013782212417178197,0
5
+ arc_challenge,acc,0.27303754266211605,0.013019332762635746,0
6
+ arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0
7
+ arc_easy,acc,0.5833333333333334,0.010116282977781247,0
8
+ arc_easy,acc_norm,0.5660774410774411,0.010169795770462104,0
9
+ boolq,acc,0.6033639143730887,0.008556148582032,1
10
  cb,acc,0.42857142857142855,0.06672848092813058,1
11
  cb,f1,0.4271380793119923,,1
12
  copa,acc,0.73,0.0446196043338474,0
13
  hellaswag,acc,0.4331806413065126,0.0049450236570322765,0
14
  hellaswag,acc_norm,0.5667197769368651,0.004945157565218203,0
15
+ piqa,acc,0.7415669205658324,0.010213971636773315,0
16
+ piqa,acc_norm,0.7377584330794341,0.010262502565172443,0
17
  rte,acc,0.5379061371841155,0.030009848912529113,0
18
+ sciq,acc,0.881,0.01024421514533666,0
19
+ sciq,acc_norm,0.864,0.01084535023047299,0
20
  storycloze_2016,acc,0.6851950828433993,0.010740068943171381,0
21
  winogrande,acc,0.5659037095501184,0.013929882555694054,0
2b855b11bc4/evaluation/rankeval/2b855b11bc4_3.json CHANGED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.6033639143730887,
44
  "acc_stderr": 0.008556148582032
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.6033639143730887,
44
  "acc_stderr": 0.008556148582032
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5833333333333334,
48
+ "acc_stderr": 0.010116282977781247,
49
+ "acc_norm": 0.5660774410774411,
50
+ "acc_norm_stderr": 0.010169795770462104
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.27303754266211605,
54
+ "acc_stderr": 0.013019332762635746,
55
+ "acc_norm": 0.2832764505119454,
56
+ "acc_norm_stderr": 0.013167478735134575
57
+ },
58
+ "sciq": {
59
+ "acc": 0.881,
60
+ "acc_stderr": 0.01024421514533666,
61
+ "acc_norm": 0.864,
62
+ "acc_norm_stderr": 0.01084535023047299
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7415669205658324,
66
+ "acc_stderr": 0.010213971636773315,
67
+ "acc_norm": 0.7377584330794341,
68
+ "acc_norm_stderr": 0.010262502565172443
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b11bc4/evaluation/rankeval/2b855b11bc4_4.csv CHANGED
@@ -2,10 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.336,0.014944140233795027,0
3
  anli_r2,acc,0.354,0.015129868238451772,0
4
  anli_r3,acc,0.3325,0.013605417345710526,0
 
 
 
 
 
5
  cb,acc,0.48214285714285715,0.0673769750864465,1
6
  cb,f1,0.40945083014048533,,1
7
  copa,acc,0.77,0.04229525846816506,0
8
  hellaswag,acc,0.4340768771161123,0.004946221512145273,0
9
  hellaswag,acc_norm,0.5635331607249552,0.004949335356881862,0
 
 
10
  rte,acc,0.49458483754512633,0.030094698123239966,0
 
 
 
11
  winogrande,acc,0.5453827940015785,0.013994481027065997,0
 
2
  anli_r1,acc,0.336,0.014944140233795027,0
3
  anli_r2,acc,0.354,0.015129868238451772,0
4
  anli_r3,acc,0.3325,0.013605417345710526,0
5
+ arc_challenge,acc,0.27986348122866894,0.013119040897725922,0
6
+ arc_challenge,acc_norm,0.29692832764505117,0.013352025976725223,0
7
+ arc_easy,acc,0.5841750841750841,0.010113348244647869,0
8
+ arc_easy,acc_norm,0.5614478114478114,0.010182010275471116,0
9
+ boolq,acc,0.6085626911314985,0.008536430524403957,1
10
  cb,acc,0.48214285714285715,0.0673769750864465,1
11
  cb,f1,0.40945083014048533,,1
12
  copa,acc,0.77,0.04229525846816506,0
13
  hellaswag,acc,0.4340768771161123,0.004946221512145273,0
14
  hellaswag,acc_norm,0.5635331607249552,0.004949335356881862,0
15
+ piqa,acc,0.736126224156692,0.010282996367695562,0
16
+ piqa,acc_norm,0.7421109902067464,0.010206956662056246,0
17
  rte,acc,0.49458483754512633,0.030094698123239966,0
18
+ sciq,acc,0.888,0.009977753031397234,0
19
+ sciq,acc_norm,0.869,0.010674874844837952,0
20
+ storycloze_2016,acc,0.6905398182789952,0.01068995674518907,0
21
  winogrande,acc,0.5453827940015785,0.013994481027065997,0
2b855b11bc4/evaluation/rankeval/2b855b11bc4_4.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.6905398182789952,
40
  "acc_stderr": 0.01068995674518907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.6905398182789952,
40
  "acc_stderr": 0.01068995674518907
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6085626911314985,
44
+ "acc_stderr": 0.008536430524403957
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5841750841750841,
48
+ "acc_stderr": 0.010113348244647869,
49
+ "acc_norm": 0.5614478114478114,
50
+ "acc_norm_stderr": 0.010182010275471116
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.27986348122866894,
54
+ "acc_stderr": 0.013119040897725922,
55
+ "acc_norm": 0.29692832764505117,
56
+ "acc_norm_stderr": 0.013352025976725223
57
+ },
58
+ "sciq": {
59
+ "acc": 0.888,
60
+ "acc_stderr": 0.009977753031397234,
61
+ "acc_norm": 0.869,
62
+ "acc_norm_stderr": 0.010674874844837952
63
+ },
64
+ "piqa": {
65
+ "acc": 0.736126224156692,
66
+ "acc_stderr": 0.010282996367695562,
67
+ "acc_norm": 0.7421109902067464,
68
+ "acc_norm_stderr": 0.010206956662056246
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b11bc4/evaluation/rankeval/2b855b11bc4_4_lm-eval_global_step52452_2023-01-31-17-30-37_4shots_backup.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795027
6
- },
7
- "anli_r2": {
8
- "acc": 0.354,
9
- "acc_stderr": 0.015129868238451772
10
- },
11
- "anli_r3": {
12
- "acc": 0.3325,
13
- "acc_stderr": 0.013605417345710526
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.40945083014048533
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.4340768771161123,
26
- "acc_stderr": 0.004946221512145273,
27
- "acc_norm": 0.5635331607249552,
28
- "acc_norm_stderr": 0.004949335356881862
29
- },
30
- "rte": {
31
- "acc": 0.49458483754512633,
32
- "acc_stderr": 0.030094698123239966
33
- },
34
- "winogrande": {
35
- "acc": 0.5453827940015785,
36
- "acc_stderr": 0.013994481027065997
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6905398182789952,
40
- "acc_stderr": 0.01068995674518907
41
- }
42
- },
43
- "versions": {
44
- "anli_r1": 0,
45
- "anli_r2": 0,
46
- "anli_r3": 0,
47
- "cb": 1,
48
- "copa": 0,
49
- "hellaswag": 0,
50
- "rte": 0,
51
- "winogrande": 0,
52
- "storycloze_2016": 0
53
- }
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b11bc4/evaluation/rankeval/2b855b11bc4_5.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.357,0.015158521721486773,0
3
  anli_r2,acc,0.361,0.015195720118175118,0
4
  anli_r3,acc,0.3525,0.013797164918918362,0
 
 
 
 
 
5
  cb,acc,0.4642857142857143,0.06724777654937658,1
6
  cb,f1,0.33484504913076335,,1
7
  copa,acc,0.76,0.04292346959909283,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.357,0.015158521721486773,0
3
  anli_r2,acc,0.361,0.015195720118175118,0
4
  anli_r3,acc,0.3525,0.013797164918918362,0
5
+ arc_challenge,acc,0.2815699658703072,0.013143376735009024,0
6
+ arc_challenge,acc_norm,0.2909556313993174,0.013273077865907597,0
7
+ arc_easy,acc,0.5862794612794613,0.010105878530238137,0
8
+ arc_easy,acc_norm,0.5622895622895623,0.010179856486006897,0
9
+ boolq,acc,0.6055045871559633,0.008548152025770934,1
10
  cb,acc,0.4642857142857143,0.06724777654937658,1
11
  cb,f1,0.33484504913076335,,1
12
  copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.4325831507667795,0.004944215937021392,0
14
+ hellaswag,acc_norm,0.5673172674765983,0.004944351065545863,0
15
+ piqa,acc,0.7372143634385201,0.010269354068140767,0
16
+ piqa,acc_norm,0.7372143634385201,0.010269354068140777,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.889,0.009938701010583726,0
19
+ sciq,acc_norm,0.886,0.010055103435823328,0
20
+ storycloze_2016,acc,0.6873329770176376,0.010720223172953168,0
21
+ winogrande,acc,0.569060773480663,0.01391779662333597,0
2b855b11bc4/evaluation/rankeval/2b855b11bc4_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.76,
22
  "acc_stderr": 0.04292346959909283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.76,
22
  "acc_stderr": 0.04292346959909283
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4325831507667795,
26
+ "acc_stderr": 0.004944215937021392,
27
+ "acc_norm": 0.5673172674765983,
28
+ "acc_norm_stderr": 0.004944351065545863
29
+ },
30
+ "rte": {
31
+ "acc": 0.5306859205776173,
32
+ "acc_stderr": 0.03003973059219781
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.569060773480663,
36
+ "acc_stderr": 0.01391779662333597
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6873329770176376,
40
+ "acc_stderr": 0.010720223172953168
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6055045871559633,
44
+ "acc_stderr": 0.008548152025770934
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5862794612794613,
48
+ "acc_stderr": 0.010105878530238137,
49
+ "acc_norm": 0.5622895622895623,
50
+ "acc_norm_stderr": 0.010179856486006897
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2815699658703072,
54
+ "acc_stderr": 0.013143376735009024,
55
+ "acc_norm": 0.2909556313993174,
56
+ "acc_norm_stderr": 0.013273077865907597
57
+ },
58
+ "sciq": {
59
+ "acc": 0.889,
60
+ "acc_stderr": 0.009938701010583726,
61
+ "acc_norm": 0.886,
62
+ "acc_norm_stderr": 0.010055103435823328
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7372143634385201,
66
+ "acc_stderr": 0.010269354068140767,
67
+ "acc_norm": 0.7372143634385201,
68
+ "acc_norm_stderr": 0.010269354068140777
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b14bc4/evaluation/rankeval/2b855b14bc4_2.csv CHANGED
@@ -12,6 +12,8 @@ cb,f1,0.2628346843527389,,1
12
  copa,acc,0.77,0.04229525846816506,0
13
  hellaswag,acc,0.42999402509460266,0.004940631135803533,0
14
  hellaswag,acc_norm,0.5637323242381995,0.0049490803348160245,0
 
 
15
  rte,acc,0.5090252707581228,0.030091559826331334,0
16
  sciq,acc,0.87,0.010640169792499347,0
17
  sciq,acc_norm,0.838,0.011657267771304413,0
 
12
  copa,acc,0.77,0.04229525846816506,0
13
  hellaswag,acc,0.42999402509460266,0.004940631135803533,0
14
  hellaswag,acc_norm,0.5637323242381995,0.0049490803348160245,0
15
+ piqa,acc,0.7328618063112078,0.010323440492612431,0
16
+ piqa,acc_norm,0.7295973884657236,0.01036316703162078,0
17
  rte,acc,0.5090252707581228,0.030091559826331334,0
18
  sciq,acc,0.87,0.010640169792499347,0
19
  sciq,acc_norm,0.838,0.011657267771304413,0
2b855b14bc4/evaluation/rankeval/2b855b14bc4_2.json CHANGED
@@ -60,6 +60,12 @@
60
  "acc_stderr": 0.010640169792499347,
61
  "acc_norm": 0.838,
62
  "acc_norm_stderr": 0.011657267771304413
 
 
 
 
 
 
63
  }
64
  },
65
  "versions": {
@@ -75,6 +81,7 @@
75
  "boolq": 1,
76
  "arc_easy": 0,
77
  "arc_challenge": 0,
78
- "sciq": 0
 
79
  }
80
  }
 
60
  "acc_stderr": 0.010640169792499347,
61
  "acc_norm": 0.838,
62
  "acc_norm_stderr": 0.011657267771304413
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7328618063112078,
66
+ "acc_stderr": 0.010323440492612431,
67
+ "acc_norm": 0.7295973884657236,
68
+ "acc_norm_stderr": 0.01036316703162078
69
  }
70
  },
71
  "versions": {
 
81
  "boolq": 1,
82
  "arc_easy": 0,
83
  "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b14bc4/evaluation/rankeval/2b855b14bc4_3.csv CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.323,0.014794927843348633,0
3
  anli_r2,acc,0.357,0.015158521721486769,0
4
  anli_r3,acc,0.34833333333333333,0.013759437498874079,0
 
 
 
 
 
5
  cb,acc,0.4642857142857143,0.0672477765493766,1
6
  cb,f1,0.3162578162578163,,1
7
  copa,acc,0.81,0.03942772444036622,0
8
  hellaswag,acc,0.4329814777932683,0.004944755230598382,0
9
  hellaswag,acc_norm,0.5656243776140211,0.004946617138983511,0
 
 
10
  rte,acc,0.5054151624548736,0.030094698123239966,0
 
 
11
  storycloze_2016,acc,0.6900053447354356,0.010695042806212553,0
12
  winogrande,acc,0.5461720599842147,0.013992441563707067,0
 
2
  anli_r1,acc,0.323,0.014794927843348633,0
3
  anli_r2,acc,0.357,0.015158521721486769,0
4
  anli_r3,acc,0.34833333333333333,0.013759437498874079,0
5
+ arc_challenge,acc,0.2738907849829352,0.013032004972989505,0
6
+ arc_challenge,acc_norm,0.28071672354948807,0.013131238126975576,0
7
+ arc_easy,acc,0.5921717171717171,0.010083950240041216,0
8
+ arc_easy,acc_norm,0.5711279461279462,0.010155440652900152,0
9
+ boolq,acc,0.5941896024464832,0.008588486726385772,1
10
  cb,acc,0.4642857142857143,0.0672477765493766,1
11
  cb,f1,0.3162578162578163,,1
12
  copa,acc,0.81,0.03942772444036622,0
13
  hellaswag,acc,0.4329814777932683,0.004944755230598382,0
14
  hellaswag,acc_norm,0.5656243776140211,0.004946617138983511,0
15
+ piqa,acc,0.735582154515778,0.010289787244767168,0
16
+ piqa,acc_norm,0.7334058759521219,0.010316749863541365,0
17
  rte,acc,0.5054151624548736,0.030094698123239966,0
18
+ sciq,acc,0.872,0.010570133761108665,0
19
+ sciq,acc_norm,0.829,0.011912216456264613,0
20
  storycloze_2016,acc,0.6900053447354356,0.010695042806212553,0
21
  winogrande,acc,0.5461720599842147,0.013992441563707067,0
2b855b14bc4/evaluation/rankeval/2b855b14bc4_3.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.6900053447354356,
40
  "acc_stderr": 0.010695042806212553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.6900053447354356,
40
  "acc_stderr": 0.010695042806212553
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5941896024464832,
44
+ "acc_stderr": 0.008588486726385772
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5921717171717171,
48
+ "acc_stderr": 0.010083950240041216,
49
+ "acc_norm": 0.5711279461279462,
50
+ "acc_norm_stderr": 0.010155440652900152
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2738907849829352,
54
+ "acc_stderr": 0.013032004972989505,
55
+ "acc_norm": 0.28071672354948807,
56
+ "acc_norm_stderr": 0.013131238126975576
57
+ },
58
+ "sciq": {
59
+ "acc": 0.872,
60
+ "acc_stderr": 0.010570133761108665,
61
+ "acc_norm": 0.829,
62
+ "acc_norm_stderr": 0.011912216456264613
63
+ },
64
+ "piqa": {
65
+ "acc": 0.735582154515778,
66
+ "acc_stderr": 0.010289787244767168,
67
+ "acc_norm": 0.7334058759521219,
68
+ "acc_norm_stderr": 0.010316749863541365
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b14bc4/evaluation/rankeval/2b855b14bc4_4.csv CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.36,0.015186527932040117,0
3
  anli_r2,acc,0.352,0.015110404505648673,0
4
  anli_r3,acc,0.35,0.013774667009018552,0
 
 
 
 
 
5
  cb,acc,0.48214285714285715,0.0673769750864465,1
6
  cb,f1,0.3338164251207729,,1
7
  copa,acc,0.79,0.040936018074033256,0
8
  hellaswag,acc,0.4314877514439355,0.004942716091996078,0
9
  hellaswag,acc_norm,0.5659231228838877,0.004946221512145289,0
 
 
10
  rte,acc,0.48375451263537905,0.030080573208738064,0
 
 
11
  storycloze_2016,acc,0.6916087653661144,0.010679734445487801,0
12
  winogrande,acc,0.569060773480663,0.013917796623335964,0
 
2
  anli_r1,acc,0.36,0.015186527932040117,0
3
  anli_r2,acc,0.352,0.015110404505648673,0
4
  anli_r3,acc,0.35,0.013774667009018552,0
5
+ arc_challenge,acc,0.2636518771331058,0.012875929151297056,0
6
+ arc_challenge,acc_norm,0.2841296928327645,0.013179442447653886,0
7
+ arc_easy,acc,0.5972222222222222,0.010063960494989161,0
8
+ arc_easy,acc_norm,0.5702861952861953,0.010157908005763676,0
9
+ boolq,acc,0.5951070336391437,0.008585393347962307,1
10
  cb,acc,0.48214285714285715,0.0673769750864465,1
11
  cb,f1,0.3338164251207729,,1
12
  copa,acc,0.79,0.040936018074033256,0
13
  hellaswag,acc,0.4314877514439355,0.004942716091996078,0
14
  hellaswag,acc_norm,0.5659231228838877,0.004946221512145289,0
15
+ piqa,acc,0.7301414581066377,0.010356595421852209,0
16
+ piqa,acc_norm,0.7377584330794341,0.010262502565172443,0
17
  rte,acc,0.48375451263537905,0.030080573208738064,0
18
+ sciq,acc,0.886,0.010055103435823328,0
19
+ sciq,acc_norm,0.86,0.010978183844357807,0
20
  storycloze_2016,acc,0.6916087653661144,0.010679734445487801,0
21
  winogrande,acc,0.569060773480663,0.013917796623335964,0
2b855b14bc4/evaluation/rankeval/2b855b14bc4_4.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.6916087653661144,
40
  "acc_stderr": 0.010679734445487801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.6916087653661144,
40
  "acc_stderr": 0.010679734445487801
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5951070336391437,
44
+ "acc_stderr": 0.008585393347962307
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5972222222222222,
48
+ "acc_stderr": 0.010063960494989161,
49
+ "acc_norm": 0.5702861952861953,
50
+ "acc_norm_stderr": 0.010157908005763676
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2636518771331058,
54
+ "acc_stderr": 0.012875929151297056,
55
+ "acc_norm": 0.2841296928327645,
56
+ "acc_norm_stderr": 0.013179442447653886
57
+ },
58
+ "sciq": {
59
+ "acc": 0.886,
60
+ "acc_stderr": 0.010055103435823328,
61
+ "acc_norm": 0.86,
62
+ "acc_norm_stderr": 0.010978183844357807
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7301414581066377,
66
+ "acc_stderr": 0.010356595421852209,
67
+ "acc_norm": 0.7377584330794341,
68
+ "acc_norm_stderr": 0.010262502565172443
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b14bc4/evaluation/rankeval/2b855b14bc4_5.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.357,0.015158521721486773,0
3
  anli_r2,acc,0.35,0.015090650341444231,0
4
  anli_r3,acc,0.34833333333333333,0.01375943749887407,0
 
 
 
 
 
5
  cb,acc,0.4642857142857143,0.06724777654937658,1
6
  cb,f1,0.32226930320150665,,1
7
  copa,acc,0.79,0.040936018074033256,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.357,0.015158521721486773,0
3
  anli_r2,acc,0.35,0.015090650341444231,0
4
  anli_r3,acc,0.34833333333333333,0.01375943749887407,0
5
+ arc_challenge,acc,0.2687713310580205,0.012955065963710684,0
6
+ arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0
7
+ arc_easy,acc,0.5909090909090909,0.010088775152615788,0
8
+ arc_easy,acc_norm,0.5664983164983165,0.010168640625454103,0
9
+ boolq,acc,0.5951070336391437,0.008585393347962317,1
10
  cb,acc,0.4642857142857143,0.06724777654937658,1
11
  cb,f1,0.32226930320150665,,1
12
  copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.43238398725353516,0.004943945069611458,0
14
+ hellaswag,acc_norm,0.5669189404501095,0.004944889545497962,0
15
+ piqa,acc,0.7263329706202394,0.010402184206229211,0
16
+ piqa,acc_norm,0.733949945593036,0.010310039263352824,0
17
+ rte,acc,0.5126353790613718,0.030086851767188564,0
18
+ sciq,acc,0.893,0.009779910359847169,0
19
+ sciq,acc_norm,0.862,0.010912152632504403,0
20
+ storycloze_2016,acc,0.6900053447354356,0.010695042806212555,0
21
+ winogrande,acc,0.5493291239147593,0.01398392886904024,0
2b855b14bc4/evaluation/rankeval/2b855b14bc4_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.79,
22
  "acc_stderr": 0.040936018074033256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.79,
22
  "acc_stderr": 0.040936018074033256
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.43238398725353516,
26
+ "acc_stderr": 0.004943945069611458,
27
+ "acc_norm": 0.5669189404501095,
28
+ "acc_norm_stderr": 0.004944889545497962
29
+ },
30
+ "rte": {
31
+ "acc": 0.5126353790613718,
32
+ "acc_stderr": 0.030086851767188564
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5493291239147593,
36
+ "acc_stderr": 0.01398392886904024
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6900053447354356,
40
+ "acc_stderr": 0.010695042806212555
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5951070336391437,
44
+ "acc_stderr": 0.008585393347962317
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5909090909090909,
48
+ "acc_stderr": 0.010088775152615788,
49
+ "acc_norm": 0.5664983164983165,
50
+ "acc_norm_stderr": 0.010168640625454103
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2687713310580205,
54
+ "acc_stderr": 0.012955065963710684,
55
+ "acc_norm": 0.2773037542662116,
56
+ "acc_norm_stderr": 0.013082095839059374
57
+ },
58
+ "sciq": {
59
+ "acc": 0.893,
60
+ "acc_stderr": 0.009779910359847169,
61
+ "acc_norm": 0.862,
62
+ "acc_norm_stderr": 0.010912152632504403
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7263329706202394,
66
+ "acc_stderr": 0.010402184206229211,
67
+ "acc_norm": 0.733949945593036,
68
+ "acc_norm_stderr": 0.010310039263352824
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b18bc4/evaluation/rankeval/2b855b18bc4_2.csv CHANGED
@@ -12,6 +12,8 @@ cb,f1,0.2674860989171788,,1
12
  copa,acc,0.73,0.0446196043338474,0
13
  hellaswag,acc,0.4281019717187811,0.004937924326742572,0
14
  hellaswag,acc_norm,0.559151563433579,0.004954740808837193,0
 
 
15
  rte,acc,0.5379061371841155,0.030009848912529117,0
16
  sciq,acc,0.883,0.010169287802713329,0
17
  sciq,acc_norm,0.872,0.010570133761108663,0
 
12
  copa,acc,0.73,0.0446196043338474,0
13
  hellaswag,acc,0.4281019717187811,0.004937924326742572,0
14
  hellaswag,acc_norm,0.559151563433579,0.004954740808837193,0
15
+ piqa,acc,0.733949945593036,0.010310039263352831,0
16
+ piqa,acc_norm,0.7334058759521219,0.010316749863541365,0
17
  rte,acc,0.5379061371841155,0.030009848912529117,0
18
  sciq,acc,0.883,0.010169287802713329,0
19
  sciq,acc_norm,0.872,0.010570133761108663,0
2b855b18bc4/evaluation/rankeval/2b855b18bc4_2.json CHANGED
@@ -60,6 +60,12 @@
60
  "acc_stderr": 0.010169287802713329,
61
  "acc_norm": 0.872,
62
  "acc_norm_stderr": 0.010570133761108663
 
 
 
 
 
 
63
  }
64
  },
65
  "versions": {
@@ -75,6 +81,7 @@
75
  "boolq": 1,
76
  "arc_easy": 0,
77
  "arc_challenge": 0,
78
- "sciq": 0
 
79
  }
80
  }
 
60
  "acc_stderr": 0.010169287802713329,
61
  "acc_norm": 0.872,
62
  "acc_norm_stderr": 0.010570133761108663
63
+ },
64
+ "piqa": {
65
+ "acc": 0.733949945593036,
66
+ "acc_stderr": 0.010310039263352831,
67
+ "acc_norm": 0.7334058759521219,
68
+ "acc_norm_stderr": 0.010316749863541365
69
  }
70
  },
71
  "versions": {
 
81
  "boolq": 1,
82
  "arc_easy": 0,
83
  "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b18bc4/evaluation/rankeval/2b855b18bc4_3.csv CHANGED
@@ -2,12 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.321,0.014770821817934642,0
3
  anli_r2,acc,0.348,0.01507060460376841,0
4
  anli_r3,acc,0.3283333333333333,0.013562032919529024,0
 
 
 
 
5
  boolq,acc,0.5770642201834862,0.008640558744656428,1
6
  cb,acc,0.4107142857142857,0.0663363415035954,1
7
  cb,f1,0.26246498599439777,,1
8
  copa,acc,0.75,0.04351941398892446,0
9
  hellaswag,acc,0.4303923521210914,0.00494119160731791,0
10
  hellaswag,acc_norm,0.5641306512646883,0.004948567856373856,0
 
 
11
  rte,acc,0.5306859205776173,0.03003973059219781,0
 
 
12
  storycloze_2016,acc,0.6910742918225548,0.01068485396626845,0
13
  winogrande,acc,0.5556432517758485,0.013965196769083555,0
 
2
  anli_r1,acc,0.321,0.014770821817934642,0
3
  anli_r2,acc,0.348,0.01507060460376841,0
4
  anli_r3,acc,0.3283333333333333,0.013562032919529024,0
5
+ arc_challenge,acc,0.26791808873720135,0.012942030195136438,0
6
+ arc_challenge,acc_norm,0.30631399317406144,0.013470584417276513,0
7
+ arc_easy,acc,0.6001683501683501,0.010051788039412925,0
8
+ arc_easy,acc_norm,0.5740740740740741,0.010146568651002255,0
9
  boolq,acc,0.5770642201834862,0.008640558744656428,1
10
  cb,acc,0.4107142857142857,0.0663363415035954,1
11
  cb,f1,0.26246498599439777,,1
12
  copa,acc,0.75,0.04351941398892446,0
13
  hellaswag,acc,0.4303923521210914,0.00494119160731791,0
14
  hellaswag,acc_norm,0.5641306512646883,0.004948567856373856,0
15
+ piqa,acc,0.7442872687704026,0.010178690109459857,0
16
+ piqa,acc_norm,0.7513601741022851,0.010084511234296855,0
17
  rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.888,0.009977753031397236,0
19
+ sciq,acc_norm,0.871,0.010605256784796572,0
20
  storycloze_2016,acc,0.6910742918225548,0.01068485396626845,0
21
  winogrande,acc,0.5556432517758485,0.013965196769083555,0
2b855b18bc4/evaluation/rankeval/2b855b18bc4_3.json CHANGED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.5770642201834862,
44
  "acc_stderr": 0.008640558744656428
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.5770642201834862,
44
  "acc_stderr": 0.008640558744656428
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6001683501683501,
48
+ "acc_stderr": 0.010051788039412925,
49
+ "acc_norm": 0.5740740740740741,
50
+ "acc_norm_stderr": 0.010146568651002255
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26791808873720135,
54
+ "acc_stderr": 0.012942030195136438,
55
+ "acc_norm": 0.30631399317406144,
56
+ "acc_norm_stderr": 0.013470584417276513
57
+ },
58
+ "sciq": {
59
+ "acc": 0.888,
60
+ "acc_stderr": 0.009977753031397236,
61
+ "acc_norm": 0.871,
62
+ "acc_norm_stderr": 0.010605256784796572
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7442872687704026,
66
+ "acc_stderr": 0.010178690109459857,
67
+ "acc_norm": 0.7513601741022851,
68
+ "acc_norm_stderr": 0.010084511234296855
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b18bc4/evaluation/rankeval/2b855b18bc4_4.csv CHANGED
@@ -2,10 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.36,0.015186527932040122,0
3
  anli_r2,acc,0.318,0.014734079309311901,0
4
  anli_r3,acc,0.3416666666666667,0.013696658778002508,0
 
 
 
 
 
5
  cb,acc,0.42857142857142855,0.06672848092813057,1
6
  cb,f1,0.33008658008658015,,1
7
  copa,acc,0.75,0.04351941398892446,0
8
  hellaswag,acc,0.4298944433379805,0.004940490508240647,0
9
  hellaswag,acc_norm,0.5628360884285999,0.004950221546187577,0
 
 
10
  rte,acc,0.5234657039711191,0.03006330041190266,0
 
 
 
11
  winogrande,acc,0.5588003157063931,0.013954975072834736,0
 
2
  anli_r1,acc,0.36,0.015186527932040122,0
3
  anli_r2,acc,0.318,0.014734079309311901,0
4
  anli_r3,acc,0.3416666666666667,0.013696658778002508,0
5
+ arc_challenge,acc,0.26109215017064846,0.012835523909473841,0
6
+ arc_challenge,acc_norm,0.295221843003413,0.013329750293382316,0
7
+ arc_easy,acc,0.6005892255892256,0.010050018228742122,0
8
+ arc_easy,acc_norm,0.5787037037037037,0.010131882498193127,0
9
+ boolq,acc,0.5660550458715596,0.008668405003744127,1
10
  cb,acc,0.42857142857142855,0.06672848092813057,1
11
  cb,f1,0.33008658008658015,,1
12
  copa,acc,0.75,0.04351941398892446,0
13
  hellaswag,acc,0.4298944433379805,0.004940490508240647,0
14
  hellaswag,acc_norm,0.5628360884285999,0.004950221546187577,0
15
+ piqa,acc,0.7328618063112078,0.010323440492612442,0
16
+ piqa,acc_norm,0.735582154515778,0.010289787244767166,0
17
  rte,acc,0.5234657039711191,0.03006330041190266,0
18
+ sciq,acc,0.899,0.009533618929340997,0
19
+ sciq,acc_norm,0.89,0.009899393819724425,0
20
+ storycloze_2016,acc,0.6878674505611972,0.010715220346279678,0
21
  winogrande,acc,0.5588003157063931,0.013954975072834736,0
2b855b18bc4/evaluation/rankeval/2b855b18bc4_4.json CHANGED
@@ -34,6 +34,38 @@
34
  "winogrande": {
35
  "acc": 0.5588003157063931,
36
  "acc_stderr": 0.013954975072834736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
38
  },
39
  "versions": {
@@ -44,6 +76,12 @@
44
  "copa": 0,
45
  "hellaswag": 0,
46
  "rte": 0,
47
- "winogrande": 0
 
 
 
 
 
 
48
  }
49
  }
 
34
  "winogrande": {
35
  "acc": 0.5588003157063931,
36
  "acc_stderr": 0.013954975072834736
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6878674505611972,
40
+ "acc_stderr": 0.010715220346279678
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5660550458715596,
44
+ "acc_stderr": 0.008668405003744127
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6005892255892256,
48
+ "acc_stderr": 0.010050018228742122,
49
+ "acc_norm": 0.5787037037037037,
50
+ "acc_norm_stderr": 0.010131882498193127
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26109215017064846,
54
+ "acc_stderr": 0.012835523909473841,
55
+ "acc_norm": 0.295221843003413,
56
+ "acc_norm_stderr": 0.013329750293382316
57
+ },
58
+ "sciq": {
59
+ "acc": 0.899,
60
+ "acc_stderr": 0.009533618929340997,
61
+ "acc_norm": 0.89,
62
+ "acc_norm_stderr": 0.009899393819724425
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7328618063112078,
66
+ "acc_stderr": 0.010323440492612442,
67
+ "acc_norm": 0.735582154515778,
68
+ "acc_norm_stderr": 0.010289787244767166
69
  }
70
  },
71
  "versions": {
 
76
  "copa": 0,
77
  "hellaswag": 0,
78
  "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b18bc4/evaluation/rankeval/2b855b18bc4_5.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.327,0.014842213153411249,0
3
  anli_r2,acc,0.324,0.014806864733738854,0
4
  anli_r3,acc,0.3175,0.013443538681348054,0
 
 
 
 
 
5
  cb,acc,0.5178571428571429,0.06737697508644648,1
6
  cb,f1,0.33564993564993567,,1
7
  copa,acc,0.76,0.04292346959909283,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.327,0.014842213153411249,0
3
  anli_r2,acc,0.324,0.014806864733738854,0
4
  anli_r3,acc,0.3175,0.013443538681348054,0
5
+ arc_challenge,acc,0.2696245733788396,0.01296804068686915,0
6
+ arc_challenge,acc_norm,0.2883959044368601,0.013238394422428175,0
7
+ arc_easy,acc,0.5951178451178452,0.010072423960395701,0
8
+ arc_easy,acc_norm,0.5803872053872053,0.010126315840891536,0
9
+ boolq,acc,0.5636085626911315,0.008674000467432073,1
10
  cb,acc,0.5178571428571429,0.06737697508644648,1
11
  cb,f1,0.33564993564993567,,1
12
  copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.43158733320055764,0.004942853459371548,0
14
+ hellaswag,acc_norm,0.5655247958573989,0.004946748608271348,0
15
+ piqa,acc,0.7328618063112078,0.010323440492612437,0
16
+ piqa,acc_norm,0.7470076169749728,0.010142888698862453,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.903,0.009363689373248111,0
19
+ sciq,acc_norm,0.901,0.009449248027662747,0
20
+ storycloze_2016,acc,0.6889363976483164,0.010705164869803167,0
21
+ winogrande,acc,0.5564325177584846,0.0139626949076204,0
2b855b18bc4/evaluation/rankeval/2b855b18bc4_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.76,
22
  "acc_stderr": 0.04292346959909283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.76,
22
  "acc_stderr": 0.04292346959909283
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.43158733320055764,
26
+ "acc_stderr": 0.004942853459371548,
27
+ "acc_norm": 0.5655247958573989,
28
+ "acc_norm_stderr": 0.004946748608271348
29
+ },
30
+ "rte": {
31
+ "acc": 0.5270758122743683,
32
+ "acc_stderr": 0.030052303463143706
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5564325177584846,
36
+ "acc_stderr": 0.0139626949076204
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6889363976483164,
40
+ "acc_stderr": 0.010705164869803167
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5636085626911315,
44
+ "acc_stderr": 0.008674000467432073
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5951178451178452,
48
+ "acc_stderr": 0.010072423960395701,
49
+ "acc_norm": 0.5803872053872053,
50
+ "acc_norm_stderr": 0.010126315840891536
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2696245733788396,
54
+ "acc_stderr": 0.01296804068686915,
55
+ "acc_norm": 0.2883959044368601,
56
+ "acc_norm_stderr": 0.013238394422428175
57
+ },
58
+ "sciq": {
59
+ "acc": 0.903,
60
+ "acc_stderr": 0.009363689373248111,
61
+ "acc_norm": 0.901,
62
+ "acc_norm_stderr": 0.009449248027662747
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7328618063112078,
66
+ "acc_stderr": 0.010323440492612437,
67
+ "acc_norm": 0.7470076169749728,
68
+ "acc_norm_stderr": 0.010142888698862453
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b28bc4/evaluation/rankeval/2b855b28bc4_2.csv CHANGED
@@ -12,6 +12,8 @@ cb,f1,0.2880105401844532,,1
12
  copa,acc,0.75,0.04351941398892446,0
13
  hellaswag,acc,0.43308105954989046,0.00494488954549795,0
14
  hellaswag,acc_norm,0.5610436168094005,0.004952454721934803,0
 
 
15
  rte,acc,0.5270758122743683,0.030052303463143713,0
16
  sciq,acc,0.872,0.010570133761108654,0
17
  sciq,acc_norm,0.849,0.011328165223341673,0
 
12
  copa,acc,0.75,0.04351941398892446,0
13
  hellaswag,acc,0.43308105954989046,0.00494488954549795,0
14
  hellaswag,acc_norm,0.5610436168094005,0.004952454721934803,0
15
+ piqa,acc,0.7480957562568009,0.010128421335088681,0
16
+ piqa,acc_norm,0.7459194776931447,0.010157271999135046,0
17
  rte,acc,0.5270758122743683,0.030052303463143713,0
18
  sciq,acc,0.872,0.010570133761108654,0
19
  sciq,acc_norm,0.849,0.011328165223341673,0
2b855b28bc4/evaluation/rankeval/2b855b28bc4_2.json CHANGED
@@ -60,6 +60,12 @@
60
  "acc_stderr": 0.010570133761108654,
61
  "acc_norm": 0.849,
62
  "acc_norm_stderr": 0.011328165223341673
 
 
 
 
 
 
63
  }
64
  },
65
  "versions": {
@@ -75,6 +81,7 @@
75
  "boolq": 1,
76
  "arc_easy": 0,
77
  "arc_challenge": 0,
78
- "sciq": 0
 
79
  }
80
  }
 
60
  "acc_stderr": 0.010570133761108654,
61
  "acc_norm": 0.849,
62
  "acc_norm_stderr": 0.011328165223341673
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7480957562568009,
66
+ "acc_stderr": 0.010128421335088681,
67
+ "acc_norm": 0.7459194776931447,
68
+ "acc_norm_stderr": 0.010157271999135046
69
  }
70
  },
71
  "versions": {
 
81
  "boolq": 1,
82
  "arc_easy": 0,
83
  "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b28bc4/evaluation/rankeval/2b855b28bc4_3.csv CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.319,0.014746404865473468,0
3
  anli_r2,acc,0.347,0.01506047203170662,0
4
  anli_r3,acc,0.3333333333333333,0.013613950010225594,0
 
 
 
 
 
5
  cb,acc,0.4107142857142857,0.0663363415035954,1
6
  cb,f1,0.3536644846617893,,1
7
  copa,acc,0.81,0.03942772444036623,0
8
  hellaswag,acc,0.4311890061740689,0.0049423027680021055,0
9
  hellaswag,acc_norm,0.5632344154550887,0.004949716368890495,0
 
 
10
  rte,acc,0.5270758122743683,0.030052303463143706,0
 
 
11
  storycloze_2016,acc,0.6996258685195083,0.010600915927985028,0
12
  winogrande,acc,0.5461720599842147,0.013992441563707068,0
 
2
  anli_r1,acc,0.319,0.014746404865473468,0
3
  anli_r2,acc,0.347,0.01506047203170662,0
4
  anli_r3,acc,0.3333333333333333,0.013613950010225594,0
5
+ arc_challenge,acc,0.2568259385665529,0.012766923794116801,0
6
+ arc_challenge,acc_norm,0.2977815699658703,0.013363080107244489,0
7
+ arc_easy,acc,0.5883838383838383,0.01009821864671491,0
8
+ arc_easy,acc_norm,0.563973063973064,0.010175459582759738,0
9
+ boolq,acc,0.6045871559633027,0.008551600109082895,1
10
  cb,acc,0.4107142857142857,0.0663363415035954,1
11
  cb,f1,0.3536644846617893,,1
12
  copa,acc,0.81,0.03942772444036623,0
13
  hellaswag,acc,0.4311890061740689,0.0049423027680021055,0
14
  hellaswag,acc_norm,0.5632344154550887,0.004949716368890495,0
15
+ piqa,acc,0.7442872687704026,0.010178690109459862,0
16
+ piqa,acc_norm,0.7524483133841132,0.010069703966857116,0
17
  rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.876,0.010427498872343961,0
19
+ sciq,acc_norm,0.855,0.011139977517890132,0
20
  storycloze_2016,acc,0.6996258685195083,0.010600915927985028,0
21
  winogrande,acc,0.5461720599842147,0.013992441563707068,0
2b855b28bc4/evaluation/rankeval/2b855b28bc4_3.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.6996258685195083,
40
  "acc_stderr": 0.010600915927985028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.6996258685195083,
40
  "acc_stderr": 0.010600915927985028
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6045871559633027,
44
+ "acc_stderr": 0.008551600109082895
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5883838383838383,
48
+ "acc_stderr": 0.01009821864671491,
49
+ "acc_norm": 0.563973063973064,
50
+ "acc_norm_stderr": 0.010175459582759738
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2568259385665529,
54
+ "acc_stderr": 0.012766923794116801,
55
+ "acc_norm": 0.2977815699658703,
56
+ "acc_norm_stderr": 0.013363080107244489
57
+ },
58
+ "sciq": {
59
+ "acc": 0.876,
60
+ "acc_stderr": 0.010427498872343961,
61
+ "acc_norm": 0.855,
62
+ "acc_norm_stderr": 0.011139977517890132
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7442872687704026,
66
+ "acc_stderr": 0.010178690109459862,
67
+ "acc_norm": 0.7524483133841132,
68
+ "acc_norm_stderr": 0.010069703966857116
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b28bc4/evaluation/rankeval/2b855b28bc4_4.csv CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.359,0.015177264224798592,0
3
  anli_r2,acc,0.349,0.0150806639915631,0
4
  anli_r3,acc,0.3383333333333333,0.013664144006618275,0
 
 
 
 
 
5
  cb,acc,0.44642857142857145,0.067031892279424,1
6
  cb,f1,0.3171262699564586,,1
7
  copa,acc,0.8,0.040201512610368445,0
8
  hellaswag,acc,0.42869946225851424,0.004938787067611805,0
9
  hellaswag,acc_norm,0.5677155945030871,0.004943809330692693,0
 
 
10
  rte,acc,0.5090252707581228,0.030091559826331334,0
 
 
11
  storycloze_2016,acc,0.6969535008017104,0.010627613073376717,0
12
  winogrande,acc,0.5627466456195738,0.013941393310695918,0
 
2
  anli_r1,acc,0.359,0.015177264224798592,0
3
  anli_r2,acc,0.349,0.0150806639915631,0
4
  anli_r3,acc,0.3383333333333333,0.013664144006618275,0
5
+ arc_challenge,acc,0.2645051194539249,0.012889272949313366,0
6
+ arc_challenge,acc_norm,0.302901023890785,0.013428241573185349,0
7
+ arc_easy,acc,0.5968013468013468,0.01006566857679479,0
8
+ arc_easy,acc_norm,0.5585016835016835,0.010189314382749936,0
9
+ boolq,acc,0.6039755351681957,0.008553881336813413,1
10
  cb,acc,0.44642857142857145,0.067031892279424,1
11
  cb,f1,0.3171262699564586,,1
12
  copa,acc,0.8,0.040201512610368445,0
13
  hellaswag,acc,0.42869946225851424,0.004938787067611805,0
14
  hellaswag,acc_norm,0.5677155945030871,0.004943809330692693,0
15
+ piqa,acc,0.7388465723612623,0.010248738649935587,0
16
+ piqa,acc_norm,0.7393906420021763,0.010241826155811632,0
17
  rte,acc,0.5090252707581228,0.030091559826331334,0
18
+ sciq,acc,0.889,0.009938701010583726,0
19
+ sciq,acc_norm,0.87,0.01064016979249935,0
20
  storycloze_2016,acc,0.6969535008017104,0.010627613073376717,0
21
  winogrande,acc,0.5627466456195738,0.013941393310695918,0
2b855b28bc4/evaluation/rankeval/2b855b28bc4_4.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.6969535008017104,
40
  "acc_stderr": 0.010627613073376717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.6969535008017104,
40
  "acc_stderr": 0.010627613073376717
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6039755351681957,
44
+ "acc_stderr": 0.008553881336813413
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5968013468013468,
48
+ "acc_stderr": 0.01006566857679479,
49
+ "acc_norm": 0.5585016835016835,
50
+ "acc_norm_stderr": 0.010189314382749936
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2645051194539249,
54
+ "acc_stderr": 0.012889272949313366,
55
+ "acc_norm": 0.302901023890785,
56
+ "acc_norm_stderr": 0.013428241573185349
57
+ },
58
+ "sciq": {
59
+ "acc": 0.889,
60
+ "acc_stderr": 0.009938701010583726,
61
+ "acc_norm": 0.87,
62
+ "acc_norm_stderr": 0.01064016979249935
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7388465723612623,
66
+ "acc_stderr": 0.010248738649935587,
67
+ "acc_norm": 0.7393906420021763,
68
+ "acc_norm_stderr": 0.010241826155811632
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b28bc4/evaluation/rankeval/2b855b28bc4_5.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.348,0.015070604603768408,0
3
  anli_r2,acc,0.325,0.014818724459095527,0
4
  anli_r3,acc,0.3458333333333333,0.013736245342311012,0
 
 
 
 
 
5
  cb,acc,0.42857142857142855,0.06672848092813057,1
6
  cb,f1,0.3018867924528302,,1
7
  copa,acc,0.77,0.04229525846816505,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.348,0.015070604603768408,0
3
  anli_r2,acc,0.325,0.014818724459095527,0
4
  anli_r3,acc,0.3458333333333333,0.013736245342311012,0
5
+ arc_challenge,acc,0.26023890784982934,0.012821930225112563,0
6
+ arc_challenge,acc_norm,0.295221843003413,0.013329750293382316,0
7
+ arc_easy,acc,0.5942760942760943,0.010075755540128876,0
8
+ arc_easy,acc_norm,0.5736531986531986,0.010147858603835144,0
9
+ boolq,acc,0.6110091743119266,0.008526800159503202,1
10
  cb,acc,0.42857142857142855,0.06672848092813057,1
11
  cb,f1,0.3018867924528302,,1
12
  copa,acc,0.77,0.04229525846816505,0
13
+ hellaswag,acc,0.4307906791475802,0.004941748817682304,0
14
+ hellaswag,acc_norm,0.5699063931487751,0.0049407715594755024,0
15
+ piqa,acc,0.7383025027203483,0.01025563077270823,0
16
+ piqa,acc_norm,0.7377584330794341,0.010262502565172442,0
17
+ rte,acc,0.5523465703971119,0.029931070362939526,0
18
+ sciq,acc,0.89,0.009899393819724439,0
19
+ sciq,acc_norm,0.876,0.01042749887234396,0
20
+ storycloze_2016,acc,0.6969535008017104,0.01062761307337672,0
21
+ winogrande,acc,0.5469613259668509,0.0139903666321481,0
2b855b28bc4/evaluation/rankeval/2b855b28bc4_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.77,
22
  "acc_stderr": 0.04229525846816505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.77,
22
  "acc_stderr": 0.04229525846816505
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4307906791475802,
26
+ "acc_stderr": 0.004941748817682304,
27
+ "acc_norm": 0.5699063931487751,
28
+ "acc_norm_stderr": 0.0049407715594755024
29
+ },
30
+ "rte": {
31
+ "acc": 0.5523465703971119,
32
+ "acc_stderr": 0.029931070362939526
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5469613259668509,
36
+ "acc_stderr": 0.0139903666321481
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6969535008017104,
40
+ "acc_stderr": 0.01062761307337672
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6110091743119266,
44
+ "acc_stderr": 0.008526800159503202
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5942760942760943,
48
+ "acc_stderr": 0.010075755540128876,
49
+ "acc_norm": 0.5736531986531986,
50
+ "acc_norm_stderr": 0.010147858603835144
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26023890784982934,
54
+ "acc_stderr": 0.012821930225112563,
55
+ "acc_norm": 0.295221843003413,
56
+ "acc_norm_stderr": 0.013329750293382316
57
+ },
58
+ "sciq": {
59
+ "acc": 0.89,
60
+ "acc_stderr": 0.009899393819724439,
61
+ "acc_norm": 0.876,
62
+ "acc_norm_stderr": 0.01042749887234396
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7383025027203483,
66
+ "acc_stderr": 0.01025563077270823,
67
+ "acc_norm": 0.7377584330794341,
68
+ "acc_norm_stderr": 0.010262502565172442
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b55bc4/evaluation/rankeval/2b855b55bc4_2.csv CHANGED
@@ -12,6 +12,8 @@ cb,f1,0.29390681003584224,,1
12
  copa,acc,0.8,0.040201512610368445,0
13
  hellaswag,acc,0.42949611631149176,0.004939925958728869,0
14
  hellaswag,acc_norm,0.5636327424815774,0.004949207947265914,0
 
 
15
  rte,acc,0.5306859205776173,0.030039730592197812,0
16
  sciq,acc,0.886,0.010055103435823332,0
17
  sciq,acc_norm,0.873,0.010534798620855762,0
 
12
  copa,acc,0.8,0.040201512610368445,0
13
  hellaswag,acc,0.42949611631149176,0.004939925958728869,0
14
  hellaswag,acc_norm,0.5636327424815774,0.004949207947265914,0
15
+ piqa,acc,0.736126224156692,0.010282996367695562,0
16
+ piqa,acc_norm,0.7377584330794341,0.010262502565172443,0
17
  rte,acc,0.5306859205776173,0.030039730592197812,0
18
  sciq,acc,0.886,0.010055103435823332,0
19
  sciq,acc_norm,0.873,0.010534798620855762,0
2b855b55bc4/evaluation/rankeval/2b855b55bc4_2.json CHANGED
@@ -60,6 +60,12 @@
60
  "acc_stderr": 0.010055103435823332,
61
  "acc_norm": 0.873,
62
  "acc_norm_stderr": 0.010534798620855762
 
 
 
 
 
 
63
  }
64
  },
65
  "versions": {
@@ -75,6 +81,7 @@
75
  "boolq": 1,
76
  "arc_easy": 0,
77
  "arc_challenge": 0,
78
- "sciq": 0
 
79
  }
80
  }
 
60
  "acc_stderr": 0.010055103435823332,
61
  "acc_norm": 0.873,
62
  "acc_norm_stderr": 0.010534798620855762
63
+ },
64
+ "piqa": {
65
+ "acc": 0.736126224156692,
66
+ "acc_stderr": 0.010282996367695562,
67
+ "acc_norm": 0.7377584330794341,
68
+ "acc_norm_stderr": 0.010262502565172443
69
  }
70
  },
71
  "versions": {
 
81
  "boolq": 1,
82
  "arc_easy": 0,
83
  "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b55bc4/evaluation/rankeval/2b855b55bc4_3.csv CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.328,0.01485384248727033,0
3
  anli_r2,acc,0.364,0.015222868840522022,0
4
  anli_r3,acc,0.3516666666666667,0.013789711695404785,0
 
 
 
 
 
5
  cb,acc,0.48214285714285715,0.0673769750864465,1
6
  cb,f1,0.4085858585858586,,1
7
  copa,acc,0.79,0.040936018074033256,0
8
  hellaswag,acc,0.43069109739095796,0.004941609820763584,0
9
  hellaswag,acc_norm,0.5651264688309102,0.004947272454226218,0
 
 
10
  rte,acc,0.5595667870036101,0.029882123363118712,0
 
 
11
  storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0
12
  winogrande,acc,0.5706393054459353,0.013911537499969158,0
 
2
  anli_r1,acc,0.328,0.01485384248727033,0
3
  anli_r2,acc,0.364,0.015222868840522022,0
4
  anli_r3,acc,0.3516666666666667,0.013789711695404785,0
5
+ arc_challenge,acc,0.257679180887372,0.012780770562768402,0
6
+ arc_challenge,acc_norm,0.2738907849829352,0.013032004972989501,0
7
+ arc_easy,acc,0.5829124579124579,0.010117738967781995,0
8
+ arc_easy,acc_norm,0.5782828282828283,0.010133255284012316,0
9
+ boolq,acc,0.6070336391437309,0.008542335147970571,1
10
  cb,acc,0.48214285714285715,0.0673769750864465,1
11
  cb,f1,0.4085858585858586,,1
12
  copa,acc,0.79,0.040936018074033256,0
13
  hellaswag,acc,0.43069109739095796,0.004941609820763584,0
14
  hellaswag,acc_norm,0.5651264688309102,0.004947272454226218,0
15
+ piqa,acc,0.7372143634385201,0.010269354068140767,0
16
+ piqa,acc_norm,0.7415669205658324,0.01021397163677331,0
17
  rte,acc,0.5595667870036101,0.029882123363118712,0
18
+ sciq,acc,0.893,0.009779910359847167,0
19
+ sciq,acc_norm,0.884,0.010131468138756998,0
20
  storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0
21
  winogrande,acc,0.5706393054459353,0.013911537499969158,0
2b855b55bc4/evaluation/rankeval/2b855b55bc4_3.json CHANGED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.6070336391437309,
44
  "acc_stderr": 0.008542335147970571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.6070336391437309,
44
  "acc_stderr": 0.008542335147970571
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5829124579124579,
48
+ "acc_stderr": 0.010117738967781995,
49
+ "acc_norm": 0.5782828282828283,
50
+ "acc_norm_stderr": 0.010133255284012316
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.257679180887372,
54
+ "acc_stderr": 0.012780770562768402,
55
+ "acc_norm": 0.2738907849829352,
56
+ "acc_norm_stderr": 0.013032004972989501
57
+ },
58
+ "sciq": {
59
+ "acc": 0.893,
60
+ "acc_stderr": 0.009779910359847167,
61
+ "acc_norm": 0.884,
62
+ "acc_norm_stderr": 0.010131468138756998
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7372143634385201,
66
+ "acc_stderr": 0.010269354068140767,
67
+ "acc_norm": 0.7415669205658324,
68
+ "acc_norm_stderr": 0.01021397163677331
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b55bc4/evaluation/rankeval/2b855b55bc4_3_lm-eval_global_step52452_2023-01-31-17-30-37_3shots_backup.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.328,
5
- "acc_stderr": 0.01485384248727033
6
- },
7
- "anli_r2": {
8
- "acc": 0.364,
9
- "acc_stderr": 0.015222868840522022
10
- },
11
- "anli_r3": {
12
- "acc": 0.3516666666666667,
13
- "acc_stderr": 0.013789711695404785
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.4085858585858586
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.43069109739095796,
26
- "acc_stderr": 0.004941609820763584,
27
- "acc_norm": 0.5651264688309102,
28
- "acc_norm_stderr": 0.004947272454226218
29
- },
30
- "rte": {
31
- "acc": 0.5595667870036101,
32
- "acc_stderr": 0.029882123363118712
33
- },
34
- "winogrande": {
35
- "acc": 0.5706393054459353,
36
- "acc_stderr": 0.013911537499969158
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6932121859967931,
40
- "acc_stderr": 0.010664275190473634
41
- },
42
- "boolq": {
43
- "acc": 0.6070336391437309,
44
- "acc_stderr": 0.008542335147970571
45
- }
46
- },
47
- "versions": {
48
- "anli_r1": 0,
49
- "anli_r2": 0,
50
- "anli_r3": 0,
51
- "cb": 1,
52
- "copa": 0,
53
- "hellaswag": 0,
54
- "rte": 0,
55
- "winogrande": 0,
56
- "storycloze_2016": 0,
57
- "boolq": 1
58
- }
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b55bc4/evaluation/rankeval/2b855b55bc4_4.csv CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.356,0.015149042659306626,0
3
  anli_r2,acc,0.327,0.014842213153411247,0
4
  anli_r3,acc,0.35083333333333333,0.013782212417178193,0
 
 
 
 
 
5
  cb,acc,0.4642857142857143,0.06724777654937658,1
6
  cb,f1,0.3249662618083671,,1
7
  copa,acc,0.79,0.040936018074033256,0
8
  hellaswag,acc,0.4312885879306911,0.004942440746328494,0
9
  hellaswag,acc_norm,0.5671181039633539,0.004944620712318273,0
 
 
10
  rte,acc,0.5523465703971119,0.029931070362939526,0
 
 
11
  storycloze_2016,acc,0.6980224478888295,0.010616985436073357,0
12
  winogrande,acc,0.56353591160221,0.013938569465677019,0
 
2
  anli_r1,acc,0.356,0.015149042659306626,0
3
  anli_r2,acc,0.327,0.014842213153411247,0
4
  anli_r3,acc,0.35083333333333333,0.013782212417178193,0
5
+ arc_challenge,acc,0.24232081911262798,0.012521593295800116,0
6
+ arc_challenge,acc_norm,0.2815699658703072,0.013143376735009022,0
7
+ arc_easy,acc,0.5820707070707071,0.010120628211017888,0
8
+ arc_easy,acc_norm,0.5698653198653199,0.010159130445178511,0
9
+ boolq,acc,0.6131498470948012,0.008518188340844748,1
10
  cb,acc,0.4642857142857143,0.06724777654937658,1
11
  cb,f1,0.3249662618083671,,1
12
  copa,acc,0.79,0.040936018074033256,0
13
  hellaswag,acc,0.4312885879306911,0.004942440746328494,0
14
  hellaswag,acc_norm,0.5671181039633539,0.004944620712318273,0
15
+ piqa,acc,0.7388465723612623,0.010248738649935574,0
16
+ piqa,acc_norm,0.7415669205658324,0.010213971636773306,0
17
  rte,acc,0.5523465703971119,0.029931070362939526,0
18
+ sciq,acc,0.891,0.009859828407037186,0
19
+ sciq,acc_norm,0.895,0.00969892102602495,0
20
  storycloze_2016,acc,0.6980224478888295,0.010616985436073357,0
21
  winogrande,acc,0.56353591160221,0.013938569465677019,0
2b855b55bc4/evaluation/rankeval/2b855b55bc4_4.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.6980224478888295,
40
  "acc_stderr": 0.010616985436073357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.6980224478888295,
40
  "acc_stderr": 0.010616985436073357
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6131498470948012,
44
+ "acc_stderr": 0.008518188340844748
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5820707070707071,
48
+ "acc_stderr": 0.010120628211017888,
49
+ "acc_norm": 0.5698653198653199,
50
+ "acc_norm_stderr": 0.010159130445178511
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.24232081911262798,
54
+ "acc_stderr": 0.012521593295800116,
55
+ "acc_norm": 0.2815699658703072,
56
+ "acc_norm_stderr": 0.013143376735009022
57
+ },
58
+ "sciq": {
59
+ "acc": 0.891,
60
+ "acc_stderr": 0.009859828407037186,
61
+ "acc_norm": 0.895,
62
+ "acc_norm_stderr": 0.00969892102602495
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7388465723612623,
66
+ "acc_stderr": 0.010248738649935574,
67
+ "acc_norm": 0.7415669205658324,
68
+ "acc_norm_stderr": 0.010213971636773306
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b55bc4/evaluation/rankeval/2b855b55bc4_5.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.354,0.015129868238451773,0
3
  anli_r2,acc,0.334,0.014922019523732965,0
4
  anli_r3,acc,0.3525,0.013797164918918355,0
 
 
 
 
 
5
  cb,acc,0.5178571428571429,0.06737697508644647,1
6
  cb,f1,0.34887334887334887,,1
7
  copa,acc,0.75,0.04351941398892446,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.354,0.015129868238451773,0
3
  anli_r2,acc,0.334,0.014922019523732965,0
4
  anli_r3,acc,0.3525,0.013797164918918355,0
5
+ arc_challenge,acc,0.24744027303754265,0.012610352663292673,0
6
+ arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0
7
+ arc_easy,acc,0.5867003367003367,0.010104361780747513,0
8
+ arc_easy,acc_norm,0.5774410774410774,0.010135978222981075,0
9
+ boolq,acc,0.6128440366972477,0.008519429207594416,1
10
  cb,acc,0.5178571428571429,0.06737697508644647,1
11
  cb,f1,0.34887334887334887,,1
12
  copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.4297948615813583,0.004940349676769321,0
14
+ hellaswag,acc_norm,0.5647281418044214,0.004947793051042662,0
15
+ piqa,acc,0.733949945593036,0.010310039263352831,0
16
+ piqa,acc_norm,0.7421109902067464,0.010206956662056232,0
17
+ rte,acc,0.5523465703971119,0.029931070362939526,0
18
+ sciq,acc,0.901,0.009449248027662732,0
19
+ sciq,acc_norm,0.912,0.008963053962592085,0
20
+ storycloze_2016,acc,0.6905398182789952,0.010689956745189072,0
21
+ winogrande,acc,0.5611681136543015,0.013946933444507032,0
2b855b55bc4/evaluation/rankeval/2b855b55bc4_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.75,
22
  "acc_stderr": 0.04351941398892446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.75,
22
  "acc_stderr": 0.04351941398892446
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4297948615813583,
26
+ "acc_stderr": 0.004940349676769321,
27
+ "acc_norm": 0.5647281418044214,
28
+ "acc_norm_stderr": 0.004947793051042662
29
+ },
30
+ "rte": {
31
+ "acc": 0.5523465703971119,
32
+ "acc_stderr": 0.029931070362939526
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5611681136543015,
36
+ "acc_stderr": 0.013946933444507032
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6905398182789952,
40
+ "acc_stderr": 0.010689956745189072
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6128440366972477,
44
+ "acc_stderr": 0.008519429207594416
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5867003367003367,
48
+ "acc_stderr": 0.010104361780747513,
49
+ "acc_norm": 0.5774410774410774,
50
+ "acc_norm_stderr": 0.010135978222981075
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.24744027303754265,
54
+ "acc_stderr": 0.012610352663292673,
55
+ "acc_norm": 0.27559726962457337,
56
+ "acc_norm_stderr": 0.013057169655761838
57
+ },
58
+ "sciq": {
59
+ "acc": 0.901,
60
+ "acc_stderr": 0.009449248027662732,
61
+ "acc_norm": 0.912,
62
+ "acc_norm_stderr": 0.008963053962592085
63
+ },
64
+ "piqa": {
65
+ "acc": 0.733949945593036,
66
+ "acc_stderr": 0.010310039263352831,
67
+ "acc_norm": 0.7421109902067464,
68
+ "acc_norm_stderr": 0.010206956662056232
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4/evaluation/rankeval/2b855b9bc4_2.csv CHANGED
@@ -12,6 +12,8 @@ cb,f1,0.24454009245974814,,1
12
  copa,acc,0.76,0.04292346959909283,0
13
  hellaswag,acc,0.4312885879306911,0.004942440746328495,0
14
  hellaswag,acc_norm,0.55646285600478,0.004957863944093132,0
 
 
15
  rte,acc,0.516245487364621,0.030080573208738064,0
16
  sciq,acc,0.88,0.01028132801274739,0
17
  sciq,acc_norm,0.843,0.011510146979230189,0
 
12
  copa,acc,0.76,0.04292346959909283,0
13
  hellaswag,acc,0.4312885879306911,0.004942440746328495,0
14
  hellaswag,acc_norm,0.55646285600478,0.004957863944093132,0
15
+ piqa,acc,0.7301414581066377,0.010356595421852197,0
16
+ piqa,acc_norm,0.735038084874864,0.010296557993316049,0
17
  rte,acc,0.516245487364621,0.030080573208738064,0
18
  sciq,acc,0.88,0.01028132801274739,0
19
  sciq,acc_norm,0.843,0.011510146979230189,0
2b855b9bc4/evaluation/rankeval/2b855b9bc4_2.json CHANGED
@@ -60,6 +60,12 @@
60
  "acc_stderr": 0.01028132801274739,
61
  "acc_norm": 0.843,
62
  "acc_norm_stderr": 0.011510146979230189
 
 
 
 
 
 
63
  }
64
  },
65
  "versions": {
@@ -75,6 +81,7 @@
75
  "boolq": 1,
76
  "arc_easy": 0,
77
  "arc_challenge": 0,
78
- "sciq": 0
 
79
  }
80
  }
 
60
  "acc_stderr": 0.01028132801274739,
61
  "acc_norm": 0.843,
62
  "acc_norm_stderr": 0.011510146979230189
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7301414581066377,
66
+ "acc_stderr": 0.010356595421852197,
67
+ "acc_norm": 0.735038084874864,
68
+ "acc_norm_stderr": 0.010296557993316049
69
  }
70
  },
71
  "versions": {
 
81
  "boolq": 1,
82
  "arc_easy": 0,
83
  "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4/evaluation/rankeval/2b855b9bc4_3.csv CHANGED
@@ -2,12 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.322,0.014782913600996664,0
3
  anli_r2,acc,0.353,0.015120172605483689,0
4
  anli_r3,acc,0.3333333333333333,0.013613950010225593,0
 
 
 
 
5
  boolq,acc,0.5834862385321101,0.008622288020674003,1
6
  cb,acc,0.375,0.06527912098338669,1
7
  cb,f1,0.34540644540644544,,1
8
  copa,acc,0.77,0.04229525846816506,0
9
  hellaswag,acc,0.4303923521210914,0.004941191607317909,0
10
  hellaswag,acc_norm,0.5595498904600678,0.004954265595373475,0
 
 
11
  rte,acc,0.49458483754512633,0.030094698123239966,0
 
 
12
  storycloze_2016,acc,0.6905398182789952,0.01068995674518907,0
13
  winogrande,acc,0.5390686661404893,0.014009521680980316,0
 
2
  anli_r1,acc,0.322,0.014782913600996664,0
3
  anli_r2,acc,0.353,0.015120172605483689,0
4
  anli_r3,acc,0.3333333333333333,0.013613950010225593,0
5
+ arc_challenge,acc,0.2525597269624573,0.012696728980207706,0
6
+ arc_challenge,acc_norm,0.28242320819112626,0.013155456884097222,0
7
+ arc_easy,acc,0.5765993265993266,0.010138671005289045,0
8
+ arc_easy,acc_norm,0.5517676767676768,0.010204645126856942,0
9
  boolq,acc,0.5834862385321101,0.008622288020674003,1
10
  cb,acc,0.375,0.06527912098338669,1
11
  cb,f1,0.34540644540644544,,1
12
  copa,acc,0.77,0.04229525846816506,0
13
  hellaswag,acc,0.4303923521210914,0.004941191607317909,0
14
  hellaswag,acc_norm,0.5595498904600678,0.004954265595373475,0
15
+ piqa,acc,0.7377584330794341,0.010262502565172449,0
16
+ piqa,acc_norm,0.7475516866158868,0.010135665547362355,0
17
  rte,acc,0.49458483754512633,0.030094698123239966,0
18
+ sciq,acc,0.881,0.01024421514533666,0
19
+ sciq,acc_norm,0.856,0.01110798754893915,0
20
  storycloze_2016,acc,0.6905398182789952,0.01068995674518907,0
21
  winogrande,acc,0.5390686661404893,0.014009521680980316,0
2b855b9bc4/evaluation/rankeval/2b855b9bc4_3.json CHANGED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.5834862385321101,
44
  "acc_stderr": 0.008622288020674003
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.5834862385321101,
44
  "acc_stderr": 0.008622288020674003
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5765993265993266,
48
+ "acc_stderr": 0.010138671005289045,
49
+ "acc_norm": 0.5517676767676768,
50
+ "acc_norm_stderr": 0.010204645126856942
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2525597269624573,
54
+ "acc_stderr": 0.012696728980207706,
55
+ "acc_norm": 0.28242320819112626,
56
+ "acc_norm_stderr": 0.013155456884097222
57
+ },
58
+ "sciq": {
59
+ "acc": 0.881,
60
+ "acc_stderr": 0.01024421514533666,
61
+ "acc_norm": 0.856,
62
+ "acc_norm_stderr": 0.01110798754893915
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7377584330794341,
66
+ "acc_stderr": 0.010262502565172449,
67
+ "acc_norm": 0.7475516866158868,
68
+ "acc_norm_stderr": 0.010135665547362355
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4/evaluation/rankeval/2b855b9bc4_4.csv CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.348,0.01507060460376841,0
3
  anli_r2,acc,0.346,0.015050266127564448,0
4
  anli_r3,acc,0.355,0.013819249004047308,0
 
 
 
 
 
5
  cb,acc,0.30357142857142855,0.06199938655510754,1
6
  cb,f1,0.24443052074631022,,1
7
  copa,acc,0.77,0.04229525846816506,0
8
  hellaswag,acc,0.43158733320055764,0.00494285345937155,0
9
  hellaswag,acc_norm,0.5638319059948218,0.004948952519517514,0
 
 
10
  rte,acc,0.48736462093862815,0.030086851767188564,0
 
 
11
  storycloze_2016,acc,0.692143238909674,0.010674598158758179,0
12
  winogrande,acc,0.5469613259668509,0.013990366632148088,0
 
2
  anli_r1,acc,0.348,0.01507060460376841,0
3
  anli_r2,acc,0.346,0.015050266127564448,0
4
  anli_r3,acc,0.355,0.013819249004047308,0
5
+ arc_challenge,acc,0.26535836177474403,0.012902554762313964,0
6
+ arc_challenge,acc_norm,0.2977815699658703,0.013363080107244487,0
7
+ arc_easy,acc,0.5774410774410774,0.01013597822298108,0
8
+ arc_easy,acc_norm,0.555976430976431,0.010195285580783954,0
9
+ boolq,acc,0.5776758409785933,0.008638883260317736,1
10
  cb,acc,0.30357142857142855,0.06199938655510754,1
11
  cb,f1,0.24443052074631022,,1
12
  copa,acc,0.77,0.04229525846816506,0
13
  hellaswag,acc,0.43158733320055764,0.00494285345937155,0
14
  hellaswag,acc_norm,0.5638319059948218,0.004948952519517514,0
15
+ piqa,acc,0.736126224156692,0.010282996367695562,0
16
+ piqa,acc_norm,0.7404787812840044,0.010227939888173925,0
17
  rte,acc,0.48736462093862815,0.030086851767188564,0
18
+ sciq,acc,0.887,0.01001655286669686,0
19
+ sciq,acc_norm,0.878,0.010354864712936701,0
20
  storycloze_2016,acc,0.692143238909674,0.010674598158758179,0
21
  winogrande,acc,0.5469613259668509,0.013990366632148088,0
2b855b9bc4/evaluation/rankeval/2b855b9bc4_4.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.692143238909674,
40
  "acc_stderr": 0.010674598158758179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.692143238909674,
40
  "acc_stderr": 0.010674598158758179
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5776758409785933,
44
+ "acc_stderr": 0.008638883260317736
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5774410774410774,
48
+ "acc_stderr": 0.01013597822298108,
49
+ "acc_norm": 0.555976430976431,
50
+ "acc_norm_stderr": 0.010195285580783954
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26535836177474403,
54
+ "acc_stderr": 0.012902554762313964,
55
+ "acc_norm": 0.2977815699658703,
56
+ "acc_norm_stderr": 0.013363080107244487
57
+ },
58
+ "sciq": {
59
+ "acc": 0.887,
60
+ "acc_stderr": 0.01001655286669686,
61
+ "acc_norm": 0.878,
62
+ "acc_norm_stderr": 0.010354864712936701
63
+ },
64
+ "piqa": {
65
+ "acc": 0.736126224156692,
66
+ "acc_stderr": 0.010282996367695562,
67
+ "acc_norm": 0.7404787812840044,
68
+ "acc_norm_stderr": 0.010227939888173925
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4/evaluation/rankeval/2b855b9bc4_5.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.337,0.014955087918653616,0
3
  anli_r2,acc,0.328,0.014853842487270334,0
4
  anli_r3,acc,0.33916666666666667,0.013672343491681822,0
 
 
 
 
 
5
  cb,acc,0.32142857142857145,0.06297362289056341,1
6
  cb,f1,0.24285714285714288,,1
7
  copa,acc,0.74,0.04408440022768078,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.337,0.014955087918653616,0
3
  anli_r2,acc,0.328,0.014853842487270334,0
4
  anli_r3,acc,0.33916666666666667,0.013672343491681822,0
5
+ arc_challenge,acc,0.2696245733788396,0.012968040686869143,0
6
+ arc_challenge,acc_norm,0.28924914675767915,0.013250012579393443,0
7
+ arc_easy,acc,0.57996632996633,0.010127718838529321,0
8
+ arc_easy,acc_norm,0.5681818181818182,0.010163945352271733,0
9
+ boolq,acc,0.5804281345565749,0.008631175489166726,1
10
  cb,acc,0.32142857142857145,0.06297362289056341,1
11
  cb,f1,0.24285714285714288,,1
12
  copa,acc,0.74,0.04408440022768078,0
13
+ hellaswag,acc,0.4298944433379805,0.004940490508240647,0
14
+ hellaswag,acc_norm,0.5665206134236208,0.004945424771611602,0
15
+ piqa,acc,0.7334058759521219,0.010316749863541367,0
16
+ piqa,acc_norm,0.7486398258977149,0.010121156016819245,0
17
+ rte,acc,0.4657039711191336,0.030025579819366426,0
18
+ sciq,acc,0.891,0.00985982840703719,0
19
+ sciq,acc_norm,0.882,0.01020686926438179,0
20
+ storycloze_2016,acc,0.6910742918225548,0.010684853966268455,0
21
+ winogrande,acc,0.5461720599842147,0.01399244156370706,0
2b855b9bc4/evaluation/rankeval/2b855b9bc4_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.74,
22
  "acc_stderr": 0.04408440022768078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.74,
22
  "acc_stderr": 0.04408440022768078
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4298944433379805,
26
+ "acc_stderr": 0.004940490508240647,
27
+ "acc_norm": 0.5665206134236208,
28
+ "acc_norm_stderr": 0.004945424771611602
29
+ },
30
+ "rte": {
31
+ "acc": 0.4657039711191336,
32
+ "acc_stderr": 0.030025579819366426
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5461720599842147,
36
+ "acc_stderr": 0.01399244156370706
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6910742918225548,
40
+ "acc_stderr": 0.010684853966268455
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5804281345565749,
44
+ "acc_stderr": 0.008631175489166726
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.57996632996633,
48
+ "acc_stderr": 0.010127718838529321,
49
+ "acc_norm": 0.5681818181818182,
50
+ "acc_norm_stderr": 0.010163945352271733
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2696245733788396,
54
+ "acc_stderr": 0.012968040686869143,
55
+ "acc_norm": 0.28924914675767915,
56
+ "acc_norm_stderr": 0.013250012579393443
57
+ },
58
+ "sciq": {
59
+ "acc": 0.891,
60
+ "acc_stderr": 0.00985982840703719,
61
+ "acc_norm": 0.882,
62
+ "acc_norm_stderr": 0.01020686926438179
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7334058759521219,
66
+ "acc_stderr": 0.010316749863541367,
67
+ "acc_norm": 0.7486398258977149,
68
+ "acc_norm_stderr": 0.010121156016819245
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }