Muennighoff commited on
Commit
595d78a
1 Parent(s): 64820e9
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.346,0.015050266127564443,0
3
+ anli_r2,acc,0.354,0.015129868238451773,0
4
+ anli_r3,acc,0.3516666666666667,0.013789711695404794,0
5
+ arc_challenge,acc,0.22696245733788395,0.012240491536132873,0
6
+ arc_challenge,acc_norm,0.27474402730375425,0.013044617212771227,0
7
+ arc_easy,acc,0.4663299663299663,0.010236494647406476,0
8
+ arc_easy,acc_norm,0.42845117845117847,0.010154195733990968,0
9
+ boolq,acc,0.4954128440366973,0.008744686941762907,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.30404040404040406,,1
12
+ copa,acc,0.65,0.0479372485441102,0
13
+ hellaswag,acc,0.3588926508663613,0.00478695314665708,0
14
+ hellaswag,acc_norm,0.4307906791475802,0.0049417488176823005,0
15
+ piqa,acc,0.6877040261153428,0.010812581599154424,0
16
+ piqa,acc_norm,0.6849836779107725,0.010838072746240652,0
17
+ rte,acc,0.49097472924187724,0.030091559826331334,0
18
+ sciq,acc,0.679,0.014770821817934649,0
19
+ sciq,acc_norm,0.617,0.015380102325652715,0
20
+ storycloze_2016,acc,0.6285408872260823,0.011173814890350133,0
21
+ winogrande,acc,0.5185477505919495,0.014042813708888378,0
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_0_lm-eval_global_step80108_2023-05-04-10-24-43_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.346,
5
- "acc_stderr": 0.015050266127564443
6
- },
7
- "anli_r2": {
8
- "acc": 0.354,
9
- "acc_stderr": 0.015129868238451773
10
- },
11
- "anli_r3": {
12
- "acc": 0.3516666666666667,
13
- "acc_stderr": 0.013789711695404794
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.30404040404040406
19
- },
20
- "copa": {
21
- "acc": 0.65,
22
- "acc_stderr": 0.0479372485441102
23
- },
24
- "hellaswag": {
25
- "acc": 0.3588926508663613,
26
- "acc_stderr": 0.00478695314665708,
27
- "acc_norm": 0.4307906791475802,
28
- "acc_norm_stderr": 0.0049417488176823005
29
- },
30
- "rte": {
31
- "acc": 0.49097472924187724,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.5185477505919495,
36
- "acc_stderr": 0.014042813708888378
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6285408872260823,
40
- "acc_stderr": 0.011173814890350133
41
- },
42
- "boolq": {
43
- "acc": 0.4954128440366973,
44
- "acc_stderr": 0.008744686941762907
45
- },
46
- "arc_easy": {
47
- "acc": 0.4663299663299663,
48
- "acc_stderr": 0.010236494647406476,
49
- "acc_norm": 0.42845117845117847,
50
- "acc_norm_stderr": 0.010154195733990968
51
- },
52
- "arc_challenge": {
53
- "acc": 0.22696245733788395,
54
- "acc_stderr": 0.012240491536132873,
55
- "acc_norm": 0.27474402730375425,
56
- "acc_norm_stderr": 0.013044617212771227
57
- },
58
- "sciq": {
59
- "acc": 0.679,
60
- "acc_stderr": 0.014770821817934649,
61
- "acc_norm": 0.617,
62
- "acc_norm_stderr": 0.015380102325652715
63
- },
64
- "piqa": {
65
- "acc": 0.6877040261153428,
66
- "acc_stderr": 0.010812581599154424,
67
- "acc_norm": 0.6849836779107725,
68
- "acc_norm_stderr": 0.010838072746240652
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.347,0.015060472031706622,0
3
+ anli_r2,acc,0.331,0.014888272588203934,0
4
+ anli_r3,acc,0.33916666666666667,0.013672343491681819,0
5
+ arc_challenge,acc,0.22866894197952217,0.012272853582540802,0
6
+ arc_challenge,acc_norm,0.26706484641638223,0.012928933196496349,0
7
+ arc_easy,acc,0.4684343434343434,0.010239317603199502,0
8
+ arc_easy,acc_norm,0.45496632996632996,0.010218084454602578,0
9
+ boolq,acc,0.41804281345565747,0.008626774352070744,1
10
+ cb,acc,0.4107142857142857,0.06633634150359541,1
11
+ cb,f1,0.27807807807807805,,1
12
+ copa,acc,0.63,0.04852365870939099,0
13
+ hellaswag,acc,0.3569010157339175,0.004781061390873926,0
14
+ hellaswag,acc_norm,0.42421828321051586,0.004932137126625413,0
15
+ piqa,acc,0.676822633297062,0.01091197412428213,0
16
+ piqa,acc_norm,0.6773667029379761,0.010907166359856616,0
17
+ rte,acc,0.48375451263537905,0.030080573208738064,0
18
+ sciq,acc,0.692,0.01460648312734276,0
19
+ sciq,acc_norm,0.671,0.014865395385928357,0
20
+ storycloze_2016,acc,0.6162479957242116,0.011245591019345448,0
21
+ winogrande,acc,0.5130228887134964,0.014047718393997667,0
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_1_lm-eval_global_step80108_2023-05-04-10-21-10_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.347,
5
- "acc_stderr": 0.015060472031706622
6
- },
7
- "anli_r2": {
8
- "acc": 0.331,
9
- "acc_stderr": 0.014888272588203934
10
- },
11
- "anli_r3": {
12
- "acc": 0.33916666666666667,
13
- "acc_stderr": 0.013672343491681819
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.06633634150359541,
18
- "f1": 0.27807807807807805
19
- },
20
- "copa": {
21
- "acc": 0.63,
22
- "acc_stderr": 0.04852365870939099
23
- },
24
- "hellaswag": {
25
- "acc": 0.3569010157339175,
26
- "acc_stderr": 0.004781061390873926,
27
- "acc_norm": 0.42421828321051586,
28
- "acc_norm_stderr": 0.004932137126625413
29
- },
30
- "rte": {
31
- "acc": 0.48375451263537905,
32
- "acc_stderr": 0.030080573208738064
33
- },
34
- "winogrande": {
35
- "acc": 0.5130228887134964,
36
- "acc_stderr": 0.014047718393997667
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6162479957242116,
40
- "acc_stderr": 0.011245591019345448
41
- },
42
- "boolq": {
43
- "acc": 0.41804281345565747,
44
- "acc_stderr": 0.008626774352070744
45
- },
46
- "arc_easy": {
47
- "acc": 0.4684343434343434,
48
- "acc_stderr": 0.010239317603199502,
49
- "acc_norm": 0.45496632996632996,
50
- "acc_norm_stderr": 0.010218084454602578
51
- },
52
- "arc_challenge": {
53
- "acc": 0.22866894197952217,
54
- "acc_stderr": 0.012272853582540802,
55
- "acc_norm": 0.26706484641638223,
56
- "acc_norm_stderr": 0.012928933196496349
57
- },
58
- "sciq": {
59
- "acc": 0.692,
60
- "acc_stderr": 0.01460648312734276,
61
- "acc_norm": 0.671,
62
- "acc_norm_stderr": 0.014865395385928357
63
- },
64
- "piqa": {
65
- "acc": 0.676822633297062,
66
- "acc_stderr": 0.01091197412428213,
67
- "acc_norm": 0.6773667029379761,
68
- "acc_norm_stderr": 0.010907166359856616
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.014944140233795021,0
3
+ anli_r2,acc,0.34,0.014987482264363937,0
4
+ anli_r3,acc,0.32166666666666666,0.013490095282989523,0
5
+ arc_challenge,acc,0.24232081911262798,0.012521593295800116,0
6
+ arc_challenge,acc_norm,0.2832764505119454,0.013167478735134576,0
7
+ arc_easy,acc,0.4654882154882155,0.01023531423896939,0
8
+ arc_easy,acc_norm,0.4553872053872054,0.010218861787618726,0
9
+ boolq,acc,0.4091743119266055,0.00859956344239735,1
10
+ cb,acc,0.4107142857142857,0.06633634150359541,1
11
+ cb,f1,0.24291938997821352,,1
12
+ copa,acc,0.66,0.04760952285695237,0
13
+ hellaswag,acc,0.3552081258713404,0.004775982650355913,0
14
+ hellaswag,acc_norm,0.42561242780322645,0.004934250390879783,0
15
+ piqa,acc,0.675734494015234,0.010921539041347987,0
16
+ piqa,acc_norm,0.6773667029379761,0.010907166359856616,0
17
+ rte,acc,0.4693140794223827,0.03003973059219781,0
18
+ sciq,acc,0.702,0.014470846741134703,0
19
+ sciq,acc_norm,0.68,0.014758652303574872,0
20
+ storycloze_2016,acc,0.6173169428113309,0.011239653231976822,0
21
+ winogrande,acc,0.5153906866614049,0.014045826789783668,0
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_2_lm-eval_global_step80108_2023-05-04-10-24-43_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795021
6
- },
7
- "anli_r2": {
8
- "acc": 0.34,
9
- "acc_stderr": 0.014987482264363937
10
- },
11
- "anli_r3": {
12
- "acc": 0.32166666666666666,
13
- "acc_stderr": 0.013490095282989523
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.06633634150359541,
18
- "f1": 0.24291938997821352
19
- },
20
- "copa": {
21
- "acc": 0.66,
22
- "acc_stderr": 0.04760952285695237
23
- },
24
- "hellaswag": {
25
- "acc": 0.3552081258713404,
26
- "acc_stderr": 0.004775982650355913,
27
- "acc_norm": 0.42561242780322645,
28
- "acc_norm_stderr": 0.004934250390879783
29
- },
30
- "rte": {
31
- "acc": 0.4693140794223827,
32
- "acc_stderr": 0.03003973059219781
33
- },
34
- "winogrande": {
35
- "acc": 0.5153906866614049,
36
- "acc_stderr": 0.014045826789783668
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6173169428113309,
40
- "acc_stderr": 0.011239653231976822
41
- },
42
- "boolq": {
43
- "acc": 0.4091743119266055,
44
- "acc_stderr": 0.00859956344239735
45
- },
46
- "arc_easy": {
47
- "acc": 0.4654882154882155,
48
- "acc_stderr": 0.01023531423896939,
49
- "acc_norm": 0.4553872053872054,
50
- "acc_norm_stderr": 0.010218861787618726
51
- },
52
- "arc_challenge": {
53
- "acc": 0.24232081911262798,
54
- "acc_stderr": 0.012521593295800116,
55
- "acc_norm": 0.2832764505119454,
56
- "acc_norm_stderr": 0.013167478735134576
57
- },
58
- "sciq": {
59
- "acc": 0.702,
60
- "acc_stderr": 0.014470846741134703,
61
- "acc_norm": 0.68,
62
- "acc_norm_stderr": 0.014758652303574872
63
- },
64
- "piqa": {
65
- "acc": 0.675734494015234,
66
- "acc_stderr": 0.010921539041347987,
67
- "acc_norm": 0.6773667029379761,
68
- "acc_norm_stderr": 0.010907166359856616
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.355,0.015139491543780532,0
3
+ anli_r2,acc,0.365,0.01523177622626491,0
4
+ anli_r3,acc,0.3375,0.013655897185463652,0
5
+ arc_challenge,acc,0.23378839590443687,0.012368225378507142,0
6
+ arc_challenge,acc_norm,0.2713310580204778,0.012993807727545794,0
7
+ arc_easy,acc,0.47053872053872053,0.01024195772840968,0
8
+ arc_easy,acc_norm,0.4642255892255892,0.010233488709726547,0
9
+ boolq,acc,0.41651376146788993,0.00862228802067401,1
10
+ cb,acc,0.4107142857142857,0.06633634150359541,1
11
+ cb,f1,0.2889767237593324,,1
12
+ copa,acc,0.62,0.04878317312145632,0
13
+ hellaswag,acc,0.358195578570006,0.004784901248558721,0
14
+ hellaswag,acc_norm,0.424317864967138,0.004932289405608946,0
15
+ piqa,acc,0.6621327529923831,0.011035474307853841,0
16
+ piqa,acc_norm,0.676822633297062,0.010911974124282128,0
17
+ rte,acc,0.4729241877256318,0.030052303463143706,0
18
+ sciq,acc,0.715,0.014282120955200484,0
19
+ sciq,acc_norm,0.687,0.014671272822977886,0
20
+ storycloze_2016,acc,0.6194548369855692,0.011227604968407471,0
21
+ winogrande,acc,0.5043409629044988,0.014051956064076892,0
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_3_lm-eval_global_step80108_2023-05-04-10-24-43_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.355,
5
- "acc_stderr": 0.015139491543780532
6
- },
7
- "anli_r2": {
8
- "acc": 0.365,
9
- "acc_stderr": 0.01523177622626491
10
- },
11
- "anli_r3": {
12
- "acc": 0.3375,
13
- "acc_stderr": 0.013655897185463652
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.06633634150359541,
18
- "f1": 0.2889767237593324
19
- },
20
- "copa": {
21
- "acc": 0.62,
22
- "acc_stderr": 0.04878317312145632
23
- },
24
- "hellaswag": {
25
- "acc": 0.358195578570006,
26
- "acc_stderr": 0.004784901248558721,
27
- "acc_norm": 0.424317864967138,
28
- "acc_norm_stderr": 0.004932289405608946
29
- },
30
- "rte": {
31
- "acc": 0.4729241877256318,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5043409629044988,
36
- "acc_stderr": 0.014051956064076892
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6194548369855692,
40
- "acc_stderr": 0.011227604968407471
41
- },
42
- "boolq": {
43
- "acc": 0.41651376146788993,
44
- "acc_stderr": 0.00862228802067401
45
- },
46
- "arc_easy": {
47
- "acc": 0.47053872053872053,
48
- "acc_stderr": 0.01024195772840968,
49
- "acc_norm": 0.4642255892255892,
50
- "acc_norm_stderr": 0.010233488709726547
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23378839590443687,
54
- "acc_stderr": 0.012368225378507142,
55
- "acc_norm": 0.2713310580204778,
56
- "acc_norm_stderr": 0.012993807727545794
57
- },
58
- "sciq": {
59
- "acc": 0.715,
60
- "acc_stderr": 0.014282120955200484,
61
- "acc_norm": 0.687,
62
- "acc_norm_stderr": 0.014671272822977886
63
- },
64
- "piqa": {
65
- "acc": 0.6621327529923831,
66
- "acc_stderr": 0.011035474307853841,
67
- "acc_norm": 0.676822633297062,
68
- "acc_norm_stderr": 0.010911974124282128
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.369,0.015266698139154617,0
3
+ anli_r2,acc,0.348,0.01507060460376841,0
4
+ anli_r3,acc,0.3425,0.013704669762934732,0
5
+ arc_challenge,acc,0.23464163822525597,0.012383873560768682,0
6
+ arc_challenge,acc_norm,0.2883959044368601,0.01323839442242818,0
7
+ arc_easy,acc,0.45791245791245794,0.010223371342195897,0
8
+ arc_easy,acc_norm,0.4612794612794613,0.010228972678389606,0
9
+ boolq,acc,0.41804281345565747,0.008626774352070744,1
10
+ cb,acc,0.4642857142857143,0.0672477765493766,1
11
+ cb,f1,0.32222222222222224,,1
12
+ copa,acc,0.68,0.04688261722621504,0
13
+ hellaswag,acc,0.359788886675961,0.004789575163418652,0
14
+ hellaswag,acc_norm,0.42401911969727146,0.004931831953800041,0
15
+ piqa,acc,0.6730141458106638,0.010945157126978217,0
16
+ piqa,acc_norm,0.6702937976060935,0.010968357083095152,0
17
+ rte,acc,0.48014440433212996,0.0300727231673172,0
18
+ sciq,acc,0.729,0.014062601350986186,0
19
+ sciq,acc_norm,0.707,0.014399942998441268,0
20
+ storycloze_2016,acc,0.6173169428113309,0.011239653231976824,0
21
+ winogrande,acc,0.5177584846093133,0.014043619596174959,0
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_4_lm-eval_global_step80108_2023-05-04-10-24-43_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.369,
5
- "acc_stderr": 0.015266698139154617
6
- },
7
- "anli_r2": {
8
- "acc": 0.348,
9
- "acc_stderr": 0.01507060460376841
10
- },
11
- "anli_r3": {
12
- "acc": 0.3425,
13
- "acc_stderr": 0.013704669762934732
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.0672477765493766,
18
- "f1": 0.32222222222222224
19
- },
20
- "copa": {
21
- "acc": 0.68,
22
- "acc_stderr": 0.04688261722621504
23
- },
24
- "hellaswag": {
25
- "acc": 0.359788886675961,
26
- "acc_stderr": 0.004789575163418652,
27
- "acc_norm": 0.42401911969727146,
28
- "acc_norm_stderr": 0.004931831953800041
29
- },
30
- "rte": {
31
- "acc": 0.48014440433212996,
32
- "acc_stderr": 0.0300727231673172
33
- },
34
- "winogrande": {
35
- "acc": 0.5177584846093133,
36
- "acc_stderr": 0.014043619596174959
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6173169428113309,
40
- "acc_stderr": 0.011239653231976824
41
- },
42
- "boolq": {
43
- "acc": 0.41804281345565747,
44
- "acc_stderr": 0.008626774352070744
45
- },
46
- "arc_easy": {
47
- "acc": 0.45791245791245794,
48
- "acc_stderr": 0.010223371342195897,
49
- "acc_norm": 0.4612794612794613,
50
- "acc_norm_stderr": 0.010228972678389606
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23464163822525597,
54
- "acc_stderr": 0.012383873560768682,
55
- "acc_norm": 0.2883959044368601,
56
- "acc_norm_stderr": 0.01323839442242818
57
- },
58
- "sciq": {
59
- "acc": 0.729,
60
- "acc_stderr": 0.014062601350986186,
61
- "acc_norm": 0.707,
62
- "acc_norm_stderr": 0.014399942998441268
63
- },
64
- "piqa": {
65
- "acc": 0.6730141458106638,
66
- "acc_stderr": 0.010945157126978217,
67
- "acc_norm": 0.6702937976060935,
68
- "acc_norm_stderr": 0.010968357083095152
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.363,0.015213890444671283,0
3
+ anli_r2,acc,0.35,0.015090650341444233,0
4
+ anli_r3,acc,0.3333333333333333,0.013613950010225617,0
5
+ arc_challenge,acc,0.23293515358361774,0.012352507042617391,0
6
+ arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0
7
+ arc_easy,acc,0.468013468013468,0.010238767643185723,0
8
+ arc_easy,acc_norm,0.4511784511784512,0.010210757101073475,0
9
+ boolq,acc,0.40948012232415903,0.00860054975132092,1
10
+ cb,acc,0.32142857142857145,0.06297362289056341,1
11
+ cb,f1,0.2379385964912281,,1
12
+ copa,acc,0.65,0.0479372485441102,0
13
+ hellaswag,acc,0.3551085441147182,0.004775681871529861,0
14
+ hellaswag,acc_norm,0.42869946225851424,0.004938787067611805,0
15
+ piqa,acc,0.6659412404787813,0.011004613886336733,0
16
+ piqa,acc_norm,0.6751904243743199,0.010926296238294038,0
17
+ rte,acc,0.4729241877256318,0.030052303463143706,0
18
+ sciq,acc,0.731,0.014029819522568196,0
19
+ sciq,acc_norm,0.71,0.014356395999905694,0
20
+ storycloze_2016,acc,0.615713522180652,0.011248538366952603,0
21
+ winogrande,acc,0.516179952644041,0.014045126130978603,0
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_5_lm-eval_global_step80108_2023-05-04-10-24-43_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.363,
5
- "acc_stderr": 0.015213890444671283
6
- },
7
- "anli_r2": {
8
- "acc": 0.35,
9
- "acc_stderr": 0.015090650341444233
10
- },
11
- "anli_r3": {
12
- "acc": 0.3333333333333333,
13
- "acc_stderr": 0.013613950010225617
14
- },
15
- "cb": {
16
- "acc": 0.32142857142857145,
17
- "acc_stderr": 0.06297362289056341,
18
- "f1": 0.2379385964912281
19
- },
20
- "copa": {
21
- "acc": 0.65,
22
- "acc_stderr": 0.0479372485441102
23
- },
24
- "hellaswag": {
25
- "acc": 0.3551085441147182,
26
- "acc_stderr": 0.004775681871529861,
27
- "acc_norm": 0.42869946225851424,
28
- "acc_norm_stderr": 0.004938787067611805
29
- },
30
- "rte": {
31
- "acc": 0.4729241877256318,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.516179952644041,
36
- "acc_stderr": 0.014045126130978603
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.615713522180652,
40
- "acc_stderr": 0.011248538366952603
41
- },
42
- "boolq": {
43
- "acc": 0.40948012232415903,
44
- "acc_stderr": 0.00860054975132092
45
- },
46
- "arc_easy": {
47
- "acc": 0.468013468013468,
48
- "acc_stderr": 0.010238767643185723,
49
- "acc_norm": 0.4511784511784512,
50
- "acc_norm_stderr": 0.010210757101073475
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23293515358361774,
54
- "acc_stderr": 0.012352507042617391,
55
- "acc_norm": 0.2773037542662116,
56
- "acc_norm_stderr": 0.013082095839059374
57
- },
58
- "sciq": {
59
- "acc": 0.731,
60
- "acc_stderr": 0.014029819522568196,
61
- "acc_norm": 0.71,
62
- "acc_norm_stderr": 0.014356395999905694
63
- },
64
- "piqa": {
65
- "acc": 0.6659412404787813,
66
- "acc_stderr": 0.011004613886336733,
67
- "acc_norm": 0.6751904243743199,
68
- "acc_norm_stderr": 0.010926296238294038
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }