Commit
•
595d78a
1
Parent(s):
64820e9
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_0.csv +21 -0
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_0_lm-eval_global_step80108_2023-05-04-10-24-43_0shots_backup.json +0 -87
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_1.csv +21 -0
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_1_lm-eval_global_step80108_2023-05-04-10-21-10_1shots_backup.json +0 -87
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_2.csv +21 -0
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_2_lm-eval_global_step80108_2023-05-04-10-24-43_2shots_backup.json +0 -87
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_3.csv +21 -0
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_3_lm-eval_global_step80108_2023-05-04-10-24-43_3shots_backup.json +0 -87
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_4.csv +21 -0
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_4_lm-eval_global_step80108_2023-05-04-10-24-43_4shots_backup.json +0 -87
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_5.csv +21 -0
- 4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_5_lm-eval_global_step80108_2023-05-04-10-24-43_5shots_backup.json +0 -87
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.346,0.015050266127564443,0
|
3 |
+
anli_r2,acc,0.354,0.015129868238451773,0
|
4 |
+
anli_r3,acc,0.3516666666666667,0.013789711695404794,0
|
5 |
+
arc_challenge,acc,0.22696245733788395,0.012240491536132873,0
|
6 |
+
arc_challenge,acc_norm,0.27474402730375425,0.013044617212771227,0
|
7 |
+
arc_easy,acc,0.4663299663299663,0.010236494647406476,0
|
8 |
+
arc_easy,acc_norm,0.42845117845117847,0.010154195733990968,0
|
9 |
+
boolq,acc,0.4954128440366973,0.008744686941762907,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.30404040404040406,,1
|
12 |
+
copa,acc,0.65,0.0479372485441102,0
|
13 |
+
hellaswag,acc,0.3588926508663613,0.00478695314665708,0
|
14 |
+
hellaswag,acc_norm,0.4307906791475802,0.0049417488176823005,0
|
15 |
+
piqa,acc,0.6877040261153428,0.010812581599154424,0
|
16 |
+
piqa,acc_norm,0.6849836779107725,0.010838072746240652,0
|
17 |
+
rte,acc,0.49097472924187724,0.030091559826331334,0
|
18 |
+
sciq,acc,0.679,0.014770821817934649,0
|
19 |
+
sciq,acc_norm,0.617,0.015380102325652715,0
|
20 |
+
storycloze_2016,acc,0.6285408872260823,0.011173814890350133,0
|
21 |
+
winogrande,acc,0.5185477505919495,0.014042813708888378,0
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_0_lm-eval_global_step80108_2023-05-04-10-24-43_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.346,
|
5 |
-
"acc_stderr": 0.015050266127564443
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.354,
|
9 |
-
"acc_stderr": 0.015129868238451773
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3516666666666667,
|
13 |
-
"acc_stderr": 0.013789711695404794
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.30404040404040406
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.65,
|
22 |
-
"acc_stderr": 0.0479372485441102
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3588926508663613,
|
26 |
-
"acc_stderr": 0.00478695314665708,
|
27 |
-
"acc_norm": 0.4307906791475802,
|
28 |
-
"acc_norm_stderr": 0.0049417488176823005
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.49097472924187724,
|
32 |
-
"acc_stderr": 0.030091559826331334
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5185477505919495,
|
36 |
-
"acc_stderr": 0.014042813708888378
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6285408872260823,
|
40 |
-
"acc_stderr": 0.011173814890350133
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.4954128440366973,
|
44 |
-
"acc_stderr": 0.008744686941762907
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.4663299663299663,
|
48 |
-
"acc_stderr": 0.010236494647406476,
|
49 |
-
"acc_norm": 0.42845117845117847,
|
50 |
-
"acc_norm_stderr": 0.010154195733990968
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.22696245733788395,
|
54 |
-
"acc_stderr": 0.012240491536132873,
|
55 |
-
"acc_norm": 0.27474402730375425,
|
56 |
-
"acc_norm_stderr": 0.013044617212771227
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.679,
|
60 |
-
"acc_stderr": 0.014770821817934649,
|
61 |
-
"acc_norm": 0.617,
|
62 |
-
"acc_norm_stderr": 0.015380102325652715
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.6877040261153428,
|
66 |
-
"acc_stderr": 0.010812581599154424,
|
67 |
-
"acc_norm": 0.6849836779107725,
|
68 |
-
"acc_norm_stderr": 0.010838072746240652
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.347,0.015060472031706622,0
|
3 |
+
anli_r2,acc,0.331,0.014888272588203934,0
|
4 |
+
anli_r3,acc,0.33916666666666667,0.013672343491681819,0
|
5 |
+
arc_challenge,acc,0.22866894197952217,0.012272853582540802,0
|
6 |
+
arc_challenge,acc_norm,0.26706484641638223,0.012928933196496349,0
|
7 |
+
arc_easy,acc,0.4684343434343434,0.010239317603199502,0
|
8 |
+
arc_easy,acc_norm,0.45496632996632996,0.010218084454602578,0
|
9 |
+
boolq,acc,0.41804281345565747,0.008626774352070744,1
|
10 |
+
cb,acc,0.4107142857142857,0.06633634150359541,1
|
11 |
+
cb,f1,0.27807807807807805,,1
|
12 |
+
copa,acc,0.63,0.04852365870939099,0
|
13 |
+
hellaswag,acc,0.3569010157339175,0.004781061390873926,0
|
14 |
+
hellaswag,acc_norm,0.42421828321051586,0.004932137126625413,0
|
15 |
+
piqa,acc,0.676822633297062,0.01091197412428213,0
|
16 |
+
piqa,acc_norm,0.6773667029379761,0.010907166359856616,0
|
17 |
+
rte,acc,0.48375451263537905,0.030080573208738064,0
|
18 |
+
sciq,acc,0.692,0.01460648312734276,0
|
19 |
+
sciq,acc_norm,0.671,0.014865395385928357,0
|
20 |
+
storycloze_2016,acc,0.6162479957242116,0.011245591019345448,0
|
21 |
+
winogrande,acc,0.5130228887134964,0.014047718393997667,0
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_1_lm-eval_global_step80108_2023-05-04-10-21-10_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.347,
|
5 |
-
"acc_stderr": 0.015060472031706622
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.331,
|
9 |
-
"acc_stderr": 0.014888272588203934
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33916666666666667,
|
13 |
-
"acc_stderr": 0.013672343491681819
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4107142857142857,
|
17 |
-
"acc_stderr": 0.06633634150359541,
|
18 |
-
"f1": 0.27807807807807805
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.63,
|
22 |
-
"acc_stderr": 0.04852365870939099
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3569010157339175,
|
26 |
-
"acc_stderr": 0.004781061390873926,
|
27 |
-
"acc_norm": 0.42421828321051586,
|
28 |
-
"acc_norm_stderr": 0.004932137126625413
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48375451263537905,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5130228887134964,
|
36 |
-
"acc_stderr": 0.014047718393997667
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6162479957242116,
|
40 |
-
"acc_stderr": 0.011245591019345448
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.41804281345565747,
|
44 |
-
"acc_stderr": 0.008626774352070744
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.4684343434343434,
|
48 |
-
"acc_stderr": 0.010239317603199502,
|
49 |
-
"acc_norm": 0.45496632996632996,
|
50 |
-
"acc_norm_stderr": 0.010218084454602578
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.22866894197952217,
|
54 |
-
"acc_stderr": 0.012272853582540802,
|
55 |
-
"acc_norm": 0.26706484641638223,
|
56 |
-
"acc_norm_stderr": 0.012928933196496349
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.692,
|
60 |
-
"acc_stderr": 0.01460648312734276,
|
61 |
-
"acc_norm": 0.671,
|
62 |
-
"acc_norm_stderr": 0.014865395385928357
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.676822633297062,
|
66 |
-
"acc_stderr": 0.01091197412428213,
|
67 |
-
"acc_norm": 0.6773667029379761,
|
68 |
-
"acc_norm_stderr": 0.010907166359856616
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.336,0.014944140233795021,0
|
3 |
+
anli_r2,acc,0.34,0.014987482264363937,0
|
4 |
+
anli_r3,acc,0.32166666666666666,0.013490095282989523,0
|
5 |
+
arc_challenge,acc,0.24232081911262798,0.012521593295800116,0
|
6 |
+
arc_challenge,acc_norm,0.2832764505119454,0.013167478735134576,0
|
7 |
+
arc_easy,acc,0.4654882154882155,0.01023531423896939,0
|
8 |
+
arc_easy,acc_norm,0.4553872053872054,0.010218861787618726,0
|
9 |
+
boolq,acc,0.4091743119266055,0.00859956344239735,1
|
10 |
+
cb,acc,0.4107142857142857,0.06633634150359541,1
|
11 |
+
cb,f1,0.24291938997821352,,1
|
12 |
+
copa,acc,0.66,0.04760952285695237,0
|
13 |
+
hellaswag,acc,0.3552081258713404,0.004775982650355913,0
|
14 |
+
hellaswag,acc_norm,0.42561242780322645,0.004934250390879783,0
|
15 |
+
piqa,acc,0.675734494015234,0.010921539041347987,0
|
16 |
+
piqa,acc_norm,0.6773667029379761,0.010907166359856616,0
|
17 |
+
rte,acc,0.4693140794223827,0.03003973059219781,0
|
18 |
+
sciq,acc,0.702,0.014470846741134703,0
|
19 |
+
sciq,acc_norm,0.68,0.014758652303574872,0
|
20 |
+
storycloze_2016,acc,0.6173169428113309,0.011239653231976822,0
|
21 |
+
winogrande,acc,0.5153906866614049,0.014045826789783668,0
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_2_lm-eval_global_step80108_2023-05-04-10-24-43_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.336,
|
5 |
-
"acc_stderr": 0.014944140233795021
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.34,
|
9 |
-
"acc_stderr": 0.014987482264363937
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.32166666666666666,
|
13 |
-
"acc_stderr": 0.013490095282989523
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4107142857142857,
|
17 |
-
"acc_stderr": 0.06633634150359541,
|
18 |
-
"f1": 0.24291938997821352
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.66,
|
22 |
-
"acc_stderr": 0.04760952285695237
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3552081258713404,
|
26 |
-
"acc_stderr": 0.004775982650355913,
|
27 |
-
"acc_norm": 0.42561242780322645,
|
28 |
-
"acc_norm_stderr": 0.004934250390879783
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4693140794223827,
|
32 |
-
"acc_stderr": 0.03003973059219781
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5153906866614049,
|
36 |
-
"acc_stderr": 0.014045826789783668
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6173169428113309,
|
40 |
-
"acc_stderr": 0.011239653231976822
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.4091743119266055,
|
44 |
-
"acc_stderr": 0.00859956344239735
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.4654882154882155,
|
48 |
-
"acc_stderr": 0.01023531423896939,
|
49 |
-
"acc_norm": 0.4553872053872054,
|
50 |
-
"acc_norm_stderr": 0.010218861787618726
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24232081911262798,
|
54 |
-
"acc_stderr": 0.012521593295800116,
|
55 |
-
"acc_norm": 0.2832764505119454,
|
56 |
-
"acc_norm_stderr": 0.013167478735134576
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.702,
|
60 |
-
"acc_stderr": 0.014470846741134703,
|
61 |
-
"acc_norm": 0.68,
|
62 |
-
"acc_norm_stderr": 0.014758652303574872
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.675734494015234,
|
66 |
-
"acc_stderr": 0.010921539041347987,
|
67 |
-
"acc_norm": 0.6773667029379761,
|
68 |
-
"acc_norm_stderr": 0.010907166359856616
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.355,0.015139491543780532,0
|
3 |
+
anli_r2,acc,0.365,0.01523177622626491,0
|
4 |
+
anli_r3,acc,0.3375,0.013655897185463652,0
|
5 |
+
arc_challenge,acc,0.23378839590443687,0.012368225378507142,0
|
6 |
+
arc_challenge,acc_norm,0.2713310580204778,0.012993807727545794,0
|
7 |
+
arc_easy,acc,0.47053872053872053,0.01024195772840968,0
|
8 |
+
arc_easy,acc_norm,0.4642255892255892,0.010233488709726547,0
|
9 |
+
boolq,acc,0.41651376146788993,0.00862228802067401,1
|
10 |
+
cb,acc,0.4107142857142857,0.06633634150359541,1
|
11 |
+
cb,f1,0.2889767237593324,,1
|
12 |
+
copa,acc,0.62,0.04878317312145632,0
|
13 |
+
hellaswag,acc,0.358195578570006,0.004784901248558721,0
|
14 |
+
hellaswag,acc_norm,0.424317864967138,0.004932289405608946,0
|
15 |
+
piqa,acc,0.6621327529923831,0.011035474307853841,0
|
16 |
+
piqa,acc_norm,0.676822633297062,0.010911974124282128,0
|
17 |
+
rte,acc,0.4729241877256318,0.030052303463143706,0
|
18 |
+
sciq,acc,0.715,0.014282120955200484,0
|
19 |
+
sciq,acc_norm,0.687,0.014671272822977886,0
|
20 |
+
storycloze_2016,acc,0.6194548369855692,0.011227604968407471,0
|
21 |
+
winogrande,acc,0.5043409629044988,0.014051956064076892,0
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_3_lm-eval_global_step80108_2023-05-04-10-24-43_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.355,
|
5 |
-
"acc_stderr": 0.015139491543780532
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.365,
|
9 |
-
"acc_stderr": 0.01523177622626491
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3375,
|
13 |
-
"acc_stderr": 0.013655897185463652
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4107142857142857,
|
17 |
-
"acc_stderr": 0.06633634150359541,
|
18 |
-
"f1": 0.2889767237593324
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.62,
|
22 |
-
"acc_stderr": 0.04878317312145632
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.358195578570006,
|
26 |
-
"acc_stderr": 0.004784901248558721,
|
27 |
-
"acc_norm": 0.424317864967138,
|
28 |
-
"acc_norm_stderr": 0.004932289405608946
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4729241877256318,
|
32 |
-
"acc_stderr": 0.030052303463143706
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5043409629044988,
|
36 |
-
"acc_stderr": 0.014051956064076892
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6194548369855692,
|
40 |
-
"acc_stderr": 0.011227604968407471
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.41651376146788993,
|
44 |
-
"acc_stderr": 0.00862228802067401
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.47053872053872053,
|
48 |
-
"acc_stderr": 0.01024195772840968,
|
49 |
-
"acc_norm": 0.4642255892255892,
|
50 |
-
"acc_norm_stderr": 0.010233488709726547
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.23378839590443687,
|
54 |
-
"acc_stderr": 0.012368225378507142,
|
55 |
-
"acc_norm": 0.2713310580204778,
|
56 |
-
"acc_norm_stderr": 0.012993807727545794
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.715,
|
60 |
-
"acc_stderr": 0.014282120955200484,
|
61 |
-
"acc_norm": 0.687,
|
62 |
-
"acc_norm_stderr": 0.014671272822977886
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.6621327529923831,
|
66 |
-
"acc_stderr": 0.011035474307853841,
|
67 |
-
"acc_norm": 0.676822633297062,
|
68 |
-
"acc_norm_stderr": 0.010911974124282128
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.369,0.015266698139154617,0
|
3 |
+
anli_r2,acc,0.348,0.01507060460376841,0
|
4 |
+
anli_r3,acc,0.3425,0.013704669762934732,0
|
5 |
+
arc_challenge,acc,0.23464163822525597,0.012383873560768682,0
|
6 |
+
arc_challenge,acc_norm,0.2883959044368601,0.01323839442242818,0
|
7 |
+
arc_easy,acc,0.45791245791245794,0.010223371342195897,0
|
8 |
+
arc_easy,acc_norm,0.4612794612794613,0.010228972678389606,0
|
9 |
+
boolq,acc,0.41804281345565747,0.008626774352070744,1
|
10 |
+
cb,acc,0.4642857142857143,0.0672477765493766,1
|
11 |
+
cb,f1,0.32222222222222224,,1
|
12 |
+
copa,acc,0.68,0.04688261722621504,0
|
13 |
+
hellaswag,acc,0.359788886675961,0.004789575163418652,0
|
14 |
+
hellaswag,acc_norm,0.42401911969727146,0.004931831953800041,0
|
15 |
+
piqa,acc,0.6730141458106638,0.010945157126978217,0
|
16 |
+
piqa,acc_norm,0.6702937976060935,0.010968357083095152,0
|
17 |
+
rte,acc,0.48014440433212996,0.0300727231673172,0
|
18 |
+
sciq,acc,0.729,0.014062601350986186,0
|
19 |
+
sciq,acc_norm,0.707,0.014399942998441268,0
|
20 |
+
storycloze_2016,acc,0.6173169428113309,0.011239653231976824,0
|
21 |
+
winogrande,acc,0.5177584846093133,0.014043619596174959,0
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_4_lm-eval_global_step80108_2023-05-04-10-24-43_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.369,
|
5 |
-
"acc_stderr": 0.015266698139154617
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.348,
|
9 |
-
"acc_stderr": 0.01507060460376841
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3425,
|
13 |
-
"acc_stderr": 0.013704669762934732
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4642857142857143,
|
17 |
-
"acc_stderr": 0.0672477765493766,
|
18 |
-
"f1": 0.32222222222222224
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.68,
|
22 |
-
"acc_stderr": 0.04688261722621504
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.359788886675961,
|
26 |
-
"acc_stderr": 0.004789575163418652,
|
27 |
-
"acc_norm": 0.42401911969727146,
|
28 |
-
"acc_norm_stderr": 0.004931831953800041
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48014440433212996,
|
32 |
-
"acc_stderr": 0.0300727231673172
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5177584846093133,
|
36 |
-
"acc_stderr": 0.014043619596174959
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6173169428113309,
|
40 |
-
"acc_stderr": 0.011239653231976824
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.41804281345565747,
|
44 |
-
"acc_stderr": 0.008626774352070744
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.45791245791245794,
|
48 |
-
"acc_stderr": 0.010223371342195897,
|
49 |
-
"acc_norm": 0.4612794612794613,
|
50 |
-
"acc_norm_stderr": 0.010228972678389606
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.23464163822525597,
|
54 |
-
"acc_stderr": 0.012383873560768682,
|
55 |
-
"acc_norm": 0.2883959044368601,
|
56 |
-
"acc_norm_stderr": 0.01323839442242818
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.729,
|
60 |
-
"acc_stderr": 0.014062601350986186,
|
61 |
-
"acc_norm": 0.707,
|
62 |
-
"acc_norm_stderr": 0.014399942998441268
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.6730141458106638,
|
66 |
-
"acc_stderr": 0.010945157126978217,
|
67 |
-
"acc_norm": 0.6702937976060935,
|
68 |
-
"acc_norm_stderr": 0.010968357083095152
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.363,0.015213890444671283,0
|
3 |
+
anli_r2,acc,0.35,0.015090650341444233,0
|
4 |
+
anli_r3,acc,0.3333333333333333,0.013613950010225617,0
|
5 |
+
arc_challenge,acc,0.23293515358361774,0.012352507042617391,0
|
6 |
+
arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0
|
7 |
+
arc_easy,acc,0.468013468013468,0.010238767643185723,0
|
8 |
+
arc_easy,acc_norm,0.4511784511784512,0.010210757101073475,0
|
9 |
+
boolq,acc,0.40948012232415903,0.00860054975132092,1
|
10 |
+
cb,acc,0.32142857142857145,0.06297362289056341,1
|
11 |
+
cb,f1,0.2379385964912281,,1
|
12 |
+
copa,acc,0.65,0.0479372485441102,0
|
13 |
+
hellaswag,acc,0.3551085441147182,0.004775681871529861,0
|
14 |
+
hellaswag,acc_norm,0.42869946225851424,0.004938787067611805,0
|
15 |
+
piqa,acc,0.6659412404787813,0.011004613886336733,0
|
16 |
+
piqa,acc_norm,0.6751904243743199,0.010926296238294038,0
|
17 |
+
rte,acc,0.4729241877256318,0.030052303463143706,0
|
18 |
+
sciq,acc,0.731,0.014029819522568196,0
|
19 |
+
sciq,acc_norm,0.71,0.014356395999905694,0
|
20 |
+
storycloze_2016,acc,0.615713522180652,0.011248538366952603,0
|
21 |
+
winogrande,acc,0.516179952644041,0.014045126130978603,0
|
4b284b1b9oscar/evaluation/rankeval/4b284b1b9oscar_5_lm-eval_global_step80108_2023-05-04-10-24-43_5shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.363,
|
5 |
-
"acc_stderr": 0.015213890444671283
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.35,
|
9 |
-
"acc_stderr": 0.015090650341444233
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3333333333333333,
|
13 |
-
"acc_stderr": 0.013613950010225617
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.32142857142857145,
|
17 |
-
"acc_stderr": 0.06297362289056341,
|
18 |
-
"f1": 0.2379385964912281
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.65,
|
22 |
-
"acc_stderr": 0.0479372485441102
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3551085441147182,
|
26 |
-
"acc_stderr": 0.004775681871529861,
|
27 |
-
"acc_norm": 0.42869946225851424,
|
28 |
-
"acc_norm_stderr": 0.004938787067611805
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4729241877256318,
|
32 |
-
"acc_stderr": 0.030052303463143706
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.516179952644041,
|
36 |
-
"acc_stderr": 0.014045126130978603
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.615713522180652,
|
40 |
-
"acc_stderr": 0.011248538366952603
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.40948012232415903,
|
44 |
-
"acc_stderr": 0.00860054975132092
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.468013468013468,
|
48 |
-
"acc_stderr": 0.010238767643185723,
|
49 |
-
"acc_norm": 0.4511784511784512,
|
50 |
-
"acc_norm_stderr": 0.010210757101073475
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.23293515358361774,
|
54 |
-
"acc_stderr": 0.012352507042617391,
|
55 |
-
"acc_norm": 0.2773037542662116,
|
56 |
-
"acc_norm_stderr": 0.013082095839059374
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.731,
|
60 |
-
"acc_stderr": 0.014029819522568196,
|
61 |
-
"acc_norm": 0.71,
|
62 |
-
"acc_norm_stderr": 0.014356395999905694
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.6659412404787813,
|
66 |
-
"acc_stderr": 0.011004613886336733,
|
67 |
-
"acc_norm": 0.6751904243743199,
|
68 |
-
"acc_norm_stderr": 0.010926296238294038
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|