Commit
•
79391b8
1
Parent(s):
fdef977
Add files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.csv +21 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0_lm-eval_global_step52452_2023-02-09-23-24-23_0shots_backup.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1_lm-eval_global_step52452_2023-02-09-23-24-23_1shots_backup.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2_lm-eval_global_step52452_2023-02-09-23-24-23_2shots_backup.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3_lm-eval_global_step52452_2023-02-09-23-24-23_3shots_backup.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4_lm-eval_global_step52452_2023-02-09-23-24-23_4shots_backup.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_5.json +87 -0
- evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_5_lm-eval_global_step52452_2023-02-09-23-24-23_5shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.csv +21 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0_lm-eval_global_step52452_2023-02-09-17-38-12_0shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1_lm-eval_global_step52452_2023-02-09-17-38-12_1shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2_lm-eval_global_step52452_2023-02-09-17-36-57_2shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3_lm-eval_global_step52452_2023-02-09-17-36-57_3shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4_lm-eval_global_step52452_2023-02-09-17-38-12_4shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_5.json +87 -0
- evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_5_lm-eval_global_step52452_2023-02-09-17-38-12_5shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_0.csv +21 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_0.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_0_lm-eval_global_step42000_2023-02-08-13-42-29_0shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_1.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_1_lm-eval_global_step42000_2023-02-08-13-42-29_1shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_2.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_2_lm-eval_global_step42000_2023-02-08-13-42-29_2shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_3.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_3_lm-eval_global_step42000_2023-02-08-13-42-29_3shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_4.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_4_lm-eval_global_step42000_2023-02-08-13-42-29_4shots_backup.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_5.json +87 -0
- evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_5_lm-eval_global_step42000_2023-02-08-13-42-29_5shots_backup.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.csv +21 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0_lm-eval_global_step52452_2023-02-09-23-08-31_0shots_backup.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1_lm-eval_global_step52452_2023-02-09-23-08-31_1shots_backup.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2_lm-eval_global_step52452_2023-02-09-23-08-31_2shots_backup.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3_lm-eval_global_step52452_2023-02-09-23-08-31_3shots_backup.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4.json +87 -0
- evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4_lm-eval_global_step52452_2023-02-09-23-08-31_4shots_backup.json +87 -0
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.339,0.01497675877162034,0
|
3 |
+
anli_r2,acc,0.335,0.014933117490932573,0
|
4 |
+
anli_r3,acc,0.33416666666666667,0.013622434813136783,0
|
5 |
+
arc_challenge,acc,0.17918088737201365,0.011207045216615674,0
|
6 |
+
arc_challenge,acc_norm,0.2235494880546075,0.012174896631202614,0
|
7 |
+
arc_easy,acc,0.4335016835016835,0.010168640625454107,0
|
8 |
+
arc_easy,acc_norm,0.3846801346801347,0.009983171707009006,0
|
9 |
+
boolq,acc,0.5938837920489297,0.008589510943787407,1
|
10 |
+
cb,acc,0.4107142857142857,0.0663363415035954,1
|
11 |
+
cb,f1,0.1940928270042194,,1
|
12 |
+
copa,acc,0.62,0.04878317312145632,0
|
13 |
+
hellaswag,acc,0.2951603266281617,0.004551826272978059,0
|
14 |
+
hellaswag,acc_norm,0.3241386178052181,0.004670955399641126,0
|
15 |
+
piqa,acc,0.6218715995647442,0.011313980666854535,0
|
16 |
+
piqa,acc_norm,0.6267682263329706,0.011284653078254898,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.732,0.01401329270272948,0
|
19 |
+
sciq,acc_norm,0.669,0.01488827258820394,0
|
20 |
+
storycloze_2016,acc,0.5873864243719936,0.011384472322969045,0
|
21 |
+
winogrande,acc,0.5059194948697711,0.01405150083848581,0
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.339,
|
5 |
+
"acc_stderr": 0.01497675877162034
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.335,
|
9 |
+
"acc_stderr": 0.014933117490932573
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33416666666666667,
|
13 |
+
"acc_stderr": 0.013622434813136783
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.1940928270042194
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145632
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2951603266281617,
|
26 |
+
"acc_stderr": 0.004551826272978059,
|
27 |
+
"acc_norm": 0.3241386178052181,
|
28 |
+
"acc_norm_stderr": 0.004670955399641126
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5234657039711191,
|
32 |
+
"acc_stderr": 0.03006330041190266
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5059194948697711,
|
36 |
+
"acc_stderr": 0.01405150083848581
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5873864243719936,
|
40 |
+
"acc_stderr": 0.011384472322969045
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5938837920489297,
|
44 |
+
"acc_stderr": 0.008589510943787407
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4335016835016835,
|
48 |
+
"acc_stderr": 0.010168640625454107,
|
49 |
+
"acc_norm": 0.3846801346801347,
|
50 |
+
"acc_norm_stderr": 0.009983171707009006
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.17918088737201365,
|
54 |
+
"acc_stderr": 0.011207045216615674,
|
55 |
+
"acc_norm": 0.2235494880546075,
|
56 |
+
"acc_norm_stderr": 0.012174896631202614
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.732,
|
60 |
+
"acc_stderr": 0.01401329270272948,
|
61 |
+
"acc_norm": 0.669,
|
62 |
+
"acc_norm_stderr": 0.01488827258820394
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6218715995647442,
|
66 |
+
"acc_stderr": 0.011313980666854535,
|
67 |
+
"acc_norm": 0.6267682263329706,
|
68 |
+
"acc_norm_stderr": 0.011284653078254898
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0_lm-eval_global_step52452_2023-02-09-23-24-23_0shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.339,
|
5 |
+
"acc_stderr": 0.01497675877162034
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.335,
|
9 |
+
"acc_stderr": 0.014933117490932573
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33416666666666667,
|
13 |
+
"acc_stderr": 0.013622434813136783
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.1940928270042194
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145632
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2951603266281617,
|
26 |
+
"acc_stderr": 0.004551826272978059,
|
27 |
+
"acc_norm": 0.3241386178052181,
|
28 |
+
"acc_norm_stderr": 0.004670955399641126
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5234657039711191,
|
32 |
+
"acc_stderr": 0.03006330041190266
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5059194948697711,
|
36 |
+
"acc_stderr": 0.01405150083848581
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5873864243719936,
|
40 |
+
"acc_stderr": 0.011384472322969045
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5938837920489297,
|
44 |
+
"acc_stderr": 0.008589510943787407
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4335016835016835,
|
48 |
+
"acc_stderr": 0.010168640625454107,
|
49 |
+
"acc_norm": 0.3846801346801347,
|
50 |
+
"acc_norm_stderr": 0.009983171707009006
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.17918088737201365,
|
54 |
+
"acc_stderr": 0.011207045216615674,
|
55 |
+
"acc_norm": 0.2235494880546075,
|
56 |
+
"acc_norm_stderr": 0.012174896631202614
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.732,
|
60 |
+
"acc_stderr": 0.01401329270272948,
|
61 |
+
"acc_norm": 0.669,
|
62 |
+
"acc_norm_stderr": 0.01488827258820394
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6218715995647442,
|
66 |
+
"acc_stderr": 0.011313980666854535,
|
67 |
+
"acc_norm": 0.6267682263329706,
|
68 |
+
"acc_norm_stderr": 0.011284653078254898
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.36,
|
5 |
+
"acc_stderr": 0.015186527932040122
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.349,
|
9 |
+
"acc_stderr": 0.015080663991563102
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.32166666666666666,
|
13 |
+
"acc_stderr": 0.01349009528298952
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.28651292802236195
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.64,
|
22 |
+
"acc_stderr": 0.048241815132442176
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2913762198765186,
|
26 |
+
"acc_stderr": 0.004534677750102734,
|
27 |
+
"acc_norm": 0.3249352718581956,
|
28 |
+
"acc_norm_stderr": 0.0046739348371504464
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5523465703971119,
|
32 |
+
"acc_stderr": 0.02993107036293953
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.494869771112865,
|
36 |
+
"acc_stderr": 0.014051745961790513
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.01141582799434265
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5110091743119266,
|
44 |
+
"acc_stderr": 0.008742934884517647
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4297138047138047,
|
48 |
+
"acc_stderr": 0.010157908005763678,
|
49 |
+
"acc_norm": 0.3792087542087542,
|
50 |
+
"acc_norm_stderr": 0.00995589166886556
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1825938566552901,
|
54 |
+
"acc_stderr": 0.011289730684564982,
|
55 |
+
"acc_norm": 0.21928327645051193,
|
56 |
+
"acc_norm_stderr": 0.012091245787615734
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.705,
|
60 |
+
"acc_stderr": 0.014428554438445517,
|
61 |
+
"acc_norm": 0.658,
|
62 |
+
"acc_norm_stderr": 0.015008706182121731
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6218715995647442,
|
66 |
+
"acc_stderr": 0.011313980666854535,
|
67 |
+
"acc_norm": 0.6109902067464635,
|
68 |
+
"acc_norm_stderr": 0.011374774974447464
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1_lm-eval_global_step52452_2023-02-09-23-24-23_1shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.36,
|
5 |
+
"acc_stderr": 0.015186527932040122
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.349,
|
9 |
+
"acc_stderr": 0.015080663991563102
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.32166666666666666,
|
13 |
+
"acc_stderr": 0.01349009528298952
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.28651292802236195
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.64,
|
22 |
+
"acc_stderr": 0.048241815132442176
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2913762198765186,
|
26 |
+
"acc_stderr": 0.004534677750102734,
|
27 |
+
"acc_norm": 0.3249352718581956,
|
28 |
+
"acc_norm_stderr": 0.0046739348371504464
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5523465703971119,
|
32 |
+
"acc_stderr": 0.02993107036293953
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.494869771112865,
|
36 |
+
"acc_stderr": 0.014051745961790513
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.01141582799434265
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5110091743119266,
|
44 |
+
"acc_stderr": 0.008742934884517647
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4297138047138047,
|
48 |
+
"acc_stderr": 0.010157908005763678,
|
49 |
+
"acc_norm": 0.3792087542087542,
|
50 |
+
"acc_norm_stderr": 0.00995589166886556
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1825938566552901,
|
54 |
+
"acc_stderr": 0.011289730684564982,
|
55 |
+
"acc_norm": 0.21928327645051193,
|
56 |
+
"acc_norm_stderr": 0.012091245787615734
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.705,
|
60 |
+
"acc_stderr": 0.014428554438445517,
|
61 |
+
"acc_norm": 0.658,
|
62 |
+
"acc_norm_stderr": 0.015008706182121731
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6218715995647442,
|
66 |
+
"acc_stderr": 0.011313980666854535,
|
67 |
+
"acc_norm": 0.6109902067464635,
|
68 |
+
"acc_norm_stderr": 0.011374774974447464
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.311,
|
5 |
+
"acc_stderr": 0.014645596385722695
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.356,
|
9 |
+
"acc_stderr": 0.015149042659306625
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33666666666666667,
|
13 |
+
"acc_stderr": 0.01364760294240639
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4642857142857143,
|
17 |
+
"acc_stderr": 0.06724777654937658,
|
18 |
+
"f1": 0.316548463356974
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.63,
|
22 |
+
"acc_stderr": 0.048523658709391
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29047998406691894,
|
26 |
+
"acc_stderr": 0.004530560646902538,
|
27 |
+
"acc_norm": 0.3179645488946425,
|
28 |
+
"acc_norm_stderr": 0.004647338877642189
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.48736462093862815,
|
32 |
+
"acc_stderr": 0.030086851767188564
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5098658247829518,
|
36 |
+
"acc_stderr": 0.014049749833367596
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.011415827994342655
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4746177370030581,
|
44 |
+
"acc_stderr": 0.008733779541853504
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42297979797979796,
|
48 |
+
"acc_stderr": 0.010137328382209104,
|
49 |
+
"acc_norm": 0.39057239057239057,
|
50 |
+
"acc_norm_stderr": 0.010011059112064229
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18515358361774745,
|
54 |
+
"acc_stderr": 0.011350774438389699,
|
55 |
+
"acc_norm": 0.22525597269624573,
|
56 |
+
"acc_norm_stderr": 0.01220783999540731
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.727,
|
60 |
+
"acc_stderr": 0.014095022868717607,
|
61 |
+
"acc_norm": 0.677,
|
62 |
+
"acc_norm_stderr": 0.014794927843348635
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6316648531011969,
|
66 |
+
"acc_stderr": 0.011254089354334373,
|
67 |
+
"acc_norm": 0.6294885745375408,
|
68 |
+
"acc_norm_stderr": 0.01126782647544766
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2_lm-eval_global_step52452_2023-02-09-23-24-23_2shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.311,
|
5 |
+
"acc_stderr": 0.014645596385722695
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.356,
|
9 |
+
"acc_stderr": 0.015149042659306625
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33666666666666667,
|
13 |
+
"acc_stderr": 0.01364760294240639
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4642857142857143,
|
17 |
+
"acc_stderr": 0.06724777654937658,
|
18 |
+
"f1": 0.316548463356974
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.63,
|
22 |
+
"acc_stderr": 0.048523658709391
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29047998406691894,
|
26 |
+
"acc_stderr": 0.004530560646902538,
|
27 |
+
"acc_norm": 0.3179645488946425,
|
28 |
+
"acc_norm_stderr": 0.004647338877642189
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.48736462093862815,
|
32 |
+
"acc_stderr": 0.030086851767188564
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5098658247829518,
|
36 |
+
"acc_stderr": 0.014049749833367596
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.011415827994342655
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4746177370030581,
|
44 |
+
"acc_stderr": 0.008733779541853504
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42297979797979796,
|
48 |
+
"acc_stderr": 0.010137328382209104,
|
49 |
+
"acc_norm": 0.39057239057239057,
|
50 |
+
"acc_norm_stderr": 0.010011059112064229
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18515358361774745,
|
54 |
+
"acc_stderr": 0.011350774438389699,
|
55 |
+
"acc_norm": 0.22525597269624573,
|
56 |
+
"acc_norm_stderr": 0.01220783999540731
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.727,
|
60 |
+
"acc_stderr": 0.014095022868717607,
|
61 |
+
"acc_norm": 0.677,
|
62 |
+
"acc_norm_stderr": 0.014794927843348635
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6316648531011969,
|
66 |
+
"acc_stderr": 0.011254089354334373,
|
67 |
+
"acc_norm": 0.6294885745375408,
|
68 |
+
"acc_norm_stderr": 0.01126782647544766
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.312,
|
5 |
+
"acc_stderr": 0.014658474370509008
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.344,
|
9 |
+
"acc_stderr": 0.015029633724408947
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3425,
|
13 |
+
"acc_stderr": 0.013704669762934725
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644647,
|
18 |
+
"f1": 0.3422885572139303
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.6,
|
22 |
+
"acc_stderr": 0.049236596391733084
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.291575383389763,
|
26 |
+
"acc_stderr": 0.004535589759202657,
|
27 |
+
"acc_norm": 0.32284405496912966,
|
28 |
+
"acc_norm_stderr": 0.004666080865179641
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5054151624548736,
|
32 |
+
"acc_stderr": 0.030094698123239966
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5011838989739542,
|
36 |
+
"acc_stderr": 0.014052446290529015
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5740245857830037,
|
40 |
+
"acc_stderr": 0.011435014262181197
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4688073394495413,
|
44 |
+
"acc_stderr": 0.008728020822889253
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42424242424242425,
|
48 |
+
"acc_stderr": 0.010141333654958574,
|
49 |
+
"acc_norm": 0.38425925925925924,
|
50 |
+
"acc_norm_stderr": 0.009981120724601443
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18686006825938567,
|
54 |
+
"acc_stderr": 0.011391015649694391,
|
55 |
+
"acc_norm": 0.22440273037542663,
|
56 |
+
"acc_norm_stderr": 0.012191404938603838
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.723,
|
60 |
+
"acc_stderr": 0.014158794845306265,
|
61 |
+
"acc_norm": 0.682,
|
62 |
+
"acc_norm_stderr": 0.014734079309311901
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6284004352557128,
|
66 |
+
"acc_stderr": 0.011274603006724743,
|
67 |
+
"acc_norm": 0.6196953210010882,
|
68 |
+
"acc_norm_stderr": 0.011326620892570314
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3_lm-eval_global_step52452_2023-02-09-23-24-23_3shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.312,
|
5 |
+
"acc_stderr": 0.014658474370509008
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.344,
|
9 |
+
"acc_stderr": 0.015029633724408947
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3425,
|
13 |
+
"acc_stderr": 0.013704669762934725
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644647,
|
18 |
+
"f1": 0.3422885572139303
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.6,
|
22 |
+
"acc_stderr": 0.049236596391733084
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.291575383389763,
|
26 |
+
"acc_stderr": 0.004535589759202657,
|
27 |
+
"acc_norm": 0.32284405496912966,
|
28 |
+
"acc_norm_stderr": 0.004666080865179641
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5054151624548736,
|
32 |
+
"acc_stderr": 0.030094698123239966
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5011838989739542,
|
36 |
+
"acc_stderr": 0.014052446290529015
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5740245857830037,
|
40 |
+
"acc_stderr": 0.011435014262181197
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4688073394495413,
|
44 |
+
"acc_stderr": 0.008728020822889253
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42424242424242425,
|
48 |
+
"acc_stderr": 0.010141333654958574,
|
49 |
+
"acc_norm": 0.38425925925925924,
|
50 |
+
"acc_norm_stderr": 0.009981120724601443
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18686006825938567,
|
54 |
+
"acc_stderr": 0.011391015649694391,
|
55 |
+
"acc_norm": 0.22440273037542663,
|
56 |
+
"acc_norm_stderr": 0.012191404938603838
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.723,
|
60 |
+
"acc_stderr": 0.014158794845306265,
|
61 |
+
"acc_norm": 0.682,
|
62 |
+
"acc_norm_stderr": 0.014734079309311901
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6284004352557128,
|
66 |
+
"acc_stderr": 0.011274603006724743,
|
67 |
+
"acc_norm": 0.6196953210010882,
|
68 |
+
"acc_norm_stderr": 0.011326620892570314
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.336,
|
5 |
+
"acc_stderr": 0.014944140233795025
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.352,
|
9 |
+
"acc_stderr": 0.01511040450564867
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3383333333333333,
|
13 |
+
"acc_stderr": 0.013664144006618266
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644645,
|
18 |
+
"f1": 0.3362023995826813
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.6,
|
22 |
+
"acc_stderr": 0.049236596391733084
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29645488946425014,
|
26 |
+
"acc_stderr": 0.004557606227194286,
|
27 |
+
"acc_norm": 0.3234415455088628,
|
28 |
+
"acc_norm_stderr": 0.004668335725410298
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.49458483754512633,
|
32 |
+
"acc_stderr": 0.030094698123239966
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4877663772691397,
|
36 |
+
"acc_stderr": 0.01404827882040562
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5788348476750401,
|
40 |
+
"acc_stderr": 0.011417808278216117
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.44434250764525995,
|
44 |
+
"acc_stderr": 0.00869070599067338
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42845117845117847,
|
48 |
+
"acc_stderr": 0.010154195733990975,
|
49 |
+
"acc_norm": 0.3930976430976431,
|
50 |
+
"acc_norm_stderr": 0.010022540618945312
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1962457337883959,
|
54 |
+
"acc_stderr": 0.01160601988141629,
|
55 |
+
"acc_norm": 0.22781569965870307,
|
56 |
+
"acc_norm_stderr": 0.012256708602326905
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.719,
|
60 |
+
"acc_stderr": 0.014221154708434929,
|
61 |
+
"acc_norm": 0.686,
|
62 |
+
"acc_norm_stderr": 0.014683991951087967
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6273122959738846,
|
66 |
+
"acc_stderr": 0.01128131833289774,
|
67 |
+
"acc_norm": 0.6169749727965179,
|
68 |
+
"acc_norm_stderr": 0.01134208170908285
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4_lm-eval_global_step52452_2023-02-09-23-24-23_4shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.336,
|
5 |
+
"acc_stderr": 0.014944140233795025
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.352,
|
9 |
+
"acc_stderr": 0.01511040450564867
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3383333333333333,
|
13 |
+
"acc_stderr": 0.013664144006618266
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644645,
|
18 |
+
"f1": 0.3362023995826813
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.6,
|
22 |
+
"acc_stderr": 0.049236596391733084
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29645488946425014,
|
26 |
+
"acc_stderr": 0.004557606227194286,
|
27 |
+
"acc_norm": 0.3234415455088628,
|
28 |
+
"acc_norm_stderr": 0.004668335725410298
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.49458483754512633,
|
32 |
+
"acc_stderr": 0.030094698123239966
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4877663772691397,
|
36 |
+
"acc_stderr": 0.01404827882040562
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5788348476750401,
|
40 |
+
"acc_stderr": 0.011417808278216117
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.44434250764525995,
|
44 |
+
"acc_stderr": 0.00869070599067338
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42845117845117847,
|
48 |
+
"acc_stderr": 0.010154195733990975,
|
49 |
+
"acc_norm": 0.3930976430976431,
|
50 |
+
"acc_norm_stderr": 0.010022540618945312
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1962457337883959,
|
54 |
+
"acc_stderr": 0.01160601988141629,
|
55 |
+
"acc_norm": 0.22781569965870307,
|
56 |
+
"acc_norm_stderr": 0.012256708602326905
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.719,
|
60 |
+
"acc_stderr": 0.014221154708434929,
|
61 |
+
"acc_norm": 0.686,
|
62 |
+
"acc_norm_stderr": 0.014683991951087967
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6273122959738846,
|
66 |
+
"acc_stderr": 0.01128131833289774,
|
67 |
+
"acc_norm": 0.6169749727965179,
|
68 |
+
"acc_norm_stderr": 0.01134208170908285
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_5.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.375,
|
5 |
+
"acc_stderr": 0.015316971293620996
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.339,
|
9 |
+
"acc_stderr": 0.014976758771620344
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3408333333333333,
|
13 |
+
"acc_stderr": 0.01368860079329693
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5535714285714286,
|
17 |
+
"acc_stderr": 0.06703189227942395,
|
18 |
+
"f1": 0.3464373464373464
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29486158135829516,
|
26 |
+
"acc_stderr": 0.0045504861860190746,
|
27 |
+
"acc_norm": 0.32304321848237405,
|
28 |
+
"acc_norm_stderr": 0.0046668334527961925
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5018050541516246,
|
32 |
+
"acc_stderr": 0.030096267148976626
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5027624309392266,
|
36 |
+
"acc_stderr": 0.014052271211616441
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5809727418492785,
|
40 |
+
"acc_stderr": 0.011409804749706194
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.44128440366972477,
|
44 |
+
"acc_stderr": 0.008684548127832634
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4297138047138047,
|
48 |
+
"acc_stderr": 0.010157908005763676,
|
49 |
+
"acc_norm": 0.3985690235690236,
|
50 |
+
"acc_norm_stderr": 0.010046455400477931
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19197952218430034,
|
54 |
+
"acc_stderr": 0.011509598906598112,
|
55 |
+
"acc_norm": 0.22525597269624573,
|
56 |
+
"acc_norm_stderr": 0.012207839995407303
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.721,
|
60 |
+
"acc_stderr": 0.014190150117612032,
|
61 |
+
"acc_norm": 0.682,
|
62 |
+
"acc_norm_stderr": 0.014734079309311901
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6305767138193689,
|
66 |
+
"acc_stderr": 0.011260988628572347,
|
67 |
+
"acc_norm": 0.6180631120783461,
|
68 |
+
"acc_norm_stderr": 0.011335942557505228
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_r_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_5_lm-eval_global_step52452_2023-02-09-23-24-23_5shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.375,
|
5 |
+
"acc_stderr": 0.015316971293620996
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.339,
|
9 |
+
"acc_stderr": 0.014976758771620344
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3408333333333333,
|
13 |
+
"acc_stderr": 0.01368860079329693
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5535714285714286,
|
17 |
+
"acc_stderr": 0.06703189227942395,
|
18 |
+
"f1": 0.3464373464373464
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29486158135829516,
|
26 |
+
"acc_stderr": 0.0045504861860190746,
|
27 |
+
"acc_norm": 0.32304321848237405,
|
28 |
+
"acc_norm_stderr": 0.0046668334527961925
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5018050541516246,
|
32 |
+
"acc_stderr": 0.030096267148976626
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5027624309392266,
|
36 |
+
"acc_stderr": 0.014052271211616441
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5809727418492785,
|
40 |
+
"acc_stderr": 0.011409804749706194
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.44128440366972477,
|
44 |
+
"acc_stderr": 0.008684548127832634
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4297138047138047,
|
48 |
+
"acc_stderr": 0.010157908005763676,
|
49 |
+
"acc_norm": 0.3985690235690236,
|
50 |
+
"acc_norm_stderr": 0.010046455400477931
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19197952218430034,
|
54 |
+
"acc_stderr": 0.011509598906598112,
|
55 |
+
"acc_norm": 0.22525597269624573,
|
56 |
+
"acc_norm_stderr": 0.012207839995407303
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.721,
|
60 |
+
"acc_stderr": 0.014190150117612032,
|
61 |
+
"acc_norm": 0.682,
|
62 |
+
"acc_norm_stderr": 0.014734079309311901
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6305767138193689,
|
66 |
+
"acc_stderr": 0.011260988628572347,
|
67 |
+
"acc_norm": 0.6180631120783461,
|
68 |
+
"acc_norm_stderr": 0.011335942557505228
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.339,0.01497675877162034,0
|
3 |
+
anli_r2,acc,0.336,0.014944140233795027,0
|
4 |
+
anli_r3,acc,0.3358333333333333,0.01363926119093288,0
|
5 |
+
arc_challenge,acc,0.1885665529010239,0.011430897647675803,0
|
6 |
+
arc_challenge,acc_norm,0.22610921501706485,0.01222420209706328,0
|
7 |
+
arc_easy,acc,0.43308080808080807,0.010167478013701799,0
|
8 |
+
arc_easy,acc_norm,0.38173400673400676,0.009968648851839667,0
|
9 |
+
boolq,acc,0.5944954128440367,0.008587459055441612,1
|
10 |
+
cb,acc,0.4107142857142857,0.0663363415035954,1
|
11 |
+
cb,f1,0.1940928270042194,,1
|
12 |
+
copa,acc,0.63,0.04852365870939099,0
|
13 |
+
hellaswag,acc,0.297450707030472,0.004562022467161891,0
|
14 |
+
hellaswag,acc_norm,0.32374029077872934,0.004669459891917689,0
|
15 |
+
piqa,acc,0.6158868335146899,0.011348160741479148,0
|
16 |
+
piqa,acc_norm,0.6218715995647442,0.011313980666854533,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.735,0.013963164754809953,0
|
19 |
+
sciq,acc_norm,0.668,0.014899597242811476,0
|
20 |
+
storycloze_2016,acc,0.5905932656333511,0.01137105952719707,0
|
21 |
+
winogrande,acc,0.5090765588003157,0.014050170094497707,0
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.339,
|
5 |
+
"acc_stderr": 0.01497675877162034
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.336,
|
9 |
+
"acc_stderr": 0.014944140233795027
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3358333333333333,
|
13 |
+
"acc_stderr": 0.01363926119093288
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.1940928270042194
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.63,
|
22 |
+
"acc_stderr": 0.04852365870939099
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.297450707030472,
|
26 |
+
"acc_stderr": 0.004562022467161891,
|
27 |
+
"acc_norm": 0.32374029077872934,
|
28 |
+
"acc_norm_stderr": 0.004669459891917689
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5234657039711191,
|
32 |
+
"acc_stderr": 0.03006330041190266
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5090765588003157,
|
36 |
+
"acc_stderr": 0.014050170094497707
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5905932656333511,
|
40 |
+
"acc_stderr": 0.01137105952719707
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5944954128440367,
|
44 |
+
"acc_stderr": 0.008587459055441612
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.43308080808080807,
|
48 |
+
"acc_stderr": 0.010167478013701799,
|
49 |
+
"acc_norm": 0.38173400673400676,
|
50 |
+
"acc_norm_stderr": 0.009968648851839667
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1885665529010239,
|
54 |
+
"acc_stderr": 0.011430897647675803,
|
55 |
+
"acc_norm": 0.22610921501706485,
|
56 |
+
"acc_norm_stderr": 0.01222420209706328
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.735,
|
60 |
+
"acc_stderr": 0.013963164754809953,
|
61 |
+
"acc_norm": 0.668,
|
62 |
+
"acc_norm_stderr": 0.014899597242811476
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6158868335146899,
|
66 |
+
"acc_stderr": 0.011348160741479148,
|
67 |
+
"acc_norm": 0.6218715995647442,
|
68 |
+
"acc_norm_stderr": 0.011313980666854533
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0_lm-eval_global_step52452_2023-02-09-17-38-12_0shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.339,
|
5 |
+
"acc_stderr": 0.01497675877162034
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.336,
|
9 |
+
"acc_stderr": 0.014944140233795027
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3358333333333333,
|
13 |
+
"acc_stderr": 0.01363926119093288
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.1940928270042194
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.63,
|
22 |
+
"acc_stderr": 0.04852365870939099
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.297450707030472,
|
26 |
+
"acc_stderr": 0.004562022467161891,
|
27 |
+
"acc_norm": 0.32374029077872934,
|
28 |
+
"acc_norm_stderr": 0.004669459891917689
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5234657039711191,
|
32 |
+
"acc_stderr": 0.03006330041190266
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5090765588003157,
|
36 |
+
"acc_stderr": 0.014050170094497707
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5905932656333511,
|
40 |
+
"acc_stderr": 0.01137105952719707
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5944954128440367,
|
44 |
+
"acc_stderr": 0.008587459055441612
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.43308080808080807,
|
48 |
+
"acc_stderr": 0.010167478013701799,
|
49 |
+
"acc_norm": 0.38173400673400676,
|
50 |
+
"acc_norm_stderr": 0.009968648851839667
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1885665529010239,
|
54 |
+
"acc_stderr": 0.011430897647675803,
|
55 |
+
"acc_norm": 0.22610921501706485,
|
56 |
+
"acc_norm_stderr": 0.01222420209706328
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.735,
|
60 |
+
"acc_stderr": 0.013963164754809953,
|
61 |
+
"acc_norm": 0.668,
|
62 |
+
"acc_norm_stderr": 0.014899597242811476
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6158868335146899,
|
66 |
+
"acc_stderr": 0.011348160741479148,
|
67 |
+
"acc_norm": 0.6218715995647442,
|
68 |
+
"acc_norm_stderr": 0.011313980666854533
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.358,
|
5 |
+
"acc_stderr": 0.015167928865407559
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.35,
|
9 |
+
"acc_stderr": 0.015090650341444236
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.32,
|
13 |
+
"acc_stderr": 0.013471620929769152
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.28651292802236195
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.68,
|
22 |
+
"acc_stderr": 0.04688261722621504
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2930691097390958,
|
26 |
+
"acc_stderr": 0.004542396269999213,
|
27 |
+
"acc_norm": 0.3207528380800637,
|
28 |
+
"acc_norm_stderr": 0.004658120152230808
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5631768953068592,
|
32 |
+
"acc_stderr": 0.02985524739031495
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4964483030781373,
|
36 |
+
"acc_stderr": 0.01405213114691586
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.01141582799434265
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5125382262996941,
|
44 |
+
"acc_stderr": 0.008742304974218311
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4313973063973064,
|
48 |
+
"acc_stderr": 0.010162752847747498,
|
49 |
+
"acc_norm": 0.38341750841750843,
|
50 |
+
"acc_norm_stderr": 0.009976995068264717
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19027303754266212,
|
54 |
+
"acc_stderr": 0.011470424179225709,
|
55 |
+
"acc_norm": 0.22610921501706485,
|
56 |
+
"acc_norm_stderr": 0.012224202097063274
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.704,
|
60 |
+
"acc_stderr": 0.014442734941575022,
|
61 |
+
"acc_norm": 0.658,
|
62 |
+
"acc_norm_stderr": 0.015008706182121731
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6240478781284005,
|
66 |
+
"acc_stderr": 0.011301098166895732,
|
67 |
+
"acc_norm": 0.6158868335146899,
|
68 |
+
"acc_norm_stderr": 0.011348160741479136
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1_lm-eval_global_step52452_2023-02-09-17-38-12_1shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.358,
|
5 |
+
"acc_stderr": 0.015167928865407559
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.35,
|
9 |
+
"acc_stderr": 0.015090650341444236
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.32,
|
13 |
+
"acc_stderr": 0.013471620929769152
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.28651292802236195
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.68,
|
22 |
+
"acc_stderr": 0.04688261722621504
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2930691097390958,
|
26 |
+
"acc_stderr": 0.004542396269999213,
|
27 |
+
"acc_norm": 0.3207528380800637,
|
28 |
+
"acc_norm_stderr": 0.004658120152230808
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5631768953068592,
|
32 |
+
"acc_stderr": 0.02985524739031495
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4964483030781373,
|
36 |
+
"acc_stderr": 0.01405213114691586
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.01141582799434265
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5125382262996941,
|
44 |
+
"acc_stderr": 0.008742304974218311
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4313973063973064,
|
48 |
+
"acc_stderr": 0.010162752847747498,
|
49 |
+
"acc_norm": 0.38341750841750843,
|
50 |
+
"acc_norm_stderr": 0.009976995068264717
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19027303754266212,
|
54 |
+
"acc_stderr": 0.011470424179225709,
|
55 |
+
"acc_norm": 0.22610921501706485,
|
56 |
+
"acc_norm_stderr": 0.012224202097063274
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.704,
|
60 |
+
"acc_stderr": 0.014442734941575022,
|
61 |
+
"acc_norm": 0.658,
|
62 |
+
"acc_norm_stderr": 0.015008706182121731
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6240478781284005,
|
66 |
+
"acc_stderr": 0.011301098166895732,
|
67 |
+
"acc_norm": 0.6158868335146899,
|
68 |
+
"acc_norm_stderr": 0.011348160741479136
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.308,
|
5 |
+
"acc_stderr": 0.014606483127342763
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.353,
|
9 |
+
"acc_stderr": 0.015120172605483696
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3375,
|
13 |
+
"acc_stderr": 0.013655897185463653
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4642857142857143,
|
17 |
+
"acc_stderr": 0.06724777654937658,
|
18 |
+
"f1": 0.316548463356974
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.61,
|
22 |
+
"acc_stderr": 0.04902071300001975
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2929695279824736,
|
26 |
+
"acc_stderr": 0.004541944342035901,
|
27 |
+
"acc_norm": 0.31866162119099783,
|
28 |
+
"acc_norm_stderr": 0.00465005215009441
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.47653429602888087,
|
32 |
+
"acc_stderr": 0.030063300411902652
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5011838989739542,
|
36 |
+
"acc_stderr": 0.014052446290529012
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5836451095670764,
|
40 |
+
"acc_stderr": 0.011399490926937005
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4776758409785933,
|
44 |
+
"acc_stderr": 0.00873633411558504
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42634680134680136,
|
48 |
+
"acc_stderr": 0.010147858603835139,
|
49 |
+
"acc_norm": 0.3926767676767677,
|
50 |
+
"acc_norm_stderr": 0.010020646555538686
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18344709897610922,
|
54 |
+
"acc_stderr": 0.011310170179554543,
|
55 |
+
"acc_norm": 0.22696245733788395,
|
56 |
+
"acc_norm_stderr": 0.012240491536132879
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.73,
|
60 |
+
"acc_stderr": 0.014046255632633915,
|
61 |
+
"acc_norm": 0.677,
|
62 |
+
"acc_norm_stderr": 0.014794927843348633
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6349292709466812,
|
66 |
+
"acc_stderr": 0.011233021830554829,
|
67 |
+
"acc_norm": 0.6251360174102285,
|
68 |
+
"acc_norm_stderr": 0.011294565805619019
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2_lm-eval_global_step52452_2023-02-09-17-36-57_2shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.308,
|
5 |
+
"acc_stderr": 0.014606483127342763
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.353,
|
9 |
+
"acc_stderr": 0.015120172605483696
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3375,
|
13 |
+
"acc_stderr": 0.013655897185463653
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4642857142857143,
|
17 |
+
"acc_stderr": 0.06724777654937658,
|
18 |
+
"f1": 0.316548463356974
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.61,
|
22 |
+
"acc_stderr": 0.04902071300001975
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2929695279824736,
|
26 |
+
"acc_stderr": 0.004541944342035901,
|
27 |
+
"acc_norm": 0.31866162119099783,
|
28 |
+
"acc_norm_stderr": 0.00465005215009441
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.47653429602888087,
|
32 |
+
"acc_stderr": 0.030063300411902652
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5011838989739542,
|
36 |
+
"acc_stderr": 0.014052446290529012
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5836451095670764,
|
40 |
+
"acc_stderr": 0.011399490926937005
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4776758409785933,
|
44 |
+
"acc_stderr": 0.00873633411558504
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42634680134680136,
|
48 |
+
"acc_stderr": 0.010147858603835139,
|
49 |
+
"acc_norm": 0.3926767676767677,
|
50 |
+
"acc_norm_stderr": 0.010020646555538686
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18344709897610922,
|
54 |
+
"acc_stderr": 0.011310170179554543,
|
55 |
+
"acc_norm": 0.22696245733788395,
|
56 |
+
"acc_norm_stderr": 0.012240491536132879
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.73,
|
60 |
+
"acc_stderr": 0.014046255632633915,
|
61 |
+
"acc_norm": 0.677,
|
62 |
+
"acc_norm_stderr": 0.014794927843348633
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6349292709466812,
|
66 |
+
"acc_stderr": 0.011233021830554829,
|
67 |
+
"acc_norm": 0.6251360174102285,
|
68 |
+
"acc_norm_stderr": 0.011294565805619019
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.314,
|
5 |
+
"acc_stderr": 0.014683991951087966
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.341,
|
9 |
+
"acc_stderr": 0.014998131348402704
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3466666666666667,
|
13 |
+
"acc_stderr": 0.013744022550571949
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644648,
|
18 |
+
"f1": 0.347985347985348
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.59,
|
22 |
+
"acc_stderr": 0.04943110704237101
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2953594901414061,
|
26 |
+
"acc_stderr": 0.004552718360513099,
|
27 |
+
"acc_norm": 0.3241386178052181,
|
28 |
+
"acc_norm_stderr": 0.0046709553996411276
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5090252707581228,
|
32 |
+
"acc_stderr": 0.030091559826331334
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.494869771112865,
|
36 |
+
"acc_stderr": 0.014051745961790513
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5788348476750401,
|
40 |
+
"acc_stderr": 0.011417808278216117
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4694189602446483,
|
44 |
+
"acc_stderr": 0.008728682900189723
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4276094276094276,
|
48 |
+
"acc_stderr": 0.010151683397430679,
|
49 |
+
"acc_norm": 0.39141414141414144,
|
50 |
+
"acc_norm_stderr": 0.010014917532627812
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19027303754266212,
|
54 |
+
"acc_stderr": 0.011470424179225698,
|
55 |
+
"acc_norm": 0.2235494880546075,
|
56 |
+
"acc_norm_stderr": 0.012174896631202607
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.716,
|
60 |
+
"acc_stderr": 0.014267009061031314,
|
61 |
+
"acc_norm": 0.679,
|
62 |
+
"acc_norm_stderr": 0.014770821817934645
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6289445048966268,
|
66 |
+
"acc_stderr": 0.011271222398600525,
|
67 |
+
"acc_norm": 0.6202393906420022,
|
68 |
+
"acc_norm_stderr": 0.011323483504715843
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3_lm-eval_global_step52452_2023-02-09-17-36-57_3shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.314,
|
5 |
+
"acc_stderr": 0.014683991951087966
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.341,
|
9 |
+
"acc_stderr": 0.014998131348402704
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3466666666666667,
|
13 |
+
"acc_stderr": 0.013744022550571949
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644648,
|
18 |
+
"f1": 0.347985347985348
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.59,
|
22 |
+
"acc_stderr": 0.04943110704237101
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2953594901414061,
|
26 |
+
"acc_stderr": 0.004552718360513099,
|
27 |
+
"acc_norm": 0.3241386178052181,
|
28 |
+
"acc_norm_stderr": 0.0046709553996411276
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5090252707581228,
|
32 |
+
"acc_stderr": 0.030091559826331334
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.494869771112865,
|
36 |
+
"acc_stderr": 0.014051745961790513
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5788348476750401,
|
40 |
+
"acc_stderr": 0.011417808278216117
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4694189602446483,
|
44 |
+
"acc_stderr": 0.008728682900189723
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4276094276094276,
|
48 |
+
"acc_stderr": 0.010151683397430679,
|
49 |
+
"acc_norm": 0.39141414141414144,
|
50 |
+
"acc_norm_stderr": 0.010014917532627812
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19027303754266212,
|
54 |
+
"acc_stderr": 0.011470424179225698,
|
55 |
+
"acc_norm": 0.2235494880546075,
|
56 |
+
"acc_norm_stderr": 0.012174896631202607
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.716,
|
60 |
+
"acc_stderr": 0.014267009061031314,
|
61 |
+
"acc_norm": 0.679,
|
62 |
+
"acc_norm_stderr": 0.014770821817934645
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6289445048966268,
|
66 |
+
"acc_stderr": 0.011271222398600525,
|
67 |
+
"acc_norm": 0.6202393906420022,
|
68 |
+
"acc_norm_stderr": 0.011323483504715843
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.336,
|
5 |
+
"acc_stderr": 0.014944140233795025
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.353,
|
9 |
+
"acc_stderr": 0.01512017260548369
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3375,
|
13 |
+
"acc_stderr": 0.01365589718546366
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644645,
|
18 |
+
"f1": 0.3362023995826813
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.6,
|
22 |
+
"acc_stderr": 0.04923659639173309
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2938657637920733,
|
26 |
+
"acc_stderr": 0.004546002255456781,
|
27 |
+
"acc_norm": 0.32204740091615214,
|
28 |
+
"acc_norm_stderr": 0.00466306082837678
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.49097472924187724,
|
32 |
+
"acc_stderr": 0.030091559826331334
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.48303078137332284,
|
36 |
+
"acc_stderr": 0.014044390401612969
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.011415827994342653
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4437308868501529,
|
44 |
+
"acc_stderr": 0.008689501105367405
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.43013468013468015,
|
48 |
+
"acc_stderr": 0.010159130445178514,
|
49 |
+
"acc_norm": 0.39225589225589225,
|
50 |
+
"acc_norm_stderr": 0.010018744689650043
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19795221843003413,
|
54 |
+
"acc_stderr": 0.011643990971573395,
|
55 |
+
"acc_norm": 0.23122866894197952,
|
56 |
+
"acc_norm_stderr": 0.012320858834772266
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.721,
|
60 |
+
"acc_stderr": 0.01419015011761203,
|
61 |
+
"acc_norm": 0.686,
|
62 |
+
"acc_norm_stderr": 0.014683991951087967
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6322089227421109,
|
66 |
+
"acc_stderr": 0.011250616646678797,
|
67 |
+
"acc_norm": 0.6240478781284005,
|
68 |
+
"acc_norm_stderr": 0.011301098166895724
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4_lm-eval_global_step52452_2023-02-09-17-38-12_4shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.336,
|
5 |
+
"acc_stderr": 0.014944140233795025
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.353,
|
9 |
+
"acc_stderr": 0.01512017260548369
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3375,
|
13 |
+
"acc_stderr": 0.01365589718546366
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644645,
|
18 |
+
"f1": 0.3362023995826813
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.6,
|
22 |
+
"acc_stderr": 0.04923659639173309
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2938657637920733,
|
26 |
+
"acc_stderr": 0.004546002255456781,
|
27 |
+
"acc_norm": 0.32204740091615214,
|
28 |
+
"acc_norm_stderr": 0.00466306082837678
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.49097472924187724,
|
32 |
+
"acc_stderr": 0.030091559826331334
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.48303078137332284,
|
36 |
+
"acc_stderr": 0.014044390401612969
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.011415827994342653
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4437308868501529,
|
44 |
+
"acc_stderr": 0.008689501105367405
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.43013468013468015,
|
48 |
+
"acc_stderr": 0.010159130445178514,
|
49 |
+
"acc_norm": 0.39225589225589225,
|
50 |
+
"acc_norm_stderr": 0.010018744689650043
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19795221843003413,
|
54 |
+
"acc_stderr": 0.011643990971573395,
|
55 |
+
"acc_norm": 0.23122866894197952,
|
56 |
+
"acc_norm_stderr": 0.012320858834772266
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.721,
|
60 |
+
"acc_stderr": 0.01419015011761203,
|
61 |
+
"acc_norm": 0.686,
|
62 |
+
"acc_norm_stderr": 0.014683991951087967
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6322089227421109,
|
66 |
+
"acc_stderr": 0.011250616646678797,
|
67 |
+
"acc_norm": 0.6240478781284005,
|
68 |
+
"acc_norm_stderr": 0.011301098166895724
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_5.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.368,
|
5 |
+
"acc_stderr": 0.015258073561521802
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.335,
|
9 |
+
"acc_stderr": 0.014933117490932577
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3416666666666667,
|
13 |
+
"acc_stderr": 0.013696658778002515
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5535714285714286,
|
17 |
+
"acc_stderr": 0.06703189227942395,
|
18 |
+
"f1": 0.3459575611066344
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2944632543318064,
|
26 |
+
"acc_stderr": 0.00454869574962096,
|
27 |
+
"acc_norm": 0.32423819956184025,
|
28 |
+
"acc_norm_stderr": 0.0046713286732178
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.4981949458483754,
|
32 |
+
"acc_stderr": 0.030096267148976633
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4972375690607735,
|
36 |
+
"acc_stderr": 0.014052271211616441
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5836451095670764,
|
40 |
+
"acc_stderr": 0.011399490926937006
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.44128440366972477,
|
44 |
+
"acc_stderr": 0.008684548127832634
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4297138047138047,
|
48 |
+
"acc_stderr": 0.010157908005763678,
|
49 |
+
"acc_norm": 0.39941077441077444,
|
50 |
+
"acc_norm_stderr": 0.010050018228742115
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19283276450511946,
|
54 |
+
"acc_stderr": 0.011529055465663338,
|
55 |
+
"acc_norm": 0.22696245733788395,
|
56 |
+
"acc_norm_stderr": 0.012240491536132873
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.717,
|
60 |
+
"acc_stderr": 0.014251810906481735,
|
61 |
+
"acc_norm": 0.68,
|
62 |
+
"acc_norm_stderr": 0.014758652303574883
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6278563656147987,
|
66 |
+
"acc_stderr": 0.01127796831359274,
|
67 |
+
"acc_norm": 0.6207834602829162,
|
68 |
+
"acc_norm_stderr": 0.011320331012905077
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_5_lm-eval_global_step52452_2023-02-09-17-38-12_5shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.368,
|
5 |
+
"acc_stderr": 0.015258073561521802
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.335,
|
9 |
+
"acc_stderr": 0.014933117490932577
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3416666666666667,
|
13 |
+
"acc_stderr": 0.013696658778002515
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5535714285714286,
|
17 |
+
"acc_stderr": 0.06703189227942395,
|
18 |
+
"f1": 0.3459575611066344
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2944632543318064,
|
26 |
+
"acc_stderr": 0.00454869574962096,
|
27 |
+
"acc_norm": 0.32423819956184025,
|
28 |
+
"acc_norm_stderr": 0.0046713286732178
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.4981949458483754,
|
32 |
+
"acc_stderr": 0.030096267148976633
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4972375690607735,
|
36 |
+
"acc_stderr": 0.014052271211616441
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5836451095670764,
|
40 |
+
"acc_stderr": 0.011399490926937006
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.44128440366972477,
|
44 |
+
"acc_stderr": 0.008684548127832634
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4297138047138047,
|
48 |
+
"acc_stderr": 0.010157908005763678,
|
49 |
+
"acc_norm": 0.39941077441077444,
|
50 |
+
"acc_norm_stderr": 0.010050018228742115
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19283276450511946,
|
54 |
+
"acc_stderr": 0.011529055465663338,
|
55 |
+
"acc_norm": 0.22696245733788395,
|
56 |
+
"acc_norm_stderr": 0.012240491536132873
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.717,
|
60 |
+
"acc_stderr": 0.014251810906481735,
|
61 |
+
"acc_norm": 0.68,
|
62 |
+
"acc_norm_stderr": 0.014758652303574883
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6278563656147987,
|
66 |
+
"acc_stderr": 0.01127796831359274,
|
67 |
+
"acc_norm": 0.6207834602829162,
|
68 |
+
"acc_norm_stderr": 0.011320331012905077
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.333,0.014910846164229868,0
|
3 |
+
anli_r2,acc,0.339,0.014976758771620344,0
|
4 |
+
anli_r3,acc,0.33916666666666667,0.013672343491681819,0
|
5 |
+
arc_challenge,acc,0.18600682593856654,0.011370940183266749,0
|
6 |
+
arc_challenge,acc_norm,0.22610921501706485,0.01222420209706328,0
|
7 |
+
arc_easy,acc,0.42003367003367004,0.010127718838529398,0
|
8 |
+
arc_easy,acc_norm,0.3728956228956229,0.009922743197129255,0
|
9 |
+
boolq,acc,0.6051987767584098,0.008549304887647411,1
|
10 |
+
cb,acc,0.4107142857142857,0.0663363415035954,1
|
11 |
+
cb,f1,0.1940928270042194,,1
|
12 |
+
copa,acc,0.62,0.04878317312145632,0
|
13 |
+
hellaswag,acc,0.2949611631149173,0.004550933142528753,0
|
14 |
+
hellaswag,acc_norm,0.32463652658832903,0.004672819355838551,0
|
15 |
+
piqa,acc,0.6251360174102285,0.011294565805619017,0
|
16 |
+
piqa,acc_norm,0.6224156692056583,0.011310782787145772,0
|
17 |
+
rte,acc,0.5342960288808665,0.030025579819366422,0
|
18 |
+
sciq,acc,0.735,0.013963164754809949,0
|
19 |
+
sciq,acc_norm,0.656,0.015029633724408945,0
|
20 |
+
storycloze_2016,acc,0.5873864243719936,0.011384472322969045,0
|
21 |
+
winogrande,acc,0.516179952644041,0.014045126130978601,0
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_0.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.333,
|
5 |
+
"acc_stderr": 0.014910846164229868
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.339,
|
9 |
+
"acc_stderr": 0.014976758771620344
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33916666666666667,
|
13 |
+
"acc_stderr": 0.013672343491681819
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.1940928270042194
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145632
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2949611631149173,
|
26 |
+
"acc_stderr": 0.004550933142528753,
|
27 |
+
"acc_norm": 0.32463652658832903,
|
28 |
+
"acc_norm_stderr": 0.004672819355838551
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5342960288808665,
|
32 |
+
"acc_stderr": 0.030025579819366422
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.516179952644041,
|
36 |
+
"acc_stderr": 0.014045126130978601
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5873864243719936,
|
40 |
+
"acc_stderr": 0.011384472322969045
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.6051987767584098,
|
44 |
+
"acc_stderr": 0.008549304887647411
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42003367003367004,
|
48 |
+
"acc_stderr": 0.010127718838529398,
|
49 |
+
"acc_norm": 0.3728956228956229,
|
50 |
+
"acc_norm_stderr": 0.009922743197129255
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18600682593856654,
|
54 |
+
"acc_stderr": 0.011370940183266749,
|
55 |
+
"acc_norm": 0.22610921501706485,
|
56 |
+
"acc_norm_stderr": 0.01222420209706328
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.735,
|
60 |
+
"acc_stderr": 0.013963164754809949,
|
61 |
+
"acc_norm": 0.656,
|
62 |
+
"acc_norm_stderr": 0.015029633724408945
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6251360174102285,
|
66 |
+
"acc_stderr": 0.011294565805619017,
|
67 |
+
"acc_norm": 0.6224156692056583,
|
68 |
+
"acc_norm_stderr": 0.011310782787145772
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_0_lm-eval_global_step42000_2023-02-08-13-42-29_0shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.333,
|
5 |
+
"acc_stderr": 0.014910846164229868
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.339,
|
9 |
+
"acc_stderr": 0.014976758771620344
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33916666666666667,
|
13 |
+
"acc_stderr": 0.013672343491681819
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.1940928270042194
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145632
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2949611631149173,
|
26 |
+
"acc_stderr": 0.004550933142528753,
|
27 |
+
"acc_norm": 0.32463652658832903,
|
28 |
+
"acc_norm_stderr": 0.004672819355838551
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5342960288808665,
|
32 |
+
"acc_stderr": 0.030025579819366422
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.516179952644041,
|
36 |
+
"acc_stderr": 0.014045126130978601
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5873864243719936,
|
40 |
+
"acc_stderr": 0.011384472322969045
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.6051987767584098,
|
44 |
+
"acc_stderr": 0.008549304887647411
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42003367003367004,
|
48 |
+
"acc_stderr": 0.010127718838529398,
|
49 |
+
"acc_norm": 0.3728956228956229,
|
50 |
+
"acc_norm_stderr": 0.009922743197129255
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18600682593856654,
|
54 |
+
"acc_stderr": 0.011370940183266749,
|
55 |
+
"acc_norm": 0.22610921501706485,
|
56 |
+
"acc_norm_stderr": 0.01222420209706328
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.735,
|
60 |
+
"acc_stderr": 0.013963164754809949,
|
61 |
+
"acc_norm": 0.656,
|
62 |
+
"acc_norm_stderr": 0.015029633724408945
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6251360174102285,
|
66 |
+
"acc_stderr": 0.011294565805619017,
|
67 |
+
"acc_norm": 0.6224156692056583,
|
68 |
+
"acc_norm_stderr": 0.011310782787145772
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_1.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.348,
|
5 |
+
"acc_stderr": 0.01507060460376841
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.335,
|
9 |
+
"acc_stderr": 0.014933117490932575
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3416666666666667,
|
13 |
+
"acc_stderr": 0.013696658778002505
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.44642857142857145,
|
17 |
+
"acc_stderr": 0.06703189227942398,
|
18 |
+
"f1": 0.2712571726656234
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.65,
|
22 |
+
"acc_stderr": 0.04793724854411019
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29396534554869547,
|
26 |
+
"acc_stderr": 0.004546451825028366,
|
27 |
+
"acc_norm": 0.3170683130850428,
|
28 |
+
"acc_norm_stderr": 0.004643832742876639
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5487364620938628,
|
32 |
+
"acc_stderr": 0.029953149241808946
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5067087608524072,
|
36 |
+
"acc_stderr": 0.014051220692330349
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5783003741314805,
|
40 |
+
"acc_stderr": 0.011419774841868156
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5587155963302752,
|
44 |
+
"acc_stderr": 0.008684548127832634
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.41203703703703703,
|
48 |
+
"acc_stderr": 0.010099765857562773,
|
49 |
+
"acc_norm": 0.3720538720538721,
|
50 |
+
"acc_norm_stderr": 0.009918187193096468
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.181740614334471,
|
54 |
+
"acc_stderr": 0.011269198948880236,
|
55 |
+
"acc_norm": 0.2167235494880546,
|
56 |
+
"acc_norm_stderr": 0.012040156713481192
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.685,
|
60 |
+
"acc_stderr": 0.014696631960792492,
|
61 |
+
"acc_norm": 0.632,
|
62 |
+
"acc_norm_stderr": 0.0152580735615218
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6294885745375408,
|
66 |
+
"acc_stderr": 0.011267826475447665,
|
67 |
+
"acc_norm": 0.6262241566920566,
|
68 |
+
"acc_norm_stderr": 0.011287972563201017
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_1_lm-eval_global_step42000_2023-02-08-13-42-29_1shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.348,
|
5 |
+
"acc_stderr": 0.01507060460376841
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.335,
|
9 |
+
"acc_stderr": 0.014933117490932575
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3416666666666667,
|
13 |
+
"acc_stderr": 0.013696658778002505
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.44642857142857145,
|
17 |
+
"acc_stderr": 0.06703189227942398,
|
18 |
+
"f1": 0.2712571726656234
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.65,
|
22 |
+
"acc_stderr": 0.04793724854411019
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29396534554869547,
|
26 |
+
"acc_stderr": 0.004546451825028366,
|
27 |
+
"acc_norm": 0.3170683130850428,
|
28 |
+
"acc_norm_stderr": 0.004643832742876639
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5487364620938628,
|
32 |
+
"acc_stderr": 0.029953149241808946
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5067087608524072,
|
36 |
+
"acc_stderr": 0.014051220692330349
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5783003741314805,
|
40 |
+
"acc_stderr": 0.011419774841868156
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5587155963302752,
|
44 |
+
"acc_stderr": 0.008684548127832634
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.41203703703703703,
|
48 |
+
"acc_stderr": 0.010099765857562773,
|
49 |
+
"acc_norm": 0.3720538720538721,
|
50 |
+
"acc_norm_stderr": 0.009918187193096468
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.181740614334471,
|
54 |
+
"acc_stderr": 0.011269198948880236,
|
55 |
+
"acc_norm": 0.2167235494880546,
|
56 |
+
"acc_norm_stderr": 0.012040156713481192
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.685,
|
60 |
+
"acc_stderr": 0.014696631960792492,
|
61 |
+
"acc_norm": 0.632,
|
62 |
+
"acc_norm_stderr": 0.0152580735615218
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6294885745375408,
|
66 |
+
"acc_stderr": 0.011267826475447665,
|
67 |
+
"acc_norm": 0.6262241566920566,
|
68 |
+
"acc_norm_stderr": 0.011287972563201017
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_2.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.313,
|
5 |
+
"acc_stderr": 0.014671272822977892
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.336,
|
9 |
+
"acc_stderr": 0.014944140233795023
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3425,
|
13 |
+
"acc_stderr": 0.013704669762934732
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.42857142857142855,
|
17 |
+
"acc_stderr": 0.06672848092813058,
|
18 |
+
"f1": 0.2791044776119403
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2887870942043418,
|
26 |
+
"acc_stderr": 0.004522725412556968,
|
27 |
+
"acc_norm": 0.31517625970922125,
|
28 |
+
"acc_norm_stderr": 0.004636365534819762
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.48014440433212996,
|
32 |
+
"acc_stderr": 0.0300727231673172
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5011838989739542,
|
36 |
+
"acc_stderr": 0.014052446290529012
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5809727418492785,
|
40 |
+
"acc_stderr": 0.011409804749706194
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5483180428134556,
|
44 |
+
"acc_stderr": 0.008704126206159355
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.398989898989899,
|
48 |
+
"acc_stderr": 0.010048240683798759,
|
49 |
+
"acc_norm": 0.36784511784511786,
|
50 |
+
"acc_norm_stderr": 0.009894923464455196
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18515358361774745,
|
54 |
+
"acc_stderr": 0.011350774438389695,
|
55 |
+
"acc_norm": 0.22781569965870307,
|
56 |
+
"acc_norm_stderr": 0.012256708602326914
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.717,
|
60 |
+
"acc_stderr": 0.014251810906481735,
|
61 |
+
"acc_norm": 0.634,
|
62 |
+
"acc_norm_stderr": 0.015240612726405756
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6327529923830251,
|
66 |
+
"acc_stderr": 0.011247128539690563,
|
67 |
+
"acc_norm": 0.6175190424374319,
|
68 |
+
"acc_norm_stderr": 0.011339019654272345
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_2_lm-eval_global_step42000_2023-02-08-13-42-29_2shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.313,
|
5 |
+
"acc_stderr": 0.014671272822977892
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.336,
|
9 |
+
"acc_stderr": 0.014944140233795023
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3425,
|
13 |
+
"acc_stderr": 0.013704669762934732
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.42857142857142855,
|
17 |
+
"acc_stderr": 0.06672848092813058,
|
18 |
+
"f1": 0.2791044776119403
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2887870942043418,
|
26 |
+
"acc_stderr": 0.004522725412556968,
|
27 |
+
"acc_norm": 0.31517625970922125,
|
28 |
+
"acc_norm_stderr": 0.004636365534819762
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.48014440433212996,
|
32 |
+
"acc_stderr": 0.0300727231673172
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5011838989739542,
|
36 |
+
"acc_stderr": 0.014052446290529012
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5809727418492785,
|
40 |
+
"acc_stderr": 0.011409804749706194
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5483180428134556,
|
44 |
+
"acc_stderr": 0.008704126206159355
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.398989898989899,
|
48 |
+
"acc_stderr": 0.010048240683798759,
|
49 |
+
"acc_norm": 0.36784511784511786,
|
50 |
+
"acc_norm_stderr": 0.009894923464455196
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18515358361774745,
|
54 |
+
"acc_stderr": 0.011350774438389695,
|
55 |
+
"acc_norm": 0.22781569965870307,
|
56 |
+
"acc_norm_stderr": 0.012256708602326914
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.717,
|
60 |
+
"acc_stderr": 0.014251810906481735,
|
61 |
+
"acc_norm": 0.634,
|
62 |
+
"acc_norm_stderr": 0.015240612726405756
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6327529923830251,
|
66 |
+
"acc_stderr": 0.011247128539690563,
|
67 |
+
"acc_norm": 0.6175190424374319,
|
68 |
+
"acc_norm_stderr": 0.011339019654272345
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_3.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.321,
|
5 |
+
"acc_stderr": 0.01477082181793465
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.343,
|
9 |
+
"acc_stderr": 0.015019206922356953
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33916666666666667,
|
13 |
+
"acc_stderr": 0.013672343491681822
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.39285714285714285,
|
17 |
+
"acc_stderr": 0.0658538889806635,
|
18 |
+
"f1": 0.2593406593406593
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29087831109340767,
|
26 |
+
"acc_stderr": 0.004532393111248679,
|
27 |
+
"acc_norm": 0.3136825333598885,
|
28 |
+
"acc_norm_stderr": 0.004630407476835188
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5090252707581228,
|
32 |
+
"acc_stderr": 0.030091559826331334
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5146014206787688,
|
36 |
+
"acc_stderr": 0.014046492383275835
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5783003741314805,
|
40 |
+
"acc_stderr": 0.011419774841868156
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5495412844036697,
|
44 |
+
"acc_stderr": 0.008702022442950878
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4090909090909091,
|
48 |
+
"acc_stderr": 0.010088775152615779,
|
49 |
+
"acc_norm": 0.3686868686868687,
|
50 |
+
"acc_norm_stderr": 0.009899640855681038
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18771331058020477,
|
54 |
+
"acc_stderr": 0.011411001314155136,
|
55 |
+
"acc_norm": 0.22098976109215018,
|
56 |
+
"acc_norm_stderr": 0.012124929206818258
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.694,
|
60 |
+
"acc_stderr": 0.014580006055436972,
|
61 |
+
"acc_norm": 0.652,
|
62 |
+
"acc_norm_stderr": 0.015070604603768408
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6354733405875952,
|
66 |
+
"acc_stderr": 0.011229456510295966,
|
67 |
+
"acc_norm": 0.6262241566920566,
|
68 |
+
"acc_norm_stderr": 0.011287972563201014
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_3_lm-eval_global_step42000_2023-02-08-13-42-29_3shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.321,
|
5 |
+
"acc_stderr": 0.01477082181793465
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.343,
|
9 |
+
"acc_stderr": 0.015019206922356953
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33916666666666667,
|
13 |
+
"acc_stderr": 0.013672343491681822
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.39285714285714285,
|
17 |
+
"acc_stderr": 0.0658538889806635,
|
18 |
+
"f1": 0.2593406593406593
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29087831109340767,
|
26 |
+
"acc_stderr": 0.004532393111248679,
|
27 |
+
"acc_norm": 0.3136825333598885,
|
28 |
+
"acc_norm_stderr": 0.004630407476835188
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5090252707581228,
|
32 |
+
"acc_stderr": 0.030091559826331334
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5146014206787688,
|
36 |
+
"acc_stderr": 0.014046492383275835
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5783003741314805,
|
40 |
+
"acc_stderr": 0.011419774841868156
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5495412844036697,
|
44 |
+
"acc_stderr": 0.008702022442950878
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4090909090909091,
|
48 |
+
"acc_stderr": 0.010088775152615779,
|
49 |
+
"acc_norm": 0.3686868686868687,
|
50 |
+
"acc_norm_stderr": 0.009899640855681038
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.18771331058020477,
|
54 |
+
"acc_stderr": 0.011411001314155136,
|
55 |
+
"acc_norm": 0.22098976109215018,
|
56 |
+
"acc_norm_stderr": 0.012124929206818258
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.694,
|
60 |
+
"acc_stderr": 0.014580006055436972,
|
61 |
+
"acc_norm": 0.652,
|
62 |
+
"acc_norm_stderr": 0.015070604603768408
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6354733405875952,
|
66 |
+
"acc_stderr": 0.011229456510295966,
|
67 |
+
"acc_norm": 0.6262241566920566,
|
68 |
+
"acc_norm_stderr": 0.011287972563201014
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_4.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.34,
|
5 |
+
"acc_stderr": 0.014987482264363937
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.341,
|
9 |
+
"acc_stderr": 0.014998131348402697
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.355,
|
13 |
+
"acc_stderr": 0.013819249004047298
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.44642857142857145,
|
17 |
+
"acc_stderr": 0.06703189227942397,
|
18 |
+
"f1": 0.29572649572649573
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.57,
|
22 |
+
"acc_stderr": 0.04975698519562428
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.287293367855009,
|
26 |
+
"acc_stderr": 0.004515748192605717,
|
27 |
+
"acc_norm": 0.3125871340370444,
|
28 |
+
"acc_norm_stderr": 0.004626002828389158
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.4657039711191336,
|
32 |
+
"acc_stderr": 0.030025579819366426
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5169692186266772,
|
36 |
+
"acc_stderr": 0.014044390401612976
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5750935328701229,
|
40 |
+
"acc_stderr": 0.011431286492205843
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5513761467889908,
|
44 |
+
"acc_stderr": 0.008698767182005272
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4057239057239057,
|
48 |
+
"acc_stderr": 0.010075755540128876,
|
49 |
+
"acc_norm": 0.37626262626262624,
|
50 |
+
"acc_norm_stderr": 0.009940646221513786
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1962457337883959,
|
54 |
+
"acc_stderr": 0.011606019881416286,
|
55 |
+
"acc_norm": 0.22781569965870307,
|
56 |
+
"acc_norm_stderr": 0.012256708602326905
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.691,
|
60 |
+
"acc_stderr": 0.014619600977206486,
|
61 |
+
"acc_norm": 0.658,
|
62 |
+
"acc_norm_stderr": 0.01500870618212173
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6305767138193689,
|
66 |
+
"acc_stderr": 0.011260988628572341,
|
67 |
+
"acc_norm": 0.6175190424374319,
|
68 |
+
"acc_norm_stderr": 0.011339019654272347
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_4_lm-eval_global_step42000_2023-02-08-13-42-29_4shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.34,
|
5 |
+
"acc_stderr": 0.014987482264363937
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.341,
|
9 |
+
"acc_stderr": 0.014998131348402697
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.355,
|
13 |
+
"acc_stderr": 0.013819249004047298
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.44642857142857145,
|
17 |
+
"acc_stderr": 0.06703189227942397,
|
18 |
+
"f1": 0.29572649572649573
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.57,
|
22 |
+
"acc_stderr": 0.04975698519562428
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.287293367855009,
|
26 |
+
"acc_stderr": 0.004515748192605717,
|
27 |
+
"acc_norm": 0.3125871340370444,
|
28 |
+
"acc_norm_stderr": 0.004626002828389158
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.4657039711191336,
|
32 |
+
"acc_stderr": 0.030025579819366426
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5169692186266772,
|
36 |
+
"acc_stderr": 0.014044390401612976
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5750935328701229,
|
40 |
+
"acc_stderr": 0.011431286492205843
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5513761467889908,
|
44 |
+
"acc_stderr": 0.008698767182005272
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4057239057239057,
|
48 |
+
"acc_stderr": 0.010075755540128876,
|
49 |
+
"acc_norm": 0.37626262626262624,
|
50 |
+
"acc_norm_stderr": 0.009940646221513786
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1962457337883959,
|
54 |
+
"acc_stderr": 0.011606019881416286,
|
55 |
+
"acc_norm": 0.22781569965870307,
|
56 |
+
"acc_norm_stderr": 0.012256708602326905
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.691,
|
60 |
+
"acc_stderr": 0.014619600977206486,
|
61 |
+
"acc_norm": 0.658,
|
62 |
+
"acc_norm_stderr": 0.01500870618212173
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6305767138193689,
|
66 |
+
"acc_stderr": 0.011260988628572341,
|
67 |
+
"acc_norm": 0.6175190424374319,
|
68 |
+
"acc_norm_stderr": 0.011339019654272347
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_5.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.353,
|
5 |
+
"acc_stderr": 0.01512017260548369
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.342,
|
9 |
+
"acc_stderr": 0.01500870618212173
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3575,
|
13 |
+
"acc_stderr": 0.013840921245257794
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5,
|
17 |
+
"acc_stderr": 0.06741998624632421,
|
18 |
+
"f1": 0.34521263958184845
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.56,
|
22 |
+
"acc_stderr": 0.04988876515698589
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2891854212308305,
|
26 |
+
"acc_stderr": 0.004524575892952968,
|
27 |
+
"acc_norm": 0.3157737502489544,
|
28 |
+
"acc_norm_stderr": 0.004638733202373885
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5379061371841155,
|
32 |
+
"acc_stderr": 0.030009848912529117
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5019731649565904,
|
36 |
+
"acc_stderr": 0.014052376259225632
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.011415827994342657
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5370030581039755,
|
44 |
+
"acc_stderr": 0.008721074177479658
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4031986531986532,
|
48 |
+
"acc_stderr": 0.010065668576794787,
|
49 |
+
"acc_norm": 0.37457912457912457,
|
50 |
+
"acc_norm_stderr": 0.00993175882041061
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.189419795221843,
|
54 |
+
"acc_stderr": 0.01145070511591077,
|
55 |
+
"acc_norm": 0.22696245733788395,
|
56 |
+
"acc_norm_stderr": 0.012240491536132873
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.71,
|
60 |
+
"acc_stderr": 0.01435639599990569,
|
61 |
+
"acc_norm": 0.665,
|
62 |
+
"acc_norm_stderr": 0.014933117490932572
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6229597388465724,
|
66 |
+
"acc_stderr": 0.011307569752543902,
|
67 |
+
"acc_norm": 0.6126224156692056,
|
68 |
+
"acc_norm_stderr": 0.011366038083435908
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_s_denoiser_44b/checkpoints_2b855b55bc4ul2ndfixnew_5_lm-eval_global_step42000_2023-02-08-13-42-29_5shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.353,
|
5 |
+
"acc_stderr": 0.01512017260548369
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.342,
|
9 |
+
"acc_stderr": 0.01500870618212173
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3575,
|
13 |
+
"acc_stderr": 0.013840921245257794
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5,
|
17 |
+
"acc_stderr": 0.06741998624632421,
|
18 |
+
"f1": 0.34521263958184845
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.56,
|
22 |
+
"acc_stderr": 0.04988876515698589
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2891854212308305,
|
26 |
+
"acc_stderr": 0.004524575892952968,
|
27 |
+
"acc_norm": 0.3157737502489544,
|
28 |
+
"acc_norm_stderr": 0.004638733202373885
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5379061371841155,
|
32 |
+
"acc_stderr": 0.030009848912529117
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5019731649565904,
|
36 |
+
"acc_stderr": 0.014052376259225632
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.011415827994342657
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5370030581039755,
|
44 |
+
"acc_stderr": 0.008721074177479658
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4031986531986532,
|
48 |
+
"acc_stderr": 0.010065668576794787,
|
49 |
+
"acc_norm": 0.37457912457912457,
|
50 |
+
"acc_norm_stderr": 0.00993175882041061
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.189419795221843,
|
54 |
+
"acc_stderr": 0.01145070511591077,
|
55 |
+
"acc_norm": 0.22696245733788395,
|
56 |
+
"acc_norm_stderr": 0.012240491536132873
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.71,
|
60 |
+
"acc_stderr": 0.01435639599990569,
|
61 |
+
"acc_norm": 0.665,
|
62 |
+
"acc_norm_stderr": 0.014933117490932572
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6229597388465724,
|
66 |
+
"acc_stderr": 0.011307569752543902,
|
67 |
+
"acc_norm": 0.6126224156692056,
|
68 |
+
"acc_norm_stderr": 0.011366038083435908
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.338,0.014965960710224473,0
|
3 |
+
anli_r2,acc,0.333,0.01491084616422986,0
|
4 |
+
anli_r3,acc,0.335,0.01363087184382148,0
|
5 |
+
arc_challenge,acc,0.181740614334471,0.011269198948880236,0
|
6 |
+
arc_challenge,acc_norm,0.22098976109215018,0.012124929206818258,0
|
7 |
+
arc_easy,acc,0.43434343434343436,0.010170943451269425,0
|
8 |
+
arc_easy,acc_norm,0.382996632996633,0.009974920384536472,0
|
9 |
+
boolq,acc,0.5935779816513761,0.008590531708882188,1
|
10 |
+
cb,acc,0.4107142857142857,0.0663363415035954,1
|
11 |
+
cb,f1,0.1940928270042194,,1
|
12 |
+
copa,acc,0.63,0.048523658709391,0
|
13 |
+
hellaswag,acc,0.29725154351722766,0.004561141293448468,0
|
14 |
+
hellaswag,acc_norm,0.3256323441545509,0.004676529200753,0
|
15 |
+
piqa,acc,0.6300326441784548,0.011264415223415281,0
|
16 |
+
piqa,acc_norm,0.6322089227421109,0.011250616646678792,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.736,0.013946271849440472,0
|
19 |
+
sciq,acc_norm,0.668,0.014899597242811476,0
|
20 |
+
storycloze_2016,acc,0.5916622127204704,0.011366477562142522,0
|
21 |
+
winogrande,acc,0.5090765588003157,0.01405017009449771,0
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.338,
|
5 |
+
"acc_stderr": 0.014965960710224473
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.333,
|
9 |
+
"acc_stderr": 0.01491084616422986
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.335,
|
13 |
+
"acc_stderr": 0.01363087184382148
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.1940928270042194
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.63,
|
22 |
+
"acc_stderr": 0.048523658709391
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29725154351722766,
|
26 |
+
"acc_stderr": 0.004561141293448468,
|
27 |
+
"acc_norm": 0.3256323441545509,
|
28 |
+
"acc_norm_stderr": 0.004676529200753
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5234657039711191,
|
32 |
+
"acc_stderr": 0.03006330041190266
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5090765588003157,
|
36 |
+
"acc_stderr": 0.01405017009449771
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5916622127204704,
|
40 |
+
"acc_stderr": 0.011366477562142522
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5935779816513761,
|
44 |
+
"acc_stderr": 0.008590531708882188
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.43434343434343436,
|
48 |
+
"acc_stderr": 0.010170943451269425,
|
49 |
+
"acc_norm": 0.382996632996633,
|
50 |
+
"acc_norm_stderr": 0.009974920384536472
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.181740614334471,
|
54 |
+
"acc_stderr": 0.011269198948880236,
|
55 |
+
"acc_norm": 0.22098976109215018,
|
56 |
+
"acc_norm_stderr": 0.012124929206818258
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.736,
|
60 |
+
"acc_stderr": 0.013946271849440472,
|
61 |
+
"acc_norm": 0.668,
|
62 |
+
"acc_norm_stderr": 0.014899597242811476
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6300326441784548,
|
66 |
+
"acc_stderr": 0.011264415223415281,
|
67 |
+
"acc_norm": 0.6322089227421109,
|
68 |
+
"acc_norm_stderr": 0.011250616646678792
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_0_lm-eval_global_step52452_2023-02-09-23-08-31_0shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.338,
|
5 |
+
"acc_stderr": 0.014965960710224473
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.333,
|
9 |
+
"acc_stderr": 0.01491084616422986
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.335,
|
13 |
+
"acc_stderr": 0.01363087184382148
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.4107142857142857,
|
17 |
+
"acc_stderr": 0.0663363415035954,
|
18 |
+
"f1": 0.1940928270042194
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.63,
|
22 |
+
"acc_stderr": 0.048523658709391
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29725154351722766,
|
26 |
+
"acc_stderr": 0.004561141293448468,
|
27 |
+
"acc_norm": 0.3256323441545509,
|
28 |
+
"acc_norm_stderr": 0.004676529200753
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5234657039711191,
|
32 |
+
"acc_stderr": 0.03006330041190266
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5090765588003157,
|
36 |
+
"acc_stderr": 0.01405017009449771
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5916622127204704,
|
40 |
+
"acc_stderr": 0.011366477562142522
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5935779816513761,
|
44 |
+
"acc_stderr": 0.008590531708882188
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.43434343434343436,
|
48 |
+
"acc_stderr": 0.010170943451269425,
|
49 |
+
"acc_norm": 0.382996632996633,
|
50 |
+
"acc_norm_stderr": 0.009974920384536472
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.181740614334471,
|
54 |
+
"acc_stderr": 0.011269198948880236,
|
55 |
+
"acc_norm": 0.22098976109215018,
|
56 |
+
"acc_norm_stderr": 0.012124929206818258
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.736,
|
60 |
+
"acc_stderr": 0.013946271849440472,
|
61 |
+
"acc_norm": 0.668,
|
62 |
+
"acc_norm_stderr": 0.014899597242811476
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6300326441784548,
|
66 |
+
"acc_stderr": 0.011264415223415281,
|
67 |
+
"acc_norm": 0.6322089227421109,
|
68 |
+
"acc_norm_stderr": 0.011250616646678792
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.359,
|
5 |
+
"acc_stderr": 0.015177264224798594
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.349,
|
9 |
+
"acc_stderr": 0.015080663991563102
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.32,
|
13 |
+
"acc_stderr": 0.01347162092976915
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.39285714285714285,
|
17 |
+
"acc_stderr": 0.0658538889806635,
|
18 |
+
"f1": 0.27365967365967364
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.64,
|
22 |
+
"acc_stderr": 0.048241815132442176
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2909778928500299,
|
26 |
+
"acc_stderr": 0.004532850566893522,
|
27 |
+
"acc_norm": 0.31955785700059747,
|
28 |
+
"acc_norm_stderr": 0.004653523038369371
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5523465703971119,
|
32 |
+
"acc_stderr": 0.02993107036293953
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5019731649565904,
|
36 |
+
"acc_stderr": 0.014052376259225632
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5783003741314805,
|
40 |
+
"acc_stderr": 0.011419774841868156
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5122324159021406,
|
44 |
+
"acc_stderr": 0.008742437504570405
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.43097643097643096,
|
48 |
+
"acc_stderr": 0.010161552863493744,
|
49 |
+
"acc_norm": 0.3792087542087542,
|
50 |
+
"acc_norm_stderr": 0.00995589166886556
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1885665529010239,
|
54 |
+
"acc_stderr": 0.0114308976476758,
|
55 |
+
"acc_norm": 0.2158703071672355,
|
56 |
+
"acc_norm_stderr": 0.012022975360030668
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.703,
|
60 |
+
"acc_stderr": 0.0144568322948011,
|
61 |
+
"acc_norm": 0.659,
|
62 |
+
"acc_norm_stderr": 0.014998131348402706
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6224156692056583,
|
66 |
+
"acc_stderr": 0.011310782787145781,
|
67 |
+
"acc_norm": 0.6158868335146899,
|
68 |
+
"acc_norm_stderr": 0.011348160741479136
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_1_lm-eval_global_step52452_2023-02-09-23-08-31_1shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.359,
|
5 |
+
"acc_stderr": 0.015177264224798594
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.349,
|
9 |
+
"acc_stderr": 0.015080663991563102
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.32,
|
13 |
+
"acc_stderr": 0.01347162092976915
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.39285714285714285,
|
17 |
+
"acc_stderr": 0.0658538889806635,
|
18 |
+
"f1": 0.27365967365967364
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.64,
|
22 |
+
"acc_stderr": 0.048241815132442176
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2909778928500299,
|
26 |
+
"acc_stderr": 0.004532850566893522,
|
27 |
+
"acc_norm": 0.31955785700059747,
|
28 |
+
"acc_norm_stderr": 0.004653523038369371
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5523465703971119,
|
32 |
+
"acc_stderr": 0.02993107036293953
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5019731649565904,
|
36 |
+
"acc_stderr": 0.014052376259225632
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5783003741314805,
|
40 |
+
"acc_stderr": 0.011419774841868156
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5122324159021406,
|
44 |
+
"acc_stderr": 0.008742437504570405
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.43097643097643096,
|
48 |
+
"acc_stderr": 0.010161552863493744,
|
49 |
+
"acc_norm": 0.3792087542087542,
|
50 |
+
"acc_norm_stderr": 0.00995589166886556
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.1885665529010239,
|
54 |
+
"acc_stderr": 0.0114308976476758,
|
55 |
+
"acc_norm": 0.2158703071672355,
|
56 |
+
"acc_norm_stderr": 0.012022975360030668
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.703,
|
60 |
+
"acc_stderr": 0.0144568322948011,
|
61 |
+
"acc_norm": 0.659,
|
62 |
+
"acc_norm_stderr": 0.014998131348402706
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6224156692056583,
|
66 |
+
"acc_stderr": 0.011310782787145781,
|
67 |
+
"acc_norm": 0.6158868335146899,
|
68 |
+
"acc_norm_stderr": 0.011348160741479136
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.312,
|
5 |
+
"acc_stderr": 0.014658474370509001
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.348,
|
9 |
+
"acc_stderr": 0.01507060460376841
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3358333333333333,
|
13 |
+
"acc_stderr": 0.013639261190932889
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.44642857142857145,
|
17 |
+
"acc_stderr": 0.06703189227942398,
|
18 |
+
"f1": 0.30579096045197734
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29286994622585144,
|
26 |
+
"acc_stderr": 0.004541492151639241,
|
27 |
+
"acc_norm": 0.31736705835490936,
|
28 |
+
"acc_norm_stderr": 0.004645003662067885
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.48375451263537905,
|
32 |
+
"acc_stderr": 0.030080573208738064
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5082872928176796,
|
36 |
+
"acc_stderr": 0.014050555322824189
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.011415827994342657
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4709480122324159,
|
44 |
+
"acc_stderr": 0.008730280528451546
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4212962962962963,
|
48 |
+
"acc_stderr": 0.010131882498193131,
|
49 |
+
"acc_norm": 0.3888888888888889,
|
50 |
+
"acc_norm_stderr": 0.010003248335313755
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.181740614334471,
|
54 |
+
"acc_stderr": 0.011269198948880236,
|
55 |
+
"acc_norm": 0.22866894197952217,
|
56 |
+
"acc_norm_stderr": 0.0122728535825408
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.727,
|
60 |
+
"acc_stderr": 0.014095022868717605,
|
61 |
+
"acc_norm": 0.676,
|
62 |
+
"acc_norm_stderr": 0.01480686473373886
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6349292709466812,
|
66 |
+
"acc_stderr": 0.011233021830554826,
|
67 |
+
"acc_norm": 0.6240478781284005,
|
68 |
+
"acc_norm_stderr": 0.011301098166895727
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_2_lm-eval_global_step52452_2023-02-09-23-08-31_2shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.312,
|
5 |
+
"acc_stderr": 0.014658474370509001
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.348,
|
9 |
+
"acc_stderr": 0.01507060460376841
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3358333333333333,
|
13 |
+
"acc_stderr": 0.013639261190932889
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.44642857142857145,
|
17 |
+
"acc_stderr": 0.06703189227942398,
|
18 |
+
"f1": 0.30579096045197734
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29286994622585144,
|
26 |
+
"acc_stderr": 0.004541492151639241,
|
27 |
+
"acc_norm": 0.31736705835490936,
|
28 |
+
"acc_norm_stderr": 0.004645003662067885
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.48375451263537905,
|
32 |
+
"acc_stderr": 0.030080573208738064
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5082872928176796,
|
36 |
+
"acc_stderr": 0.014050555322824189
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5793693212185996,
|
40 |
+
"acc_stderr": 0.011415827994342657
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4709480122324159,
|
44 |
+
"acc_stderr": 0.008730280528451546
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.4212962962962963,
|
48 |
+
"acc_stderr": 0.010131882498193131,
|
49 |
+
"acc_norm": 0.3888888888888889,
|
50 |
+
"acc_norm_stderr": 0.010003248335313755
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.181740614334471,
|
54 |
+
"acc_stderr": 0.011269198948880236,
|
55 |
+
"acc_norm": 0.22866894197952217,
|
56 |
+
"acc_norm_stderr": 0.0122728535825408
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.727,
|
60 |
+
"acc_stderr": 0.014095022868717605,
|
61 |
+
"acc_norm": 0.676,
|
62 |
+
"acc_norm_stderr": 0.01480686473373886
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6349292709466812,
|
66 |
+
"acc_stderr": 0.011233021830554826,
|
67 |
+
"acc_norm": 0.6240478781284005,
|
68 |
+
"acc_norm_stderr": 0.011301098166895727
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.314,
|
5 |
+
"acc_stderr": 0.014683991951087967
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.343,
|
9 |
+
"acc_stderr": 0.015019206922356951
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3458333333333333,
|
13 |
+
"acc_stderr": 0.013736245342311012
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.48214285714285715,
|
17 |
+
"acc_stderr": 0.0673769750864465,
|
18 |
+
"f1": 0.3218559218559219
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29286994622585144,
|
26 |
+
"acc_stderr": 0.004541492151639243,
|
27 |
+
"acc_norm": 0.3227444732125075,
|
28 |
+
"acc_norm_stderr": 0.004665704208339039
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5054151624548736,
|
32 |
+
"acc_stderr": 0.030094698123239966
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4925019731649566,
|
36 |
+
"acc_stderr": 0.01405090552122858
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5729556386958845,
|
40 |
+
"acc_stderr": 0.01143868739857839
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4675840978593272,
|
44 |
+
"acc_stderr": 0.008726657178723137
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42297979797979796,
|
48 |
+
"acc_stderr": 0.010137328382209099,
|
49 |
+
"acc_norm": 0.3808922558922559,
|
50 |
+
"acc_norm_stderr": 0.009964428212260379
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19197952218430034,
|
54 |
+
"acc_stderr": 0.011509598906598098,
|
55 |
+
"acc_norm": 0.22525597269624573,
|
56 |
+
"acc_norm_stderr": 0.0122078399954073
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.72,
|
60 |
+
"acc_stderr": 0.014205696104091496,
|
61 |
+
"acc_norm": 0.679,
|
62 |
+
"acc_norm_stderr": 0.014770821817934645
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6273122959738846,
|
66 |
+
"acc_stderr": 0.011281318332897734,
|
67 |
+
"acc_norm": 0.6202393906420022,
|
68 |
+
"acc_norm_stderr": 0.01132348350471584
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_3_lm-eval_global_step52452_2023-02-09-23-08-31_3shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.314,
|
5 |
+
"acc_stderr": 0.014683991951087967
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.343,
|
9 |
+
"acc_stderr": 0.015019206922356951
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.3458333333333333,
|
13 |
+
"acc_stderr": 0.013736245342311012
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.48214285714285715,
|
17 |
+
"acc_stderr": 0.0673769750864465,
|
18 |
+
"f1": 0.3218559218559219
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.62,
|
22 |
+
"acc_stderr": 0.04878317312145633
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.29286994622585144,
|
26 |
+
"acc_stderr": 0.004541492151639243,
|
27 |
+
"acc_norm": 0.3227444732125075,
|
28 |
+
"acc_norm_stderr": 0.004665704208339039
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5054151624548736,
|
32 |
+
"acc_stderr": 0.030094698123239966
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4925019731649566,
|
36 |
+
"acc_stderr": 0.01405090552122858
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5729556386958845,
|
40 |
+
"acc_stderr": 0.01143868739857839
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.4675840978593272,
|
44 |
+
"acc_stderr": 0.008726657178723137
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42297979797979796,
|
48 |
+
"acc_stderr": 0.010137328382209099,
|
49 |
+
"acc_norm": 0.3808922558922559,
|
50 |
+
"acc_norm_stderr": 0.009964428212260379
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19197952218430034,
|
54 |
+
"acc_stderr": 0.011509598906598098,
|
55 |
+
"acc_norm": 0.22525597269624573,
|
56 |
+
"acc_norm_stderr": 0.0122078399954073
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.72,
|
60 |
+
"acc_stderr": 0.014205696104091496,
|
61 |
+
"acc_norm": 0.679,
|
62 |
+
"acc_norm_stderr": 0.014770821817934645
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6273122959738846,
|
66 |
+
"acc_stderr": 0.011281318332897734,
|
67 |
+
"acc_norm": 0.6202393906420022,
|
68 |
+
"acc_norm_stderr": 0.01132348350471584
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.338,
|
5 |
+
"acc_stderr": 0.014965960710224482
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.346,
|
9 |
+
"acc_stderr": 0.015050266127564446
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33916666666666667,
|
13 |
+
"acc_stderr": 0.013672343491681817
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644645,
|
18 |
+
"f1": 0.3362023995826813
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.61,
|
22 |
+
"acc_stderr": 0.04902071300001975
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2954590718980283,
|
26 |
+
"acc_stderr": 0.004553164013379555,
|
27 |
+
"acc_norm": 0.32443736307508464,
|
28 |
+
"acc_norm_stderr": 0.004672074496749016
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.48375451263537905,
|
32 |
+
"acc_stderr": 0.030080573208738064
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4846093133385951,
|
36 |
+
"acc_stderr": 0.014045826789783656
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5734901122394441,
|
40 |
+
"acc_stderr": 0.011436857656268697
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.44587155963302755,
|
44 |
+
"acc_stderr": 0.008693659886486845
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42634680134680136,
|
48 |
+
"acc_stderr": 0.010147858603835144,
|
49 |
+
"acc_norm": 0.39057239057239057,
|
50 |
+
"acc_norm_stderr": 0.010011059112064239
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19539249146757678,
|
54 |
+
"acc_stderr": 0.011586907189952911,
|
55 |
+
"acc_norm": 0.23037542662116042,
|
56 |
+
"acc_norm_stderr": 0.012304928418747611
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.718,
|
60 |
+
"acc_stderr": 0.014236526215291334,
|
61 |
+
"acc_norm": 0.687,
|
62 |
+
"acc_norm_stderr": 0.014671272822977883
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6289445048966268,
|
66 |
+
"acc_stderr": 0.011271222398600525,
|
67 |
+
"acc_norm": 0.6218715995647442,
|
68 |
+
"acc_norm_stderr": 0.011313980666854535
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|
evaluation/rankeval_x_denoiser/checkpoints_2b855b55bc4ul2ndfixnew_4_lm-eval_global_step52452_2023-02-09-23-08-31_4shots_backup.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"anli_r1": {
|
4 |
+
"acc": 0.338,
|
5 |
+
"acc_stderr": 0.014965960710224482
|
6 |
+
},
|
7 |
+
"anli_r2": {
|
8 |
+
"acc": 0.346,
|
9 |
+
"acc_stderr": 0.015050266127564446
|
10 |
+
},
|
11 |
+
"anli_r3": {
|
12 |
+
"acc": 0.33916666666666667,
|
13 |
+
"acc_stderr": 0.013672343491681817
|
14 |
+
},
|
15 |
+
"cb": {
|
16 |
+
"acc": 0.5178571428571429,
|
17 |
+
"acc_stderr": 0.06737697508644645,
|
18 |
+
"f1": 0.3362023995826813
|
19 |
+
},
|
20 |
+
"copa": {
|
21 |
+
"acc": 0.61,
|
22 |
+
"acc_stderr": 0.04902071300001975
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.2954590718980283,
|
26 |
+
"acc_stderr": 0.004553164013379555,
|
27 |
+
"acc_norm": 0.32443736307508464,
|
28 |
+
"acc_norm_stderr": 0.004672074496749016
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.48375451263537905,
|
32 |
+
"acc_stderr": 0.030080573208738064
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.4846093133385951,
|
36 |
+
"acc_stderr": 0.014045826789783656
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.5734901122394441,
|
40 |
+
"acc_stderr": 0.011436857656268697
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.44587155963302755,
|
44 |
+
"acc_stderr": 0.008693659886486845
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.42634680134680136,
|
48 |
+
"acc_stderr": 0.010147858603835144,
|
49 |
+
"acc_norm": 0.39057239057239057,
|
50 |
+
"acc_norm_stderr": 0.010011059112064239
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.19539249146757678,
|
54 |
+
"acc_stderr": 0.011586907189952911,
|
55 |
+
"acc_norm": 0.23037542662116042,
|
56 |
+
"acc_norm_stderr": 0.012304928418747611
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.718,
|
60 |
+
"acc_stderr": 0.014236526215291334,
|
61 |
+
"acc_norm": 0.687,
|
62 |
+
"acc_norm_stderr": 0.014671272822977883
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.6289445048966268,
|
66 |
+
"acc_stderr": 0.011271222398600525,
|
67 |
+
"acc_norm": 0.6218715995647442,
|
68 |
+
"acc_norm_stderr": 0.011313980666854535
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"versions": {
|
72 |
+
"anli_r1": 0,
|
73 |
+
"anli_r2": 0,
|
74 |
+
"anli_r3": 0,
|
75 |
+
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
+
}
|
87 |
+
}
|