Muennighoff commited on
Commit
b0240cf
1 Parent(s): 6f47304

Update scores

Browse files
Files changed (40) hide show
  1. 4b284b12boscar/evaluation/rankeval/4b284b12boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json +0 -87
  2. 4b284b12boscar/evaluation/rankeval/4b284b12boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json +0 -87
  3. 4b284b12boscar/evaluation/rankeval/4b284b12boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json +0 -87
  4. 4b284b12boscar/evaluation/rankeval/4b284b12boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json +0 -87
  5. 4b284b12boscar/evaluation/rankeval/4b284b12boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json +0 -87
  6. 4b284b12boscar/evaluation/rankeval/4b284b12boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json +0 -87
  7. 4b284b17boscar/evaluation/rankeval/4b284b17boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json +0 -87
  8. 4b284b17boscar/evaluation/rankeval/4b284b17boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json +0 -87
  9. 4b284b17boscar/evaluation/rankeval/4b284b17boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json +0 -87
  10. 4b284b17boscar/evaluation/rankeval/4b284b17boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json +0 -87
  11. 4b284b17boscar/evaluation/rankeval/4b284b17boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json +0 -87
  12. 4b284b17boscar/evaluation/rankeval/4b284b17boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json +0 -87
  13. 4b284b21boscar/evaluation/rankeval/4b284b21boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json +0 -87
  14. 4b284b21boscar/evaluation/rankeval/4b284b21boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json +0 -87
  15. 4b284b21boscar/evaluation/rankeval/4b284b21boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json +0 -87
  16. 4b284b21boscar/evaluation/rankeval/4b284b21boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json +0 -87
  17. 4b284b21boscar/evaluation/rankeval/4b284b21boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json +0 -87
  18. 4b284b21boscar/evaluation/rankeval/4b284b21boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json +0 -87
  19. 4b284b28boscar/evaluation/rankeval/4b284b28boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json +0 -87
  20. 4b284b28boscar/evaluation/rankeval/4b284b28boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json +0 -87
  21. 4b284b28boscar/evaluation/rankeval/4b284b28boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json +0 -87
  22. 4b284b28boscar/evaluation/rankeval/4b284b28boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json +0 -87
  23. 4b284b28boscar/evaluation/rankeval/4b284b28boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json +0 -87
  24. 4b284b28boscar/evaluation/rankeval/4b284b28boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json +0 -87
  25. 4b284b42boscar/evaluation/generation/merged.csv +3 -1
  26. 4b284b42boscar/evaluation/generation/merged.json +1 -1
  27. 4b284b42boscar/evaluation/rankeval/4b284b42boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json +0 -87
  28. 4b284b42boscar/evaluation/rankeval/4b284b42boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json +0 -87
  29. 4b284b42boscar/evaluation/rankeval/4b284b42boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json +0 -87
  30. 4b284b42boscar/evaluation/rankeval/4b284b42boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json +0 -87
  31. 4b284b42boscar/evaluation/rankeval/4b284b42boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json +0 -87
  32. 4b284b42boscar/evaluation/rankeval/4b284b42boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json +0 -87
  33. 4b284b84boscar/evaluation/generation/merged.csv +52 -0
  34. 4b284b84boscar/evaluation/generation/merged.json +1 -1
  35. 4b284b84boscar/evaluation/rankeval/4b284b84boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json +0 -87
  36. 4b284b84boscar/evaluation/rankeval/4b284b84boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json +0 -87
  37. 4b284b84boscar/evaluation/rankeval/4b284b84boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json +0 -87
  38. 4b284b84boscar/evaluation/rankeval/4b284b84boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json +0 -87
  39. 4b284b84boscar/evaluation/rankeval/4b284b84boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json +0 -87
  40. 4b284b84boscar/evaluation/rankeval/4b284b84boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json +0 -87
4b284b12boscar/evaluation/rankeval/4b284b12boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.324,
5
- "acc_stderr": 0.01480686473373886
6
- },
7
- "anli_r2": {
8
- "acc": 0.327,
9
- "acc_stderr": 0.014842213153411239
10
- },
11
- "anli_r3": {
12
- "acc": 0.34,
13
- "acc_stderr": 0.013680495725767785
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.3806146572104019
19
- },
20
- "copa": {
21
- "acc": 0.72,
22
- "acc_stderr": 0.045126085985421276
23
- },
24
- "hellaswag": {
25
- "acc": 0.4027086237801235,
26
- "acc_stderr": 0.004894407257215796,
27
- "acc_norm": 0.5084644493128859,
28
- "acc_norm_stderr": 0.004989066355449556
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5595895816890292,
36
- "acc_stderr": 0.01395233031191561
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6755745590593266,
40
- "acc_stderr": 0.010826131344990888
41
- },
42
- "boolq": {
43
- "acc": 0.5608562691131499,
44
- "acc_stderr": 0.008680038923540374
45
- },
46
- "arc_easy": {
47
- "acc": 0.5488215488215489,
48
- "acc_stderr": 0.010210757101073482,
49
- "acc_norm": 0.48863636363636365,
50
- "acc_norm_stderr": 0.010257133441117115
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23378839590443687,
54
- "acc_stderr": 0.01236822537850715,
55
- "acc_norm": 0.27047781569965873,
56
- "acc_norm_stderr": 0.012980954547659554
57
- },
58
- "sciq": {
59
- "acc": 0.815,
60
- "acc_stderr": 0.012285191326386696,
61
- "acc_norm": 0.724,
62
- "acc_norm_stderr": 0.014142984975740668
63
- },
64
- "piqa": {
65
- "acc": 0.7187159956474428,
66
- "acc_stderr": 0.010490509832327423,
67
- "acc_norm": 0.7143634385201306,
68
- "acc_norm_stderr": 0.010539303948661915
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12boscar/evaluation/rankeval/4b284b12boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.324,
5
- "acc_stderr": 0.014806864733738859
6
- },
7
- "anli_r2": {
8
- "acc": 0.325,
9
- "acc_stderr": 0.014818724459095526
10
- },
11
- "anli_r3": {
12
- "acc": 0.33166666666666667,
13
- "acc_stderr": 0.013596836729485163
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.3299501424501424
19
- },
20
- "copa": {
21
- "acc": 0.72,
22
- "acc_stderr": 0.04512608598542127
23
- },
24
- "hellaswag": {
25
- "acc": 0.40091615216092413,
26
- "acc_stderr": 0.004890824718530301,
27
- "acc_norm": 0.5123481378211512,
28
- "acc_norm_stderr": 0.0049882595304724655
29
- },
30
- "rte": {
31
- "acc": 0.5306859205776173,
32
- "acc_stderr": 0.03003973059219781
33
- },
34
- "winogrande": {
35
- "acc": 0.5461720599842147,
36
- "acc_stderr": 0.013992441563707063
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6632816675574559,
40
- "acc_stderr": 0.010928525619392454
41
- },
42
- "boolq": {
43
- "acc": 0.5525993883792049,
44
- "acc_stderr": 0.008696530539281539
45
- },
46
- "arc_easy": {
47
- "acc": 0.5837542087542088,
48
- "acc_stderr": 0.010114819404500866,
49
- "acc_norm": 0.5526094276094277,
50
- "acc_norm_stderr": 0.01020283238541565
51
- },
52
- "arc_challenge": {
53
- "acc": 0.25426621160409557,
54
- "acc_stderr": 0.012724999945157746,
55
- "acc_norm": 0.2773037542662116,
56
- "acc_norm_stderr": 0.013082095839059374
57
- },
58
- "sciq": {
59
- "acc": 0.883,
60
- "acc_stderr": 0.010169287802713329,
61
- "acc_norm": 0.874,
62
- "acc_norm_stderr": 0.010499249222408046
63
- },
64
- "piqa": {
65
- "acc": 0.7274211099020674,
66
- "acc_stderr": 0.010389256803296021,
67
- "acc_norm": 0.720348204570185,
68
- "acc_norm_stderr": 0.010471899530306559
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12boscar/evaluation/rankeval/4b284b12boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.332,
5
- "acc_stderr": 0.014899597242811478
6
- },
7
- "anli_r2": {
8
- "acc": 0.338,
9
- "acc_stderr": 0.014965960710224487
10
- },
11
- "anli_r3": {
12
- "acc": 0.33916666666666667,
13
- "acc_stderr": 0.013672343491681813
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.2570314675577834
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768079
23
- },
24
- "hellaswag": {
25
- "acc": 0.40151364270065726,
26
- "acc_stderr": 0.004892026457294707,
27
- "acc_norm": 0.5150368452499502,
28
- "acc_norm_stderr": 0.004987524454849712
29
- },
30
- "rte": {
31
- "acc": 0.51985559566787,
32
- "acc_stderr": 0.030072723167317184
33
- },
34
- "winogrande": {
35
- "acc": 0.5438042620363063,
36
- "acc_stderr": 0.013998453610924324
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6654195617316943,
40
- "acc_stderr": 0.010911318967127935
41
- },
42
- "boolq": {
43
- "acc": 0.5431192660550459,
44
- "acc_stderr": 0.008712475433089477
45
- },
46
- "arc_easy": {
47
- "acc": 0.5942760942760943,
48
- "acc_stderr": 0.010075755540128873,
49
- "acc_norm": 0.5782828282828283,
50
- "acc_norm_stderr": 0.010133255284012323
51
- },
52
- "arc_challenge": {
53
- "acc": 0.25853242320819114,
54
- "acc_stderr": 0.012794553754288686,
55
- "acc_norm": 0.2841296928327645,
56
- "acc_norm_stderr": 0.013179442447653886
57
- },
58
- "sciq": {
59
- "acc": 0.89,
60
- "acc_stderr": 0.009899393819724442,
61
- "acc_norm": 0.89,
62
- "acc_norm_stderr": 0.009899393819724453
63
- },
64
- "piqa": {
65
- "acc": 0.7257889009793254,
66
- "acc_stderr": 0.010408618664933382,
67
- "acc_norm": 0.7170837867247007,
68
- "acc_norm_stderr": 0.010508949177489678
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12boscar/evaluation/rankeval/4b284b12boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.322,
5
- "acc_stderr": 0.014782913600996683
6
- },
7
- "anli_r2": {
8
- "acc": 0.341,
9
- "acc_stderr": 0.014998131348402699
10
- },
11
- "anli_r3": {
12
- "acc": 0.33166666666666667,
13
- "acc_stderr": 0.01359683672948516
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.3560833560833561
19
- },
20
- "copa": {
21
- "acc": 0.73,
22
- "acc_stderr": 0.044619604333847394
23
- },
24
- "hellaswag": {
25
- "acc": 0.4025094602668791,
26
- "acc_stderr": 0.004894012555642636,
27
- "acc_norm": 0.5155347540330611,
28
- "acc_norm_stderr": 0.004987372476207029
29
- },
30
- "rte": {
31
- "acc": 0.5379061371841155,
32
- "acc_stderr": 0.030009848912529117
33
- },
34
- "winogrande": {
35
- "acc": 0.55327545382794,
36
- "acc_stderr": 0.013972488371616687
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.677712453233565,
40
- "acc_stderr": 0.010807461374996361
41
- },
42
- "boolq": {
43
- "acc": 0.5311926605504587,
44
- "acc_stderr": 0.008728020822889253
45
- },
46
- "arc_easy": {
47
- "acc": 0.5858585858585859,
48
- "acc_stderr": 0.010107387673002531,
49
- "acc_norm": 0.571969696969697,
50
- "acc_norm_stderr": 0.01015294331642626
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2627986348122867,
54
- "acc_stderr": 0.012862523175351333,
55
- "acc_norm": 0.2935153583617747,
56
- "acc_norm_stderr": 0.013307250444941113
57
- },
58
- "sciq": {
59
- "acc": 0.898,
60
- "acc_stderr": 0.009575368801653876,
61
- "acc_norm": 0.899,
62
- "acc_norm_stderr": 0.009533618929340973
63
- },
64
- "piqa": {
65
- "acc": 0.7252448313384113,
66
- "acc_stderr": 0.010415033676676037,
67
- "acc_norm": 0.7285092491838956,
68
- "acc_norm_stderr": 0.010376251176596137
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12boscar/evaluation/rankeval/4b284b12boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.333,
5
- "acc_stderr": 0.01491084616422987
6
- },
7
- "anli_r2": {
8
- "acc": 0.338,
9
- "acc_stderr": 0.014965960710224487
10
- },
11
- "anli_r3": {
12
- "acc": 0.3275,
13
- "acc_stderr": 0.01355321116725194
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.3895559795009913
19
- },
20
- "copa": {
21
- "acc": 0.7,
22
- "acc_stderr": 0.046056618647183814
23
- },
24
- "hellaswag": {
25
- "acc": 0.4008165704043019,
26
- "acc_stderr": 0.0048906236932436216,
27
- "acc_norm": 0.5142401911969727,
28
- "acc_norm_stderr": 0.004987757314769834
29
- },
30
- "rte": {
31
- "acc": 0.5234657039711191,
32
- "acc_stderr": 0.03006330041190266
33
- },
34
- "winogrande": {
35
- "acc": 0.5540647198105761,
36
- "acc_stderr": 0.013970093482330704
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6707642971672902,
40
- "acc_stderr": 0.010867199207548977
41
- },
42
- "boolq": {
43
- "acc": 0.518960244648318,
44
- "acc_stderr": 0.008738765179491934
45
- },
46
- "arc_easy": {
47
- "acc": 0.5980639730639731,
48
- "acc_stderr": 0.010060521220920566,
49
- "acc_norm": 0.5854377104377104,
50
- "acc_norm_stderr": 0.010108889212447783
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26023890784982934,
54
- "acc_stderr": 0.012821930225112568,
55
- "acc_norm": 0.29266211604095566,
56
- "acc_norm_stderr": 0.01329591610361942
57
- },
58
- "sciq": {
59
- "acc": 0.908,
60
- "acc_stderr": 0.009144376393151108,
61
- "acc_norm": 0.913,
62
- "acc_norm_stderr": 0.008916866630745908
63
- },
64
- "piqa": {
65
- "acc": 0.721436343852013,
66
- "acc_stderr": 0.010459397235965189,
67
- "acc_norm": 0.7219804134929271,
68
- "acc_norm_stderr": 0.010453117358332814
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12boscar/evaluation/rankeval/4b284b12boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.33,
5
- "acc_stderr": 0.014876872027456732
6
- },
7
- "anli_r2": {
8
- "acc": 0.328,
9
- "acc_stderr": 0.014853842487270336
10
- },
11
- "anli_r3": {
12
- "acc": 0.34833333333333333,
13
- "acc_stderr": 0.013759437498874073
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.25000000000000006
19
- },
20
- "copa": {
21
- "acc": 0.73,
22
- "acc_stderr": 0.044619604333847394
23
- },
24
- "hellaswag": {
25
- "acc": 0.40380402310296754,
26
- "acc_stderr": 0.004896563126116814,
27
- "acc_norm": 0.5171280621390162,
28
- "acc_norm_stderr": 0.004986852842576728
29
- },
30
- "rte": {
31
- "acc": 0.5090252707581228,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.5374901341752171,
36
- "acc_stderr": 0.014012928183336578
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.677712453233565,
40
- "acc_stderr": 0.010807461374996361
41
- },
42
- "boolq": {
43
- "acc": 0.5070336391437309,
44
- "acc_stderr": 0.008744189661475107
45
- },
46
- "arc_easy": {
47
- "acc": 0.5959595959595959,
48
- "acc_stderr": 0.010069061649549547,
49
- "acc_norm": 0.5765993265993266,
50
- "acc_norm_stderr": 0.010138671005289049
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2687713310580205,
54
- "acc_stderr": 0.01295506596371069,
55
- "acc_norm": 0.2909556313993174,
56
- "acc_norm_stderr": 0.013273077865907586
57
- },
58
- "sciq": {
59
- "acc": 0.904,
60
- "acc_stderr": 0.009320454434783207,
61
- "acc_norm": 0.908,
62
- "acc_norm_stderr": 0.009144376393151127
63
- },
64
- "piqa": {
65
- "acc": 0.720892274211099,
66
- "acc_stderr": 0.010465657948498228,
67
- "acc_norm": 0.720892274211099,
68
- "acc_norm_stderr": 0.01046565794849823
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17boscar/evaluation/rankeval/4b284b17boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.348,
5
- "acc_stderr": 0.015070604603768408
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.014955087918653607
10
- },
11
- "anli_r3": {
12
- "acc": 0.3408333333333333,
13
- "acc_stderr": 0.013688600793296937
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.22058422058422059
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768077
23
- },
24
- "hellaswag": {
25
- "acc": 0.4071898028281219,
26
- "acc_stderr": 0.00490306663976195,
27
- "acc_norm": 0.5113523202549293,
28
- "acc_norm_stderr": 0.004988495127747283
29
- },
30
- "rte": {
31
- "acc": 0.5523465703971119,
32
- "acc_stderr": 0.029931070362939526
33
- },
34
- "winogrande": {
35
- "acc": 0.5335438042620363,
36
- "acc_stderr": 0.014020826677598096
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6771779796900054,
40
- "acc_stderr": 0.010812153082758843
41
- },
42
- "boolq": {
43
- "acc": 0.5137614678899083,
44
- "acc_stderr": 0.008741742106878659
45
- },
46
- "arc_easy": {
47
- "acc": 0.5538720538720538,
48
- "acc_stderr": 0.010200057828765008,
49
- "acc_norm": 0.5033670033670034,
50
- "acc_norm_stderr": 0.01025955089379893
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2380546075085324,
54
- "acc_stderr": 0.012445770028026208,
55
- "acc_norm": 0.2764505119453925,
56
- "acc_norm_stderr": 0.013069662474252427
57
- },
58
- "sciq": {
59
- "acc": 0.825,
60
- "acc_stderr": 0.012021627157731968,
61
- "acc_norm": 0.757,
62
- "acc_norm_stderr": 0.013569640199177457
63
- },
64
- "piqa": {
65
- "acc": 0.7230685527747551,
66
- "acc_stderr": 0.010440499969334526,
67
- "acc_norm": 0.7230685527747551,
68
- "acc_norm_stderr": 0.010440499969334542
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17boscar/evaluation/rankeval/4b284b17boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.356,
5
- "acc_stderr": 0.015149042659306625
6
- },
7
- "anli_r2": {
8
- "acc": 0.34,
9
- "acc_stderr": 0.014987482264363937
10
- },
11
- "anli_r3": {
12
- "acc": 0.3416666666666667,
13
- "acc_stderr": 0.013696658778002514
14
- },
15
- "cb": {
16
- "acc": 0.32142857142857145,
17
- "acc_stderr": 0.06297362289056341,
18
- "f1": 0.24382716049382716
19
- },
20
- "copa": {
21
- "acc": 0.76,
22
- "acc_stderr": 0.04292346959909282
23
- },
24
- "hellaswag": {
25
- "acc": 0.40380402310296754,
26
- "acc_stderr": 0.004896563126116815,
27
- "acc_norm": 0.522903804023103,
28
- "acc_norm_stderr": 0.004984543540932338
29
- },
30
- "rte": {
31
- "acc": 0.555956678700361,
32
- "acc_stderr": 0.029907396333795994
33
- },
34
- "winogrande": {
35
- "acc": 0.5469613259668509,
36
- "acc_stderr": 0.0139903666321481
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6691608765366115,
40
- "acc_stderr": 0.010880601338204657
41
- },
42
- "boolq": {
43
- "acc": 0.5321100917431193,
44
- "acc_stderr": 0.008727003026917804
45
- },
46
- "arc_easy": {
47
- "acc": 0.5841750841750841,
48
- "acc_stderr": 0.010113348244647866,
49
- "acc_norm": 0.5778619528619529,
50
- "acc_norm_stderr": 0.010134620524592268
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26109215017064846,
54
- "acc_stderr": 0.01283552390947384,
55
- "acc_norm": 0.30887372013651876,
56
- "acc_norm_stderr": 0.013501770929344003
57
- },
58
- "sciq": {
59
- "acc": 0.887,
60
- "acc_stderr": 0.010016552866696844,
61
- "acc_norm": 0.884,
62
- "acc_norm_stderr": 0.010131468138756993
63
- },
64
- "piqa": {
65
- "acc": 0.7236126224156693,
66
- "acc_stderr": 0.010434162388275619,
67
- "acc_norm": 0.7279651795429815,
68
- "acc_norm_stderr": 0.01038276378624739
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17boscar/evaluation/rankeval/4b284b17boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.339,
5
- "acc_stderr": 0.014976758771620347
6
- },
7
- "anli_r2": {
8
- "acc": 0.335,
9
- "acc_stderr": 0.014933117490932573
10
- },
11
- "anli_r3": {
12
- "acc": 0.31916666666666665,
13
- "acc_stderr": 0.013462309712005136
14
- },
15
- "cb": {
16
- "acc": 0.32142857142857145,
17
- "acc_stderr": 0.06297362289056341,
18
- "f1": 0.24941724941724944
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.40748854809798846,
26
- "acc_stderr": 0.004903628887264533,
27
- "acc_norm": 0.5238000398327026,
28
- "acc_norm_stderr": 0.004984125363319072
29
- },
30
- "rte": {
31
- "acc": 0.5090252707581228,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.5493291239147593,
36
- "acc_stderr": 0.01398392886904024
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6622127204703367,
40
- "acc_stderr": 0.010937034991003881
41
- },
42
- "boolq": {
43
- "acc": 0.5318042813455658,
44
- "acc_stderr": 0.008727345583419182
45
- },
46
- "arc_easy": {
47
- "acc": 0.6031144781144782,
48
- "acc_stderr": 0.010039236800583199,
49
- "acc_norm": 0.5862794612794613,
50
- "acc_norm_stderr": 0.010105878530238133
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2645051194539249,
54
- "acc_stderr": 0.012889272949313366,
55
- "acc_norm": 0.29948805460750855,
56
- "acc_norm_stderr": 0.01338502163731357
57
- },
58
- "sciq": {
59
- "acc": 0.895,
60
- "acc_stderr": 0.00969892102602495,
61
- "acc_norm": 0.901,
62
- "acc_norm_stderr": 0.009449248027662746
63
- },
64
- "piqa": {
65
- "acc": 0.7268770402611534,
66
- "acc_stderr": 0.010395730264453265,
67
- "acc_norm": 0.720892274211099,
68
- "acc_norm_stderr": 0.010465657948498228
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17boscar/evaluation/rankeval/4b284b17boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.329,
5
- "acc_stderr": 0.014865395385928359
6
- },
7
- "anli_r2": {
8
- "acc": 0.352,
9
- "acc_stderr": 0.01511040450564867
10
- },
11
- "anli_r3": {
12
- "acc": 0.3433333333333333,
13
- "acc_stderr": 0.01371263383046586
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.33124459353967556
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.4063931487751444,
26
- "acc_stderr": 0.004901558132335524,
27
- "acc_norm": 0.5276837283409679,
28
- "acc_norm_stderr": 0.004982127315605216
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5540647198105761,
36
- "acc_stderr": 0.01397009348233069
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6718332442544094,
40
- "acc_stderr": 0.010858184920580582
41
- },
42
- "boolq": {
43
- "acc": 0.518960244648318,
44
- "acc_stderr": 0.008738765179491938
45
- },
46
- "arc_easy": {
47
- "acc": 0.5989057239057239,
48
- "acc_stderr": 0.010057051106534374,
49
- "acc_norm": 0.5946969696969697,
50
- "acc_norm_stderr": 0.010074093589739203
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2636518771331058,
54
- "acc_stderr": 0.012875929151297049,
55
- "acc_norm": 0.2977815699658703,
56
- "acc_norm_stderr": 0.013363080107244487
57
- },
58
- "sciq": {
59
- "acc": 0.901,
60
- "acc_stderr": 0.00944924802766277,
61
- "acc_norm": 0.894,
62
- "acc_norm_stderr": 0.009739551265785134
63
- },
64
- "piqa": {
65
- "acc": 0.7241566920565833,
66
- "acc_stderr": 0.010427805502729114,
67
- "acc_norm": 0.720348204570185,
68
- "acc_norm_stderr": 0.010471899530306555
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17boscar/evaluation/rankeval/4b284b17boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.32,
5
- "acc_stderr": 0.014758652303574883
6
- },
7
- "anli_r2": {
8
- "acc": 0.32,
9
- "acc_stderr": 0.014758652303574872
10
- },
11
- "anli_r3": {
12
- "acc": 0.335,
13
- "acc_stderr": 0.013630871843821476
14
- },
15
- "cb": {
16
- "acc": 0.25,
17
- "acc_stderr": 0.058387420812114225,
18
- "f1": 0.21497326203208558
19
- },
20
- "copa": {
21
- "acc": 0.69,
22
- "acc_stderr": 0.04648231987117316
23
- },
24
- "hellaswag": {
25
- "acc": 0.4060944035052778,
26
- "acc_stderr": 0.004900988997414223,
27
- "acc_norm": 0.5269866560446126,
28
- "acc_norm_stderr": 0.004982508198584269
29
- },
30
- "rte": {
31
- "acc": 0.4620938628158845,
32
- "acc_stderr": 0.030009848912529117
33
- },
34
- "winogrande": {
35
- "acc": 0.5398579321231255,
36
- "acc_stderr": 0.014007765428365165
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6702298236237306,
40
- "acc_stderr": 0.010871682471395135
41
- },
42
- "boolq": {
43
- "acc": 0.5217125382262997,
44
- "acc_stderr": 0.008736805647519948
45
- },
46
- "arc_easy": {
47
- "acc": 0.6022727272727273,
48
- "acc_stderr": 0.010042861602178058,
49
- "acc_norm": 0.5934343434343434,
50
- "acc_norm_stderr": 0.010079056419223525
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2721843003412969,
54
- "acc_stderr": 0.013006600406423709,
55
- "acc_norm": 0.2986348122866894,
56
- "acc_norm_stderr": 0.013374078615068747
57
- },
58
- "sciq": {
59
- "acc": 0.909,
60
- "acc_stderr": 0.009099549538400246,
61
- "acc_norm": 0.915,
62
- "acc_norm_stderr": 0.008823426366942305
63
- },
64
- "piqa": {
65
- "acc": 0.7230685527747551,
66
- "acc_stderr": 0.010440499969334523,
67
- "acc_norm": 0.7236126224156693,
68
- "acc_norm_stderr": 0.010434162388275598
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17boscar/evaluation/rankeval/4b284b17boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.346,
5
- "acc_stderr": 0.015050266127564445
6
- },
7
- "anli_r2": {
8
- "acc": 0.325,
9
- "acc_stderr": 0.014818724459095526
10
- },
11
- "anli_r3": {
12
- "acc": 0.32,
13
- "acc_stderr": 0.013471620929769149
14
- },
15
- "cb": {
16
- "acc": 0.26785714285714285,
17
- "acc_stderr": 0.05971290310957635,
18
- "f1": 0.23228120516499284
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.4085839474208325,
26
- "acc_stderr": 0.004905674408614017,
27
- "acc_norm": 0.5306711810396335,
28
- "acc_norm_stderr": 0.004980384575535375
29
- },
30
- "rte": {
31
- "acc": 0.5342960288808665,
32
- "acc_stderr": 0.030025579819366422
33
- },
34
- "winogrande": {
35
- "acc": 0.5398579321231255,
36
- "acc_stderr": 0.014007765428365165
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6686264029930519,
40
- "acc_stderr": 0.010885036980220164
41
- },
42
- "boolq": {
43
- "acc": 0.5107033639143731,
44
- "acc_stderr": 0.008743051044836891
45
- },
46
- "arc_easy": {
47
- "acc": 0.6085858585858586,
48
- "acc_stderr": 0.010014917532627819,
49
- "acc_norm": 0.601010101010101,
50
- "acc_norm_stderr": 0.010048240683798745
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27986348122866894,
54
- "acc_stderr": 0.013119040897725922,
55
- "acc_norm": 0.30887372013651876,
56
- "acc_norm_stderr": 0.013501770929344003
57
- },
58
- "sciq": {
59
- "acc": 0.911,
60
- "acc_stderr": 0.009008893392651516,
61
- "acc_norm": 0.912,
62
- "acc_norm_stderr": 0.00896305396259208
63
- },
64
- "piqa": {
65
- "acc": 0.720348204570185,
66
- "acc_stderr": 0.01047189953030656,
67
- "acc_norm": 0.720348204570185,
68
- "acc_norm_stderr": 0.010471899530306559
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21boscar/evaluation/rankeval/4b284b21boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.344,
5
- "acc_stderr": 0.015029633724408947
6
- },
7
- "anli_r2": {
8
- "acc": 0.331,
9
- "acc_stderr": 0.014888272588203941
10
- },
11
- "anli_r3": {
12
- "acc": 0.33166666666666667,
13
- "acc_stderr": 0.013596836729485163
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.18803418803418803
19
- },
20
- "copa": {
21
- "acc": 0.72,
22
- "acc_stderr": 0.04512608598542126
23
- },
24
- "hellaswag": {
25
- "acc": 0.40370444134634537,
26
- "acc_stderr": 0.00489636818576524,
27
- "acc_norm": 0.5094602668791077,
28
- "acc_norm_stderr": 0.004988888194063274
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.531965272296764,
36
- "acc_stderr": 0.014023739221166384
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6664885088188135,
40
- "acc_stderr": 0.01090262138991413
41
- },
42
- "boolq": {
43
- "acc": 0.573394495412844,
44
- "acc_stderr": 0.008650327037726273
45
- },
46
- "arc_easy": {
47
- "acc": 0.5643939393939394,
48
- "acc_stderr": 0.010174341733665226,
49
- "acc_norm": 0.5029461279461279,
50
- "acc_norm_stderr": 0.01025960541623758
51
- },
52
- "arc_challenge": {
53
- "acc": 0.25170648464163825,
54
- "acc_stderr": 0.012682496334042961,
55
- "acc_norm": 0.2790102389078498,
56
- "acc_norm_stderr": 0.013106784883601333
57
- },
58
- "sciq": {
59
- "acc": 0.844,
60
- "acc_stderr": 0.011480235006122358,
61
- "acc_norm": 0.765,
62
- "acc_norm_stderr": 0.013414729030247114
63
- },
64
- "piqa": {
65
- "acc": 0.7219804134929271,
66
- "acc_stderr": 0.01045311735833281,
67
- "acc_norm": 0.7236126224156693,
68
- "acc_norm_stderr": 0.0104341623882756
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21boscar/evaluation/rankeval/4b284b21boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.323,
5
- "acc_stderr": 0.014794927843348633
6
- },
7
- "anli_r2": {
8
- "acc": 0.339,
9
- "acc_stderr": 0.014976758771620342
10
- },
11
- "anli_r3": {
12
- "acc": 0.33,
13
- "acc_stderr": 0.013579531277800923
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.3268398268398269
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.40360485958972314,
26
- "acc_stderr": 0.004896173035943315,
27
- "acc_norm": 0.5180242979486158,
28
- "acc_norm_stderr": 0.004986538243846636
29
- },
30
- "rte": {
31
- "acc": 0.555956678700361,
32
- "acc_stderr": 0.02990739633379599
33
- },
34
- "winogrande": {
35
- "acc": 0.5556432517758485,
36
- "acc_stderr": 0.013965196769083555
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6627471940138963,
40
- "acc_stderr": 0.010932788119436439
41
- },
42
- "boolq": {
43
- "acc": 0.5694189602446483,
44
- "acc_stderr": 0.008660360145988744
45
- },
46
- "arc_easy": {
47
- "acc": 0.577020202020202,
48
- "acc_stderr": 0.010137328382209094,
49
- "acc_norm": 0.5631313131313131,
50
- "acc_norm_stderr": 0.010177672928157694
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27047781569965873,
54
- "acc_stderr": 0.012980954547659554,
55
- "acc_norm": 0.2977815699658703,
56
- "acc_norm_stderr": 0.013363080107244487
57
- },
58
- "sciq": {
59
- "acc": 0.905,
60
- "acc_stderr": 0.009276910103103305,
61
- "acc_norm": 0.899,
62
- "acc_norm_stderr": 0.009533618929340997
63
- },
64
- "piqa": {
65
- "acc": 0.7241566920565833,
66
- "acc_stderr": 0.010427805502729115,
67
- "acc_norm": 0.719804134929271,
68
- "acc_norm_stderr": 0.010478122015577095
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21boscar/evaluation/rankeval/4b284b21boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.339,
5
- "acc_stderr": 0.014976758771620347
6
- },
7
- "anli_r2": {
8
- "acc": 0.348,
9
- "acc_stderr": 0.015070604603768408
10
- },
11
- "anli_r3": {
12
- "acc": 0.335,
13
- "acc_stderr": 0.013630871843821476
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.3373075012419275
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.40440151364270066,
26
- "acc_stderr": 0.004897728370737246,
27
- "acc_norm": 0.5218084047002589,
28
- "acc_norm_stderr": 0.004985032806802431
29
- },
30
- "rte": {
31
- "acc": 0.4981949458483754,
32
- "acc_stderr": 0.030096267148976633
33
- },
34
- "winogrande": {
35
- "acc": 0.5485398579321231,
36
- "acc_stderr": 0.013986110301017759
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6616782469267771,
40
- "acc_stderr": 0.010941266252293478
41
- },
42
- "boolq": {
43
- "acc": 0.5636085626911315,
44
- "acc_stderr": 0.00867400046743208
45
- },
46
- "arc_easy": {
47
- "acc": 0.5837542087542088,
48
- "acc_stderr": 0.01011481940450087,
49
- "acc_norm": 0.5765993265993266,
50
- "acc_norm_stderr": 0.010138671005289049
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28242320819112626,
54
- "acc_stderr": 0.013155456884097222,
55
- "acc_norm": 0.30119453924914674,
56
- "acc_norm_stderr": 0.013406741767847626
57
- },
58
- "sciq": {
59
- "acc": 0.909,
60
- "acc_stderr": 0.009099549538400246,
61
- "acc_norm": 0.909,
62
- "acc_norm_stderr": 0.009099549538400238
63
- },
64
- "piqa": {
65
- "acc": 0.7263329706202394,
66
- "acc_stderr": 0.010402184206229206,
67
- "acc_norm": 0.7187159956474428,
68
- "acc_norm_stderr": 0.010490509832327423
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21boscar/evaluation/rankeval/4b284b21boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.311,
5
- "acc_stderr": 0.014645596385722692
6
- },
7
- "anli_r2": {
8
- "acc": 0.335,
9
- "acc_stderr": 0.014933117490932572
10
- },
11
- "anli_r3": {
12
- "acc": 0.3408333333333333,
13
- "acc_stderr": 0.013688600793296939
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.067031892279424,
18
- "f1": 0.3012820512820513
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768078
23
- },
24
- "hellaswag": {
25
- "acc": 0.40509858593905596,
26
- "acc_stderr": 0.00489907830018425,
27
- "acc_norm": 0.522903804023103,
28
- "acc_norm_stderr": 0.004984543540932339
29
- },
30
- "rte": {
31
- "acc": 0.5018050541516246,
32
- "acc_stderr": 0.030096267148976633
33
- },
34
- "winogrande": {
35
- "acc": 0.5619573796369376,
36
- "acc_stderr": 0.013944181296470804
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6745056119722074,
40
- "acc_stderr": 0.010835369677013443
41
- },
42
- "boolq": {
43
- "acc": 0.5553516819571865,
44
- "acc_stderr": 0.008691303433317494
45
- },
46
- "arc_easy": {
47
- "acc": 0.5833333333333334,
48
- "acc_stderr": 0.010116282977781242,
49
- "acc_norm": 0.5816498316498316,
50
- "acc_norm_stderr": 0.010122061470742853
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2790102389078498,
54
- "acc_stderr": 0.013106784883601343,
55
- "acc_norm": 0.30631399317406144,
56
- "acc_norm_stderr": 0.013470584417276513
57
- },
58
- "sciq": {
59
- "acc": 0.913,
60
- "acc_stderr": 0.008916866630745921,
61
- "acc_norm": 0.908,
62
- "acc_norm_stderr": 0.009144376393151118
63
- },
64
- "piqa": {
65
- "acc": 0.7285092491838956,
66
- "acc_stderr": 0.010376251176596135,
67
- "acc_norm": 0.7176278563656148,
68
- "acc_norm_stderr": 0.010502821668555356
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21boscar/evaluation/rankeval/4b284b21boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.317,
5
- "acc_stderr": 0.014721675438880226
6
- },
7
- "anli_r2": {
8
- "acc": 0.339,
9
- "acc_stderr": 0.014976758771620345
10
- },
11
- "anli_r3": {
12
- "acc": 0.325,
13
- "acc_stderr": 0.013526454480351016
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942397,
18
- "f1": 0.3929292929292929
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816507
23
- },
24
- "hellaswag": {
25
- "acc": 0.40440151364270066,
26
- "acc_stderr": 0.0048977283707372496,
27
- "acc_norm": 0.5226050587532364,
28
- "acc_norm_stderr": 0.004984679359375623
29
- },
30
- "rte": {
31
- "acc": 0.48375451263537905,
32
- "acc_stderr": 0.030080573208738064
33
- },
34
- "winogrande": {
35
- "acc": 0.5501183898973955,
36
- "acc_stderr": 0.013981711904049733
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6793158738642437,
40
- "acc_stderr": 0.01079328909592361
41
- },
42
- "boolq": {
43
- "acc": 0.5428134556574924,
44
- "acc_stderr": 0.008712936764296237
45
- },
46
- "arc_easy": {
47
- "acc": 0.5900673400673401,
48
- "acc_stderr": 0.010091953527506251,
49
- "acc_norm": 0.5909090909090909,
50
- "acc_norm_stderr": 0.010088775152615786
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27047781569965873,
54
- "acc_stderr": 0.012980954547659556,
55
- "acc_norm": 0.30631399317406144,
56
- "acc_norm_stderr": 0.013470584417276513
57
- },
58
- "sciq": {
59
- "acc": 0.919,
60
- "acc_stderr": 0.00863212103213998,
61
- "acc_norm": 0.914,
62
- "acc_norm_stderr": 0.008870325962594766
63
- },
64
- "piqa": {
65
- "acc": 0.7285092491838956,
66
- "acc_stderr": 0.010376251176596137,
67
- "acc_norm": 0.7257889009793254,
68
- "acc_norm_stderr": 0.010408618664933382
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21boscar/evaluation/rankeval/4b284b21boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.345,
5
- "acc_stderr": 0.015039986742055233
6
- },
7
- "anli_r2": {
8
- "acc": 0.33,
9
- "acc_stderr": 0.014876872027456736
10
- },
11
- "anli_r3": {
12
- "acc": 0.3383333333333333,
13
- "acc_stderr": 0.013664144006618268
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942397,
18
- "f1": 0.2706349206349206
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.40659231228838877,
26
- "acc_stderr": 0.0049019365115461205,
27
- "acc_norm": 0.5252937661820355,
28
- "acc_norm_stderr": 0.004983392650570959
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5453827940015785,
36
- "acc_stderr": 0.013994481027065998
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6718332442544094,
40
- "acc_stderr": 0.010858184920580584
41
- },
42
- "boolq": {
43
- "acc": 0.5311926605504587,
44
- "acc_stderr": 0.008728020822889253
45
- },
46
- "arc_easy": {
47
- "acc": 0.5875420875420876,
48
- "acc_stderr": 0.010101305447864778,
49
- "acc_norm": 0.5845959595959596,
50
- "acc_norm_stderr": 0.010111869494911512
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2815699658703072,
54
- "acc_stderr": 0.013143376735009024,
55
- "acc_norm": 0.310580204778157,
56
- "acc_norm_stderr": 0.013522292098053054
57
- },
58
- "sciq": {
59
- "acc": 0.91,
60
- "acc_stderr": 0.00905439020486644,
61
- "acc_norm": 0.91,
62
- "acc_norm_stderr": 0.00905439020486644
63
- },
64
- "piqa": {
65
- "acc": 0.7252448313384113,
66
- "acc_stderr": 0.010415033676676042,
67
- "acc_norm": 0.7219804134929271,
68
- "acc_norm_stderr": 0.010453117358332828
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28boscar/evaluation/rankeval/4b284b28boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.34,
5
- "acc_stderr": 0.014987482264363937
6
- },
7
- "anli_r2": {
8
- "acc": 0.34,
9
- "acc_stderr": 0.014987482264363937
10
- },
11
- "anli_r3": {
12
- "acc": 0.3408333333333333,
13
- "acc_stderr": 0.013688600793296939
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.1818181818181818
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.4060944035052778,
26
- "acc_stderr": 0.004900988997414227,
27
- "acc_norm": 0.5160326628161721,
28
- "acc_norm_stderr": 0.004987215542259667
29
- },
30
- "rte": {
31
- "acc": 0.5487364620938628,
32
- "acc_stderr": 0.029953149241808946
33
- },
34
- "winogrande": {
35
- "acc": 0.5430149960536701,
36
- "acc_stderr": 0.01400038676159829
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6739711384286478,
40
- "acc_stderr": 0.010839964752045184
41
- },
42
- "boolq": {
43
- "acc": 0.5318042813455658,
44
- "acc_stderr": 0.008727345583419184
45
- },
46
- "arc_easy": {
47
- "acc": 0.5694444444444444,
48
- "acc_stderr": 0.010160345396860075,
49
- "acc_norm": 0.5151515151515151,
50
- "acc_norm_stderr": 0.010255071794531504
51
- },
52
- "arc_challenge": {
53
- "acc": 0.24744027303754265,
54
- "acc_stderr": 0.01261035266329267,
55
- "acc_norm": 0.2858361774744027,
56
- "acc_norm_stderr": 0.013203196088537369
57
- },
58
- "sciq": {
59
- "acc": 0.833,
60
- "acc_stderr": 0.011800434324644594,
61
- "acc_norm": 0.754,
62
- "acc_norm_stderr": 0.013626065817750636
63
- },
64
- "piqa": {
65
- "acc": 0.7219804134929271,
66
- "acc_stderr": 0.010453117358332802,
67
- "acc_norm": 0.7247007616974973,
68
- "acc_norm_stderr": 0.01042142927736953
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28boscar/evaluation/rankeval/4b284b28boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795023
6
- },
7
- "anli_r2": {
8
- "acc": 0.326,
9
- "acc_stderr": 0.014830507204541037
10
- },
11
- "anli_r3": {
12
- "acc": 0.33,
13
- "acc_stderr": 0.013579531277800922
14
- },
15
- "cb": {
16
- "acc": 0.25,
17
- "acc_stderr": 0.058387420812114225,
18
- "f1": 0.2095321637426901
19
- },
20
- "copa": {
21
- "acc": 0.72,
22
- "acc_stderr": 0.045126085985421276
23
- },
24
- "hellaswag": {
25
- "acc": 0.4051981676956781,
26
- "acc_stderr": 0.004899270310557984,
27
- "acc_norm": 0.5231029675363473,
28
- "acc_norm_stderr": 0.004984452002563928
29
- },
30
- "rte": {
31
- "acc": 0.49097472924187724,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.5469613259668509,
36
- "acc_stderr": 0.013990366632148104
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6632816675574559,
40
- "acc_stderr": 0.010928525619392455
41
- },
42
- "boolq": {
43
- "acc": 0.57217125382263,
44
- "acc_stderr": 0.008653474894637182
45
- },
46
- "arc_easy": {
47
- "acc": 0.5854377104377104,
48
- "acc_stderr": 0.010108889212447769,
49
- "acc_norm": 0.5723905723905723,
50
- "acc_norm_stderr": 0.010151683397430677
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2721843003412969,
54
- "acc_stderr": 0.013006600406423707,
55
- "acc_norm": 0.29436860068259385,
56
- "acc_norm_stderr": 0.013318528460539422
57
- },
58
- "sciq": {
59
- "acc": 0.891,
60
- "acc_stderr": 0.009859828407037188,
61
- "acc_norm": 0.883,
62
- "acc_norm_stderr": 0.010169287802713327
63
- },
64
- "piqa": {
65
- "acc": 0.721436343852013,
66
- "acc_stderr": 0.010459397235965182,
67
- "acc_norm": 0.719260065288357,
68
- "acc_norm_stderr": 0.010484325438311827
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28boscar/evaluation/rankeval/4b284b28boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.341,
5
- "acc_stderr": 0.014998131348402707
6
- },
7
- "anli_r2": {
8
- "acc": 0.343,
9
- "acc_stderr": 0.015019206922356953
10
- },
11
- "anli_r3": {
12
- "acc": 0.32666666666666666,
13
- "acc_stderr": 0.013544340907003665
14
- },
15
- "cb": {
16
- "acc": 0.25,
17
- "acc_stderr": 0.058387420812114225,
18
- "f1": 0.2075098814229249
19
- },
20
- "copa": {
21
- "acc": 0.73,
22
- "acc_stderr": 0.044619604333847394
23
- },
24
- "hellaswag": {
25
- "acc": 0.40728938458474406,
26
- "acc_stderr": 0.0049032542641776235,
27
- "acc_norm": 0.5259908384783908,
28
- "acc_norm_stderr": 0.004983035420235712
29
- },
30
- "rte": {
31
- "acc": 0.5054151624548736,
32
- "acc_stderr": 0.030094698123239966
33
- },
34
- "winogrande": {
35
- "acc": 0.5516969218626677,
36
- "acc_stderr": 0.013977171307126345
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6675574559059326,
40
- "acc_stderr": 0.010893860778343539
41
- },
42
- "boolq": {
43
- "acc": 0.5795107033639144,
44
- "acc_stderr": 0.008633775332463619
45
- },
46
- "arc_easy": {
47
- "acc": 0.6031144781144782,
48
- "acc_stderr": 0.010039236800583206,
49
- "acc_norm": 0.5858585858585859,
50
- "acc_norm_stderr": 0.01010738767300251
51
- },
52
- "arc_challenge": {
53
- "acc": 0.25853242320819114,
54
- "acc_stderr": 0.012794553754288692,
55
- "acc_norm": 0.29948805460750855,
56
- "acc_norm_stderr": 0.013385021637313572
57
- },
58
- "sciq": {
59
- "acc": 0.902,
60
- "acc_stderr": 0.00940661918462124,
61
- "acc_norm": 0.9,
62
- "acc_norm_stderr": 0.009491579957525044
63
- },
64
- "piqa": {
65
- "acc": 0.7219804134929271,
66
- "acc_stderr": 0.010453117358332795,
67
- "acc_norm": 0.7236126224156693,
68
- "acc_norm_stderr": 0.010434162388275608
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28boscar/evaluation/rankeval/4b284b28boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.332,
5
- "acc_stderr": 0.01489959724281149
6
- },
7
- "anli_r2": {
8
- "acc": 0.362,
9
- "acc_stderr": 0.015204840912919503
10
- },
11
- "anli_r3": {
12
- "acc": 0.33416666666666667,
13
- "acc_stderr": 0.013622434813136788
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.4217687074829932
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.40420235012945627,
26
- "acc_stderr": 0.004897340793314381,
27
- "acc_norm": 0.5269866560446126,
28
- "acc_norm_stderr": 0.004982508198584267
29
- },
30
- "rte": {
31
- "acc": 0.5776173285198556,
32
- "acc_stderr": 0.02973162264649588
33
- },
34
- "winogrande": {
35
- "acc": 0.5351223362273086,
36
- "acc_stderr": 0.014017773120881585
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6675574559059326,
40
- "acc_stderr": 0.01089386077834354
41
- },
42
- "boolq": {
43
- "acc": 0.5688073394495413,
44
- "acc_stderr": 0.008661853128165595
45
- },
46
- "arc_easy": {
47
- "acc": 0.6014309764309764,
48
- "acc_stderr": 0.010046455400477943,
49
- "acc_norm": 0.585016835016835,
50
- "acc_norm_stderr": 0.01011038315196114
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28071672354948807,
54
- "acc_stderr": 0.013131238126975578,
55
- "acc_norm": 0.3046075085324232,
56
- "acc_norm_stderr": 0.013449522109932489
57
- },
58
- "sciq": {
59
- "acc": 0.918,
60
- "acc_stderr": 0.008680515615523727,
61
- "acc_norm": 0.908,
62
- "acc_norm_stderr": 0.009144376393151098
63
- },
64
- "piqa": {
65
- "acc": 0.7274211099020674,
66
- "acc_stderr": 0.010389256803296023,
67
- "acc_norm": 0.7290533188248096,
68
- "acc_norm_stderr": 0.010369718937426844
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28boscar/evaluation/rankeval/4b284b28boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795027
6
- },
7
- "anli_r2": {
8
- "acc": 0.34,
9
- "acc_stderr": 0.014987482264363937
10
- },
11
- "anli_r3": {
12
- "acc": 0.3516666666666667,
13
- "acc_stderr": 0.013789711695404801
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.33413848631239934
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.4049990041824338,
26
- "acc_stderr": 0.004898886080687925,
27
- "acc_norm": 0.5279824736108345,
28
- "acc_norm_stderr": 0.004981961097590808
29
- },
30
- "rte": {
31
- "acc": 0.49097472924187724,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.5422257300710339,
36
- "acc_stderr": 0.014002284504422438
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6734366648850882,
40
- "acc_stderr": 0.010844543793668893
41
- },
42
- "boolq": {
43
- "acc": 0.5605504587155963,
44
- "acc_stderr": 0.008680693125810188
45
- },
46
- "arc_easy": {
47
- "acc": 0.6064814814814815,
48
- "acc_stderr": 0.010024426884292557,
49
- "acc_norm": 0.5917508417508418,
50
- "acc_norm_stderr": 0.010085566195791252
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26109215017064846,
54
- "acc_stderr": 0.012835523909473847,
55
- "acc_norm": 0.29948805460750855,
56
- "acc_norm_stderr": 0.013385021637313572
57
- },
58
- "sciq": {
59
- "acc": 0.915,
60
- "acc_stderr": 0.00882342636694232,
61
- "acc_norm": 0.911,
62
- "acc_norm_stderr": 0.009008893392651525
63
- },
64
- "piqa": {
65
- "acc": 0.7165397170837867,
66
- "acc_stderr": 0.010515057791152076,
67
- "acc_norm": 0.7236126224156693,
68
- "acc_norm_stderr": 0.01043416238827561
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28boscar/evaluation/rankeval/4b284b28boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.359,
5
- "acc_stderr": 0.0151772642247986
6
- },
7
- "anli_r2": {
8
- "acc": 0.363,
9
- "acc_stderr": 0.015213890444671283
10
- },
11
- "anli_r3": {
12
- "acc": 0.3358333333333333,
13
- "acc_stderr": 0.013639261190932887
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.3307297277885513
19
- },
20
- "copa": {
21
- "acc": 0.7,
22
- "acc_stderr": 0.046056618647183814
23
- },
24
- "hellaswag": {
25
- "acc": 0.40599482174865564,
26
- "acc_stderr": 0.004900798868048132,
27
- "acc_norm": 0.5313682533359888,
28
- "acc_norm_stderr": 0.004979952166595542
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5461720599842147,
36
- "acc_stderr": 0.01399244156370707
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6766435061464458,
40
- "acc_stderr": 0.010816828633068225
41
- },
42
- "boolq": {
43
- "acc": 0.5620795107033639,
44
- "acc_stderr": 0.008677388652709263
45
- },
46
- "arc_easy": {
47
- "acc": 0.5963804713804713,
48
- "acc_stderr": 0.010067368960348216,
49
- "acc_norm": 0.5904882154882155,
50
- "acc_norm_stderr": 0.010090368160990062
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2764505119453925,
54
- "acc_stderr": 0.013069662474252428,
55
- "acc_norm": 0.310580204778157,
56
- "acc_norm_stderr": 0.013522292098053055
57
- },
58
- "sciq": {
59
- "acc": 0.917,
60
- "acc_stderr": 0.00872852720607479,
61
- "acc_norm": 0.912,
62
- "acc_norm_stderr": 0.008963053962592081
63
- },
64
- "piqa": {
65
- "acc": 0.720892274211099,
66
- "acc_stderr": 0.010465657948498228,
67
- "acc_norm": 0.7274211099020674,
68
- "acc_norm_stderr": 0.010389256803296009
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b42boscar/evaluation/generation/merged.csv CHANGED
@@ -9,7 +9,9 @@ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2589859331129998
9
  e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2589859331129998
10
  e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2613413893640116
11
  e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2613413893640116
12
- e2e_nlg_cleaned,4,average,multiple,0.2171095046484443
 
 
13
  gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04328828501096498
14
  gem_xsum,0,median,rouge2_fmeasure,0.04328828501096498
15
  gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04534620252044498
 
9
  e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2589859331129998
10
  e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2613413893640116
11
  e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2613413893640116
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.26306733403610516
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.26306733403610516
14
+ e2e_nlg_cleaned,5,average,multiple,0.2247691428797211
15
  gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04328828501096498
16
  gem_xsum,0,median,rouge2_fmeasure,0.04328828501096498
17
  gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04534620252044498
4b284b42boscar/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3920095188313853, "bleu_stderr": 0.05821741416066471, "rouge1_fmeasure": 0.10974192875795953, "rouge1_fmeasure_stderr": 0.002108481379704356, "rouge1_precision": 0.07689265316885617, "rouge1_precision_stderr": 0.0021889644764877354, "rouge1_recall": 0.3353633364771332, "rouge1_recall_stderr": 0.005464034559346198, "rouge2_fmeasure": 0.04944058280627724, "rouge2_fmeasure_stderr": 0.0012733709222344247, "rouge2_precision": 0.03467177168437793, "rouge2_precision_stderr": 0.001263966142663848, "rouge2_recall": 0.1501022471408039, "rouge2_recall_stderr": 0.0034656326105627303, "rougeL_fmeasure": 0.10487292177099015, "rougeL_fmeasure_stderr": 0.0019145881199173131, "rougeL_precision": 0.07300762564486572, "rougeL_precision_stderr": 0.0019791385357395687, "rougeL_recall": 0.3239944481490108, "rougeL_recall_stderr": 0.005278665955249544, "rougeLsum_fmeasure": 0.10223587057095268, "rougeLsum_fmeasure_stderr": 0.001953957091865035, "rougeLsum_precision": 0.07173881674469755, "rougeLsum_precision_stderr": 0.0020327580166630914, "rougeLsum_recall": 0.3108299199198791, "rougeLsum_recall_stderr": 0.0050441398235876655}}, "1": {"PALM_prompt": {"bleu": 0.5923583934046589, "bleu_stderr": 0.04413735939354442, "rouge1_fmeasure": 0.1697750677390444, "rouge1_fmeasure_stderr": 0.0038772039426472724, "rouge1_precision": 0.16010050130964712, "rouge1_precision_stderr": 0.005125042778102285, "rouge1_recall": 0.3196355072168386, "rouge1_recall_stderr": 0.005119850964520928, "rouge2_fmeasure": 0.08671853497591266, "rouge2_fmeasure_stderr": 0.002631475278488475, "rouge2_precision": 0.08438981052215226, "rouge2_precision_stderr": 0.0035428710754014017, "rouge2_recall": 0.16375709757632376, "rouge2_recall_stderr": 0.0035801758170607334, "rougeL_fmeasure": 0.15421967175389004, "rougeL_fmeasure_stderr": 0.003335409750319338, "rougeL_precision": 0.14464852171051198, "rougeL_precision_stderr": 0.004587920039107864, "rougeL_recall": 0.2991072637349091, "rougeL_recall_stderr": 0.004697289445064134, "rougeLsum_fmeasure": 0.15698432412836977, "rougeLsum_fmeasure_stderr": 0.0034258507511626024, "rougeLsum_precision": 0.14773643541948675, "rougeLsum_precision_stderr": 0.004697318924441809, "rougeLsum_recall": 0.3022253960230997, "rougeLsum_recall_stderr": 0.004739701115545982}}, "2": {"PALM_prompt": {"bleu": 0.87481699127398, "bleu_stderr": 0.059582002804657114, "rouge1_fmeasure": 0.2091307571621392, "rouge1_fmeasure_stderr": 0.00438758838932181, "rouge1_precision": 0.199466981650156, "rouge1_precision_stderr": 0.005680133053279315, "rouge1_recall": 0.36987834122063556, "rouge1_recall_stderr": 0.004990035433475184, "rouge2_fmeasure": 0.1125014602113282, "rouge2_fmeasure_stderr": 0.0030818329485191713, "rouge2_precision": 0.1097332483868951, "rouge2_precision_stderr": 0.003880260643680688, "rouge2_recall": 0.19989822315140882, "rouge2_recall_stderr": 0.003891995132588205, "rougeL_fmeasure": 0.18777604196623215, "rougeL_fmeasure_stderr": 0.0037686741944170416, "rougeL_precision": 0.17708218948059182, "rougeL_precision_stderr": 0.004959206002368757, "rougeL_recall": 0.34473442974785634, "rougeL_recall_stderr": 0.004653107328809787, "rougeLsum_fmeasure": 0.19268792375031765, "rougeLsum_fmeasure_stderr": 0.00391282714541826, "rougeLsum_precision": 0.18275767079289593, "rougeLsum_precision_stderr": 0.005158519349266604, "rougeLsum_recall": 0.3495549840141228, "rougeLsum_recall_stderr": 0.00470047440035223}}, "3": {"PALM_prompt": {"bleu": 0.917052522385174, "bleu_stderr": 0.04515993268519611, "rouge1_fmeasure": 0.21372711407619938, "rouge1_fmeasure_stderr": 0.004343895044489938, "rouge1_precision": 0.20888904165496103, "rouge1_precision_stderr": 0.005880585457367179, "rouge1_recall": 0.37814776640994463, "rouge1_recall_stderr": 0.004965150831364198, "rouge2_fmeasure": 0.11478946258154298, "rouge2_fmeasure_stderr": 0.003021059673039041, "rouge2_precision": 0.1168574620986129, "rouge2_precision_stderr": 0.004100382644887659, "rouge2_recall": 0.2033520821796969, "rouge2_recall_stderr": 0.00383298408380243, "rougeL_fmeasure": 0.19232862909601875, "rougeL_fmeasure_stderr": 0.0036812800257055574, "rougeL_precision": 0.18605811290952207, "rougeL_precision_stderr": 0.005155344402670075, "rougeL_recall": 0.35240104768920266, "rougeL_recall_stderr": 0.004560857947768766, "rougeLsum_fmeasure": 0.1967270339578505, "rougeLsum_fmeasure_stderr": 0.0038181408662667426, "rougeLsum_precision": 0.19147784744930493, "rougeLsum_precision_stderr": 0.0053469173736333826, "rougeLsum_recall": 0.3571044253500232, "rougeLsum_recall_stderr": 0.004630285578564224}}, "4": {"PALM_prompt": {"bleu": 1.0770812969732624, "bleu_stderr": 0.057150178907157206, "rouge1_fmeasure": 0.23556795007723105, "rouge1_fmeasure_stderr": 0.004533277698287348, "rouge1_precision": 0.22981660265238718, "rouge1_precision_stderr": 0.0060091983198828705, "rouge1_recall": 0.39885875556534467, "rouge1_recall_stderr": 0.004954969825753986, "rouge2_fmeasure": 0.12790770467752932, "rouge2_fmeasure_stderr": 0.0031721283738045283, "rouge2_precision": 0.12750847088352804, "rouge2_precision_stderr": 0.004054232891970157, "rouge2_recall": 0.21938886368566654, "rouge2_recall_stderr": 0.003991587985278903, "rougeL_fmeasure": 0.20953786614403025, "rougeL_fmeasure_stderr": 0.0038198330880285674, "rougeL_precision": 0.2007562947847461, "rougeL_precision_stderr": 0.005097419240769892, "rougeL_recall": 0.3691187056685199, "rougeL_recall_stderr": 0.004545072578815252, "rougeLsum_fmeasure": 0.21611300792836619, "rougeLsum_fmeasure_stderr": 0.003996019817069793, "rougeLsum_precision": 0.2090073734779189, "rougeLsum_precision_stderr": 0.0053710149363667935, "rougeLsum_recall": 0.3759411168888221, "rougeLsum_recall_stderr": 0.004622209838017953}}, "5": {"PALM_prompt": {"bleu": 1.1504048600305352, "bleu_stderr": 0.0755822004199928, "rouge1_fmeasure": 0.24203388956598934, "rouge1_fmeasure_stderr": 0.004720213442449205, "rouge1_precision": 0.24380689117778276, "rouge1_precision_stderr": 0.006371373530201665, "rouge1_recall": 0.3952201821451148, "rouge1_recall_stderr": 0.005033224197794555, "rouge2_fmeasure": 0.13474976693002946, "rouge2_fmeasure_stderr": 0.003415392488984557, "rouge2_precision": 0.1418504594039488, "rouge2_precision_stderr": 0.004575582136553598, "rouge2_recall": 0.21936500213450158, "rouge2_recall_stderr": 0.004020020751334439, "rougeL_fmeasure": 0.21539137146469606, "rougeL_fmeasure_stderr": 0.004028386529865914, "rougeL_precision": 0.21441718014302874, "rougeL_precision_stderr": 0.005537317704553182, "rougeL_recall": 0.36454653195485753, "rougeL_recall_stderr": 0.004645366026599087, "rougeLsum_fmeasure": 0.222229551042884, "rougeLsum_fmeasure_stderr": 0.004199234047990457, "rougeLsum_precision": 0.22298936285767557, "rougeLsum_precision_stderr": 0.005807349708841443, "rougeLsum_recall": 0.37135465454110034, "rougeLsum_recall_stderr": 0.004699420865300012}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.0868626526463756, "bleu_stderr": 0.10334620651927755, "rouge1_fmeasure": 0.13547708241929612, "rouge1_fmeasure_stderr": 0.0026556069493799896, "rouge1_precision": 0.13065355742673593, "rouge1_precision_stderr": 0.003004776280579319, "rouge1_recall": 0.1786059794590456, "rouge1_recall_stderr": 0.003480497255310532, "rouge2_fmeasure": 0.0352234336154376, "rouge2_fmeasure_stderr": 0.0010798725770486046, "rouge2_precision": 0.03291780593276675, "rouge2_precision_stderr": 0.0010760829551416996, "rouge2_recall": 0.04663674589032859, "rouge2_recall_stderr": 0.001504992918818976, "rougeL_fmeasure": 0.10336654675385076, "rougeL_fmeasure_stderr": 0.001968971023431868, "rougeL_precision": 0.09987150235228559, "rougeL_precision_stderr": 0.0023685337113851174, "rougeL_recall": 0.13959473564876165, "rougeL_recall_stderr": 0.0027581190918920103, "rougeLsum_fmeasure": 0.12705867398208964, "rougeLsum_fmeasure_stderr": 0.002506787857940285, "rougeLsum_precision": 0.12288788441024788, "rougeLsum_precision_stderr": 0.0028720387887482485, "rougeLsum_recall": 0.16759622234914887, "rougeLsum_recall_stderr": 0.0032867178590762366}}, "1": {"tldr_en": {"bleu": 2.5514775583592173, "bleu_stderr": 0.11284261690010776, "rouge1_fmeasure": 0.16580650031794167, "rouge1_fmeasure_stderr": 0.0021193004105035614, "rouge1_precision": 0.19126686092152348, "rouge1_precision_stderr": 0.003082884423906421, "rouge1_recall": 0.2013507500089625, "rouge1_recall_stderr": 0.0029925213818961095, "rouge2_fmeasure": 0.035746404962808155, "rouge2_fmeasure_stderr": 0.0011192977008782524, "rouge2_precision": 0.044582320866862424, "rouge2_precision_stderr": 0.0018321118047088484, "rouge2_recall": 0.04442769282883079, "rouge2_recall_stderr": 0.0014782140733801531, "rougeL_fmeasure": 0.1274347165087169, "rougeL_fmeasure_stderr": 0.0015926173758816556, "rougeL_precision": 0.14988887413463797, "rougeL_precision_stderr": 0.002586002929489674, "rougeL_recall": 0.15563614848002308, "rougeL_recall_stderr": 0.0023345594461348924, "rougeLsum_fmeasure": 0.15656227686437413, "rougeLsum_fmeasure_stderr": 0.0019872168695675768, "rougeLsum_precision": 0.18119105294163743, "rougeLsum_precision_stderr": 0.002938341173461725, "rougeLsum_recall": 0.19011415446724061, "rougeLsum_recall_stderr": 0.002811944286183604}}, "2": {"tldr_en": {"bleu": 4.160843322149366, "bleu_stderr": 0.1298621943311205, "rouge1_fmeasure": 0.227527675745211, "rouge1_fmeasure_stderr": 0.002262727638760964, "rouge1_precision": 0.27518353034811777, "rouge1_precision_stderr": 0.003681886785126964, "rouge1_recall": 0.26994010795497553, "rouge1_recall_stderr": 0.003076759192772023, "rouge2_fmeasure": 0.06444156598905733, "rouge2_fmeasure_stderr": 0.001316135695695516, "rouge2_precision": 0.08493527530542791, "rouge2_precision_stderr": 0.002342907743897752, "rouge2_recall": 0.07562522541046568, "rouge2_recall_stderr": 0.0016670921194054875, "rougeL_fmeasure": 0.17356357206141904, "rougeL_fmeasure_stderr": 0.0017138094026774846, "rougeL_precision": 0.21378447499783487, "rougeL_precision_stderr": 0.003086057725507419, "rougeL_recall": 0.20706303996123673, "rougeL_recall_stderr": 0.002420883132061566, "rougeLsum_fmeasure": 0.21385148741041204, "rougeLsum_fmeasure_stderr": 0.0021335293397074695, "rougeLsum_precision": 0.2594677476381253, "rougeLsum_precision_stderr": 0.0035263951440822477, "rougeLsum_recall": 0.25382240907884707, "rougeLsum_recall_stderr": 0.0029022538349274}}, "3": {"tldr_en": {"bleu": 3.550269268318738, "bleu_stderr": 0.123976667805652, "rouge1_fmeasure": 0.19542809253461213, "rouge1_fmeasure_stderr": 0.002567437085860653, "rouge1_precision": 0.25197379414278026, "rouge1_precision_stderr": 0.004061114481390069, "rouge1_recall": 0.2241545120711431, "rouge1_recall_stderr": 0.003349453782690701, "rouge2_fmeasure": 0.055649339745118376, "rouge2_fmeasure_stderr": 0.0013436864732712206, "rouge2_precision": 0.07748731951936201, "rouge2_precision_stderr": 0.0024143985835070676, "rouge2_recall": 0.06396850914384923, "rouge2_recall_stderr": 0.001702594494628315, "rougeL_fmeasure": 0.1493802268370947, "rougeL_fmeasure_stderr": 0.001958022255827534, "rougeL_precision": 0.19629979985917206, "rougeL_precision_stderr": 0.00335622576475892, "rougeL_recall": 0.17220070773118773, "rougeL_recall_stderr": 0.0026273647249788665, "rougeLsum_fmeasure": 0.18360399651239948, "rougeLsum_fmeasure_stderr": 0.002422426009983074, "rougeLsum_precision": 0.2374958020095092, "rougeLsum_precision_stderr": 0.0038790177970112016, "rougeLsum_recall": 0.2104711045184697, "rougeLsum_recall_stderr": 0.0031578834852280147}}, "4": {"tldr_en": {"bleu": 0.18727975729565796, "bleu_stderr": 0.024743677554065382, "rouge1_fmeasure": 0.06369530530813045, "rouge1_fmeasure_stderr": 0.002225322220412946, "rouge1_precision": 0.08736393590844754, "rouge1_precision_stderr": 0.003320236333651585, "rouge1_recall": 0.07170734676784853, "rouge1_recall_stderr": 0.0026863536115591326, "rouge2_fmeasure": 0.018728966913288573, "rouge2_fmeasure_stderr": 0.0009803445596888972, "rouge2_precision": 0.026539844024013316, "rouge2_precision_stderr": 0.0016227872678637359, "rouge2_recall": 0.021485803029836836, "rouge2_recall_stderr": 0.0012234458979275497, "rougeL_fmeasure": 0.0499467876025126, "rougeL_fmeasure_stderr": 0.0017576059173842731, "rougeL_precision": 0.07026197905523278, "rougeL_precision_stderr": 0.002775489585285312, "rougeL_recall": 0.056214726744091746, "rougeL_recall_stderr": 0.0021292176604441565, "rougeLsum_fmeasure": 0.05976166581566339, "rougeLsum_fmeasure_stderr": 0.0020932837351865146, "rougeLsum_precision": 0.08238350525240551, "rougeLsum_precision_stderr": 0.00315393172421982, "rougeLsum_recall": 0.06716878885751017, "rougeLsum_recall_stderr": 0.002524004286911468}}, "5": {"tldr_en": {"bleu": 5.577157241826622e-13, "bleu_stderr": 1.1569346422902409e-11, "rouge1_fmeasure": 0.01043565444271075, "rouge1_fmeasure_stderr": 0.0010239464330389257, "rouge1_precision": 0.016308055800076632, "rouge1_precision_stderr": 0.0016664962689093485, "rouge1_recall": 0.011189655817252247, "rouge1_recall_stderr": 0.0011802896184760587, "rouge2_fmeasure": 0.003212219718708495, "rouge2_fmeasure_stderr": 0.00044504346236305713, "rouge2_precision": 0.004996639391364696, "rouge2_precision_stderr": 0.0007412751630573523, "rouge2_recall": 0.0035850495924414864, "rouge2_recall_stderr": 0.0005653334006649237, "rougeL_fmeasure": 0.008463888867128002, "rougeL_fmeasure_stderr": 0.0008498965362098245, "rougeL_precision": 0.013336917042039756, "rougeL_precision_stderr": 0.0013953475029781011, "rougeL_recall": 0.009180912567410115, "rougeL_recall_stderr": 0.001001081279334972, "rougeLsum_fmeasure": 0.009948416660924693, "rougeLsum_fmeasure_stderr": 0.000978895492286139, "rougeLsum_precision": 0.015695607203614483, "rougeLsum_precision_stderr": 0.0016199770379973, "rougeLsum_recall": 0.010692369328543307, "rougeLsum_recall_stderr": 0.0011385128958340267}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.1932091730586953, "bleu_stderr": 0.12204583279523666, "rouge1_fmeasure": 0.24046689564492205, "rouge1_fmeasure_stderr": 0.0026584396733124543, "rouge1_precision": 0.41337918705325344, "rouge1_precision_stderr": 0.0052760059693995, "rouge1_recall": 0.2309238228257254, "rouge1_recall_stderr": 0.0032774148724919335, "rouge2_fmeasure": 0.08042598706024275, "rouge2_fmeasure_stderr": 0.001263671869367697, "rouge2_precision": 0.23463707329236097, "rouge2_precision_stderr": 0.0062472430372265, "rouge2_recall": 0.07811161999290202, "rouge2_recall_stderr": 0.001551040325003802, "rougeL_fmeasure": 0.18994590486023305, "rougeL_fmeasure_stderr": 0.0016519612628498022, "rougeL_precision": 0.3599428455550758, "rougeL_precision_stderr": 0.005381765248981534, "rougeL_recall": 0.179931821205033, "rougeL_recall_stderr": 0.002310618989620828, "rougeLsum_fmeasure": 0.2231487737804936, "rougeLsum_fmeasure_stderr": 0.0024418581547524815, "rougeLsum_precision": 0.39508222499421797, "rougeLsum_precision_stderr": 0.005341940324256954, "rougeLsum_recall": 0.21269354787389044, "rougeLsum_recall_stderr": 0.003034851683547011}}, "1": {"generate_text_restaurant": {"bleu": 12.375900606698423, "bleu_stderr": 0.19606823444326352, "rouge1_fmeasure": 0.48293554827595836, "rouge1_fmeasure_stderr": 0.0023435879912405283, "rouge1_precision": 0.597454575764657, "rouge1_precision_stderr": 0.0032329061443020345, "rouge1_recall": 0.4434643604110928, "rouge1_recall_stderr": 0.0029889149083277117, "rouge2_fmeasure": 0.23118450232385498, "rouge2_fmeasure_stderr": 0.0020415023824907675, "rouge2_precision": 0.2910060712314691, "rouge2_precision_stderr": 0.0027476178598673066, "rouge2_recall": 0.21176179817940102, "rouge2_recall_stderr": 0.0021589328308703566, "rougeL_fmeasure": 0.3518994031108773, "rougeL_fmeasure_stderr": 0.0021201781748616372, "rougeL_precision": 0.43888436474875336, "rougeL_precision_stderr": 0.0030457086517011057, "rougeL_recall": 0.3220136107735307, "rougeL_recall_stderr": 0.0024545810076108667, "rougeLsum_fmeasure": 0.39555654566655024, "rougeLsum_fmeasure_stderr": 0.0023470162024293896, "rougeLsum_precision": 0.4908609045585646, "rougeLsum_precision_stderr": 0.003223896911814539, "rougeLsum_recall": 0.3626278022836946, "rougeLsum_recall_stderr": 0.0027433883081054507}}, "2": {"generate_text_restaurant": {"bleu": 14.369325725727359, "bleu_stderr": 0.16247647633686452, "rouge1_fmeasure": 0.5050896808334919, "rouge1_fmeasure_stderr": 0.0022820908410676275, "rouge1_precision": 0.6089651864273457, "rouge1_precision_stderr": 0.003198435500725766, "rouge1_recall": 0.4684512148933178, "rouge1_recall_stderr": 0.0029309075224427496, "rouge2_fmeasure": 0.25360971138111243, "rouge2_fmeasure_stderr": 0.0021297710933538033, "rouge2_precision": 0.31006692351264853, "rouge2_precision_stderr": 0.002823237008286986, "rouge2_recall": 0.23489561514574075, "rouge2_recall_stderr": 0.0022593518873055855, "rougeL_fmeasure": 0.3737642937216708, "rougeL_fmeasure_stderr": 0.002190131174828545, "rougeL_precision": 0.4528063450828851, "rougeL_precision_stderr": 0.0030647195241319818, "rougeL_recall": 0.34593233406974633, "rougeL_recall_stderr": 0.0025245059893910897, "rougeLsum_fmeasure": 0.4229068329404585, "rougeLsum_fmeasure_stderr": 0.0023792336336982954, "rougeLsum_precision": 0.5104756682075146, "rougeLsum_precision_stderr": 0.0032393352108801977, "rougeLsum_recall": 0.39199556131381524, "rougeLsum_recall_stderr": 0.0027868585721285742}}, "3": {"generate_text_restaurant": {"bleu": 15.229236548887188, "bleu_stderr": 0.1893298480282073, "rouge1_fmeasure": 0.5094328850058195, "rouge1_fmeasure_stderr": 0.0023113991646470547, "rouge1_precision": 0.6019638812498068, "rouge1_precision_stderr": 0.003154041355931588, "rouge1_recall": 0.4769354854853059, "rouge1_recall_stderr": 0.002958348996182155, "rouge2_fmeasure": 0.2589859331129998, "rouge2_fmeasure_stderr": 0.0021744147637838404, "rouge2_precision": 0.30884118560646173, "rouge2_precision_stderr": 0.002746739827702377, "rouge2_recall": 0.24294392646302612, "rouge2_recall_stderr": 0.002367727486010716, "rougeL_fmeasure": 0.3764193148894467, "rougeL_fmeasure_stderr": 0.002186561194132359, "rougeL_precision": 0.4461308937388599, "rougeL_precision_stderr": 0.0029602114104730864, "rougeL_recall": 0.3521903496319005, "rougeL_recall_stderr": 0.002557452362321216, "rougeLsum_fmeasure": 0.428648276943535, "rougeLsum_fmeasure_stderr": 0.00241888010522652, "rougeLsum_precision": 0.5062370391801149, "rougeLsum_precision_stderr": 0.003164925596063054, "rougeLsum_recall": 0.4014335787816741, "rougeLsum_recall_stderr": 0.002844420919383205}}, "4": {"generate_text_restaurant": {"bleu": 15.5976388656912, "bleu_stderr": 0.21552554445196853, "rouge1_fmeasure": 0.5121277417214488, "rouge1_fmeasure_stderr": 0.0023435839886818115, "rouge1_precision": 0.5995295914989719, "rouge1_precision_stderr": 0.003160879006436628, "rouge1_recall": 0.48298984835020653, "rouge1_recall_stderr": 0.002984598478781118, "rouge2_fmeasure": 0.2613413893640116, "rouge2_fmeasure_stderr": 0.0022244910107010367, "rouge2_precision": 0.30839025650068297, "rouge2_precision_stderr": 0.002750823115926885, "rouge2_recall": 0.24686659194192842, "rouge2_recall_stderr": 0.0024081433509198598, "rougeL_fmeasure": 0.37763811148528253, "rougeL_fmeasure_stderr": 0.002256869338791484, "rougeL_precision": 0.4436724332909479, "rougeL_precision_stderr": 0.0029870463284766544, "rougeL_recall": 0.35568244613898303, "rougeL_recall_stderr": 0.0026122906695223214, "rougeLsum_fmeasure": 0.43177980537098043, "rougeLsum_fmeasure_stderr": 0.0024732350477868438, "rougeLsum_precision": 0.5052162393739371, "rougeLsum_precision_stderr": 0.0031736753796748323, "rougeLsum_recall": 0.4072510388482175, "rougeLsum_recall_stderr": 0.0029018213850476}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.7872267963517905, "bleu_stderr": 0.10045358377651255, "rouge1_fmeasure": 0.19062853936311658, "rouge1_fmeasure_stderr": 0.0029532646040511964, "rouge1_precision": 0.13999986827582567, "rouge1_precision_stderr": 0.0022474794503698063, "rouge1_recall": 0.3146848228074294, "rouge1_recall_stderr": 0.004975343003506539, "rouge2_fmeasure": 0.04328828501096498, "rouge2_fmeasure_stderr": 0.001566248398707883, "rouge2_precision": 0.03163976089485668, "rouge2_precision_stderr": 0.0011861306107240704, "rouge2_recall": 0.07306747424744006, "rouge2_recall_stderr": 0.002675800285732193, "rougeL_fmeasure": 0.139911595168302, "rougeL_fmeasure_stderr": 0.0021985427116768684, "rougeL_precision": 0.10267416140662887, "rougeL_precision_stderr": 0.0016806410262001903, "rougeL_recall": 0.23218112607700236, "rougeL_recall_stderr": 0.003798988027549183, "rougeLsum_fmeasure": 0.15161941662067227, "rougeLsum_fmeasure_stderr": 0.0024594718320593506, "rougeLsum_precision": 0.11113494233445374, "rougeLsum_precision_stderr": 0.0018618965914790858, "rougeLsum_recall": 0.2517399727563149, "rougeLsum_recall_stderr": 0.004226741179316262}}, "1": {"article_DOC_summary": {"bleu": 2.0535765112268263, "bleu_stderr": 0.11972597685190434, "rouge1_fmeasure": 0.21134911198456563, "rouge1_fmeasure_stderr": 0.003029508784148872, "rouge1_precision": 0.198389498815838, "rouge1_precision_stderr": 0.0036200528310182637, "rouge1_recall": 0.27297271430803394, "rouge1_recall_stderr": 0.004198789938626886, "rouge2_fmeasure": 0.04534620252044498, "rouge2_fmeasure_stderr": 0.0017858055311589297, "rouge2_precision": 0.043045220973661015, "rouge2_precision_stderr": 0.0018919242792135235, "rouge2_recall": 0.0597065055598483, "rouge2_recall_stderr": 0.0023731544903471393, "rougeL_fmeasure": 0.16064653466424578, "rougeL_fmeasure_stderr": 0.0024169691348135945, "rougeL_precision": 0.15080384564385396, "rougeL_precision_stderr": 0.0028920115769217117, "rougeL_recall": 0.2086058984013953, "rougeL_recall_stderr": 0.0033970509765821025, "rougeLsum_fmeasure": 0.1633349245997364, "rougeLsum_fmeasure_stderr": 0.002468028232419068, "rougeLsum_precision": 0.15275988244015698, "rougeLsum_precision_stderr": 0.0028982643948039046, "rougeLsum_recall": 0.21320242569675277, "rougeLsum_recall_stderr": 0.0035810539802842534}}, "2": {"article_DOC_summary": {"bleu": 3.0737243948901085, "bleu_stderr": 0.20707696514444188, "rouge1_fmeasure": 0.2444857696168434, "rouge1_fmeasure_stderr": 0.0034588270505411917, "rouge1_precision": 0.2607761013228957, "rouge1_precision_stderr": 0.004351702830837392, "rouge1_recall": 0.2607885042839857, "rouge1_recall_stderr": 0.004018215060642568, "rouge2_fmeasure": 0.05852877798479666, "rouge2_fmeasure_stderr": 0.0023072565365240597, "rouge2_precision": 0.06344356630530605, "rouge2_precision_stderr": 0.0026820157377482148, "rouge2_recall": 0.06209709779478734, "rouge2_recall_stderr": 0.00246191601941908, "rougeL_fmeasure": 0.18609020665428283, "rougeL_fmeasure_stderr": 0.002889852915757798, "rougeL_precision": 0.19907633090246687, "rougeL_precision_stderr": 0.0036157515137942723, "rougeL_recall": 0.19814987180436297, "rougeL_recall_stderr": 0.003287204314003516, "rougeLsum_fmeasure": 0.18820566058762211, "rougeLsum_fmeasure_stderr": 0.002901279776307625, "rougeLsum_precision": 0.20074551849299913, "rougeLsum_precision_stderr": 0.0036053665449958016, "rougeLsum_recall": 0.20155142761693595, "rougeLsum_recall_stderr": 0.003414189551327218}}, "3": {"article_DOC_summary": {"bleu": 3.3049815114104333, "bleu_stderr": 0.159904818084904, "rouge1_fmeasure": 0.23929443585701224, "rouge1_fmeasure_stderr": 0.0037168757909286886, "rouge1_precision": 0.26517071973676093, "rouge1_precision_stderr": 0.004671054222681604, "rouge1_recall": 0.24180770628858178, "rouge1_recall_stderr": 0.003940301667285207, "rouge2_fmeasure": 0.05849869572638163, "rouge2_fmeasure_stderr": 0.0023240414297435643, "rouge2_precision": 0.06573984745309881, "rouge2_precision_stderr": 0.0027796960958526426, "rouge2_recall": 0.05833153123977987, "rouge2_recall_stderr": 0.0023156387603366626, "rougeL_fmeasure": 0.1821366529554913, "rougeL_fmeasure_stderr": 0.0030775956513009466, "rougeL_precision": 0.203598693888488, "rougeL_precision_stderr": 0.0039976969321419975, "rougeL_recall": 0.18373116025857217, "rougeL_recall_stderr": 0.003225733424944883, "rougeLsum_fmeasure": 0.1835081526249607, "rougeLsum_fmeasure_stderr": 0.003085655347849612, "rougeLsum_precision": 0.20485233666950228, "rougeLsum_precision_stderr": 0.003995574052859589, "rougeLsum_recall": 0.1854022097149016, "rougeLsum_recall_stderr": 0.003252182345953945}}, "4": {"article_DOC_summary": {"bleu": 0.14790698979130756, "bleu_stderr": 0.04384286085154373, "rouge1_fmeasure": 0.06274888299344217, "rouge1_fmeasure_stderr": 0.00368347747261438, "rouge1_precision": 0.07320382597863412, "rouge1_precision_stderr": 0.004382339503654509, "rouge1_recall": 0.06154298544226265, "rouge1_recall_stderr": 0.0037288317198754368, "rouge2_fmeasure": 0.01636894923247962, "rouge2_fmeasure_stderr": 0.0015408484981787023, "rouge2_precision": 0.018885269050898535, "rouge2_precision_stderr": 0.0018067685701354235, "rouge2_recall": 0.016055891153780352, "rouge2_recall_stderr": 0.0015325179921096511, "rougeL_fmeasure": 0.04906837355261336, "rougeL_fmeasure_stderr": 0.002968163725283441, "rougeL_precision": 0.057742538854948004, "rougeL_precision_stderr": 0.003573097680574354, "rougeL_recall": 0.0479152284127263, "rougeL_recall_stderr": 0.0029805271711328637, "rougeLsum_fmeasure": 0.04921620084471268, "rougeLsum_fmeasure_stderr": 0.0029754368968071495, "rougeLsum_precision": 0.05793234813776215, "rougeLsum_precision_stderr": 0.0035819297580720103, "rougeLsum_recall": 0.04803187451993844, "rougeLsum_recall_stderr": 0.0029874284500312834}}, "5": {"article_DOC_summary": {"bleu": 1.0193423208210636e-40, "bleu_stderr": 1.482688743257922e-35, "rouge1_fmeasure": 0.0021324501644318945, "rouge1_fmeasure_stderr": 0.0005815454829014453, "rouge1_precision": 0.002313901471635568, "rouge1_precision_stderr": 0.0006326582061365616, "rouge1_recall": 0.0020282790639805184, "rouge1_recall_stderr": 0.0005520696211086723, "rouge2_fmeasure": 0.0003960521179313103, "rouge2_fmeasure_stderr": 0.00017805195394425088, "rouge2_precision": 0.0004152380160032476, "rouge2_precision_stderr": 0.00018397799224345848, "rouge2_recall": 0.00038092195139919884, "rouge2_recall_stderr": 0.00017352719125781168, "rougeL_fmeasure": 0.0018203546217228452, "rougeL_fmeasure_stderr": 0.0005047529402091446, "rougeL_precision": 0.0019597074784293666, "rougeL_precision_stderr": 0.0005429844277966419, "rougeL_recall": 0.0017448307905115785, "rougeL_recall_stderr": 0.00048409378403815574, "rougeLsum_fmeasure": 0.0019229642762192925, "rougeLsum_fmeasure_stderr": 0.0005389418518153393, "rougeLsum_precision": 0.0020637585328133834, "rougeLsum_precision_stderr": 0.0005760004715550667, "rougeLsum_recall": 0.0018460791228919861, "rougeLsum_recall_stderr": 0.0005185190358660706}}}}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3920095188313853, "bleu_stderr": 0.05821741416066471, "rouge1_fmeasure": 0.10974192875795953, "rouge1_fmeasure_stderr": 0.002108481379704356, "rouge1_precision": 0.07689265316885617, "rouge1_precision_stderr": 0.0021889644764877354, "rouge1_recall": 0.3353633364771332, "rouge1_recall_stderr": 0.005464034559346198, "rouge2_fmeasure": 0.04944058280627724, "rouge2_fmeasure_stderr": 0.0012733709222344247, "rouge2_precision": 0.03467177168437793, "rouge2_precision_stderr": 0.001263966142663848, "rouge2_recall": 0.1501022471408039, "rouge2_recall_stderr": 0.0034656326105627303, "rougeL_fmeasure": 0.10487292177099015, "rougeL_fmeasure_stderr": 0.0019145881199173131, "rougeL_precision": 0.07300762564486572, "rougeL_precision_stderr": 0.0019791385357395687, "rougeL_recall": 0.3239944481490108, "rougeL_recall_stderr": 0.005278665955249544, "rougeLsum_fmeasure": 0.10223587057095268, "rougeLsum_fmeasure_stderr": 0.001953957091865035, "rougeLsum_precision": 0.07173881674469755, "rougeLsum_precision_stderr": 0.0020327580166630914, "rougeLsum_recall": 0.3108299199198791, "rougeLsum_recall_stderr": 0.0050441398235876655}}, "1": {"PALM_prompt": {"bleu": 0.5923583934046589, "bleu_stderr": 0.04413735939354442, "rouge1_fmeasure": 0.1697750677390444, "rouge1_fmeasure_stderr": 0.0038772039426472724, "rouge1_precision": 0.16010050130964712, "rouge1_precision_stderr": 0.005125042778102285, "rouge1_recall": 0.3196355072168386, "rouge1_recall_stderr": 0.005119850964520928, "rouge2_fmeasure": 0.08671853497591266, "rouge2_fmeasure_stderr": 0.002631475278488475, "rouge2_precision": 0.08438981052215226, "rouge2_precision_stderr": 0.0035428710754014017, "rouge2_recall": 0.16375709757632376, "rouge2_recall_stderr": 0.0035801758170607334, "rougeL_fmeasure": 0.15421967175389004, "rougeL_fmeasure_stderr": 0.003335409750319338, "rougeL_precision": 0.14464852171051198, "rougeL_precision_stderr": 0.004587920039107864, "rougeL_recall": 0.2991072637349091, "rougeL_recall_stderr": 0.004697289445064134, "rougeLsum_fmeasure": 0.15698432412836977, "rougeLsum_fmeasure_stderr": 0.0034258507511626024, "rougeLsum_precision": 0.14773643541948675, "rougeLsum_precision_stderr": 0.004697318924441809, "rougeLsum_recall": 0.3022253960230997, "rougeLsum_recall_stderr": 0.004739701115545982}}, "2": {"PALM_prompt": {"bleu": 0.87481699127398, "bleu_stderr": 0.059582002804657114, "rouge1_fmeasure": 0.2091307571621392, "rouge1_fmeasure_stderr": 0.00438758838932181, "rouge1_precision": 0.199466981650156, "rouge1_precision_stderr": 0.005680133053279315, "rouge1_recall": 0.36987834122063556, "rouge1_recall_stderr": 0.004990035433475184, "rouge2_fmeasure": 0.1125014602113282, "rouge2_fmeasure_stderr": 0.0030818329485191713, "rouge2_precision": 0.1097332483868951, "rouge2_precision_stderr": 0.003880260643680688, "rouge2_recall": 0.19989822315140882, "rouge2_recall_stderr": 0.003891995132588205, "rougeL_fmeasure": 0.18777604196623215, "rougeL_fmeasure_stderr": 0.0037686741944170416, "rougeL_precision": 0.17708218948059182, "rougeL_precision_stderr": 0.004959206002368757, "rougeL_recall": 0.34473442974785634, "rougeL_recall_stderr": 0.004653107328809787, "rougeLsum_fmeasure": 0.19268792375031765, "rougeLsum_fmeasure_stderr": 0.00391282714541826, "rougeLsum_precision": 0.18275767079289593, "rougeLsum_precision_stderr": 0.005158519349266604, "rougeLsum_recall": 0.3495549840141228, "rougeLsum_recall_stderr": 0.00470047440035223}}, "3": {"PALM_prompt": {"bleu": 0.917052522385174, "bleu_stderr": 0.04515993268519611, "rouge1_fmeasure": 0.21372711407619938, "rouge1_fmeasure_stderr": 0.004343895044489938, "rouge1_precision": 0.20888904165496103, "rouge1_precision_stderr": 0.005880585457367179, "rouge1_recall": 0.37814776640994463, "rouge1_recall_stderr": 0.004965150831364198, "rouge2_fmeasure": 0.11478946258154298, "rouge2_fmeasure_stderr": 0.003021059673039041, "rouge2_precision": 0.1168574620986129, "rouge2_precision_stderr": 0.004100382644887659, "rouge2_recall": 0.2033520821796969, "rouge2_recall_stderr": 0.00383298408380243, "rougeL_fmeasure": 0.19232862909601875, "rougeL_fmeasure_stderr": 0.0036812800257055574, "rougeL_precision": 0.18605811290952207, "rougeL_precision_stderr": 0.005155344402670075, "rougeL_recall": 0.35240104768920266, "rougeL_recall_stderr": 0.004560857947768766, "rougeLsum_fmeasure": 0.1967270339578505, "rougeLsum_fmeasure_stderr": 0.0038181408662667426, "rougeLsum_precision": 0.19147784744930493, "rougeLsum_precision_stderr": 0.0053469173736333826, "rougeLsum_recall": 0.3571044253500232, "rougeLsum_recall_stderr": 0.004630285578564224}}, "4": {"PALM_prompt": {"bleu": 1.0770812969732624, "bleu_stderr": 0.057150178907157206, "rouge1_fmeasure": 0.23556795007723105, "rouge1_fmeasure_stderr": 0.004533277698287348, "rouge1_precision": 0.22981660265238718, "rouge1_precision_stderr": 0.0060091983198828705, "rouge1_recall": 0.39885875556534467, "rouge1_recall_stderr": 0.004954969825753986, "rouge2_fmeasure": 0.12790770467752932, "rouge2_fmeasure_stderr": 0.0031721283738045283, "rouge2_precision": 0.12750847088352804, "rouge2_precision_stderr": 0.004054232891970157, "rouge2_recall": 0.21938886368566654, "rouge2_recall_stderr": 0.003991587985278903, "rougeL_fmeasure": 0.20953786614403025, "rougeL_fmeasure_stderr": 0.0038198330880285674, "rougeL_precision": 0.2007562947847461, "rougeL_precision_stderr": 0.005097419240769892, "rougeL_recall": 0.3691187056685199, "rougeL_recall_stderr": 0.004545072578815252, "rougeLsum_fmeasure": 0.21611300792836619, "rougeLsum_fmeasure_stderr": 0.003996019817069793, "rougeLsum_precision": 0.2090073734779189, "rougeLsum_precision_stderr": 0.0053710149363667935, "rougeLsum_recall": 0.3759411168888221, "rougeLsum_recall_stderr": 0.004622209838017953}}, "5": {"PALM_prompt": {"bleu": 1.1504048600305352, "bleu_stderr": 0.0755822004199928, "rouge1_fmeasure": 0.24203388956598934, "rouge1_fmeasure_stderr": 0.004720213442449205, "rouge1_precision": 0.24380689117778276, "rouge1_precision_stderr": 0.006371373530201665, "rouge1_recall": 0.3952201821451148, "rouge1_recall_stderr": 0.005033224197794555, "rouge2_fmeasure": 0.13474976693002946, "rouge2_fmeasure_stderr": 0.003415392488984557, "rouge2_precision": 0.1418504594039488, "rouge2_precision_stderr": 0.004575582136553598, "rouge2_recall": 0.21936500213450158, "rouge2_recall_stderr": 0.004020020751334439, "rougeL_fmeasure": 0.21539137146469606, "rougeL_fmeasure_stderr": 0.004028386529865914, "rougeL_precision": 0.21441718014302874, "rougeL_precision_stderr": 0.005537317704553182, "rougeL_recall": 0.36454653195485753, "rougeL_recall_stderr": 0.004645366026599087, "rougeLsum_fmeasure": 0.222229551042884, "rougeLsum_fmeasure_stderr": 0.004199234047990457, "rougeLsum_precision": 0.22298936285767557, "rougeLsum_precision_stderr": 0.005807349708841443, "rougeLsum_recall": 0.37135465454110034, "rougeLsum_recall_stderr": 0.004699420865300012}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.0868626526463756, "bleu_stderr": 0.10334620651927755, "rouge1_fmeasure": 0.13547708241929612, "rouge1_fmeasure_stderr": 0.0026556069493799896, "rouge1_precision": 0.13065355742673593, "rouge1_precision_stderr": 0.003004776280579319, "rouge1_recall": 0.1786059794590456, "rouge1_recall_stderr": 0.003480497255310532, "rouge2_fmeasure": 0.0352234336154376, "rouge2_fmeasure_stderr": 0.0010798725770486046, "rouge2_precision": 0.03291780593276675, "rouge2_precision_stderr": 0.0010760829551416996, "rouge2_recall": 0.04663674589032859, "rouge2_recall_stderr": 0.001504992918818976, "rougeL_fmeasure": 0.10336654675385076, "rougeL_fmeasure_stderr": 0.001968971023431868, "rougeL_precision": 0.09987150235228559, "rougeL_precision_stderr": 0.0023685337113851174, "rougeL_recall": 0.13959473564876165, "rougeL_recall_stderr": 0.0027581190918920103, "rougeLsum_fmeasure": 0.12705867398208964, "rougeLsum_fmeasure_stderr": 0.002506787857940285, "rougeLsum_precision": 0.12288788441024788, "rougeLsum_precision_stderr": 0.0028720387887482485, "rougeLsum_recall": 0.16759622234914887, "rougeLsum_recall_stderr": 0.0032867178590762366}}, "1": {"tldr_en": {"bleu": 2.5514775583592173, "bleu_stderr": 0.11284261690010776, "rouge1_fmeasure": 0.16580650031794167, "rouge1_fmeasure_stderr": 0.0021193004105035614, "rouge1_precision": 0.19126686092152348, "rouge1_precision_stderr": 0.003082884423906421, "rouge1_recall": 0.2013507500089625, "rouge1_recall_stderr": 0.0029925213818961095, "rouge2_fmeasure": 0.035746404962808155, "rouge2_fmeasure_stderr": 0.0011192977008782524, "rouge2_precision": 0.044582320866862424, "rouge2_precision_stderr": 0.0018321118047088484, "rouge2_recall": 0.04442769282883079, "rouge2_recall_stderr": 0.0014782140733801531, "rougeL_fmeasure": 0.1274347165087169, "rougeL_fmeasure_stderr": 0.0015926173758816556, "rougeL_precision": 0.14988887413463797, "rougeL_precision_stderr": 0.002586002929489674, "rougeL_recall": 0.15563614848002308, "rougeL_recall_stderr": 0.0023345594461348924, "rougeLsum_fmeasure": 0.15656227686437413, "rougeLsum_fmeasure_stderr": 0.0019872168695675768, "rougeLsum_precision": 0.18119105294163743, "rougeLsum_precision_stderr": 0.002938341173461725, "rougeLsum_recall": 0.19011415446724061, "rougeLsum_recall_stderr": 0.002811944286183604}}, "2": {"tldr_en": {"bleu": 4.160843322149366, "bleu_stderr": 0.1298621943311205, "rouge1_fmeasure": 0.227527675745211, "rouge1_fmeasure_stderr": 0.002262727638760964, "rouge1_precision": 0.27518353034811777, "rouge1_precision_stderr": 0.003681886785126964, "rouge1_recall": 0.26994010795497553, "rouge1_recall_stderr": 0.003076759192772023, "rouge2_fmeasure": 0.06444156598905733, "rouge2_fmeasure_stderr": 0.001316135695695516, "rouge2_precision": 0.08493527530542791, "rouge2_precision_stderr": 0.002342907743897752, "rouge2_recall": 0.07562522541046568, "rouge2_recall_stderr": 0.0016670921194054875, "rougeL_fmeasure": 0.17356357206141904, "rougeL_fmeasure_stderr": 0.0017138094026774846, "rougeL_precision": 0.21378447499783487, "rougeL_precision_stderr": 0.003086057725507419, "rougeL_recall": 0.20706303996123673, "rougeL_recall_stderr": 0.002420883132061566, "rougeLsum_fmeasure": 0.21385148741041204, "rougeLsum_fmeasure_stderr": 0.0021335293397074695, "rougeLsum_precision": 0.2594677476381253, "rougeLsum_precision_stderr": 0.0035263951440822477, "rougeLsum_recall": 0.25382240907884707, "rougeLsum_recall_stderr": 0.0029022538349274}}, "3": {"tldr_en": {"bleu": 3.550269268318738, "bleu_stderr": 0.123976667805652, "rouge1_fmeasure": 0.19542809253461213, "rouge1_fmeasure_stderr": 0.002567437085860653, "rouge1_precision": 0.25197379414278026, "rouge1_precision_stderr": 0.004061114481390069, "rouge1_recall": 0.2241545120711431, "rouge1_recall_stderr": 0.003349453782690701, "rouge2_fmeasure": 0.055649339745118376, "rouge2_fmeasure_stderr": 0.0013436864732712206, "rouge2_precision": 0.07748731951936201, "rouge2_precision_stderr": 0.0024143985835070676, "rouge2_recall": 0.06396850914384923, "rouge2_recall_stderr": 0.001702594494628315, "rougeL_fmeasure": 0.1493802268370947, "rougeL_fmeasure_stderr": 0.001958022255827534, "rougeL_precision": 0.19629979985917206, "rougeL_precision_stderr": 0.00335622576475892, "rougeL_recall": 0.17220070773118773, "rougeL_recall_stderr": 0.0026273647249788665, "rougeLsum_fmeasure": 0.18360399651239948, "rougeLsum_fmeasure_stderr": 0.002422426009983074, "rougeLsum_precision": 0.2374958020095092, "rougeLsum_precision_stderr": 0.0038790177970112016, "rougeLsum_recall": 0.2104711045184697, "rougeLsum_recall_stderr": 0.0031578834852280147}}, "4": {"tldr_en": {"bleu": 0.18727975729565796, "bleu_stderr": 0.024743677554065382, "rouge1_fmeasure": 0.06369530530813045, "rouge1_fmeasure_stderr": 0.002225322220412946, "rouge1_precision": 0.08736393590844754, "rouge1_precision_stderr": 0.003320236333651585, "rouge1_recall": 0.07170734676784853, "rouge1_recall_stderr": 0.0026863536115591326, "rouge2_fmeasure": 0.018728966913288573, "rouge2_fmeasure_stderr": 0.0009803445596888972, "rouge2_precision": 0.026539844024013316, "rouge2_precision_stderr": 0.0016227872678637359, "rouge2_recall": 0.021485803029836836, "rouge2_recall_stderr": 0.0012234458979275497, "rougeL_fmeasure": 0.0499467876025126, "rougeL_fmeasure_stderr": 0.0017576059173842731, "rougeL_precision": 0.07026197905523278, "rougeL_precision_stderr": 0.002775489585285312, "rougeL_recall": 0.056214726744091746, "rougeL_recall_stderr": 0.0021292176604441565, "rougeLsum_fmeasure": 0.05976166581566339, "rougeLsum_fmeasure_stderr": 0.0020932837351865146, "rougeLsum_precision": 0.08238350525240551, "rougeLsum_precision_stderr": 0.00315393172421982, "rougeLsum_recall": 0.06716878885751017, "rougeLsum_recall_stderr": 0.002524004286911468}}, "5": {"tldr_en": {"bleu": 5.577157241826622e-13, "bleu_stderr": 1.1569346422902409e-11, "rouge1_fmeasure": 0.01043565444271075, "rouge1_fmeasure_stderr": 0.0010239464330389257, "rouge1_precision": 0.016308055800076632, "rouge1_precision_stderr": 0.0016664962689093485, "rouge1_recall": 0.011189655817252247, "rouge1_recall_stderr": 0.0011802896184760587, "rouge2_fmeasure": 0.003212219718708495, "rouge2_fmeasure_stderr": 0.00044504346236305713, "rouge2_precision": 0.004996639391364696, "rouge2_precision_stderr": 0.0007412751630573523, "rouge2_recall": 0.0035850495924414864, "rouge2_recall_stderr": 0.0005653334006649237, "rougeL_fmeasure": 0.008463888867128002, "rougeL_fmeasure_stderr": 0.0008498965362098245, "rougeL_precision": 0.013336917042039756, "rougeL_precision_stderr": 0.0013953475029781011, "rougeL_recall": 0.009180912567410115, "rougeL_recall_stderr": 0.001001081279334972, "rougeLsum_fmeasure": 0.009948416660924693, "rougeLsum_fmeasure_stderr": 0.000978895492286139, "rougeLsum_precision": 0.015695607203614483, "rougeLsum_precision_stderr": 0.0016199770379973, "rougeLsum_recall": 0.010692369328543307, "rougeLsum_recall_stderr": 0.0011385128958340267}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.1932091730586953, "bleu_stderr": 0.12204583279523666, "rouge1_fmeasure": 0.24046689564492205, "rouge1_fmeasure_stderr": 0.0026584396733124543, "rouge1_precision": 0.41337918705325344, "rouge1_precision_stderr": 0.0052760059693995, "rouge1_recall": 0.2309238228257254, "rouge1_recall_stderr": 0.0032774148724919335, "rouge2_fmeasure": 0.08042598706024275, "rouge2_fmeasure_stderr": 0.001263671869367697, "rouge2_precision": 0.23463707329236097, "rouge2_precision_stderr": 0.0062472430372265, "rouge2_recall": 0.07811161999290202, "rouge2_recall_stderr": 0.001551040325003802, "rougeL_fmeasure": 0.18994590486023305, "rougeL_fmeasure_stderr": 0.0016519612628498022, "rougeL_precision": 0.3599428455550758, "rougeL_precision_stderr": 0.005381765248981534, "rougeL_recall": 0.179931821205033, "rougeL_recall_stderr": 0.002310618989620828, "rougeLsum_fmeasure": 0.2231487737804936, "rougeLsum_fmeasure_stderr": 0.0024418581547524815, "rougeLsum_precision": 0.39508222499421797, "rougeLsum_precision_stderr": 0.005341940324256954, "rougeLsum_recall": 0.21269354787389044, "rougeLsum_recall_stderr": 0.003034851683547011}}, "1": {"generate_text_restaurant": {"bleu": 12.375900606698423, "bleu_stderr": 0.19606823444326352, "rouge1_fmeasure": 0.48293554827595836, "rouge1_fmeasure_stderr": 0.0023435879912405283, "rouge1_precision": 0.597454575764657, "rouge1_precision_stderr": 0.0032329061443020345, "rouge1_recall": 0.4434643604110928, "rouge1_recall_stderr": 0.0029889149083277117, "rouge2_fmeasure": 0.23118450232385498, "rouge2_fmeasure_stderr": 0.0020415023824907675, "rouge2_precision": 0.2910060712314691, "rouge2_precision_stderr": 0.0027476178598673066, "rouge2_recall": 0.21176179817940102, "rouge2_recall_stderr": 0.0021589328308703566, "rougeL_fmeasure": 0.3518994031108773, "rougeL_fmeasure_stderr": 0.0021201781748616372, "rougeL_precision": 0.43888436474875336, "rougeL_precision_stderr": 0.0030457086517011057, "rougeL_recall": 0.3220136107735307, "rougeL_recall_stderr": 0.0024545810076108667, "rougeLsum_fmeasure": 0.39555654566655024, "rougeLsum_fmeasure_stderr": 0.0023470162024293896, "rougeLsum_precision": 0.4908609045585646, "rougeLsum_precision_stderr": 0.003223896911814539, "rougeLsum_recall": 0.3626278022836946, "rougeLsum_recall_stderr": 0.0027433883081054507}}, "2": {"generate_text_restaurant": {"bleu": 14.369325725727359, "bleu_stderr": 0.16247647633686452, "rouge1_fmeasure": 0.5050896808334919, "rouge1_fmeasure_stderr": 0.0022820908410676275, "rouge1_precision": 0.6089651864273457, "rouge1_precision_stderr": 0.003198435500725766, "rouge1_recall": 0.4684512148933178, "rouge1_recall_stderr": 0.0029309075224427496, "rouge2_fmeasure": 0.25360971138111243, "rouge2_fmeasure_stderr": 0.0021297710933538033, "rouge2_precision": 0.31006692351264853, "rouge2_precision_stderr": 0.002823237008286986, "rouge2_recall": 0.23489561514574075, "rouge2_recall_stderr": 0.0022593518873055855, "rougeL_fmeasure": 0.3737642937216708, "rougeL_fmeasure_stderr": 0.002190131174828545, "rougeL_precision": 0.4528063450828851, "rougeL_precision_stderr": 0.0030647195241319818, "rougeL_recall": 0.34593233406974633, "rougeL_recall_stderr": 0.0025245059893910897, "rougeLsum_fmeasure": 0.4229068329404585, "rougeLsum_fmeasure_stderr": 0.0023792336336982954, "rougeLsum_precision": 0.5104756682075146, "rougeLsum_precision_stderr": 0.0032393352108801977, "rougeLsum_recall": 0.39199556131381524, "rougeLsum_recall_stderr": 0.0027868585721285742}}, "3": {"generate_text_restaurant": {"bleu": 15.229236548887188, "bleu_stderr": 0.1893298480282073, "rouge1_fmeasure": 0.5094328850058195, "rouge1_fmeasure_stderr": 0.0023113991646470547, "rouge1_precision": 0.6019638812498068, "rouge1_precision_stderr": 0.003154041355931588, "rouge1_recall": 0.4769354854853059, "rouge1_recall_stderr": 0.002958348996182155, "rouge2_fmeasure": 0.2589859331129998, "rouge2_fmeasure_stderr": 0.0021744147637838404, "rouge2_precision": 0.30884118560646173, "rouge2_precision_stderr": 0.002746739827702377, "rouge2_recall": 0.24294392646302612, "rouge2_recall_stderr": 0.002367727486010716, "rougeL_fmeasure": 0.3764193148894467, "rougeL_fmeasure_stderr": 0.002186561194132359, "rougeL_precision": 0.4461308937388599, "rougeL_precision_stderr": 0.0029602114104730864, "rougeL_recall": 0.3521903496319005, "rougeL_recall_stderr": 0.002557452362321216, "rougeLsum_fmeasure": 0.428648276943535, "rougeLsum_fmeasure_stderr": 0.00241888010522652, "rougeLsum_precision": 0.5062370391801149, "rougeLsum_precision_stderr": 0.003164925596063054, "rougeLsum_recall": 0.4014335787816741, "rougeLsum_recall_stderr": 0.002844420919383205}}, "4": {"generate_text_restaurant": {"bleu": 15.5976388656912, "bleu_stderr": 0.21552554445196853, "rouge1_fmeasure": 0.5121277417214488, "rouge1_fmeasure_stderr": 0.0023435839886818115, "rouge1_precision": 0.5995295914989719, "rouge1_precision_stderr": 0.003160879006436628, "rouge1_recall": 0.48298984835020653, "rouge1_recall_stderr": 0.002984598478781118, "rouge2_fmeasure": 0.2613413893640116, "rouge2_fmeasure_stderr": 0.0022244910107010367, "rouge2_precision": 0.30839025650068297, "rouge2_precision_stderr": 0.002750823115926885, "rouge2_recall": 0.24686659194192842, "rouge2_recall_stderr": 0.0024081433509198598, "rougeL_fmeasure": 0.37763811148528253, "rougeL_fmeasure_stderr": 0.002256869338791484, "rougeL_precision": 0.4436724332909479, "rougeL_precision_stderr": 0.0029870463284766544, "rougeL_recall": 0.35568244613898303, "rougeL_recall_stderr": 0.0026122906695223214, "rougeLsum_fmeasure": 0.43177980537098043, "rougeLsum_fmeasure_stderr": 0.0024732350477868438, "rougeLsum_precision": 0.5052162393739371, "rougeLsum_precision_stderr": 0.0031736753796748323, "rougeLsum_recall": 0.4072510388482175, "rougeLsum_recall_stderr": 0.0029018213850476}}, "5": {"generate_text_restaurant": {"bleu": 15.687395974066053, "bleu_stderr": 0.20622943608143518, "rouge1_fmeasure": 0.5141607410794193, "rouge1_fmeasure_stderr": 0.0022659587311288916, "rouge1_precision": 0.6000616265181339, "rouge1_precision_stderr": 0.003133461735181107, "rouge1_recall": 0.4841217228002453, "rouge1_recall_stderr": 0.002858501032563205, "rouge2_fmeasure": 0.26306733403610516, "rouge2_fmeasure_stderr": 0.002177347942982272, "rouge2_precision": 0.3102090370387057, "rouge2_precision_stderr": 0.0027631962681951977, "rouge2_recall": 0.24783518322106535, "rouge2_recall_stderr": 0.002322540306661216, "rougeL_fmeasure": 0.3806224317408668, "rougeL_fmeasure_stderr": 0.00219416512525324, "rougeL_precision": 0.44553889899086446, "rougeL_precision_stderr": 0.0029429390968387604, "rougeL_recall": 0.35816627026075004, "rougeL_recall_stderr": 0.002525260477177778, "rougeLsum_fmeasure": 0.43430558211603404, "rougeLsum_fmeasure_stderr": 0.0024106090546087014, "rougeLsum_precision": 0.5071492219163851, "rougeLsum_precision_stderr": 0.003169349814561261, "rougeLsum_recall": 0.40886465620272155, "rougeLsum_recall_stderr": 0.0027986322707706374}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.7872267963517905, "bleu_stderr": 0.10045358377651255, "rouge1_fmeasure": 0.19062853936311658, "rouge1_fmeasure_stderr": 0.0029532646040511964, "rouge1_precision": 0.13999986827582567, "rouge1_precision_stderr": 0.0022474794503698063, "rouge1_recall": 0.3146848228074294, "rouge1_recall_stderr": 0.004975343003506539, "rouge2_fmeasure": 0.04328828501096498, "rouge2_fmeasure_stderr": 0.001566248398707883, "rouge2_precision": 0.03163976089485668, "rouge2_precision_stderr": 0.0011861306107240704, "rouge2_recall": 0.07306747424744006, "rouge2_recall_stderr": 0.002675800285732193, "rougeL_fmeasure": 0.139911595168302, "rougeL_fmeasure_stderr": 0.0021985427116768684, "rougeL_precision": 0.10267416140662887, "rougeL_precision_stderr": 0.0016806410262001903, "rougeL_recall": 0.23218112607700236, "rougeL_recall_stderr": 0.003798988027549183, "rougeLsum_fmeasure": 0.15161941662067227, "rougeLsum_fmeasure_stderr": 0.0024594718320593506, "rougeLsum_precision": 0.11113494233445374, "rougeLsum_precision_stderr": 0.0018618965914790858, "rougeLsum_recall": 0.2517399727563149, "rougeLsum_recall_stderr": 0.004226741179316262}}, "1": {"article_DOC_summary": {"bleu": 2.0535765112268263, "bleu_stderr": 0.11972597685190434, "rouge1_fmeasure": 0.21134911198456563, "rouge1_fmeasure_stderr": 0.003029508784148872, "rouge1_precision": 0.198389498815838, "rouge1_precision_stderr": 0.0036200528310182637, "rouge1_recall": 0.27297271430803394, "rouge1_recall_stderr": 0.004198789938626886, "rouge2_fmeasure": 0.04534620252044498, "rouge2_fmeasure_stderr": 0.0017858055311589297, "rouge2_precision": 0.043045220973661015, "rouge2_precision_stderr": 0.0018919242792135235, "rouge2_recall": 0.0597065055598483, "rouge2_recall_stderr": 0.0023731544903471393, "rougeL_fmeasure": 0.16064653466424578, "rougeL_fmeasure_stderr": 0.0024169691348135945, "rougeL_precision": 0.15080384564385396, "rougeL_precision_stderr": 0.0028920115769217117, "rougeL_recall": 0.2086058984013953, "rougeL_recall_stderr": 0.0033970509765821025, "rougeLsum_fmeasure": 0.1633349245997364, "rougeLsum_fmeasure_stderr": 0.002468028232419068, "rougeLsum_precision": 0.15275988244015698, "rougeLsum_precision_stderr": 0.0028982643948039046, "rougeLsum_recall": 0.21320242569675277, "rougeLsum_recall_stderr": 0.0035810539802842534}}, "2": {"article_DOC_summary": {"bleu": 3.0737243948901085, "bleu_stderr": 0.20707696514444188, "rouge1_fmeasure": 0.2444857696168434, "rouge1_fmeasure_stderr": 0.0034588270505411917, "rouge1_precision": 0.2607761013228957, "rouge1_precision_stderr": 0.004351702830837392, "rouge1_recall": 0.2607885042839857, "rouge1_recall_stderr": 0.004018215060642568, "rouge2_fmeasure": 0.05852877798479666, "rouge2_fmeasure_stderr": 0.0023072565365240597, "rouge2_precision": 0.06344356630530605, "rouge2_precision_stderr": 0.0026820157377482148, "rouge2_recall": 0.06209709779478734, "rouge2_recall_stderr": 0.00246191601941908, "rougeL_fmeasure": 0.18609020665428283, "rougeL_fmeasure_stderr": 0.002889852915757798, "rougeL_precision": 0.19907633090246687, "rougeL_precision_stderr": 0.0036157515137942723, "rougeL_recall": 0.19814987180436297, "rougeL_recall_stderr": 0.003287204314003516, "rougeLsum_fmeasure": 0.18820566058762211, "rougeLsum_fmeasure_stderr": 0.002901279776307625, "rougeLsum_precision": 0.20074551849299913, "rougeLsum_precision_stderr": 0.0036053665449958016, "rougeLsum_recall": 0.20155142761693595, "rougeLsum_recall_stderr": 0.003414189551327218}}, "3": {"article_DOC_summary": {"bleu": 3.3049815114104333, "bleu_stderr": 0.159904818084904, "rouge1_fmeasure": 0.23929443585701224, "rouge1_fmeasure_stderr": 0.0037168757909286886, "rouge1_precision": 0.26517071973676093, "rouge1_precision_stderr": 0.004671054222681604, "rouge1_recall": 0.24180770628858178, "rouge1_recall_stderr": 0.003940301667285207, "rouge2_fmeasure": 0.05849869572638163, "rouge2_fmeasure_stderr": 0.0023240414297435643, "rouge2_precision": 0.06573984745309881, "rouge2_precision_stderr": 0.0027796960958526426, "rouge2_recall": 0.05833153123977987, "rouge2_recall_stderr": 0.0023156387603366626, "rougeL_fmeasure": 0.1821366529554913, "rougeL_fmeasure_stderr": 0.0030775956513009466, "rougeL_precision": 0.203598693888488, "rougeL_precision_stderr": 0.0039976969321419975, "rougeL_recall": 0.18373116025857217, "rougeL_recall_stderr": 0.003225733424944883, "rougeLsum_fmeasure": 0.1835081526249607, "rougeLsum_fmeasure_stderr": 0.003085655347849612, "rougeLsum_precision": 0.20485233666950228, "rougeLsum_precision_stderr": 0.003995574052859589, "rougeLsum_recall": 0.1854022097149016, "rougeLsum_recall_stderr": 0.003252182345953945}}, "4": {"article_DOC_summary": {"bleu": 0.14790698979130756, "bleu_stderr": 0.04384286085154373, "rouge1_fmeasure": 0.06274888299344217, "rouge1_fmeasure_stderr": 0.00368347747261438, "rouge1_precision": 0.07320382597863412, "rouge1_precision_stderr": 0.004382339503654509, "rouge1_recall": 0.06154298544226265, "rouge1_recall_stderr": 0.0037288317198754368, "rouge2_fmeasure": 0.01636894923247962, "rouge2_fmeasure_stderr": 0.0015408484981787023, "rouge2_precision": 0.018885269050898535, "rouge2_precision_stderr": 0.0018067685701354235, "rouge2_recall": 0.016055891153780352, "rouge2_recall_stderr": 0.0015325179921096511, "rougeL_fmeasure": 0.04906837355261336, "rougeL_fmeasure_stderr": 0.002968163725283441, "rougeL_precision": 0.057742538854948004, "rougeL_precision_stderr": 0.003573097680574354, "rougeL_recall": 0.0479152284127263, "rougeL_recall_stderr": 0.0029805271711328637, "rougeLsum_fmeasure": 0.04921620084471268, "rougeLsum_fmeasure_stderr": 0.0029754368968071495, "rougeLsum_precision": 0.05793234813776215, "rougeLsum_precision_stderr": 0.0035819297580720103, "rougeLsum_recall": 0.04803187451993844, "rougeLsum_recall_stderr": 0.0029874284500312834}}, "5": {"article_DOC_summary": {"bleu": 1.0193423208210636e-40, "bleu_stderr": 1.482688743257922e-35, "rouge1_fmeasure": 0.0021324501644318945, "rouge1_fmeasure_stderr": 0.0005815454829014453, "rouge1_precision": 0.002313901471635568, "rouge1_precision_stderr": 0.0006326582061365616, "rouge1_recall": 0.0020282790639805184, "rouge1_recall_stderr": 0.0005520696211086723, "rouge2_fmeasure": 0.0003960521179313103, "rouge2_fmeasure_stderr": 0.00017805195394425088, "rouge2_precision": 0.0004152380160032476, "rouge2_precision_stderr": 0.00018397799224345848, "rouge2_recall": 0.00038092195139919884, "rouge2_recall_stderr": 0.00017352719125781168, "rougeL_fmeasure": 0.0018203546217228452, "rougeL_fmeasure_stderr": 0.0005047529402091446, "rougeL_precision": 0.0019597074784293666, "rougeL_precision_stderr": 0.0005429844277966419, "rougeL_recall": 0.0017448307905115785, "rougeL_recall_stderr": 0.00048409378403815574, "rougeLsum_fmeasure": 0.0019229642762192925, "rougeLsum_fmeasure_stderr": 0.0005389418518153393, "rougeLsum_precision": 0.0020637585328133834, "rougeLsum_precision_stderr": 0.0005760004715550667, "rougeLsum_recall": 0.0018460791228919861, "rougeLsum_recall_stderr": 0.0005185190358660706}}}}
4b284b42boscar/evaluation/rankeval/4b284b42boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.343,
5
- "acc_stderr": 0.015019206922356951
6
- },
7
- "anli_r2": {
8
- "acc": 0.344,
9
- "acc_stderr": 0.015029633724408945
10
- },
11
- "anli_r3": {
12
- "acc": 0.3491666666666667,
13
- "acc_stderr": 0.013767075395077244
14
- },
15
- "cb": {
16
- "acc": 0.32142857142857145,
17
- "acc_stderr": 0.06297362289056341,
18
- "f1": 0.1884169884169884
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.40818562039434375,
26
- "acc_stderr": 0.0049049335002558855,
27
- "acc_norm": 0.5161322445727943,
28
- "acc_norm_stderr": 0.004987183560792757
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373317
33
- },
34
- "winogrande": {
35
- "acc": 0.5540647198105761,
36
- "acc_stderr": 0.01397009348233069
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6841261357562801,
40
- "acc_stderr": 0.01074989282701111
41
- },
42
- "boolq": {
43
- "acc": 0.5568807339449541,
44
- "acc_stderr": 0.008688282882073796
45
- },
46
- "arc_easy": {
47
- "acc": 0.5736531986531986,
48
- "acc_stderr": 0.010147858603835136,
49
- "acc_norm": 0.5113636363636364,
50
- "acc_norm_stderr": 0.010257133441117103
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2525597269624573,
54
- "acc_stderr": 0.012696728980207708,
55
- "acc_norm": 0.28242320819112626,
56
- "acc_norm_stderr": 0.013155456884097224
57
- },
58
- "sciq": {
59
- "acc": 0.855,
60
- "acc_stderr": 0.011139977517890148,
61
- "acc_norm": 0.772,
62
- "acc_norm_stderr": 0.013273740700804474
63
- },
64
- "piqa": {
65
- "acc": 0.7247007616974973,
66
- "acc_stderr": 0.01042142927736953,
67
- "acc_norm": 0.7323177366702938,
68
- "acc_norm_stderr": 0.010330111189370418
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b42boscar/evaluation/rankeval/4b284b42boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.35,
5
- "acc_stderr": 0.015090650341444233
6
- },
7
- "anli_r2": {
8
- "acc": 0.338,
9
- "acc_stderr": 0.014965960710224489
10
- },
11
- "anli_r3": {
12
- "acc": 0.345,
13
- "acc_stderr": 0.013728421539454881
14
- },
15
- "cb": {
16
- "acc": 0.35714285714285715,
17
- "acc_stderr": 0.0646095738380922,
18
- "f1": 0.263246425567704
19
- },
20
- "copa": {
21
- "acc": 0.76,
22
- "acc_stderr": 0.04292346959909283
23
- },
24
- "hellaswag": {
25
- "acc": 0.40599482174865564,
26
- "acc_stderr": 0.004900798868048131,
27
- "acc_norm": 0.5247958573989245,
28
- "acc_norm_stderr": 0.004983641854351151
29
- },
30
- "rte": {
31
- "acc": 0.5776173285198556,
32
- "acc_stderr": 0.029731622646495887
33
- },
34
- "winogrande": {
35
- "acc": 0.5564325177584846,
36
- "acc_stderr": 0.0139626949076204
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6755745590593266,
40
- "acc_stderr": 0.010826131344990888
41
- },
42
- "boolq": {
43
- "acc": 0.5464831804281346,
44
- "acc_stderr": 0.008707182331111644
45
- },
46
- "arc_easy": {
47
- "acc": 0.6035353535353535,
48
- "acc_stderr": 0.010037412763064524,
49
- "acc_norm": 0.5757575757575758,
50
- "acc_norm_stderr": 0.010141333654958562
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27047781569965873,
54
- "acc_stderr": 0.012980954547659554,
55
- "acc_norm": 0.30204778156996587,
56
- "acc_norm_stderr": 0.013417519144716412
57
- },
58
- "sciq": {
59
- "acc": 0.897,
60
- "acc_stderr": 0.009616833339695796,
61
- "acc_norm": 0.893,
62
- "acc_norm_stderr": 0.009779910359847169
63
- },
64
- "piqa": {
65
- "acc": 0.7290533188248096,
66
- "acc_stderr": 0.010369718937426843,
67
- "acc_norm": 0.7301414581066377,
68
- "acc_norm_stderr": 0.010356595421852188
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b42boscar/evaluation/rankeval/4b284b42boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.325,
5
- "acc_stderr": 0.014818724459095524
6
- },
7
- "anli_r2": {
8
- "acc": 0.338,
9
- "acc_stderr": 0.014965960710224479
10
- },
11
- "anli_r3": {
12
- "acc": 0.3225,
13
- "acc_stderr": 0.013499258621103247
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.2754385964912281
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768079
23
- },
24
- "hellaswag": {
25
- "acc": 0.4095797649870544,
26
- "acc_stderr": 0.004907512103128348,
27
- "acc_norm": 0.5251941844254132,
28
- "acc_norm_stderr": 0.004983442888677763
29
- },
30
- "rte": {
31
- "acc": 0.5306859205776173,
32
- "acc_stderr": 0.030039730592197816
33
- },
34
- "winogrande": {
35
- "acc": 0.5548539857932123,
36
- "acc_stderr": 0.013967662954355491
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6761090326028861,
40
- "acc_stderr": 0.010821488046867108
41
- },
42
- "boolq": {
43
- "acc": 0.572782874617737,
44
- "acc_stderr": 0.008651907722486108
45
- },
46
- "arc_easy": {
47
- "acc": 0.601010101010101,
48
- "acc_stderr": 0.01004824068379877,
49
- "acc_norm": 0.5904882154882155,
50
- "acc_norm_stderr": 0.010090368160990062
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2773037542662116,
54
- "acc_stderr": 0.013082095839059374,
55
- "acc_norm": 0.29948805460750855,
56
- "acc_norm_stderr": 0.013385021637313572
57
- },
58
- "sciq": {
59
- "acc": 0.908,
60
- "acc_stderr": 0.009144376393151108,
61
- "acc_norm": 0.905,
62
- "acc_norm_stderr": 0.009276910103103313
63
- },
64
- "piqa": {
65
- "acc": 0.7274211099020674,
66
- "acc_stderr": 0.010389256803296021,
67
- "acc_norm": 0.7241566920565833,
68
- "acc_norm_stderr": 0.01042780550272912
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b42boscar/evaluation/rankeval/4b284b42boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.329,
5
- "acc_stderr": 0.01486539538592837
6
- },
7
- "anli_r2": {
8
- "acc": 0.351,
9
- "acc_stderr": 0.015100563798316402
10
- },
11
- "anli_r3": {
12
- "acc": 0.35,
13
- "acc_stderr": 0.013774667009018558
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.48858858858858856
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.40748854809798846,
26
- "acc_stderr": 0.004903628887264533,
27
- "acc_norm": 0.5296753634734117,
28
- "acc_norm_stderr": 0.004980985384152898
29
- },
30
- "rte": {
31
- "acc": 0.4981949458483754,
32
- "acc_stderr": 0.030096267148976626
33
- },
34
- "winogrande": {
35
- "acc": 0.5485398579321231,
36
- "acc_stderr": 0.01398611030101776
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6798503474078034,
40
- "acc_stderr": 0.010788532546733105
41
- },
42
- "boolq": {
43
- "acc": 0.5737003058103975,
44
- "acc_stderr": 0.008649531625805677
45
- },
46
- "arc_easy": {
47
- "acc": 0.6014309764309764,
48
- "acc_stderr": 0.01004645540047794,
49
- "acc_norm": 0.5883838383838383,
50
- "acc_norm_stderr": 0.010098218646714906
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26109215017064846,
54
- "acc_stderr": 0.01283552390947384,
55
- "acc_norm": 0.2935153583617747,
56
- "acc_norm_stderr": 0.01330725044494112
57
- },
58
- "sciq": {
59
- "acc": 0.914,
60
- "acc_stderr": 0.008870325962594766,
61
- "acc_norm": 0.919,
62
- "acc_norm_stderr": 0.008632121032139986
63
- },
64
- "piqa": {
65
- "acc": 0.7334058759521219,
66
- "acc_stderr": 0.010316749863541367,
67
- "acc_norm": 0.7431991294885746,
68
- "acc_norm_stderr": 0.010192864802278039
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b42boscar/evaluation/rankeval/4b284b42boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.325,
5
- "acc_stderr": 0.014818724459095526
6
- },
7
- "anli_r2": {
8
- "acc": 0.348,
9
- "acc_stderr": 0.01507060460376841
10
- },
11
- "anli_r3": {
12
- "acc": 0.3525,
13
- "acc_stderr": 0.013797164918918359
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644647,
18
- "f1": 0.406816186447442
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.04560480215720684
23
- },
24
- "hellaswag": {
25
- "acc": 0.4099780920135431,
26
- "acc_stderr": 0.0049082413543102095,
27
- "acc_norm": 0.5292770364469229,
28
- "acc_norm_stderr": 0.00498122013588233
29
- },
30
- "rte": {
31
- "acc": 0.48014440433212996,
32
- "acc_stderr": 0.0300727231673172
33
- },
34
- "winogrande": {
35
- "acc": 0.5548539857932123,
36
- "acc_stderr": 0.013967662954355493
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6830571886691609,
40
- "acc_stderr": 0.01075965095145212
41
- },
42
- "boolq": {
43
- "acc": 0.5636085626911315,
44
- "acc_stderr": 0.008674000467432073
45
- },
46
- "arc_easy": {
47
- "acc": 0.601010101010101,
48
- "acc_stderr": 0.01004824068379876,
49
- "acc_norm": 0.5841750841750841,
50
- "acc_norm_stderr": 0.01011334824464787
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2790102389078498,
54
- "acc_stderr": 0.013106784883601333,
55
- "acc_norm": 0.29692832764505117,
56
- "acc_norm_stderr": 0.013352025976725222
57
- },
58
- "sciq": {
59
- "acc": 0.925,
60
- "acc_stderr": 0.008333333333333352,
61
- "acc_norm": 0.917,
62
- "acc_norm_stderr": 0.00872852720607479
63
- },
64
- "piqa": {
65
- "acc": 0.7285092491838956,
66
- "acc_stderr": 0.010376251176596137,
67
- "acc_norm": 0.733949945593036,
68
- "acc_norm_stderr": 0.01031003926335282
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b42boscar/evaluation/rankeval/4b284b42boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.332,
5
- "acc_stderr": 0.014899597242811476
6
- },
7
- "anli_r2": {
8
- "acc": 0.353,
9
- "acc_stderr": 0.01512017260548369
10
- },
11
- "anli_r3": {
12
- "acc": 0.3375,
13
- "acc_stderr": 0.013655897185463652
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644647,
18
- "f1": 0.40404606286959227
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.4084843656642103,
26
- "acc_stderr": 0.0049054894940050746,
27
- "acc_norm": 0.5316669986058554,
28
- "acc_norm_stderr": 0.004979763862134992
29
- },
30
- "rte": {
31
- "acc": 0.48736462093862815,
32
- "acc_stderr": 0.030086851767188564
33
- },
34
- "winogrande": {
35
- "acc": 0.5438042620363063,
36
- "acc_stderr": 0.013998453610924324
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6857295563869589,
40
- "acc_stderr": 0.010735132285108171
41
- },
42
- "boolq": {
43
- "acc": 0.5703363914373089,
44
- "acc_stderr": 0.00865809540849789
45
- },
46
- "arc_easy": {
47
- "acc": 0.6064814814814815,
48
- "acc_stderr": 0.010024426884292555,
49
- "acc_norm": 0.601010101010101,
50
- "acc_norm_stderr": 0.010048240683798747
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28071672354948807,
54
- "acc_stderr": 0.013131238126975572,
55
- "acc_norm": 0.29692832764505117,
56
- "acc_norm_stderr": 0.013352025976725223
57
- },
58
- "sciq": {
59
- "acc": 0.919,
60
- "acc_stderr": 0.008632121032139985,
61
- "acc_norm": 0.915,
62
- "acc_norm_stderr": 0.008823426366942324
63
- },
64
- "piqa": {
65
- "acc": 0.7230685527747551,
66
- "acc_stderr": 0.010440499969334535,
67
- "acc_norm": 0.7257889009793254,
68
- "acc_norm_stderr": 0.010408618664933384
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84boscar/evaluation/generation/merged.csv CHANGED
@@ -1 +1,53 @@
1
  dataset,fewshots,prompt,metric,value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.11366020920928965
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.11366020920928965
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.21840708978516402
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.21840708978516402
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.24661876395459947
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.24661876395459947
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2590847138395995
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2590847138395995
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.26342970509252306
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.26342970509252306
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.265260656307001
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.265260656307001
14
+ e2e_nlg_cleaned,5,average,multiple,0.2277435230313628
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05104998297888022
16
+ gem_xsum,0,median,rouge2_fmeasure,0.05104998297888022
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.041503622574202585
18
+ gem_xsum,1,median,rouge2_fmeasure,0.041503622574202585
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04934524005211476
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04934524005211476
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.047001131672013766
22
+ gem_xsum,3,median,rouge2_fmeasure,0.047001131672013766
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.014128424244693396
24
+ gem_xsum,4,median,rouge2_fmeasure,0.014128424244693396
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003328471473444539
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0003328471473444539
27
+ gem_xsum,5,average,multiple,0.033893541444874865
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04533088635900367
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.04533088635900367
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0707256674518147
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.0707256674518147
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09269395791934168
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.09269395791934168
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10613152749319005
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.10613152749319005
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10765686386731786
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.10765686386731786
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11687710078916223
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.11687710078916223
40
+ web_nlg_en,5,average,multiple,0.08990266731330503
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03664384349201909
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03664384349201909
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.057131255472237095
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.057131255472237095
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07316494206446221
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.07316494206446221
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.061844393074462746
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.061844393074462746
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.019499194268273016
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.019499194268273016
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0030060031859358545
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0030060031859358545
53
+ wiki_lingua_en,5,average,multiple,0.041881605259565004
4b284b84boscar/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3544571642681627, "bleu_stderr": 0.03407992444539609, "rouge1_fmeasure": 0.10020082045945918, "rouge1_fmeasure_stderr": 0.0019981291727618295, "rouge1_precision": 0.06761582940505043, "rouge1_precision_stderr": 0.001698628240993464, "rouge1_recall": 0.2835900568928793, "rouge1_recall_stderr": 0.0051423530685150995, "rouge2_fmeasure": 0.04533088635900367, "rouge2_fmeasure_stderr": 0.0011446673324507026, "rouge2_precision": 0.0298169184397825, "rouge2_precision_stderr": 0.0008893184105046417, "rouge2_recall": 0.1320847016852513, "rouge2_recall_stderr": 0.0030629028842595533, "rougeL_fmeasure": 0.09624293276078504, "rougeL_fmeasure_stderr": 0.001854737899845049, "rougeL_precision": 0.06475622065447084, "rougeL_precision_stderr": 0.001561020298095879, "rougeL_recall": 0.2752132257530675, "rougeL_recall_stderr": 0.004995344363048776, "rougeLsum_fmeasure": 0.09482991624917897, "rougeLsum_fmeasure_stderr": 0.00186076215487636, "rougeLsum_precision": 0.0640147635918145, "rougeLsum_precision_stderr": 0.001575258619523322, "rougeLsum_recall": 0.2681983425978889, "rougeLsum_recall_stderr": 0.004751089970308051}}, "1": {"PALM_prompt": {"bleu": 0.47989295807569715, "bleu_stderr": 0.0475397437178309, "rouge1_fmeasure": 0.14346476228827978, "rouge1_fmeasure_stderr": 0.0034815952103301458, "rouge1_precision": 0.11534724394991629, "rouge1_precision_stderr": 0.003827393590577779, "rouge1_recall": 0.3024780747165944, "rouge1_recall_stderr": 0.004967827146790581, "rouge2_fmeasure": 0.0707256674518147, "rouge2_fmeasure_stderr": 0.0022771008751978987, "rouge2_precision": 0.057998177860908384, "rouge2_precision_stderr": 0.0025743686817686295, "rouge2_recall": 0.15177356518097615, "rouge2_recall_stderr": 0.0034257177589283546, "rougeL_fmeasure": 0.13060275535305266, "rougeL_fmeasure_stderr": 0.0029918940675236053, "rougeL_precision": 0.10379078272811205, "rougeL_precision_stderr": 0.003315527005477984, "rougeL_recall": 0.28296523055686296, "rougeL_recall_stderr": 0.0045634431352543385, "rougeLsum_fmeasure": 0.13337871311137614, "rougeLsum_fmeasure_stderr": 0.0030768430402653476, "rougeLsum_precision": 0.10645036357421774, "rougeLsum_precision_stderr": 0.0034217554292272586, "rougeLsum_recall": 0.2866781623820036, "rougeLsum_recall_stderr": 0.0046215646233925805}}, "2": {"PALM_prompt": {"bleu": 0.666861601918144, "bleu_stderr": 0.026299047658385755, "rouge1_fmeasure": 0.17667190508199623, "rouge1_fmeasure_stderr": 0.00419053891681268, "rouge1_precision": 0.1547430839815294, "rouge1_precision_stderr": 0.004933321468949893, "rouge1_recall": 0.33478841436451773, "rouge1_recall_stderr": 0.005044795305603361, "rouge2_fmeasure": 0.09269395791934168, "rouge2_fmeasure_stderr": 0.002904096045529738, "rouge2_precision": 0.0836753208680069, "rouge2_precision_stderr": 0.0033827146122726165, "rouge2_recall": 0.17569109460452212, "rouge2_recall_stderr": 0.00376890003216499, "rougeL_fmeasure": 0.15988566101529644, "rougeL_fmeasure_stderr": 0.003641617023536087, "rougeL_precision": 0.13806190021996617, "rougeL_precision_stderr": 0.004291374157153462, "rougeL_recall": 0.31163920071713547, "rougeL_recall_stderr": 0.004598513995424206, "rougeLsum_fmeasure": 0.16376462584391532, "rougeLsum_fmeasure_stderr": 0.003764157053925168, "rougeLsum_precision": 0.14223938842195769, "rougeLsum_precision_stderr": 0.004446164564752725, "rougeLsum_recall": 0.31614503537746547, "rougeLsum_recall_stderr": 0.004685139330564806}}, "3": {"PALM_prompt": {"bleu": 0.7918596286549058, "bleu_stderr": 0.0505082941574267, "rouge1_fmeasure": 0.19582351163910977, "rouge1_fmeasure_stderr": 0.004516582135357114, "rouge1_precision": 0.17771262032081198, "rouge1_precision_stderr": 0.0054951572240111435, "rouge1_recall": 0.35311914964743174, "rouge1_recall_stderr": 0.005025197259419015, "rouge2_fmeasure": 0.10613152749319005, "rouge2_fmeasure_stderr": 0.0031819607620739436, "rouge2_precision": 0.10055990834825583, "rouge2_precision_stderr": 0.003908838679331289, "rouge2_recall": 0.18896553373557853, "rouge2_recall_stderr": 0.0038495013649217655, "rougeL_fmeasure": 0.1766613912727054, "rougeL_fmeasure_stderr": 0.003927425134752762, "rougeL_precision": 0.15835690076776687, "rougeL_precision_stderr": 0.004829192255619531, "rougeL_recall": 0.3282078865520143, "rougeL_recall_stderr": 0.004628481258729149, "rougeLsum_fmeasure": 0.18105604842604275, "rougeLsum_fmeasure_stderr": 0.004055025613896272, "rougeLsum_precision": 0.1631952722315149, "rougeLsum_precision_stderr": 0.005000959403714751, "rougeLsum_recall": 0.3330222709351221, "rougeLsum_recall_stderr": 0.00468575613523257}}, "4": {"PALM_prompt": {"bleu": 0.8540878153236116, "bleu_stderr": 0.04264209433083351, "rouge1_fmeasure": 0.19985774352152183, "rouge1_fmeasure_stderr": 0.004398277620735298, "rouge1_precision": 0.1824057188502487, "rouge1_precision_stderr": 0.005459901706601115, "rouge1_recall": 0.3648014741857513, "rouge1_recall_stderr": 0.004987141844898954, "rouge2_fmeasure": 0.10765686386731786, "rouge2_fmeasure_stderr": 0.0030775323292218954, "rouge2_precision": 0.10170190198280457, "rouge2_precision_stderr": 0.003802778312321268, "rouge2_recall": 0.1969832896256075, "rouge2_recall_stderr": 0.003898300177769583, "rougeL_fmeasure": 0.1790692812610404, "rougeL_fmeasure_stderr": 0.0037636080036965234, "rougeL_precision": 0.16138211318666076, "rougeL_precision_stderr": 0.004748850648521006, "rougeL_recall": 0.3387426515169364, "rougeL_recall_stderr": 0.004593853553959088, "rougeLsum_fmeasure": 0.18423836334787674, "rougeLsum_fmeasure_stderr": 0.003922969546378721, "rougeLsum_precision": 0.1672573483266801, "rougeLsum_precision_stderr": 0.0049642924071582155, "rougeLsum_recall": 0.34429093880762246, "rougeLsum_recall_stderr": 0.0046537632402660605}}, "5": {"PALM_prompt": {"bleu": 0.912001768808622, "bleu_stderr": 0.043254031735108306, "rouge1_fmeasure": 0.2092111019883349, "rouge1_fmeasure_stderr": 0.00460529861669814, "rouge1_precision": 0.19859783075889126, "rouge1_precision_stderr": 0.005919289804309579, "rouge1_recall": 0.36844512002888075, "rouge1_recall_stderr": 0.00505018826168294, "rouge2_fmeasure": 0.11687710078916223, "rouge2_fmeasure_stderr": 0.003314632884900664, "rouge2_precision": 0.11681205435376238, "rouge2_precision_stderr": 0.004339261455429453, "rouge2_recall": 0.20339458742529712, "rouge2_recall_stderr": 0.004013105858687102, "rougeL_fmeasure": 0.18815490160354775, "rougeL_fmeasure_stderr": 0.0039766285759192405, "rougeL_precision": 0.17708492516954327, "rougeL_precision_stderr": 0.005248751409406995, "rougeL_recall": 0.3425914027687954, "rougeL_recall_stderr": 0.004629793741986903, "rougeLsum_fmeasure": 0.19256627879558108, "rougeLsum_fmeasure_stderr": 0.004111657382117099, "rougeLsum_precision": 0.18241276096737613, "rougeLsum_precision_stderr": 0.0054404594348748435, "rougeLsum_recall": 0.3465669085097831, "rougeLsum_recall_stderr": 0.004652250039019875}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.1294656115380315, "bleu_stderr": 0.08608373097383684, "rouge1_fmeasure": 0.1681539808990386, "rouge1_fmeasure_stderr": 0.0022516781432784135, "rouge1_precision": 0.15239310002995923, "rouge1_precision_stderr": 0.002438796263618738, "rouge1_recall": 0.23281616338419592, "rouge1_recall_stderr": 0.003153441505325274, "rouge2_fmeasure": 0.03664384349201909, "rouge2_fmeasure_stderr": 0.0009444852473193031, "rouge2_precision": 0.032362606343306115, "rouge2_precision_stderr": 0.0008736804306871456, "rouge2_recall": 0.05277290954431128, "rouge2_recall_stderr": 0.0015535207326358358, "rougeL_fmeasure": 0.13117374932151898, "rougeL_fmeasure_stderr": 0.0016522658265504455, "rougeL_precision": 0.11810199150971258, "rougeL_precision_stderr": 0.0018590591408895979, "rougeL_recall": 0.18586688913934518, "rougeL_recall_stderr": 0.002536111883086673, "rougeLsum_fmeasure": 0.15416461307042828, "rougeLsum_fmeasure_stderr": 0.0020767003871997682, "rougeLsum_precision": 0.13989462081303852, "rougeLsum_precision_stderr": 0.0022791670344911403, "rougeLsum_recall": 0.21404920648714998, "rougeLsum_recall_stderr": 0.0029316839033584957}}, "1": {"tldr_en": {"bleu": 3.6252694677393182, "bleu_stderr": 0.11516306057568679, "rouge1_fmeasure": 0.2114035653216517, "rouge1_fmeasure_stderr": 0.0023246773063628615, "rouge1_precision": 0.2751823879663153, "rouge1_precision_stderr": 0.003881996275772322, "rouge1_recall": 0.2346682268668015, "rouge1_recall_stderr": 0.0030480805010752515, "rouge2_fmeasure": 0.057131255472237095, "rouge2_fmeasure_stderr": 0.0014085125325758205, "rouge2_precision": 0.07953360723989537, "rouge2_precision_stderr": 0.0023704352619084283, "rouge2_recall": 0.062365192117877284, "rouge2_recall_stderr": 0.0016572329309161098, "rougeL_fmeasure": 0.16173018381687715, "rougeL_fmeasure_stderr": 0.0017896383598985555, "rougeL_precision": 0.2144299891910755, "rougeL_precision_stderr": 0.0032459302277197937, "rougeL_recall": 0.17984636106508087, "rougeL_recall_stderr": 0.002382783710665373, "rougeLsum_fmeasure": 0.1991087226604085, "rougeLsum_fmeasure_stderr": 0.002183402133872785, "rougeLsum_precision": 0.2602893534943804, "rougeLsum_precision_stderr": 0.0037273296754777798, "rougeLsum_recall": 0.22087932595278062, "rougeLsum_recall_stderr": 0.002865409410302555}}, "2": {"tldr_en": {"bleu": 4.294396830160884, "bleu_stderr": 0.10510562701376282, "rouge1_fmeasure": 0.2415919978553291, "rouge1_fmeasure_stderr": 0.002275465997227811, "rouge1_precision": 0.3427708550618383, "rouge1_precision_stderr": 0.0039616432661503925, "rouge1_recall": 0.24353092504463666, "rouge1_recall_stderr": 0.0028368108328522975, "rouge2_fmeasure": 0.07316494206446221, "rouge2_fmeasure_stderr": 0.0014908416636519638, "rouge2_precision": 0.10981242271190325, "rouge2_precision_stderr": 0.002600254579986533, "rouge2_recall": 0.07299604533557322, "rouge2_recall_stderr": 0.0016818837531280511, "rougeL_fmeasure": 0.18830499327024094, "rougeL_fmeasure_stderr": 0.0018262223394083703, "rougeL_precision": 0.27052380671481613, "rougeL_precision_stderr": 0.003342423060950019, "rougeL_recall": 0.18975380555144886, "rougeL_recall_stderr": 0.002272746335013426, "rougeLsum_fmeasure": 0.22838437822057972, "rougeLsum_fmeasure_stderr": 0.0021677295739195527, "rougeLsum_precision": 0.3252291175905201, "rougeLsum_precision_stderr": 0.003832831712454371, "rougeLsum_recall": 0.2296884650140397, "rougeLsum_recall_stderr": 0.00267247029974853}}, "3": {"tldr_en": {"bleu": 2.9094995439916813, "bleu_stderr": 0.1020505721496997, "rouge1_fmeasure": 0.20299844928550523, "rouge1_fmeasure_stderr": 0.002687864968532052, "rouge1_precision": 0.3073965000868567, "rouge1_precision_stderr": 0.004532668529568856, "rouge1_recall": 0.19614307988052224, "rouge1_recall_stderr": 0.003080191271460992, "rouge2_fmeasure": 0.061844393074462746, "rouge2_fmeasure_stderr": 0.0014810001971562478, "rouge2_precision": 0.10017130648615265, "rouge2_precision_stderr": 0.002776734778018214, "rouge2_recall": 0.059152151232302004, "rouge2_recall_stderr": 0.0016020020538619843, "rougeL_fmeasure": 0.16011928521487392, "rougeL_fmeasure_stderr": 0.0021479359283275963, "rougeL_precision": 0.24589394056510158, "rougeL_precision_stderr": 0.0038062820175439435, "rougeL_recall": 0.15441816724846275, "rougeL_recall_stderr": 0.0024515259341510915, "rougeLsum_fmeasure": 0.19201708263502876, "rougeLsum_fmeasure_stderr": 0.002552311917295444, "rougeLsum_precision": 0.2921779950267841, "rougeLsum_precision_stderr": 0.004374364347847452, "rougeLsum_recall": 0.18494595132044844, "rougeLsum_recall_stderr": 0.002894976936245908}}, "4": {"tldr_en": {"bleu": 0.0372391578969955, "bleu_stderr": 0.006245634047152133, "rouge1_fmeasure": 0.06515084313802752, "rouge1_fmeasure_stderr": 0.0023301601298560937, "rouge1_precision": 0.1006627719845332, "rouge1_precision_stderr": 0.00368234304942406, "rouge1_recall": 0.06278686240158242, "rouge1_recall_stderr": 0.002429866089129886, "rouge2_fmeasure": 0.019499194268273016, "rouge2_fmeasure_stderr": 0.0010427160072790155, "rouge2_precision": 0.0324786744467653, "rouge2_precision_stderr": 0.0019107387952160683, "rouge2_recall": 0.018895906029094554, "rouge2_recall_stderr": 0.0011271396066362587, "rougeL_fmeasure": 0.0523000871859911, "rougeL_fmeasure_stderr": 0.0018885969396611308, "rougeL_precision": 0.0823028370721046, "rougeL_precision_stderr": 0.0031051929375046153, "rougeL_recall": 0.05042051526784097, "rougeL_recall_stderr": 0.0019878366229883104, "rougeLsum_fmeasure": 0.06128900015472603, "rougeLsum_fmeasure_stderr": 0.002200644241627669, "rougeLsum_precision": 0.0954288017222706, "rougeLsum_precision_stderr": 0.0035264644632323466, "rougeLsum_recall": 0.05883979010675358, "rougeLsum_recall_stderr": 0.002281234737704665}}, "5": {"tldr_en": {"bleu": 5.89885644425285e-18, "bleu_stderr": 2.573727805263297e-15, "rouge1_fmeasure": 0.009841170717213035, "rouge1_fmeasure_stderr": 0.0009774207438938338, "rouge1_precision": 0.016979177640177374, "rouge1_precision_stderr": 0.0017468463524776949, "rouge1_recall": 0.00933120437657692, "rouge1_recall_stderr": 0.0010188132703681308, "rouge2_fmeasure": 0.0030060031859358545, "rouge2_fmeasure_stderr": 0.0004356120556552399, "rouge2_precision": 0.00576381210470867, "rouge2_precision_stderr": 0.0009504290775495337, "rouge2_recall": 0.0028734319831093366, "rouge2_recall_stderr": 0.0005018903996908588, "rougeL_fmeasure": 0.008154099496089815, "rougeL_fmeasure_stderr": 0.0008271752653321118, "rougeL_precision": 0.0144844872691487, "rougeL_precision_stderr": 0.00156210488273622, "rougeL_recall": 0.00773643445858647, "rougeL_recall_stderr": 0.0008732235538470538, "rougeLsum_fmeasure": 0.009454743941186122, "rougeLsum_fmeasure_stderr": 0.0009383005713490151, "rougeLsum_precision": 0.01642901756992128, "rougeLsum_precision_stderr": 0.0016976705166155568, "rougeLsum_recall": 0.00896493050335643, "rougeLsum_recall_stderr": 0.0009870511987772768}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 6.333542682337677, "bleu_stderr": 0.10726439979644425, "rouge1_fmeasure": 0.2920852729385422, "rouge1_fmeasure_stderr": 0.002585917559501568, "rouge1_precision": 0.29450049619552965, "rouge1_precision_stderr": 0.003103286443917978, "rouge1_recall": 0.3531232647714125, "rouge1_recall_stderr": 0.0034741713736329807, "rouge2_fmeasure": 0.11366020920928965, "rouge2_fmeasure_stderr": 0.0016797755141133139, "rouge2_precision": 0.10697543573411866, "rouge2_precision_stderr": 0.0017227291388967737, "rouge2_recall": 0.14277215373452296, "rouge2_recall_stderr": 0.002226911641687255, "rougeL_fmeasure": 0.23998550760843057, "rougeL_fmeasure_stderr": 0.001884083170727271, "rougeL_precision": 0.24140263089446137, "rougeL_precision_stderr": 0.0024270505962341814, "rougeL_recall": 0.2922338485900498, "rougeL_recall_stderr": 0.0026699862026011607, "rougeLsum_fmeasure": 0.2615141040307818, "rougeLsum_fmeasure_stderr": 0.002531253935486819, "rougeLsum_precision": 0.26084716441716366, "rougeLsum_precision_stderr": 0.0028038576941327303, "rougeLsum_recall": 0.31762402755219166, "rougeLsum_recall_stderr": 0.0033985728561216327}}, "1": {"generate_text_restaurant": {"bleu": 11.806297456970103, "bleu_stderr": 0.14551133253272594, "rouge1_fmeasure": 0.4697116660037698, "rouge1_fmeasure_stderr": 0.0023603969460476203, "rouge1_precision": 0.5669733154538599, "rouge1_precision_stderr": 0.003166104003686594, "rouge1_recall": 0.44018022577601473, "rouge1_recall_stderr": 0.0030630676874763485, "rouge2_fmeasure": 0.21840708978516402, "rouge2_fmeasure_stderr": 0.0020602369546148685, "rouge2_precision": 0.26656594919484183, "rouge2_precision_stderr": 0.0026339804527222587, "rouge2_recall": 0.20480027468494613, "rouge2_recall_stderr": 0.0022153423750415576, "rougeL_fmeasure": 0.33677921954756435, "rougeL_fmeasure_stderr": 0.0020664076993835057, "rougeL_precision": 0.40943221799438445, "rougeL_precision_stderr": 0.0028646547374588187, "rougeL_recall": 0.31460122768740206, "rougeL_recall_stderr": 0.002459245774729076, "rougeLsum_fmeasure": 0.380038908785806, "rougeLsum_fmeasure_stderr": 0.00234012651128036, "rougeLsum_precision": 0.45952493691536156, "rougeLsum_precision_stderr": 0.003085366813391384, "rougeLsum_recall": 0.35573040920519855, "rougeLsum_recall_stderr": 0.0027873440001945633}}, "2": {"generate_text_restaurant": {"bleu": 14.008929768185011, "bleu_stderr": 0.22440533336667637, "rouge1_fmeasure": 0.4993958738165169, "rouge1_fmeasure_stderr": 0.002296786336619267, "rouge1_precision": 0.597056411617577, "rouge1_precision_stderr": 0.0031639255878135267, "rouge1_recall": 0.4669271432097342, "rouge1_recall_stderr": 0.0029680675695804874, "rouge2_fmeasure": 0.24661876395459947, "rouge2_fmeasure_stderr": 0.0021262352052310984, "rouge2_precision": 0.29845711503191086, "rouge2_precision_stderr": 0.002761178182657989, "rouge2_recall": 0.23063537341826645, "rouge2_recall_stderr": 0.0022864554379483197, "rougeL_fmeasure": 0.36623753572761475, "rougeL_fmeasure_stderr": 0.002138032094317196, "rougeL_precision": 0.4403342175867495, "rougeL_precision_stderr": 0.0029750295481156285, "rougeL_recall": 0.34158773617015814, "rougeL_recall_stderr": 0.002503496159612652, "rougeLsum_fmeasure": 0.4135155622957521, "rougeLsum_fmeasure_stderr": 0.0023712145913602815, "rougeLsum_precision": 0.4951601294309332, "rougeLsum_precision_stderr": 0.0031807126472612607, "rougeLsum_recall": 0.38625888194917374, "rougeLsum_recall_stderr": 0.002796003413056854}}, "3": {"generate_text_restaurant": {"bleu": 15.105211182371182, "bleu_stderr": 0.18451933366555634, "rouge1_fmeasure": 0.5090805906068294, "rouge1_fmeasure_stderr": 0.0023041964520437996, "rouge1_precision": 0.6051917320726051, "rouge1_precision_stderr": 0.003186903545451817, "rouge1_recall": 0.4753181139650836, "rouge1_recall_stderr": 0.0029509453548170993, "rouge2_fmeasure": 0.2590847138395995, "rouge2_fmeasure_stderr": 0.002198161898886385, "rouge2_precision": 0.31111017436171656, "rouge2_precision_stderr": 0.00280736353534738, "rouge2_recall": 0.24213349104334048, "rouge2_recall_stderr": 0.0023696571519957635, "rougeL_fmeasure": 0.376176843640381, "rougeL_fmeasure_stderr": 0.0021978084264396066, "rougeL_precision": 0.449516766264704, "rougeL_precision_stderr": 0.0030507269243126643, "rougeL_recall": 0.35056364105111965, "rougeL_recall_stderr": 0.0025410208161535013, "rougeLsum_fmeasure": 0.42660489724977463, "rougeLsum_fmeasure_stderr": 0.002422704060549194, "rougeLsum_precision": 0.5074249964876819, "rougeLsum_precision_stderr": 0.003227079004477513, "rougeLsum_recall": 0.39822024067404677, "rougeLsum_recall_stderr": 0.0028309325845659074}}, "4": {"generate_text_restaurant": {"bleu": 15.474611986847089, "bleu_stderr": 0.14973216958042582, "rouge1_fmeasure": 0.5131026297255403, "rouge1_fmeasure_stderr": 0.0023318809968242366, "rouge1_precision": 0.6085558629612119, "rouge1_precision_stderr": 0.0032072518394225774, "rouge1_recall": 0.47700323449919174, "rouge1_recall_stderr": 0.002886159999148568, "rouge2_fmeasure": 0.26342970509252306, "rouge2_fmeasure_stderr": 0.0022326066051085933, "rouge2_precision": 0.3159128148862684, "rouge2_precision_stderr": 0.002852006737172459, "rouge2_recall": 0.24477002962445538, "rouge2_recall_stderr": 0.0023552807653690055, "rougeL_fmeasure": 0.380814583402722, "rougeL_fmeasure_stderr": 0.0022426618892846866, "rougeL_precision": 0.45306373160845365, "rougeL_precision_stderr": 0.003027837117034534, "rougeL_recall": 0.3537262109331126, "rougeL_recall_stderr": 0.0025437784350140404, "rougeLsum_fmeasure": 0.4326211658968149, "rougeLsum_fmeasure_stderr": 0.002460044931103608, "rougeLsum_precision": 0.5130870759871387, "rougeLsum_precision_stderr": 0.0032394064012829364, "rougeLsum_recall": 0.40218523134077677, "rougeLsum_recall_stderr": 0.0028079212152934013}}, "5": {"generate_text_restaurant": {"bleu": 15.682367116615334, "bleu_stderr": 0.25213559215306164, "rouge1_fmeasure": 0.5145966431967747, "rouge1_fmeasure_stderr": 0.0022816662119259, "rouge1_precision": 0.6058475533177261, "rouge1_precision_stderr": 0.0031740863555628556, "rouge1_recall": 0.4800148399689924, "rouge1_recall_stderr": 0.002825932845056274, "rouge2_fmeasure": 0.265260656307001, "rouge2_fmeasure_stderr": 0.002214043917610533, "rouge2_precision": 0.31623066797668453, "rouge2_precision_stderr": 0.0028462039844017895, "rouge2_recall": 0.24702364656456957, "rouge2_recall_stderr": 0.0023166256664731376, "rougeL_fmeasure": 0.3813026321038327, "rougeL_fmeasure_stderr": 0.0022157680630405245, "rougeL_precision": 0.4504141035870135, "rougeL_precision_stderr": 0.0030013926360521865, "rougeL_recall": 0.3552692730806331, "rougeL_recall_stderr": 0.0025020557665565548, "rougeLsum_fmeasure": 0.43506423120687865, "rougeLsum_fmeasure_stderr": 0.002423656341895589, "rougeLsum_precision": 0.512482274282817, "rougeLsum_precision_stderr": 0.0032133729386090016, "rougeLsum_recall": 0.4057631051784554, "rougeLsum_recall_stderr": 0.0027687632042124708}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.018881442853279, "bleu_stderr": 0.1198347060919623, "rouge1_fmeasure": 0.2153075569633584, "rouge1_fmeasure_stderr": 0.0027773199670098106, "rouge1_precision": 0.1569943914968631, "rouge1_precision_stderr": 0.0021655546132824436, "rouge1_recall": 0.36095324775264914, "rouge1_recall_stderr": 0.004668719577415661, "rouge2_fmeasure": 0.05104998297888022, "rouge2_fmeasure_stderr": 0.0015800598536466126, "rouge2_precision": 0.03708686838166038, "rouge2_precision_stderr": 0.0012602221441622482, "rouge2_recall": 0.08846965963220199, "rouge2_recall_stderr": 0.00283080199253979, "rougeL_fmeasure": 0.15509756403131228, "rougeL_fmeasure_stderr": 0.0020293672794457597, "rougeL_precision": 0.11303186631574795, "rougeL_precision_stderr": 0.0016203168039502166, "rougeL_recall": 0.26188641959340075, "rougeL_recall_stderr": 0.003589938071974499, "rougeLsum_fmeasure": 0.1703530812291028, "rougeLsum_fmeasure_stderr": 0.0023104790446865997, "rougeLsum_precision": 0.12408813760298806, "rougeLsum_precision_stderr": 0.0018120535311226468, "rougeLsum_recall": 0.2872247357242771, "rougeLsum_recall_stderr": 0.004029862587537261}}, "1": {"article_DOC_summary": {"bleu": 1.8858887147145498, "bleu_stderr": 0.14401168622655466, "rouge1_fmeasure": 0.20505806920550534, "rouge1_fmeasure_stderr": 0.0032842375202117528, "rouge1_precision": 0.20579494303326157, "rouge1_precision_stderr": 0.004051163060208727, "rouge1_recall": 0.24161408729335127, "rouge1_recall_stderr": 0.0038629846635870923, "rouge2_fmeasure": 0.041503622574202585, "rouge2_fmeasure_stderr": 0.0019316323685802709, "rouge2_precision": 0.04302422220208535, "rouge2_precision_stderr": 0.0022086413426371967, "rouge2_recall": 0.04740997367205154, "rouge2_recall_stderr": 0.00213449836929217, "rougeL_fmeasure": 0.15474973660621036, "rougeL_fmeasure_stderr": 0.0024910059328029795, "rougeL_precision": 0.155249768985157, "rougeL_precision_stderr": 0.0031065288103969873, "rougeL_recall": 0.18302318903854045, "rougeL_recall_stderr": 0.0029523933556650545, "rougeLsum_fmeasure": 0.1576172218419311, "rougeLsum_fmeasure_stderr": 0.0025371395589236932, "rougeLsum_precision": 0.15741153110570363, "rougeLsum_precision_stderr": 0.0031082667915430405, "rougeLsum_recall": 0.18784045795172838, "rougeLsum_recall_stderr": 0.003159463678885708}}, "2": {"article_DOC_summary": {"bleu": 2.2458789198162017, "bleu_stderr": 0.1742473110428736, "rouge1_fmeasure": 0.22078072361061904, "rouge1_fmeasure_stderr": 0.003334382844575592, "rouge1_precision": 0.22569787787241932, "rouge1_precision_stderr": 0.004125302870311216, "rouge1_recall": 0.24825517453863355, "rouge1_recall_stderr": 0.0037122286921001967, "rouge2_fmeasure": 0.04934524005211476, "rouge2_fmeasure_stderr": 0.002100138042584609, "rouge2_precision": 0.05192675145501797, "rouge2_precision_stderr": 0.0023645411876905063, "rouge2_recall": 0.05351390792594154, "rouge2_recall_stderr": 0.002216269112397534, "rougeL_fmeasure": 0.16774207141216832, "rougeL_fmeasure_stderr": 0.002691419016293324, "rougeL_precision": 0.17160827341192383, "rougeL_precision_stderr": 0.0033267002849613177, "rougeL_recall": 0.1887814116454046, "rougeL_recall_stderr": 0.002953788514567365, "rougeLsum_fmeasure": 0.17009783088315292, "rougeLsum_fmeasure_stderr": 0.002696769898546311, "rougeLsum_precision": 0.17340733922548357, "rougeLsum_precision_stderr": 0.003311231497456564, "rougeLsum_recall": 0.19262857796948832, "rougeLsum_recall_stderr": 0.003071618374246459}}, "3": {"article_DOC_summary": {"bleu": 2.409483529873289, "bleu_stderr": 0.18624352036745506, "rouge1_fmeasure": 0.21169375438699195, "rouge1_fmeasure_stderr": 0.0036769584663103264, "rouge1_precision": 0.22042725300280533, "rouge1_precision_stderr": 0.004337361685140243, "rouge1_recall": 0.2304513543151878, "rouge1_recall_stderr": 0.0039858349476740084, "rouge2_fmeasure": 0.047001131672013766, "rouge2_fmeasure_stderr": 0.002098938383869797, "rouge2_precision": 0.0493940240244741, "rouge2_precision_stderr": 0.0023545425488460964, "rouge2_recall": 0.050316504605654816, "rouge2_recall_stderr": 0.002169331811953602, "rougeL_fmeasure": 0.1606547287936503, "rougeL_fmeasure_stderr": 0.0029102503972058716, "rougeL_precision": 0.1678584231793303, "rougeL_precision_stderr": 0.0035036978381175823, "rougeL_recall": 0.1752875336790613, "rougeL_recall_stderr": 0.003140466954264108, "rougeLsum_fmeasure": 0.1629790699497271, "rougeLsum_fmeasure_stderr": 0.0029227535963180563, "rougeLsum_precision": 0.1698133729917698, "rougeLsum_precision_stderr": 0.0034978460193711204, "rougeLsum_recall": 0.1786035650942303, "rougeLsum_recall_stderr": 0.0032260778602228465}}, "4": {"article_DOC_summary": {"bleu": 0.18746323959602532, "bleu_stderr": 0.054417035919397665, "rouge1_fmeasure": 0.05852312292259964, "rouge1_fmeasure_stderr": 0.003411745135977311, "rouge1_precision": 0.07074969836542556, "rouge1_precision_stderr": 0.004374846930886093, "rouge1_recall": 0.05935148359227344, "rouge1_recall_stderr": 0.0035290460576690463, "rouge2_fmeasure": 0.014128424244693396, "rouge2_fmeasure_stderr": 0.001372131743832138, "rouge2_precision": 0.017451535244963366, "rouge2_precision_stderr": 0.0018970209619056206, "rouge2_recall": 0.013916749485762063, "rouge2_recall_stderr": 0.0013341269055275502, "rougeL_fmeasure": 0.04473715844771224, "rougeL_fmeasure_stderr": 0.002669461412700664, "rougeL_precision": 0.05499178642750529, "rougeL_precision_stderr": 0.003534909374730744, "rougeL_recall": 0.04518795012564015, "rougeL_recall_stderr": 0.0027283021035140226, "rougeLsum_fmeasure": 0.045274962805013225, "rougeLsum_fmeasure_stderr": 0.002694404061113344, "rougeLsum_precision": 0.05547251868043394, "rougeLsum_precision_stderr": 0.0035488908061919494, "rougeLsum_recall": 0.045982840052559225, "rougeLsum_recall_stderr": 0.002791306716270555}}, "5": {"article_DOC_summary": {"bleu": 8.822788855803002e-48, "bleu_stderr": 1.2895291827584802e-40, "rouge1_fmeasure": 0.0023248125380148903, "rouge1_fmeasure_stderr": 0.0007115633852660155, "rouge1_precision": 0.0034544238838444107, "rouge1_precision_stderr": 0.0010758816471093306, "rouge1_recall": 0.001915294823020827, "rouge1_recall_stderr": 0.0005795152691274384, "rouge2_fmeasure": 0.0003328471473444539, "rouge2_fmeasure_stderr": 0.00015760010881621698, "rouge2_precision": 0.0006244865012900529, "rouge2_precision_stderr": 0.00030583312965056826, "rouge2_recall": 0.0002459647270968026, "rouge2_recall_stderr": 0.00011885497758694596, "rougeL_fmeasure": 0.0018202630319951564, "rougeL_fmeasure_stderr": 0.0005323287142674903, "rougeL_precision": 0.002672449885242423, "rougeL_precision_stderr": 0.000805779683887303, "rougeL_recall": 0.0015212204792498286, "rougeL_recall_stderr": 0.0004379233657056629, "rougeLsum_fmeasure": 0.001866621568919731, "rougeLsum_fmeasure_stderr": 0.000538219309246783, "rougeLsum_precision": 0.0027228988813074017, "rougeLsum_precision_stderr": 0.0008103635709126659, "rougeLsum_recall": 0.00156410212590506, "rougeLsum_recall_stderr": 0.0004440546710373052}}}}
4b284b84boscar/evaluation/rankeval/4b284b84boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.323,
5
- "acc_stderr": 0.01479492784334864
6
- },
7
- "anli_r2": {
8
- "acc": 0.326,
9
- "acc_stderr": 0.014830507204541031
10
- },
11
- "anli_r3": {
12
- "acc": 0.3333333333333333,
13
- "acc_stderr": 0.013613950010225596
14
- },
15
- "cb": {
16
- "acc": 0.5,
17
- "acc_stderr": 0.06741998624632421,
18
- "f1": 0.3323383084577114
19
- },
20
- "copa": {
21
- "acc": 0.73,
22
- "acc_stderr": 0.0446196043338474
23
- },
24
- "hellaswag": {
25
- "acc": 0.4069906393148775,
26
- "acc_stderr": 0.0049026907650664255,
27
- "acc_norm": 0.5218084047002589,
28
- "acc_norm_stderr": 0.004985032806802434
29
- },
30
- "rte": {
31
- "acc": 0.5234657039711191,
32
- "acc_stderr": 0.03006330041190266
33
- },
34
- "winogrande": {
35
- "acc": 0.5406471981057617,
36
- "acc_stderr": 0.014005973823825136
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6798503474078034,
40
- "acc_stderr": 0.010788532546733108
41
- },
42
- "boolq": {
43
- "acc": 0.5978593272171254,
44
- "acc_stderr": 0.008575926383211254
45
- },
46
- "arc_easy": {
47
- "acc": 0.5677609427609428,
48
- "acc_stderr": 0.010165130379698743,
49
- "acc_norm": 0.5109427609427609,
50
- "acc_norm_stderr": 0.010257326131172867
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26535836177474403,
54
- "acc_stderr": 0.012902554762313962,
55
- "acc_norm": 0.2832764505119454,
56
- "acc_norm_stderr": 0.013167478735134575
57
- },
58
- "sciq": {
59
- "acc": 0.857,
60
- "acc_stderr": 0.01107581480856704,
61
- "acc_norm": 0.788,
62
- "acc_norm_stderr": 0.01293148186493805
63
- },
64
- "piqa": {
65
- "acc": 0.7257889009793254,
66
- "acc_stderr": 0.010408618664933382,
67
- "acc_norm": 0.7426550598476604,
68
- "acc_norm_stderr": 0.01019992106479251
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84boscar/evaluation/rankeval/4b284b84boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.32,
5
- "acc_stderr": 0.01475865230357488
6
- },
7
- "anli_r2": {
8
- "acc": 0.329,
9
- "acc_stderr": 0.014865395385928367
10
- },
11
- "anli_r3": {
12
- "acc": 0.3433333333333333,
13
- "acc_stderr": 0.01371263383046586
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.28905472636815915
19
- },
20
- "copa": {
21
- "acc": 0.69,
22
- "acc_stderr": 0.04648231987117316
23
- },
24
- "hellaswag": {
25
- "acc": 0.40659231228838877,
26
- "acc_stderr": 0.004901936511546117,
27
- "acc_norm": 0.5297749452300339,
28
- "acc_norm_stderr": 0.0049809261987989835
29
- },
30
- "rte": {
31
- "acc": 0.5090252707581228,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.5509076558800315,
36
- "acc_stderr": 0.013979459389140842
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6702298236237306,
40
- "acc_stderr": 0.010871682471395132
41
- },
42
- "boolq": {
43
- "acc": 0.6033639143730887,
44
- "acc_stderr": 0.008556148582031995
45
- },
46
- "arc_easy": {
47
- "acc": 0.6005892255892256,
48
- "acc_stderr": 0.010050018228742127,
49
- "acc_norm": 0.5740740740740741,
50
- "acc_norm_stderr": 0.010146568651002257
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26109215017064846,
54
- "acc_stderr": 0.012835523909473841,
55
- "acc_norm": 0.3046075085324232,
56
- "acc_norm_stderr": 0.01344952210993249
57
- },
58
- "sciq": {
59
- "acc": 0.907,
60
- "acc_stderr": 0.009188875634996697,
61
- "acc_norm": 0.898,
62
- "acc_norm_stderr": 0.009575368801653892
63
- },
64
- "piqa": {
65
- "acc": 0.7181719260065288,
66
- "acc_stderr": 0.010496675231258164,
67
- "acc_norm": 0.7372143634385201,
68
- "acc_norm_stderr": 0.010269354068140779
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84boscar/evaluation/rankeval/4b284b84boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.315,
5
- "acc_stderr": 0.014696631960792508
6
- },
7
- "anli_r2": {
8
- "acc": 0.343,
9
- "acc_stderr": 0.015019206922356951
10
- },
11
- "anli_r3": {
12
- "acc": 0.33166666666666667,
13
- "acc_stderr": 0.013596836729485168
14
- },
15
- "cb": {
16
- "acc": 0.5,
17
- "acc_stderr": 0.06741998624632421,
18
- "f1": 0.3416488477072939
19
- },
20
- "copa": {
21
- "acc": 0.68,
22
- "acc_stderr": 0.04688261722621504
23
- },
24
- "hellaswag": {
25
- "acc": 0.40659231228838877,
26
- "acc_stderr": 0.004901936511546122,
27
- "acc_norm": 0.5274845648277235,
28
- "acc_norm_stderr": 0.004982237133409149
29
- },
30
- "rte": {
31
- "acc": 0.4657039711191336,
32
- "acc_stderr": 0.030025579819366426
33
- },
34
- "winogrande": {
35
- "acc": 0.5556432517758485,
36
- "acc_stderr": 0.013965196769083555
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6734366648850882,
40
- "acc_stderr": 0.010844543793668893
41
- },
42
- "boolq": {
43
- "acc": 0.6061162079510704,
44
- "acc_stderr": 0.00854583579261498
45
- },
46
- "arc_easy": {
47
- "acc": 0.6005892255892256,
48
- "acc_stderr": 0.010050018228742127,
49
- "acc_norm": 0.5875420875420876,
50
- "acc_norm_stderr": 0.010101305447864764
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26109215017064846,
54
- "acc_stderr": 0.012835523909473841,
55
- "acc_norm": 0.3054607508532423,
56
- "acc_norm_stderr": 0.013460080478002505
57
- },
58
- "sciq": {
59
- "acc": 0.915,
60
- "acc_stderr": 0.008823426366942328,
61
- "acc_norm": 0.911,
62
- "acc_norm_stderr": 0.009008893392651532
63
- },
64
- "piqa": {
65
- "acc": 0.721436343852013,
66
- "acc_stderr": 0.010459397235965175,
67
- "acc_norm": 0.7252448313384113,
68
- "acc_norm_stderr": 0.010415033676676051
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84boscar/evaluation/rankeval/4b284b84boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.314,
5
- "acc_stderr": 0.014683991951087976
6
- },
7
- "anli_r2": {
8
- "acc": 0.348,
9
- "acc_stderr": 0.01507060460376841
10
- },
11
- "anli_r3": {
12
- "acc": 0.3466666666666667,
13
- "acc_stderr": 0.013744022550571952
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.0672477765493766,
18
- "f1": 0.4123643651945539
19
- },
20
- "copa": {
21
- "acc": 0.68,
22
- "acc_stderr": 0.04688261722621504
23
- },
24
- "hellaswag": {
25
- "acc": 0.4064927305317666,
26
- "acc_stderr": 0.0049017474263317465,
27
- "acc_norm": 0.530372435769767,
28
- "acc_norm_stderr": 0.0049805669077904536
29
- },
30
- "rte": {
31
- "acc": 0.48014440433212996,
32
- "acc_stderr": 0.0300727231673172
33
- },
34
- "winogrande": {
35
- "acc": 0.5501183898973955,
36
- "acc_stderr": 0.013981711904049733
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6745056119722074,
40
- "acc_stderr": 0.010835369677013443
41
- },
42
- "boolq": {
43
- "acc": 0.6030581039755352,
44
- "acc_stderr": 0.00855727696467514
45
- },
46
- "arc_easy": {
47
- "acc": 0.6127946127946128,
48
- "acc_stderr": 0.00999531206589035,
49
- "acc_norm": 0.5938552188552189,
50
- "acc_norm_stderr": 0.010077409815364055
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26621160409556316,
54
- "acc_stderr": 0.0129157747815232,
55
- "acc_norm": 0.295221843003413,
56
- "acc_norm_stderr": 0.013329750293382316
57
- },
58
- "sciq": {
59
- "acc": 0.914,
60
- "acc_stderr": 0.008870325962594766,
61
- "acc_norm": 0.917,
62
- "acc_norm_stderr": 0.008728527206074792
63
- },
64
- "piqa": {
65
- "acc": 0.7252448313384113,
66
- "acc_stderr": 0.010415033676676042,
67
- "acc_norm": 0.735038084874864,
68
- "acc_norm_stderr": 0.01029655799331605
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84boscar/evaluation/rankeval/4b284b84boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.324,
5
- "acc_stderr": 0.014806864733738859
6
- },
7
- "anli_r2": {
8
- "acc": 0.346,
9
- "acc_stderr": 0.01505026612756444
10
- },
11
- "anli_r3": {
12
- "acc": 0.34,
13
- "acc_stderr": 0.013680495725767792
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.3220736570490265
19
- },
20
- "copa": {
21
- "acc": 0.67,
22
- "acc_stderr": 0.04725815626252607
23
- },
24
- "hellaswag": {
25
- "acc": 0.4078868751244772,
26
- "acc_stderr": 0.004904375631128856,
27
- "acc_norm": 0.5286795459071898,
28
- "acc_norm_stderr": 0.004981566295189449
29
- },
30
- "rte": {
31
- "acc": 0.47653429602888087,
32
- "acc_stderr": 0.03006330041190266
33
- },
34
- "winogrande": {
35
- "acc": 0.5516969218626677,
36
- "acc_stderr": 0.013977171307126342
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6814537680384821,
40
- "acc_stderr": 0.010774165229761342
41
- },
42
- "boolq": {
43
- "acc": 0.6051987767584098,
44
- "acc_stderr": 0.008549304887647416
45
- },
46
- "arc_easy": {
47
- "acc": 0.6060606060606061,
48
- "acc_stderr": 0.01002630535598182,
49
- "acc_norm": 0.5972222222222222,
50
- "acc_norm_stderr": 0.010063960494989163
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2781569965870307,
54
- "acc_stderr": 0.013094469919538807,
55
- "acc_norm": 0.3122866894197952,
56
- "acc_norm_stderr": 0.013542598541688065
57
- },
58
- "sciq": {
59
- "acc": 0.92,
60
- "acc_stderr": 0.008583336977753656,
61
- "acc_norm": 0.927,
62
- "acc_norm_stderr": 0.008230354715244059
63
- },
64
- "piqa": {
65
- "acc": 0.7279651795429815,
66
- "acc_stderr": 0.010382763786247381,
67
- "acc_norm": 0.7317736670293797,
68
- "acc_norm_stderr": 0.010336761992404485
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84boscar/evaluation/rankeval/4b284b84boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.348,
5
- "acc_stderr": 0.015070604603768408
6
- },
7
- "anli_r2": {
8
- "acc": 0.355,
9
- "acc_stderr": 0.01513949154378053
10
- },
11
- "anli_r3": {
12
- "acc": 0.3591666666666667,
13
- "acc_stderr": 0.013855141559780354
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.3114219114219114
19
- },
20
- "copa": {
21
- "acc": 0.69,
22
- "acc_stderr": 0.04648231987117316
23
- },
24
- "hellaswag": {
25
- "acc": 0.4095797649870544,
26
- "acc_stderr": 0.00490751210312835,
27
- "acc_norm": 0.5337582154949213,
28
- "acc_norm_stderr": 0.004978395540514379
29
- },
30
- "rte": {
31
- "acc": 0.49458483754512633,
32
- "acc_stderr": 0.030094698123239966
33
- },
34
- "winogrande": {
35
- "acc": 0.5603788476716653,
36
- "acc_stderr": 0.01394964977601569
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6819882415820417,
40
- "acc_stderr": 0.010769343495248544
41
- },
42
- "boolq": {
43
- "acc": 0.6162079510703364,
44
- "acc_stderr": 0.008505584729104966
45
- },
46
- "arc_easy": {
47
- "acc": 0.6077441077441077,
48
- "acc_stderr": 0.010018744689650043,
49
- "acc_norm": 0.6085858585858586,
50
- "acc_norm_stderr": 0.010014917532627817
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27986348122866894,
54
- "acc_stderr": 0.013119040897725922,
55
- "acc_norm": 0.30716723549488056,
56
- "acc_norm_stderr": 0.013481034054980943
57
- },
58
- "sciq": {
59
- "acc": 0.915,
60
- "acc_stderr": 0.008823426366942324,
61
- "acc_norm": 0.918,
62
- "acc_norm_stderr": 0.00868051561552372
63
- },
64
- "piqa": {
65
- "acc": 0.7236126224156693,
66
- "acc_stderr": 0.010434162388275615,
67
- "acc_norm": 0.7328618063112078,
68
- "acc_norm_stderr": 0.010323440492612423
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }