Muennighoff commited on
Commit
eb2a9c4
1 Parent(s): 7aa58e1
Files changed (30) hide show
  1. perplexity25/evaluation/generation/merged.csv +53 -0
  2. perplexity25/evaluation/generation/merged.json +1 -0
  3. perplexity25/evaluation/rankeval/perplexity25_0.csv +21 -0
  4. perplexity25/evaluation/rankeval/perplexity25_0_lm-eval_global_step52452_2023-05-13-01-15-21_0shots_backup.json +0 -87
  5. perplexity25/evaluation/rankeval/perplexity25_1.csv +21 -0
  6. perplexity25/evaluation/rankeval/perplexity25_1_lm-eval_global_step52452_2023-05-13-01-15-21_1shots_backup.json +0 -87
  7. perplexity25/evaluation/rankeval/perplexity25_2.csv +21 -0
  8. perplexity25/evaluation/rankeval/perplexity25_2_lm-eval_global_step52452_2023-05-13-01-15-21_2shots_backup.json +0 -87
  9. perplexity25/evaluation/rankeval/perplexity25_3.csv +21 -0
  10. perplexity25/evaluation/rankeval/perplexity25_3_lm-eval_global_step52452_2023-05-13-01-15-21_3shots_backup.json +0 -87
  11. perplexity25/evaluation/rankeval/perplexity25_4.csv +21 -0
  12. perplexity25/evaluation/rankeval/perplexity25_4_lm-eval_global_step52452_2023-05-13-01-15-21_4shots_backup.json +0 -87
  13. perplexity25/evaluation/rankeval/perplexity25_5.csv +21 -0
  14. perplexity25/evaluation/rankeval/perplexity25_5_lm-eval_global_step52452_2023-05-13-01-15-21_5shots_backup.json +0 -87
  15. perplexity50/3511463.err +0 -0
  16. perplexity50/3511463.out +0 -0
  17. perplexity50/evaluation/generation/merged.csv +53 -0
  18. perplexity50/evaluation/generation/merged.json +1 -0
  19. perplexity50/evaluation/rankeval/perplexity50_0.csv +21 -0
  20. perplexity50/evaluation/rankeval/perplexity50_0_lm-eval_global_step52452_2023-05-13-01-15-21_0shots_backup.json +0 -87
  21. perplexity50/evaluation/rankeval/perplexity50_1.csv +21 -0
  22. perplexity50/evaluation/rankeval/perplexity50_1_lm-eval_global_step52452_2023-05-13-01-15-21_1shots_backup.json +0 -87
  23. perplexity50/evaluation/rankeval/perplexity50_2.csv +21 -0
  24. perplexity50/evaluation/rankeval/perplexity50_2_lm-eval_global_step52452_2023-05-13-01-15-21_2shots_backup.json +0 -87
  25. perplexity50/evaluation/rankeval/perplexity50_3.csv +21 -0
  26. perplexity50/evaluation/rankeval/perplexity50_3_lm-eval_global_step52452_2023-05-13-01-15-21_3shots_backup.json +0 -87
  27. perplexity50/evaluation/rankeval/perplexity50_4.csv +21 -0
  28. perplexity50/evaluation/rankeval/perplexity50_4_lm-eval_global_step52452_2023-05-13-01-15-21_4shots_backup.json +0 -87
  29. perplexity50/evaluation/rankeval/perplexity50_5.csv +21 -0
  30. perplexity50/evaluation/rankeval/perplexity50_5_lm-eval_global_step52452_2023-05-13-01-15-21_5shots_backup.json +0 -87
perplexity25/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.09329941078416822
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.09329941078416822
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.142023180586533
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.142023180586533
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.17045725388262045
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.17045725388262045
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.181728403672227
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.181728403672227
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.18641178852244134
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.18641178852244134
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1917272606223869
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1917272606223869
14
+ e2e_nlg_cleaned,5,average,multiple,0.1609412163450628
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04288345569115875
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04288345569115875
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.035379638697656535
18
+ gem_xsum,1,median,rouge2_fmeasure,0.035379638697656535
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03502857896138652
20
+ gem_xsum,2,median,rouge2_fmeasure,0.03502857896138652
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.034720433489397975
22
+ gem_xsum,3,median,rouge2_fmeasure,0.034720433489397975
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009740454133203196
24
+ gem_xsum,4,median,rouge2_fmeasure,0.009740454133203196
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003686455308029473
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0003686455308029473
27
+ gem_xsum,5,average,multiple,0.026353534417267652
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04369490354925837
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.04369490354925837
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0456950711509664
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.0456950711509664
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.048680867420328906
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.048680867420328906
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.04863415417606342
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.04863415417606342
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.04926079317014994
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.04926079317014994
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.04953443120550419
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.04953443120550419
40
+ web_nlg_en,5,average,multiple,0.047583370112045206
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.037881954687990986
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.037881954687990986
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.046870008000920206
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.046870008000920206
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.049493121373645386
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.049493121373645386
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04118261236702404
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04118261236702404
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013165816959927556
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.013165816959927556
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0024221607010753237
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0024221607010753237
53
+ wiki_lingua_en,5,average,multiple,0.031835945681763914
perplexity25/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.31224558314839784, "bleu_stderr": 0.03713618627951878, "rouge1_fmeasure": 0.09597672933276848, "rouge1_fmeasure_stderr": 0.002119566893316448, "rouge1_precision": 0.06416860697398828, "rouge1_precision_stderr": 0.0017145071144190034, "rouge1_recall": 0.27658877051081887, "rouge1_recall_stderr": 0.004689276935825928, "rouge2_fmeasure": 0.04369490354925837, "rouge2_fmeasure_stderr": 0.0012778025959773575, "rouge2_precision": 0.02898895600657147, "rouge2_precision_stderr": 0.0009732573595948091, "rouge2_recall": 0.12474346123394711, "rouge2_recall_stderr": 0.003103993275317445, "rougeL_fmeasure": 0.0916810654796967, "rougeL_fmeasure_stderr": 0.0019730758573202655, "rougeL_precision": 0.061168279447352224, "rougeL_precision_stderr": 0.0016042565824939442, "rougeL_recall": 0.2661016573130979, "rougeL_recall_stderr": 0.004476805597758023, "rougeLsum_fmeasure": 0.09077121152470446, "rougeLsum_fmeasure_stderr": 0.002000842365909719, "rougeLsum_precision": 0.06079097382146833, "rougeLsum_precision_stderr": 0.001638980434900789, "rougeLsum_recall": 0.2611818383395376, "rougeLsum_recall_stderr": 0.004408937318215278}}, "1": {"PALM_prompt": {"bleu": 0.3392027660661122, "bleu_stderr": 0.021974513144963268, "rouge1_fmeasure": 0.10098636709306053, "rouge1_fmeasure_stderr": 0.0018586237655962318, "rouge1_precision": 0.0649237002813453, "rouge1_precision_stderr": 0.0013836117996389116, "rouge1_recall": 0.3252507747834447, "rouge1_recall_stderr": 0.004830093238320235, "rouge2_fmeasure": 0.0456950711509664, "rouge2_fmeasure_stderr": 0.0011687980868032475, "rouge2_precision": 0.029412803554314693, "rouge2_precision_stderr": 0.000840010407149539, "rouge2_recall": 0.1489985792608451, "rouge2_recall_stderr": 0.0033169694716508446, "rougeL_fmeasure": 0.09597600266569703, "rougeL_fmeasure_stderr": 0.0017278112461683963, "rougeL_precision": 0.06161199072970022, "rougeL_precision_stderr": 0.001272442500492367, "rougeL_recall": 0.30851984711534297, "rougeL_recall_stderr": 0.004479724678989707, "rougeLsum_fmeasure": 0.09646257885608286, "rougeLsum_fmeasure_stderr": 0.0017756689280714777, "rougeLsum_precision": 0.062043590899161734, "rougeLsum_precision_stderr": 0.0013176307073547546, "rougeLsum_recall": 0.3089691495669342, "rougeLsum_recall_stderr": 0.004492168762956442}}, "2": {"PALM_prompt": {"bleu": 0.3654954806712323, "bleu_stderr": 0.025040063648794096, "rouge1_fmeasure": 0.10670235122995915, "rouge1_fmeasure_stderr": 0.001806209828663936, "rouge1_precision": 0.06820653358439158, "rouge1_precision_stderr": 0.0013447524023065525, "rouge1_recall": 0.3459044512732088, "rouge1_recall_stderr": 0.004653942809147225, "rouge2_fmeasure": 0.048680867420328906, "rouge2_fmeasure_stderr": 0.001161761999498929, "rouge2_precision": 0.03109751061942729, "rouge2_precision_stderr": 0.0008314173693090511, "rouge2_recall": 0.16361070245426873, "rouge2_recall_stderr": 0.0034348511874994717, "rougeL_fmeasure": 0.10152476531671561, "rougeL_fmeasure_stderr": 0.0016858485831521945, "rougeL_precision": 0.06484016065528389, "rougeL_precision_stderr": 0.0012401376869968777, "rougeL_recall": 0.3269715065567825, "rougeL_recall_stderr": 0.004272067994546378, "rougeLsum_fmeasure": 0.10225161251115494, "rougeLsum_fmeasure_stderr": 0.0017327367584781756, "rougeLsum_precision": 0.06536251718368276, "rougeLsum_precision_stderr": 0.0012825569934637642, "rougeLsum_recall": 0.32966818044940316, "rougeLsum_recall_stderr": 0.004368989276954383}}, "3": {"PALM_prompt": {"bleu": 0.3592609344640833, "bleu_stderr": 0.02304406751100301, "rouge1_fmeasure": 0.10565789241386328, "rouge1_fmeasure_stderr": 0.0017663252684883553, "rouge1_precision": 0.06727272384378305, "rouge1_precision_stderr": 0.0012989075786256402, "rouge1_recall": 0.34514411045024723, "rouge1_recall_stderr": 0.004734096982650293, "rouge2_fmeasure": 0.04863415417606342, "rouge2_fmeasure_stderr": 0.0011329644835331011, "rouge2_precision": 0.030916825454777475, "rouge2_precision_stderr": 0.0008056451738364325, "rouge2_recall": 0.1658153905976806, "rouge2_recall_stderr": 0.003481711090228483, "rougeL_fmeasure": 0.10036601085777433, "rougeL_fmeasure_stderr": 0.0016502859429412418, "rougeL_precision": 0.06386360352562777, "rougeL_precision_stderr": 0.0012027232237294206, "rougeL_recall": 0.32523617484648604, "rougeL_recall_stderr": 0.004324297311903695, "rougeLsum_fmeasure": 0.10090783330802172, "rougeLsum_fmeasure_stderr": 0.0016784875419119682, "rougeLsum_precision": 0.06426144521188693, "rougeLsum_precision_stderr": 0.0012323427980484077, "rougeLsum_recall": 0.32867237062843047, "rougeLsum_recall_stderr": 0.004443576841012128}}, "4": {"PALM_prompt": {"bleu": 0.37835540251359956, "bleu_stderr": 0.028435806983719093, "rouge1_fmeasure": 0.10677394877416337, "rouge1_fmeasure_stderr": 0.0017419426846328717, "rouge1_precision": 0.06785826306020468, "rouge1_precision_stderr": 0.0012639461669679325, "rouge1_recall": 0.3474520077766322, "rouge1_recall_stderr": 0.004635808596091852, "rouge2_fmeasure": 0.04926079317014994, "rouge2_fmeasure_stderr": 0.0011176582047592932, "rouge2_precision": 0.03118629908795949, "rouge2_precision_stderr": 0.0007824248126301933, "rouge2_recall": 0.1686112674466713, "rouge2_recall_stderr": 0.0034686723713827842, "rougeL_fmeasure": 0.1015063853834285, "rougeL_fmeasure_stderr": 0.0016330220290649304, "rougeL_precision": 0.06446018636445104, "rougeL_precision_stderr": 0.0011767040252516788, "rougeL_recall": 0.3290377644596475, "rougeL_recall_stderr": 0.00429945865540488, "rougeLsum_fmeasure": 0.10255214222995561, "rougeLsum_fmeasure_stderr": 0.0016742530012902649, "rougeLsum_precision": 0.06518870615697699, "rougeLsum_precision_stderr": 0.0012152427779437518, "rougeLsum_recall": 0.3330559516297193, "rougeLsum_recall_stderr": 0.004414312796704986}}, "5": {"PALM_prompt": {"bleu": 0.3707957491393801, "bleu_stderr": 0.03149803172288385, "rouge1_fmeasure": 0.10804595534435403, "rouge1_fmeasure_stderr": 0.0016819251634874286, "rouge1_precision": 0.06864134680013424, "rouge1_precision_stderr": 0.0012322679271972023, "rouge1_recall": 0.35035957224055103, "rouge1_recall_stderr": 0.004525153332145143, "rouge2_fmeasure": 0.04953443120550419, "rouge2_fmeasure_stderr": 0.0010974380500823572, "rouge2_precision": 0.03135388148292866, "rouge2_precision_stderr": 0.0007751648761981692, "rouge2_recall": 0.16896861616649583, "rouge2_recall_stderr": 0.003345397272046064, "rougeL_fmeasure": 0.10248439747284772, "rougeL_fmeasure_stderr": 0.0015852907542033721, "rougeL_precision": 0.06510326325941494, "rougeL_precision_stderr": 0.0011560260507405354, "rougeL_recall": 0.33044293852360024, "rougeL_recall_stderr": 0.004153133131273199, "rougeLsum_fmeasure": 0.10323556805167783, "rougeLsum_fmeasure_stderr": 0.0016059700043438213, "rougeLsum_precision": 0.0656014791935983, "rougeLsum_precision_stderr": 0.0011776642200350253, "rougeLsum_recall": 0.3339836287603389, "rougeLsum_recall_stderr": 0.004260131546440473}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.6375281102002284, "bleu_stderr": 0.049083205722565484, "rouge1_fmeasure": 0.1821170022700696, "rouge1_fmeasure_stderr": 0.0018365968765570223, "rouge1_precision": 0.15526703395478428, "rouge1_precision_stderr": 0.0019048385092560274, "rouge1_recall": 0.2657362987772207, "rouge1_recall_stderr": 0.002621491681886565, "rouge2_fmeasure": 0.037881954687990986, "rouge2_fmeasure_stderr": 0.0008453802181656135, "rouge2_precision": 0.03206185435985054, "rouge2_precision_stderr": 0.000759586624489302, "rouge2_recall": 0.057440011896085316, "rouge2_recall_stderr": 0.0014245492222374873, "rougeL_fmeasure": 0.1431674380673611, "rougeL_fmeasure_stderr": 0.0013181357642786014, "rougeL_precision": 0.12048492403451228, "rougeL_precision_stderr": 0.0013325755443150576, "rougeL_recall": 0.21415389684069874, "rougeL_recall_stderr": 0.002145834100436848, "rougeLsum_fmeasure": 0.1673439925521862, "rougeLsum_fmeasure_stderr": 0.0016912749684659687, "rougeLsum_precision": 0.14253479978773045, "rougeLsum_precision_stderr": 0.0017513462587583613, "rougeLsum_recall": 0.24495352588719183, "rougeLsum_recall_stderr": 0.0024490806612200763}}, "1": {"tldr_en": {"bleu": 2.3919433608178244, "bleu_stderr": 0.08548762264678461, "rouge1_fmeasure": 0.20800416624670884, "rouge1_fmeasure_stderr": 0.0019096560966773478, "rouge1_precision": 0.17980706675942354, "rouge1_precision_stderr": 0.002098175664461687, "rouge1_recall": 0.3009360804108007, "rouge1_recall_stderr": 0.002722466712920559, "rouge2_fmeasure": 0.046870008000920206, "rouge2_fmeasure_stderr": 0.0009635114797446192, "rouge2_precision": 0.040412609345037494, "rouge2_precision_stderr": 0.000890482142333157, "rouge2_recall": 0.0707443093699949, "rouge2_recall_stderr": 0.0016525596112716557, "rougeL_fmeasure": 0.1486340507538382, "rougeL_fmeasure_stderr": 0.0012723389369324837, "rougeL_precision": 0.1268309364742902, "rougeL_precision_stderr": 0.0013760396751293185, "rougeL_recall": 0.22173573063373433, "rougeL_recall_stderr": 0.0021719897093334284, "rougeLsum_fmeasure": 0.1951271284074896, "rougeLsum_fmeasure_stderr": 0.0017825943482372452, "rougeLsum_precision": 0.1684402130679538, "rougeLsum_precision_stderr": 0.00195351875600882, "rougeLsum_recall": 0.2830602164220028, "rougeLsum_recall_stderr": 0.0025790890750399965}}, "2": {"tldr_en": {"bleu": 2.5303113987588, "bleu_stderr": 0.06314873438274192, "rouge1_fmeasure": 0.21350784531146766, "rouge1_fmeasure_stderr": 0.0018372008047181686, "rouge1_precision": 0.19024342341910272, "rouge1_precision_stderr": 0.002270091248824654, "rouge1_recall": 0.30688686376798624, "rouge1_recall_stderr": 0.002645904846436756, "rouge2_fmeasure": 0.049493121373645386, "rouge2_fmeasure_stderr": 0.0009770655274603632, "rouge2_precision": 0.04485105522627462, "rouge2_precision_stderr": 0.0011025549402209852, "rouge2_recall": 0.07310002107093573, "rouge2_recall_stderr": 0.0016239815932843845, "rougeL_fmeasure": 0.15386240322179967, "rougeL_fmeasure_stderr": 0.001284009017533657, "rougeL_precision": 0.13675580244350435, "rougeL_precision_stderr": 0.0016853525652057188, "rougeL_recall": 0.22643200780218806, "rougeL_recall_stderr": 0.002155739575675694, "rougeLsum_fmeasure": 0.20135002510431343, "rougeLsum_fmeasure_stderr": 0.0017256845604918154, "rougeLsum_precision": 0.17941688106712367, "rougeLsum_precision_stderr": 0.002150275172122676, "rougeLsum_recall": 0.28991649578115825, "rougeLsum_recall_stderr": 0.0025125824058213043}}, "3": {"tldr_en": {"bleu": 2.5056315812266687, "bleu_stderr": 0.0987654628167384, "rouge1_fmeasure": 0.17905977454735247, "rouge1_fmeasure_stderr": 0.0021682988144671764, "rouge1_precision": 0.1667156419346577, "rouge1_precision_stderr": 0.0025223421653541435, "rouge1_recall": 0.2556133564452001, "rouge1_recall_stderr": 0.003255288422257417, "rouge2_fmeasure": 0.04118261236702404, "rouge2_fmeasure_stderr": 0.0009347304524167121, "rouge2_precision": 0.0388323173701326, "rouge2_precision_stderr": 0.0010940369607383355, "rouge2_recall": 0.06213727178884212, "rouge2_recall_stderr": 0.001669610738631409, "rougeL_fmeasure": 0.12952435483301103, "rougeL_fmeasure_stderr": 0.0015357281079718173, "rougeL_precision": 0.12089310071398937, "rougeL_precision_stderr": 0.0018705319801461653, "rougeL_recall": 0.18919380915956352, "rougeL_recall_stderr": 0.002567644889149163, "rougeLsum_fmeasure": 0.16910312476864786, "rougeLsum_fmeasure_stderr": 0.0020403281058382705, "rougeLsum_precision": 0.1573503817985137, "rougeLsum_precision_stderr": 0.0023805026761902592, "rougeLsum_recall": 0.24220269466463637, "rougeLsum_recall_stderr": 0.003114884743178747}}, "4": {"tldr_en": {"bleu": 0.558593061097989, "bleu_stderr": 0.04095297851543845, "rouge1_fmeasure": 0.05795016970693355, "rouge1_fmeasure_stderr": 0.001916255841264876, "rouge1_precision": 0.05512873037041646, "rouge1_precision_stderr": 0.002053021409006724, "rouge1_recall": 0.08653039568969556, "rouge1_recall_stderr": 0.002932898045655817, "rouge2_fmeasure": 0.013165816959927556, "rouge2_fmeasure_stderr": 0.0006382389780016135, "rouge2_precision": 0.012291396262431102, "rouge2_precision_stderr": 0.0006860904350077833, "rouge2_recall": 0.021084941019315177, "rouge2_recall_stderr": 0.001188230063769281, "rougeL_fmeasure": 0.043071887152266786, "rougeL_fmeasure_stderr": 0.001407113300208392, "rougeL_precision": 0.04126037678616895, "rougeL_precision_stderr": 0.0015664586581985208, "rougeL_recall": 0.06589363408283794, "rougeL_recall_stderr": 0.002292257294557098, "rougeLsum_fmeasure": 0.054561504253622946, "rougeLsum_fmeasure_stderr": 0.00180105400180901, "rougeLsum_precision": 0.0520537723141164, "rougeLsum_precision_stderr": 0.0019443085493386046, "rougeLsum_recall": 0.08158523490668657, "rougeLsum_recall_stderr": 0.0027709770644967647}}, "5": {"tldr_en": {"bleu": 2.49693387658237e-06, "bleu_stderr": 4.9254476844807124e-06, "rouge1_fmeasure": 0.009411385725263191, "rouge1_fmeasure_stderr": 0.0008471619506817471, "rouge1_precision": 0.009598793842482911, "rouge1_precision_stderr": 0.0010135227951615076, "rouge1_recall": 0.014751731366643654, "rouge1_recall_stderr": 0.0013823298706162432, "rouge2_fmeasure": 0.0024221607010753237, "rouge2_fmeasure_stderr": 0.00029561192228850174, "rouge2_precision": 0.0025860727447622914, "rouge2_precision_stderr": 0.00048780227870820193, "rouge2_recall": 0.004211421745829904, "rouge2_recall_stderr": 0.0005814098675929214, "rougeL_fmeasure": 0.007136631099672401, "rougeL_fmeasure_stderr": 0.0006388509886284223, "rougeL_precision": 0.007459585367896189, "rougeL_precision_stderr": 0.000834881509938252, "rougeL_recall": 0.011482514249452291, "rougeL_recall_stderr": 0.0011106961669246301, "rougeLsum_fmeasure": 0.008925101458585369, "rougeLsum_fmeasure_stderr": 0.0008030681343683619, "rougeLsum_precision": 0.009060670979252713, "rougeLsum_precision_stderr": 0.0009579283395946946, "rougeLsum_recall": 0.014087779059365814, "rougeLsum_recall_stderr": 0.0013315634718775852}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 4.780545170170601, "bleu_stderr": 0.07118121769393712, "rouge1_fmeasure": 0.23246731745402952, "rouge1_fmeasure_stderr": 0.0017141203715973089, "rouge1_precision": 0.1774633723287846, "rouge1_precision_stderr": 0.0015636360094331604, "rouge1_recall": 0.364075043069517, "rouge1_recall_stderr": 0.002322444967434633, "rouge2_fmeasure": 0.09329941078416822, "rouge2_fmeasure_stderr": 0.0011794536309790357, "rouge2_precision": 0.06978291968631191, "rouge2_precision_stderr": 0.0009315100071943132, "rouge2_recall": 0.14929846575132927, "rouge2_recall_stderr": 0.0018334497217483083, "rougeL_fmeasure": 0.2052968426221788, "rougeL_fmeasure_stderr": 0.0013740335300801536, "rougeL_precision": 0.1559347512157101, "rougeL_precision_stderr": 0.0012468570328765438, "rougeL_recall": 0.3246942073486663, "rougeL_recall_stderr": 0.002000868521767354, "rougeLsum_fmeasure": 0.20379402537245084, "rougeLsum_fmeasure_stderr": 0.0016105541434803087, "rougeLsum_precision": 0.15543450901475353, "rougeLsum_precision_stderr": 0.0014302666176517367, "rougeLsum_recall": 0.3196428693668727, "rougeLsum_recall_stderr": 0.002242319012699104}}, "1": {"generate_text_restaurant": {"bleu": 6.940570293175053, "bleu_stderr": 0.10104547668423318, "rouge1_fmeasure": 0.3575800722995913, "rouge1_fmeasure_stderr": 0.002127689173825895, "rouge1_precision": 0.3496601719591484, "rouge1_precision_stderr": 0.002849559103688844, "rouge1_recall": 0.42359261277664634, "rouge1_recall_stderr": 0.002763477540076445, "rouge2_fmeasure": 0.142023180586533, "rouge2_fmeasure_stderr": 0.0014957539772810853, "rouge2_precision": 0.1429410376669936, "rouge2_precision_stderr": 0.0020796619782422693, "rouge2_recall": 0.1705613877983603, "rouge2_recall_stderr": 0.0019090729632325418, "rougeL_fmeasure": 0.2540611971473509, "rougeL_fmeasure_stderr": 0.001515044497001407, "rougeL_precision": 0.24812697347482596, "rougeL_precision_stderr": 0.0022073871312320963, "rougeL_recall": 0.30667916863775024, "rougeL_recall_stderr": 0.002283669807725874, "rougeLsum_fmeasure": 0.2953396678867069, "rougeLsum_fmeasure_stderr": 0.0019610430628804647, "rougeLsum_precision": 0.29053903973532214, "rougeLsum_precision_stderr": 0.002642421636933054, "rougeLsum_recall": 0.34937076105340387, "rougeLsum_recall_stderr": 0.002510233957991074}}, "2": {"generate_text_restaurant": {"bleu": 9.578029851318268, "bleu_stderr": 0.201482009124794, "rouge1_fmeasure": 0.38938815590089454, "rouge1_fmeasure_stderr": 0.002015416977580809, "rouge1_precision": 0.39404892212892395, "rouge1_precision_stderr": 0.0028317434398928863, "rouge1_recall": 0.4314560771401939, "rouge1_recall_stderr": 0.002630398723273804, "rouge2_fmeasure": 0.17045725388262045, "rouge2_fmeasure_stderr": 0.0016711304351085842, "rouge2_precision": 0.17484313787847716, "rouge2_precision_stderr": 0.0021450695622140473, "rouge2_recall": 0.19062521235801827, "rouge2_recall_stderr": 0.0020301066688945897, "rougeL_fmeasure": 0.28720479387341713, "rougeL_fmeasure_stderr": 0.00166490138295134, "rougeL_precision": 0.2902089433918642, "rougeL_precision_stderr": 0.002309942464742442, "rougeL_recall": 0.3208704945592194, "rougeL_recall_stderr": 0.0022660826248778083, "rougeLsum_fmeasure": 0.3266334163588423, "rougeLsum_fmeasure_stderr": 0.001973231465683255, "rougeLsum_precision": 0.3310651908517372, "rougeLsum_precision_stderr": 0.002649650064405018, "rougeLsum_recall": 0.36193836102433935, "rougeLsum_recall_stderr": 0.002507014363898419}}, "3": {"generate_text_restaurant": {"bleu": 10.920454433065684, "bleu_stderr": 0.1316943729685155, "rouge1_fmeasure": 0.40421930965797104, "rouge1_fmeasure_stderr": 0.001965580619381596, "rouge1_precision": 0.4192958302681238, "rouge1_precision_stderr": 0.0027833911244728436, "rouge1_recall": 0.43191640398277037, "rouge1_recall_stderr": 0.0025958195715232047, "rouge2_fmeasure": 0.181728403672227, "rouge2_fmeasure_stderr": 0.0017180728141282062, "rouge2_precision": 0.19011172601557672, "rouge2_precision_stderr": 0.002113152497347813, "rouge2_recall": 0.19573521805211624, "rouge2_recall_stderr": 0.002055597667273278, "rougeL_fmeasure": 0.3004132022131482, "rougeL_fmeasure_stderr": 0.001744736531356786, "rougeL_precision": 0.31151155319807905, "rougeL_precision_stderr": 0.002356738120811969, "rougeL_recall": 0.3223236931614774, "rougeL_recall_stderr": 0.0022747777223757824, "rougeLsum_fmeasure": 0.34264109704469076, "rougeLsum_fmeasure_stderr": 0.001982280995883766, "rougeLsum_precision": 0.35498206085286044, "rougeLsum_precision_stderr": 0.002606574588757685, "rougeLsum_recall": 0.3668525311353958, "rougeLsum_recall_stderr": 0.0025367007147142026}}, "4": {"generate_text_restaurant": {"bleu": 11.500897320173257, "bleu_stderr": 0.1956034095087363, "rouge1_fmeasure": 0.41092706000969503, "rouge1_fmeasure_stderr": 0.002013505579199469, "rouge1_precision": 0.42930123866901915, "rouge1_precision_stderr": 0.002802161499400842, "rouge1_recall": 0.4331782904924909, "rouge1_recall_stderr": 0.002577476100131464, "rouge2_fmeasure": 0.18641178852244134, "rouge2_fmeasure_stderr": 0.0017617429227832714, "rouge2_precision": 0.1963741088023231, "rouge2_precision_stderr": 0.002127045475765985, "rouge2_recall": 0.19754483584480773, "rouge2_recall_stderr": 0.002036905880965759, "rougeL_fmeasure": 0.30804933657878797, "rougeL_fmeasure_stderr": 0.0018042408542884843, "rougeL_precision": 0.32184870299119356, "rougeL_precision_stderr": 0.0023948432966585135, "rougeL_recall": 0.32560487667250465, "rougeL_recall_stderr": 0.0022611096432227965, "rougeLsum_fmeasure": 0.35022280659725025, "rougeLsum_fmeasure_stderr": 0.002028073865541317, "rougeLsum_precision": 0.36559763624883224, "rougeLsum_precision_stderr": 0.0026458064257003032, "rougeLsum_recall": 0.3697574496463752, "rougeLsum_recall_stderr": 0.0025156597623565424}}, "5": {"generate_text_restaurant": {"bleu": 11.771270305618712, "bleu_stderr": 0.17412022732055477, "rouge1_fmeasure": 0.41830245139279826, "rouge1_fmeasure_stderr": 0.002021004042601933, "rouge1_precision": 0.43818215510000286, "rouge1_precision_stderr": 0.0028851585855894776, "rouge1_recall": 0.4399123520832634, "rouge1_recall_stderr": 0.0025733621012235895, "rouge2_fmeasure": 0.1917272606223869, "rouge2_fmeasure_stderr": 0.001783066182693068, "rouge2_precision": 0.20257411232135283, "rouge2_precision_stderr": 0.0021820200889344113, "rouge2_recall": 0.20260609597182416, "rouge2_recall_stderr": 0.0020654299321480334, "rougeL_fmeasure": 0.31474259806009264, "rougeL_fmeasure_stderr": 0.0018056447106615293, "rougeL_precision": 0.32996450967513996, "rougeL_precision_stderr": 0.0024791014115638364, "rougeL_recall": 0.331872138433623, "rougeL_recall_stderr": 0.0022597005060796707, "rougeLsum_fmeasure": 0.35845598302759274, "rougeLsum_fmeasure_stderr": 0.002046672331208988, "rougeLsum_precision": 0.37550892623756665, "rougeLsum_precision_stderr": 0.0027651776519762796, "rougeLsum_recall": 0.377168038065161, "rougeLsum_recall_stderr": 0.002496827725922857}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8174302706970622, "bleu_stderr": 0.11196728320067137, "rouge1_fmeasure": 0.20114774044747769, "rouge1_fmeasure_stderr": 0.0025269409205907075, "rouge1_precision": 0.1545989297987524, "rouge1_precision_stderr": 0.0022684654702864685, "rouge1_recall": 0.3221703515084542, "rouge1_recall_stderr": 0.004289581865346496, "rouge2_fmeasure": 0.04288345569115875, "rouge2_fmeasure_stderr": 0.00150423780993115, "rouge2_precision": 0.0319437568430621, "rouge2_precision_stderr": 0.0011634686267094517, "rouge2_recall": 0.07254495732539187, "rouge2_recall_stderr": 0.0026632962610127395, "rougeL_fmeasure": 0.15252407111150582, "rougeL_fmeasure_stderr": 0.001904423395546802, "rougeL_precision": 0.11668807171332557, "rougeL_precision_stderr": 0.0016652450570352347, "rougeL_recall": 0.24679085078761165, "rougeL_recall_stderr": 0.003463508045372628, "rougeLsum_fmeasure": 0.15637245537010086, "rougeLsum_fmeasure_stderr": 0.0020947672689738675, "rougeLsum_precision": 0.11942635042403643, "rougeLsum_precision_stderr": 0.0017706637778327492, "rougeLsum_recall": 0.25332562732507136, "rougeLsum_recall_stderr": 0.0037831592179613394}}, "1": {"article_DOC_summary": {"bleu": 1.552748129219279, "bleu_stderr": 0.09997239921399444, "rouge1_fmeasure": 0.17114372550513476, "rouge1_fmeasure_stderr": 0.0024604571497117834, "rouge1_precision": 0.12170201682595827, "rouge1_precision_stderr": 0.001826044289177986, "rouge1_recall": 0.30043563707916027, "rouge1_recall_stderr": 0.004233363825654979, "rouge2_fmeasure": 0.035379638697656535, "rouge2_fmeasure_stderr": 0.0014234913861855445, "rouge2_precision": 0.02491068120814432, "rouge2_precision_stderr": 0.0010056223310594503, "rouge2_recall": 0.06366125078253994, "rouge2_recall_stderr": 0.0026092736589792367, "rougeL_fmeasure": 0.13569122371994208, "rougeL_fmeasure_stderr": 0.0018778599571687325, "rougeL_precision": 0.09626651393384027, "rougeL_precision_stderr": 0.0013808637405283324, "rougeL_recall": 0.24005794800544122, "rougeL_recall_stderr": 0.0034002667798535, "rougeLsum_fmeasure": 0.13720484475514233, "rougeLsum_fmeasure_stderr": 0.0020361013127471544, "rougeLsum_precision": 0.09732882949507539, "rougeLsum_precision_stderr": 0.001491003899937052, "rougeLsum_recall": 0.24264339775958071, "rougeLsum_recall_stderr": 0.003652067540550535}}, "2": {"article_DOC_summary": {"bleu": 1.3601465950321643, "bleu_stderr": 0.0861315196923141, "rouge1_fmeasure": 0.17309895172368248, "rouge1_fmeasure_stderr": 0.002407562936005181, "rouge1_precision": 0.12278797123169674, "rouge1_precision_stderr": 0.001785131647152879, "rouge1_recall": 0.30543718274767545, "rouge1_recall_stderr": 0.004162923693892446, "rouge2_fmeasure": 0.03502857896138652, "rouge2_fmeasure_stderr": 0.0013726414538592514, "rouge2_precision": 0.0246072866788354, "rouge2_precision_stderr": 0.0009656088007101129, "rouge2_recall": 0.06340535253720762, "rouge2_recall_stderr": 0.0025614084712425903, "rougeL_fmeasure": 0.13712998116139988, "rougeL_fmeasure_stderr": 0.001853089467865337, "rougeL_precision": 0.09711482251051126, "rougeL_precision_stderr": 0.0013612129408171914, "rougeL_recall": 0.24338254656295813, "rougeL_recall_stderr": 0.0033638392897146316, "rougeLsum_fmeasure": 0.13797316536629922, "rougeLsum_fmeasure_stderr": 0.0020045397719122386, "rougeLsum_precision": 0.09765391570730775, "rougeLsum_precision_stderr": 0.0014675271246877423, "rougeLsum_recall": 0.24504917885902605, "rougeLsum_recall_stderr": 0.003598210298390468}}, "3": {"article_DOC_summary": {"bleu": 1.5289039831037767, "bleu_stderr": 0.08704589796771511, "rouge1_fmeasure": 0.17194468346922234, "rouge1_fmeasure_stderr": 0.0025657618680217883, "rouge1_precision": 0.12456236869961217, "rouge1_precision_stderr": 0.001979303032414647, "rouge1_recall": 0.2969986877323206, "rouge1_recall_stderr": 0.004379156286983872, "rouge2_fmeasure": 0.034720433489397975, "rouge2_fmeasure_stderr": 0.0013709396923174736, "rouge2_precision": 0.024705844817679887, "rouge2_precision_stderr": 0.000982394972116292, "rouge2_recall": 0.06152065262458846, "rouge2_recall_stderr": 0.002489362220975216, "rougeL_fmeasure": 0.13492154229574052, "rougeL_fmeasure_stderr": 0.0019677258292475, "rougeL_precision": 0.09749135882170305, "rougeL_precision_stderr": 0.0014954299008710392, "rougeL_recall": 0.23449083710611318, "rougeL_recall_stderr": 0.003493009897335876, "rougeLsum_fmeasure": 0.13730295741486614, "rougeLsum_fmeasure_stderr": 0.002115839730669299, "rougeLsum_precision": 0.09915225156332898, "rougeLsum_precision_stderr": 0.001597792391607745, "rougeLsum_recall": 0.23891448433195075, "rougeLsum_recall_stderr": 0.0037586328246274584}}, "4": {"article_DOC_summary": {"bleu": 0.7057885792239146, "bleu_stderr": 0.09633059952297361, "rouge1_fmeasure": 0.047178072114941544, "rouge1_fmeasure_stderr": 0.002616626424383668, "rouge1_precision": 0.038885594409457096, "rouge1_precision_stderr": 0.002307145421336748, "rouge1_recall": 0.07533535776380973, "rouge1_recall_stderr": 0.004278569489488177, "rouge2_fmeasure": 0.009740454133203196, "rouge2_fmeasure_stderr": 0.0008810391003143671, "rouge2_precision": 0.00767009703914481, "rouge2_precision_stderr": 0.0007436396195559962, "rouge2_recall": 0.016162073507849355, "rouge2_recall_stderr": 0.0015013180480267284, "rougeL_fmeasure": 0.03687428992461382, "rougeL_fmeasure_stderr": 0.0020423490604046203, "rougeL_precision": 0.030629416984214033, "rougeL_precision_stderr": 0.0018394051263934531, "rougeL_recall": 0.059033336787710354, "rougeL_recall_stderr": 0.0033568504886673342, "rougeLsum_fmeasure": 0.03798026533325551, "rougeLsum_fmeasure_stderr": 0.002122845512935527, "rougeLsum_precision": 0.03156727174883922, "rougeLsum_precision_stderr": 0.0018991973218119777, "rougeLsum_recall": 0.060734850890216574, "rougeLsum_recall_stderr": 0.003501714228117917}}, "5": {"article_DOC_summary": {"bleu": 1.4101701229157404e-16, "bleu_stderr": 7.284988438739496e-14, "rouge1_fmeasure": 0.0024087011729151905, "rouge1_fmeasure_stderr": 0.0006704731920197829, "rouge1_precision": 0.0020524139087283043, "rouge1_precision_stderr": 0.000588767108298375, "rouge1_recall": 0.003713621943684625, "rouge1_recall_stderr": 0.0010695365153013802, "rouge2_fmeasure": 0.0003686455308029473, "rouge2_fmeasure_stderr": 0.0001512143971336204, "rouge2_precision": 0.00028228247573819387, "rouge2_precision_stderr": 0.0001152152143834138, "rouge2_recall": 0.000611287528601624, "rouge2_recall_stderr": 0.0002553224416713872, "rougeL_fmeasure": 0.0017850468058708325, "rougeL_fmeasure_stderr": 0.0004908275382504802, "rougeL_precision": 0.0015369529867172342, "rougeL_precision_stderr": 0.00043863667780986944, "rougeL_recall": 0.0026922203626893403, "rougeL_recall_stderr": 0.0007595049688272029, "rougeLsum_fmeasure": 0.0019251742486571363, "rougeLsum_fmeasure_stderr": 0.0005450647907848718, "rougeLsum_precision": 0.0016840361597954422, "rougeLsum_precision_stderr": 0.0005020554020988375, "rougeLsum_recall": 0.0029125954622099127, "rougeLsum_recall_stderr": 0.0008341811116817512}}}}
perplexity25/evaluation/rankeval/perplexity25_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.337,0.014955087918653605,0
3
+ anli_r2,acc,0.339,0.014976758771620344,0
4
+ anli_r3,acc,0.3308333333333333,0.013588208070708986,0
5
+ arc_challenge,acc,0.2645051194539249,0.012889272949313366,0
6
+ arc_challenge,acc_norm,0.29180887372013653,0.013284525292403501,0
7
+ arc_easy,acc,0.5963804713804713,0.010067368960348226,0
8
+ arc_easy,acc_norm,0.5340909090909091,0.010235908103438687,0
9
+ boolq,acc,0.6125382262996942,0.00852066653613694,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.1940928270042194,,1
12
+ copa,acc,0.73,0.0446196043338474,0
13
+ hellaswag,acc,0.43706432981477794,0.00495009555596467,0
14
+ hellaswag,acc_norm,0.5617406891057558,0.004951594063272048,0
15
+ piqa,acc,0.719804134929271,0.010478122015577082,0
16
+ piqa,acc_norm,0.720892274211099,0.010465657948498233,0
17
+ rte,acc,0.5126353790613718,0.030086851767188564,0
18
+ sciq,acc,0.827,0.011967214137559933,0
19
+ sciq,acc_norm,0.751,0.013681600278702301,0
20
+ storycloze_2016,acc,0.7076429716729022,0.010518239729787741,0
21
+ winogrande,acc,0.5808997632202052,0.013867325192210117,0
perplexity25/evaluation/rankeval/perplexity25_0_lm-eval_global_step52452_2023-05-13-01-15-21_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.337,
5
- "acc_stderr": 0.014955087918653605
6
- },
7
- "anli_r2": {
8
- "acc": 0.339,
9
- "acc_stderr": 0.014976758771620344
10
- },
11
- "anli_r3": {
12
- "acc": 0.3308333333333333,
13
- "acc_stderr": 0.013588208070708986
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.1940928270042194
19
- },
20
- "copa": {
21
- "acc": 0.73,
22
- "acc_stderr": 0.0446196043338474
23
- },
24
- "hellaswag": {
25
- "acc": 0.43706432981477794,
26
- "acc_stderr": 0.00495009555596467,
27
- "acc_norm": 0.5617406891057558,
28
- "acc_norm_stderr": 0.004951594063272048
29
- },
30
- "rte": {
31
- "acc": 0.5126353790613718,
32
- "acc_stderr": 0.030086851767188564
33
- },
34
- "winogrande": {
35
- "acc": 0.5808997632202052,
36
- "acc_stderr": 0.013867325192210117
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7076429716729022,
40
- "acc_stderr": 0.010518239729787741
41
- },
42
- "boolq": {
43
- "acc": 0.6125382262996942,
44
- "acc_stderr": 0.00852066653613694
45
- },
46
- "arc_easy": {
47
- "acc": 0.5963804713804713,
48
- "acc_stderr": 0.010067368960348226,
49
- "acc_norm": 0.5340909090909091,
50
- "acc_norm_stderr": 0.010235908103438687
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2645051194539249,
54
- "acc_stderr": 0.012889272949313366,
55
- "acc_norm": 0.29180887372013653,
56
- "acc_norm_stderr": 0.013284525292403501
57
- },
58
- "sciq": {
59
- "acc": 0.827,
60
- "acc_stderr": 0.011967214137559933,
61
- "acc_norm": 0.751,
62
- "acc_norm_stderr": 0.013681600278702301
63
- },
64
- "piqa": {
65
- "acc": 0.719804134929271,
66
- "acc_stderr": 0.010478122015577082,
67
- "acc_norm": 0.720892274211099,
68
- "acc_norm_stderr": 0.010465657948498233
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.014944140233795023,0
3
+ anli_r2,acc,0.338,0.014965960710224484,0
4
+ anli_r3,acc,0.33916666666666667,0.013672343491681817,0
5
+ arc_challenge,acc,0.27303754266211605,0.013019332762635744,0
6
+ arc_challenge,acc_norm,0.2960750853242321,0.013340916085246258,0
7
+ arc_easy,acc,0.6014309764309764,0.010046455400477945,0
8
+ arc_easy,acc_norm,0.5597643097643098,0.010186228624515656,0
9
+ boolq,acc,0.5868501529051988,0.00861211754780358,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.2283333333333333,,1
12
+ copa,acc,0.73,0.044619604333847394,0
13
+ hellaswag,acc,0.43457478589922327,0.004946879874422678,0
14
+ hellaswag,acc_norm,0.5644293965345548,0.0049481813670249584,0
15
+ piqa,acc,0.7127312295973884,0.010557291761528637,0
16
+ piqa,acc_norm,0.7159956474428727,0.010521147542454213,0
17
+ rte,acc,0.5342960288808665,0.030025579819366426,0
18
+ sciq,acc,0.863,0.010878848714333308,0
19
+ sciq,acc_norm,0.837,0.01168621271274684,0
20
+ storycloze_2016,acc,0.706574024585783,0.01052948933474447,0
21
+ winogrande,acc,0.5698500394632992,0.013914685094716701,0
perplexity25/evaluation/rankeval/perplexity25_1_lm-eval_global_step52452_2023-05-13-01-15-21_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795023
6
- },
7
- "anli_r2": {
8
- "acc": 0.338,
9
- "acc_stderr": 0.014965960710224484
10
- },
11
- "anli_r3": {
12
- "acc": 0.33916666666666667,
13
- "acc_stderr": 0.013672343491681817
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.2283333333333333
19
- },
20
- "copa": {
21
- "acc": 0.73,
22
- "acc_stderr": 0.044619604333847394
23
- },
24
- "hellaswag": {
25
- "acc": 0.43457478589922327,
26
- "acc_stderr": 0.004946879874422678,
27
- "acc_norm": 0.5644293965345548,
28
- "acc_norm_stderr": 0.0049481813670249584
29
- },
30
- "rte": {
31
- "acc": 0.5342960288808665,
32
- "acc_stderr": 0.030025579819366426
33
- },
34
- "winogrande": {
35
- "acc": 0.5698500394632992,
36
- "acc_stderr": 0.013914685094716701
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.706574024585783,
40
- "acc_stderr": 0.01052948933474447
41
- },
42
- "boolq": {
43
- "acc": 0.5868501529051988,
44
- "acc_stderr": 0.00861211754780358
45
- },
46
- "arc_easy": {
47
- "acc": 0.6014309764309764,
48
- "acc_stderr": 0.010046455400477945,
49
- "acc_norm": 0.5597643097643098,
50
- "acc_norm_stderr": 0.010186228624515656
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27303754266211605,
54
- "acc_stderr": 0.013019332762635744,
55
- "acc_norm": 0.2960750853242321,
56
- "acc_norm_stderr": 0.013340916085246258
57
- },
58
- "sciq": {
59
- "acc": 0.863,
60
- "acc_stderr": 0.010878848714333308,
61
- "acc_norm": 0.837,
62
- "acc_norm_stderr": 0.01168621271274684
63
- },
64
- "piqa": {
65
- "acc": 0.7127312295973884,
66
- "acc_stderr": 0.010557291761528637,
67
- "acc_norm": 0.7159956474428727,
68
- "acc_norm_stderr": 0.010521147542454213
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.33,0.014876872027456732,0
3
+ anli_r2,acc,0.337,0.014955087918653598,0
4
+ anli_r3,acc,0.3375,0.013655897185463657,0
5
+ arc_challenge,acc,0.27986348122866894,0.013119040897725922,0
6
+ arc_challenge,acc_norm,0.2986348122866894,0.01337407861506875,0
7
+ arc_easy,acc,0.6056397306397306,0.010028176038393,0
8
+ arc_easy,acc_norm,0.5808080808080808,0.010124905282491183,0
9
+ boolq,acc,0.5825688073394495,0.008624990050216684,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.2706349206349206,,1
12
+ copa,acc,0.78,0.04163331998932261,0
13
+ hellaswag,acc,0.43487353116908983,0.004947272454226208,0
14
+ hellaswag,acc_norm,0.5603465445130452,0.004953305461311746,0
15
+ piqa,acc,0.7149075081610446,0.010533270588738937,0
16
+ piqa,acc_norm,0.7110990206746464,0.010575111841364908,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.875,0.010463483381956722,0
19
+ sciq,acc_norm,0.842,0.011539894677559559,0
20
+ storycloze_2016,acc,0.7033671833244255,0.010562819181563227,0
21
+ winogrande,acc,0.5785319652722968,0.013878072377497606,0
perplexity25/evaluation/rankeval/perplexity25_2_lm-eval_global_step52452_2023-05-13-01-15-21_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.33,
5
- "acc_stderr": 0.014876872027456732
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.014955087918653598
10
- },
11
- "anli_r3": {
12
- "acc": 0.3375,
13
- "acc_stderr": 0.013655897185463657
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.2706349206349206
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932261
23
- },
24
- "hellaswag": {
25
- "acc": 0.43487353116908983,
26
- "acc_stderr": 0.004947272454226208,
27
- "acc_norm": 0.5603465445130452,
28
- "acc_norm_stderr": 0.004953305461311746
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5785319652722968,
36
- "acc_stderr": 0.013878072377497606
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7033671833244255,
40
- "acc_stderr": 0.010562819181563227
41
- },
42
- "boolq": {
43
- "acc": 0.5825688073394495,
44
- "acc_stderr": 0.008624990050216684
45
- },
46
- "arc_easy": {
47
- "acc": 0.6056397306397306,
48
- "acc_stderr": 0.010028176038393,
49
- "acc_norm": 0.5808080808080808,
50
- "acc_norm_stderr": 0.010124905282491183
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27986348122866894,
54
- "acc_stderr": 0.013119040897725922,
55
- "acc_norm": 0.2986348122866894,
56
- "acc_norm_stderr": 0.01337407861506875
57
- },
58
- "sciq": {
59
- "acc": 0.875,
60
- "acc_stderr": 0.010463483381956722,
61
- "acc_norm": 0.842,
62
- "acc_norm_stderr": 0.011539894677559559
63
- },
64
- "piqa": {
65
- "acc": 0.7149075081610446,
66
- "acc_stderr": 0.010533270588738937,
67
- "acc_norm": 0.7110990206746464,
68
- "acc_norm_stderr": 0.010575111841364908
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.325,0.014818724459095524,0
3
+ anli_r2,acc,0.328,0.014853842487270336,0
4
+ anli_r3,acc,0.3416666666666667,0.013696658778002519,0
5
+ arc_challenge,acc,0.2764505119453925,0.013069662474252427,0
6
+ arc_challenge,acc_norm,0.3003412969283277,0.013395909309957,0
7
+ arc_easy,acc,0.6026936026936027,0.010041053078884277,0
8
+ arc_easy,acc_norm,0.5833333333333334,0.010116282977781253,0
9
+ boolq,acc,0.582262996941896,0.008625883905552707,1
10
+ cb,acc,0.44642857142857145,0.06703189227942397,1
11
+ cb,f1,0.28883861236802416,,1
12
+ copa,acc,0.82,0.038612291966536955,0
13
+ hellaswag,acc,0.4342760406293567,0.004946485466544626,0
14
+ hellaswag,acc_norm,0.5595498904600678,0.0049542655953734695,0
15
+ piqa,acc,0.7170837867247007,0.010508949177489686,0
16
+ piqa,acc_norm,0.7241566920565833,0.010427805502729119,0
17
+ rte,acc,0.5342960288808665,0.030025579819366426,0
18
+ sciq,acc,0.87,0.010640169792499344,0
19
+ sciq,acc_norm,0.849,0.011328165223341674,0
20
+ storycloze_2016,acc,0.7055050774986639,0.010540668963800296,0
21
+ winogrande,acc,0.5769534333070244,0.013885055359056476,0
perplexity25/evaluation/rankeval/perplexity25_3_lm-eval_global_step52452_2023-05-13-01-15-21_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.325,
5
- "acc_stderr": 0.014818724459095524
6
- },
7
- "anli_r2": {
8
- "acc": 0.328,
9
- "acc_stderr": 0.014853842487270336
10
- },
11
- "anli_r3": {
12
- "acc": 0.3416666666666667,
13
- "acc_stderr": 0.013696658778002519
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942397,
18
- "f1": 0.28883861236802416
19
- },
20
- "copa": {
21
- "acc": 0.82,
22
- "acc_stderr": 0.038612291966536955
23
- },
24
- "hellaswag": {
25
- "acc": 0.4342760406293567,
26
- "acc_stderr": 0.004946485466544626,
27
- "acc_norm": 0.5595498904600678,
28
- "acc_norm_stderr": 0.0049542655953734695
29
- },
30
- "rte": {
31
- "acc": 0.5342960288808665,
32
- "acc_stderr": 0.030025579819366426
33
- },
34
- "winogrande": {
35
- "acc": 0.5769534333070244,
36
- "acc_stderr": 0.013885055359056476
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7055050774986639,
40
- "acc_stderr": 0.010540668963800296
41
- },
42
- "boolq": {
43
- "acc": 0.582262996941896,
44
- "acc_stderr": 0.008625883905552707
45
- },
46
- "arc_easy": {
47
- "acc": 0.6026936026936027,
48
- "acc_stderr": 0.010041053078884277,
49
- "acc_norm": 0.5833333333333334,
50
- "acc_norm_stderr": 0.010116282977781253
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2764505119453925,
54
- "acc_stderr": 0.013069662474252427,
55
- "acc_norm": 0.3003412969283277,
56
- "acc_norm_stderr": 0.013395909309957
57
- },
58
- "sciq": {
59
- "acc": 0.87,
60
- "acc_stderr": 0.010640169792499344,
61
- "acc_norm": 0.849,
62
- "acc_norm_stderr": 0.011328165223341674
63
- },
64
- "piqa": {
65
- "acc": 0.7170837867247007,
66
- "acc_stderr": 0.010508949177489686,
67
- "acc_norm": 0.7241566920565833,
68
- "acc_norm_stderr": 0.010427805502729119
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811488,0
3
+ anli_r2,acc,0.316,0.014709193056057118,0
4
+ anli_r3,acc,0.3308333333333333,0.013588208070708995,0
5
+ arc_challenge,acc,0.27303754266211605,0.013019332762635746,0
6
+ arc_challenge,acc_norm,0.29436860068259385,0.013318528460539426,0
7
+ arc_easy,acc,0.6136363636363636,0.009991296778159617,0
8
+ arc_easy,acc_norm,0.5812289562289562,0.010123487160167813,0
9
+ boolq,acc,0.5703363914373089,0.00865809540849789,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.28154851684263454,,1
12
+ copa,acc,0.81,0.03942772444036623,0
13
+ hellaswag,acc,0.4366660027882892,0.004949589567678892,0
14
+ hellaswag,acc_norm,0.5635331607249552,0.0049493353568818635,0
15
+ piqa,acc,0.7181719260065288,0.010496675231258166,0
16
+ piqa,acc_norm,0.7132752992383025,0.01055131450310808,0
17
+ rte,acc,0.5342960288808665,0.030025579819366426,0
18
+ sciq,acc,0.88,0.010281328012747386,0
19
+ sciq,acc_norm,0.863,0.010878848714333318,0
20
+ storycloze_2016,acc,0.7151256012827365,0.01043751398661171,0
21
+ winogrande,acc,0.5808997632202052,0.013867325192210117,0
perplexity25/evaluation/rankeval/perplexity25_4_lm-eval_global_step52452_2023-05-13-01-15-21_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.332,
5
- "acc_stderr": 0.014899597242811488
6
- },
7
- "anli_r2": {
8
- "acc": 0.316,
9
- "acc_stderr": 0.014709193056057118
10
- },
11
- "anli_r3": {
12
- "acc": 0.3308333333333333,
13
- "acc_stderr": 0.013588208070708995
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.28154851684263454
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.4366660027882892,
26
- "acc_stderr": 0.004949589567678892,
27
- "acc_norm": 0.5635331607249552,
28
- "acc_norm_stderr": 0.0049493353568818635
29
- },
30
- "rte": {
31
- "acc": 0.5342960288808665,
32
- "acc_stderr": 0.030025579819366426
33
- },
34
- "winogrande": {
35
- "acc": 0.5808997632202052,
36
- "acc_stderr": 0.013867325192210117
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7151256012827365,
40
- "acc_stderr": 0.01043751398661171
41
- },
42
- "boolq": {
43
- "acc": 0.5703363914373089,
44
- "acc_stderr": 0.00865809540849789
45
- },
46
- "arc_easy": {
47
- "acc": 0.6136363636363636,
48
- "acc_stderr": 0.009991296778159617,
49
- "acc_norm": 0.5812289562289562,
50
- "acc_norm_stderr": 0.010123487160167813
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27303754266211605,
54
- "acc_stderr": 0.013019332762635746,
55
- "acc_norm": 0.29436860068259385,
56
- "acc_norm_stderr": 0.013318528460539426
57
- },
58
- "sciq": {
59
- "acc": 0.88,
60
- "acc_stderr": 0.010281328012747386,
61
- "acc_norm": 0.863,
62
- "acc_norm_stderr": 0.010878848714333318
63
- },
64
- "piqa": {
65
- "acc": 0.7181719260065288,
66
- "acc_stderr": 0.010496675231258166,
67
- "acc_norm": 0.7132752992383025,
68
- "acc_norm_stderr": 0.01055131450310808
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.014944140233795021,0
3
+ anli_r2,acc,0.332,0.014899597242811475,0
4
+ anli_r3,acc,0.3383333333333333,0.013664144006618268,0
5
+ arc_challenge,acc,0.28071672354948807,0.013131238126975576,0
6
+ arc_challenge,acc_norm,0.3003412969283277,0.013395909309956995,0
7
+ arc_easy,acc,0.6085858585858586,0.01001491753262781,0
8
+ arc_easy,acc_norm,0.5909090909090909,0.010088775152615786,0
9
+ boolq,acc,0.5779816513761468,0.008638040428462952,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.30810810810810807,,1
12
+ copa,acc,0.8,0.04020151261036845,0
13
+ hellaswag,acc,0.43198566022704643,0.004943400892881053,0
14
+ hellaswag,acc_norm,0.5660227046405099,0.004946089230153026,0
15
+ piqa,acc,0.7187159956474428,0.010490509832327423,0
16
+ piqa,acc_norm,0.7165397170837867,0.010515057791152041,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.895,0.009698921026024961,0
19
+ sciq,acc_norm,0.878,0.010354864712936694,0
20
+ storycloze_2016,acc,0.7097808658471406,0.010495529690730063,0
21
+ winogrande,acc,0.5730071033938438,0.013901878072575055,0
perplexity25/evaluation/rankeval/perplexity25_5_lm-eval_global_step52452_2023-05-13-01-15-21_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795021
6
- },
7
- "anli_r2": {
8
- "acc": 0.332,
9
- "acc_stderr": 0.014899597242811475
10
- },
11
- "anli_r3": {
12
- "acc": 0.3383333333333333,
13
- "acc_stderr": 0.013664144006618268
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.30810810810810807
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.04020151261036845
23
- },
24
- "hellaswag": {
25
- "acc": 0.43198566022704643,
26
- "acc_stderr": 0.004943400892881053,
27
- "acc_norm": 0.5660227046405099,
28
- "acc_norm_stderr": 0.004946089230153026
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5730071033938438,
36
- "acc_stderr": 0.013901878072575055
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7097808658471406,
40
- "acc_stderr": 0.010495529690730063
41
- },
42
- "boolq": {
43
- "acc": 0.5779816513761468,
44
- "acc_stderr": 0.008638040428462952
45
- },
46
- "arc_easy": {
47
- "acc": 0.6085858585858586,
48
- "acc_stderr": 0.01001491753262781,
49
- "acc_norm": 0.5909090909090909,
50
- "acc_norm_stderr": 0.010088775152615786
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28071672354948807,
54
- "acc_stderr": 0.013131238126975576,
55
- "acc_norm": 0.3003412969283277,
56
- "acc_norm_stderr": 0.013395909309956995
57
- },
58
- "sciq": {
59
- "acc": 0.895,
60
- "acc_stderr": 0.009698921026024961,
61
- "acc_norm": 0.878,
62
- "acc_norm_stderr": 0.010354864712936694
63
- },
64
- "piqa": {
65
- "acc": 0.7187159956474428,
66
- "acc_stderr": 0.010490509832327423,
67
- "acc_norm": 0.7165397170837867,
68
- "acc_norm_stderr": 0.010515057791152041
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/3511463.err ADDED
The diff for this file is too large to render. See raw diff
 
perplexity50/3511463.out ADDED
The diff for this file is too large to render. See raw diff
 
perplexity50/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.04470849859270079
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.04470849859270079
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1690464759571803
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1690464759571803
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.19315023086977673
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.19315023086977673
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.19799186403146818
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.19799186403146818
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.20046771550907586
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.20046771550907586
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.20204503717178296
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.20204503717178296
14
+ e2e_nlg_cleaned,5,average,multiple,0.16790163702199748
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04696069247147477
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04696069247147477
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.0380992656895211
18
+ gem_xsum,1,median,rouge2_fmeasure,0.0380992656895211
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04236560512098692
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04236560512098692
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04083587951791368
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04083587951791368
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.011004483045456277
24
+ gem_xsum,4,median,rouge2_fmeasure,0.011004483045456277
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00046142390179033124
26
+ gem_xsum,5,median,rouge2_fmeasure,0.00046142390179033124
27
+ gem_xsum,5,average,multiple,0.029954558291190512
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.043062923489258886
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.043062923489258886
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05003844184852099
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.05003844184852099
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.052510412804003204
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.052510412804003204
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05249781726283433
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05249781726283433
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05356639642925066
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.05356639642925066
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.053747641849556244
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.053747641849556244
40
+ web_nlg_en,5,average,multiple,0.05090393894723739
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03605723207630101
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03605723207630101
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05061691983629231
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05061691983629231
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.052038438662649955
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.052038438662649955
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04434221317654373
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04434221317654373
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014683899167593325
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.014683899167593325
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0025645649630508895
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0025645649630508895
53
+ wiki_lingua_en,5,average,multiple,0.0333838779804052
perplexity50/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.22823240153546337, "bleu_stderr": 0.019801328507653555, "rouge1_fmeasure": 0.09278812248464041, "rouge1_fmeasure_stderr": 0.0019451208940882462, "rouge1_precision": 0.06488132852178101, "rouge1_precision_stderr": 0.0019983200287404636, "rouge1_recall": 0.2694613973506335, "rouge1_recall_stderr": 0.004689917452744887, "rouge2_fmeasure": 0.043062923489258886, "rouge2_fmeasure_stderr": 0.0011955223112274191, "rouge2_precision": 0.03049969544111268, "rouge2_precision_stderr": 0.0014195164318779048, "rouge2_recall": 0.12616507029445545, "rouge2_recall_stderr": 0.003145765839594005, "rougeL_fmeasure": 0.08987712765401562, "rougeL_fmeasure_stderr": 0.0018570238876570958, "rougeL_precision": 0.06277531818573256, "rougeL_precision_stderr": 0.0019255483147167943, "rougeL_recall": 0.26204267960081423, "rougeL_recall_stderr": 0.004572745714241958, "rougeLsum_fmeasure": 0.08897530462885096, "rougeLsum_fmeasure_stderr": 0.0018637844480694857, "rougeLsum_precision": 0.062396574817472954, "rougeLsum_precision_stderr": 0.001949378798188583, "rougeLsum_recall": 0.2575118059057066, "rougeLsum_recall_stderr": 0.004447426802462746}}, "1": {"PALM_prompt": {"bleu": 0.3661841801825079, "bleu_stderr": 0.02563149821206728, "rouge1_fmeasure": 0.10942376581457419, "rouge1_fmeasure_stderr": 0.0018798780952399333, "rouge1_precision": 0.07083770111936527, "rouge1_precision_stderr": 0.0014702310183376884, "rouge1_recall": 0.3470278622959199, "rouge1_recall_stderr": 0.0049553242440100185, "rouge2_fmeasure": 0.05003844184852099, "rouge2_fmeasure_stderr": 0.0011747503323877196, "rouge2_precision": 0.03224100861794047, "rouge2_precision_stderr": 0.0008797529232351612, "rouge2_recall": 0.1629607516749881, "rouge2_recall_stderr": 0.003471799684879311, "rougeL_fmeasure": 0.10321985245908673, "rougeL_fmeasure_stderr": 0.0017162487713120534, "rougeL_precision": 0.06675253029318402, "rougeL_precision_stderr": 0.0013359347871140462, "rougeL_recall": 0.3264833708205708, "rougeL_recall_stderr": 0.004550418676968034, "rougeLsum_fmeasure": 0.10443938348190776, "rougeLsum_fmeasure_stderr": 0.0017943553372490961, "rougeLsum_precision": 0.06768091521368075, "rougeLsum_precision_stderr": 0.0014117297702072977, "rougeLsum_recall": 0.32967316098048005, "rougeLsum_recall_stderr": 0.004616088914018551}}, "2": {"PALM_prompt": {"bleu": 0.4407900141469654, "bleu_stderr": 0.0323167010621195, "rouge1_fmeasure": 0.11440451965537006, "rouge1_fmeasure_stderr": 0.0018124345601119877, "rouge1_precision": 0.07367342181261477, "rouge1_precision_stderr": 0.0014829437502766741, "rouge1_recall": 0.3719044219842431, "rouge1_recall_stderr": 0.00477089503485067, "rouge2_fmeasure": 0.052510412804003204, "rouge2_fmeasure_stderr": 0.001161074783078678, "rouge2_precision": 0.03399629764627388, "rouge2_precision_stderr": 0.0009924732443245534, "rouge2_recall": 0.17886553819187756, "rouge2_recall_stderr": 0.003495523044565052, "rougeL_fmeasure": 0.1081021771197844, "rougeL_fmeasure_stderr": 0.0016842278535025275, "rougeL_precision": 0.06963671261885444, "rougeL_precision_stderr": 0.001383680359402878, "rougeL_recall": 0.3486950741986897, "rougeL_recall_stderr": 0.0043656658470662135, "rougeLsum_fmeasure": 0.10922703731482176, "rougeLsum_fmeasure_stderr": 0.0017377919109765759, "rougeLsum_precision": 0.07041609903242399, "rougeLsum_precision_stderr": 0.0014339871134255062, "rougeLsum_recall": 0.3537894575591342, "rougeLsum_recall_stderr": 0.004485079469708994}}, "3": {"PALM_prompt": {"bleu": 0.4466003889413579, "bleu_stderr": 0.029566651496688107, "rouge1_fmeasure": 0.11485365257915954, "rouge1_fmeasure_stderr": 0.0017656536007023573, "rouge1_precision": 0.07299482462643084, "rouge1_precision_stderr": 0.0013027934781524187, "rouge1_recall": 0.3766877383768923, "rouge1_recall_stderr": 0.004855020535693255, "rouge2_fmeasure": 0.05249781726283433, "rouge2_fmeasure_stderr": 0.0011107151159447624, "rouge2_precision": 0.03319871477020421, "rouge2_precision_stderr": 0.0007814167138878587, "rouge2_recall": 0.18122070914268562, "rouge2_recall_stderr": 0.0035476324822892387, "rougeL_fmeasure": 0.10790582240037933, "rougeL_fmeasure_stderr": 0.0016126105841386515, "rougeL_precision": 0.06853966061247058, "rougeL_precision_stderr": 0.001173030755309074, "rougeL_recall": 0.35045572978503337, "rougeL_recall_stderr": 0.004340732169898299, "rougeLsum_fmeasure": 0.1093595107745124, "rougeLsum_fmeasure_stderr": 0.0016743636963225566, "rougeLsum_precision": 0.0695306625372233, "rougeLsum_precision_stderr": 0.0012361693266576337, "rougeLsum_recall": 0.3569668908645508, "rougeLsum_recall_stderr": 0.004507395219482628}}, "4": {"PALM_prompt": {"bleu": 0.487050312474933, "bleu_stderr": 0.03607205735455871, "rouge1_fmeasure": 0.11648487474193464, "rouge1_fmeasure_stderr": 0.0017721469641833675, "rouge1_precision": 0.07396468654034809, "rouge1_precision_stderr": 0.001288920146199216, "rouge1_recall": 0.3813221117629107, "rouge1_recall_stderr": 0.004861483986307508, "rouge2_fmeasure": 0.05356639642925066, "rouge2_fmeasure_stderr": 0.0011294372559632658, "rouge2_precision": 0.03382639123113972, "rouge2_precision_stderr": 0.0007889725664873102, "rouge2_recall": 0.18653704114196332, "rouge2_recall_stderr": 0.0036151812865906443, "rougeL_fmeasure": 0.10916764420003369, "rougeL_fmeasure_stderr": 0.001618727645899262, "rougeL_precision": 0.06930980722047882, "rougeL_precision_stderr": 0.0011715656957547833, "rougeL_recall": 0.3554813029943238, "rougeL_recall_stderr": 0.004410895398774957, "rougeLsum_fmeasure": 0.11097231310945427, "rougeLsum_fmeasure_stderr": 0.001686654148524125, "rougeLsum_precision": 0.07051571464958685, "rougeLsum_precision_stderr": 0.0012288916026023105, "rougeLsum_recall": 0.3623967921619407, "rougeLsum_recall_stderr": 0.004563172681319971}}, "5": {"PALM_prompt": {"bleu": 0.5397098988068925, "bleu_stderr": 0.036026287961208193, "rouge1_fmeasure": 0.11701396324144628, "rouge1_fmeasure_stderr": 0.0017917711999714296, "rouge1_precision": 0.07425114855720495, "rouge1_precision_stderr": 0.0013099248484394475, "rouge1_recall": 0.3830995799564378, "rouge1_recall_stderr": 0.00489237854555699, "rouge2_fmeasure": 0.053747641849556244, "rouge2_fmeasure_stderr": 0.0011442658690604846, "rouge2_precision": 0.03392002961701272, "rouge2_precision_stderr": 0.0008050168112872793, "rouge2_recall": 0.18691540396802492, "rouge2_recall_stderr": 0.0035729305211479787, "rougeL_fmeasure": 0.10873350954319214, "rougeL_fmeasure_stderr": 0.001623917279486683, "rougeL_precision": 0.06897611448091759, "rougeL_precision_stderr": 0.0011802923800557673, "rougeL_recall": 0.35391589444854826, "rougeL_recall_stderr": 0.004352932319637079, "rougeLsum_fmeasure": 0.11104846189066431, "rougeLsum_fmeasure_stderr": 0.0016911151955166914, "rougeLsum_precision": 0.07051724516851579, "rougeLsum_precision_stderr": 0.0012374326489376225, "rougeLsum_recall": 0.36206603074240407, "rougeLsum_recall_stderr": 0.004520194020448341}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.448281189434789, "bleu_stderr": 0.043687187905172974, "rouge1_fmeasure": 0.17597234365678907, "rouge1_fmeasure_stderr": 0.0018425883141842628, "rouge1_precision": 0.15008607008803437, "rouge1_precision_stderr": 0.0018601006191885032, "rouge1_recall": 0.2557790788700999, "rouge1_recall_stderr": 0.0026277056136050623, "rouge2_fmeasure": 0.03605723207630101, "rouge2_fmeasure_stderr": 0.0008357986241834882, "rouge2_precision": 0.030323067859554097, "rouge2_precision_stderr": 0.0007317604807993408, "rouge2_recall": 0.05443329398449302, "rouge2_recall_stderr": 0.00141532094214201, "rougeL_fmeasure": 0.1423375418286688, "rougeL_fmeasure_stderr": 0.0013578084929713739, "rougeL_precision": 0.12015006900667996, "rougeL_precision_stderr": 0.0013538934502534572, "rougeL_recall": 0.2114978327363309, "rougeL_recall_stderr": 0.002161888757192885, "rougeLsum_fmeasure": 0.16018030489449112, "rougeLsum_fmeasure_stderr": 0.001662237048905438, "rougeLsum_precision": 0.13635035795299258, "rougeLsum_precision_stderr": 0.0016762285015091921, "rougeLsum_recall": 0.23411616572646593, "rougeLsum_recall_stderr": 0.0024292424381029474}}, "1": {"tldr_en": {"bleu": 2.4771863151121267, "bleu_stderr": 0.06776329006842716, "rouge1_fmeasure": 0.20613079171954835, "rouge1_fmeasure_stderr": 0.00198790447206624, "rouge1_precision": 0.1799251922119011, "rouge1_precision_stderr": 0.0021296048274116225, "rouge1_recall": 0.29307731722151764, "rouge1_recall_stderr": 0.0028769505493505395, "rouge2_fmeasure": 0.05061691983629231, "rouge2_fmeasure_stderr": 0.0010260625512296302, "rouge2_precision": 0.04408218261083662, "rouge2_precision_stderr": 0.0009705977120723668, "rouge2_recall": 0.07439629666386674, "rouge2_recall_stderr": 0.0017087707910905313, "rougeL_fmeasure": 0.15685595746582018, "rougeL_fmeasure_stderr": 0.0013926553308231432, "rougeL_precision": 0.13570179337405314, "rougeL_precision_stderr": 0.001482289478470975, "rougeL_recall": 0.22754064537842192, "rougeL_recall_stderr": 0.0022783134373401074, "rougeLsum_fmeasure": 0.1915120511455412, "rougeLsum_fmeasure_stderr": 0.0018508952358904137, "rougeLsum_precision": 0.16695539260829864, "rougeLsum_precision_stderr": 0.0019794474426647083, "rougeLsum_recall": 0.27322645374414084, "rougeLsum_recall_stderr": 0.0027198090833989628}}, "2": {"tldr_en": {"bleu": 2.5971654922191916, "bleu_stderr": 0.07231754735552207, "rouge1_fmeasure": 0.20579252650321492, "rouge1_fmeasure_stderr": 0.0019347999276799514, "rouge1_precision": 0.18193541117316303, "rouge1_precision_stderr": 0.002129022757679657, "rouge1_recall": 0.2873271136787168, "rouge1_recall_stderr": 0.0027757324346008626, "rouge2_fmeasure": 0.052038438662649955, "rouge2_fmeasure_stderr": 0.001033768052074922, "rouge2_precision": 0.04585562124835071, "rouge2_precision_stderr": 0.000987613878134528, "rouge2_recall": 0.07485086144628574, "rouge2_recall_stderr": 0.0016500579154340113, "rougeL_fmeasure": 0.16233798956689643, "rougeL_fmeasure_stderr": 0.0014285375845216534, "rougeL_precision": 0.14212357929533947, "rougeL_precision_stderr": 0.001534668369261251, "rougeL_recall": 0.23092536679601577, "rougeL_recall_stderr": 0.0023083235968400194, "rougeLsum_fmeasure": 0.19060273378557263, "rougeLsum_fmeasure_stderr": 0.0017906790063786202, "rougeLsum_precision": 0.16802587348839354, "rougeLsum_precision_stderr": 0.0019533651000337255, "rougeLsum_recall": 0.26735252969355866, "rougeLsum_recall_stderr": 0.002625978487494257}}, "3": {"tldr_en": {"bleu": 2.6344153317715175, "bleu_stderr": 0.08503440436205431, "rouge1_fmeasure": 0.1743109316643425, "rouge1_fmeasure_stderr": 0.002206836232239465, "rouge1_precision": 0.16050894171948427, "rouge1_precision_stderr": 0.0024100514703960206, "rouge1_recall": 0.24283730708449638, "rouge1_recall_stderr": 0.0032126308751215795, "rouge2_fmeasure": 0.04434221317654373, "rouge2_fmeasure_stderr": 0.0010212597712348022, "rouge2_precision": 0.040363396793793804, "rouge2_precision_stderr": 0.0010438141177973716, "rouge2_recall": 0.0642798756210113, "rouge2_recall_stderr": 0.0016775914985579477, "rougeL_fmeasure": 0.13765815446156848, "rougeL_fmeasure_stderr": 0.001669109955926649, "rougeL_precision": 0.1263857167912172, "rougeL_precision_stderr": 0.0018569367920588345, "rougeL_recall": 0.19505495367647274, "rougeL_recall_stderr": 0.002639115007673249, "rougeLsum_fmeasure": 0.1625350254941212, "rougeLsum_fmeasure_stderr": 0.0020529206650028065, "rougeLsum_precision": 0.14962894160355625, "rougeLsum_precision_stderr": 0.0022515883799491287, "rougeLsum_recall": 0.2273490498335059, "rougeLsum_recall_stderr": 0.0030400460462734805}}, "4": {"tldr_en": {"bleu": 0.6935784123394136, "bleu_stderr": 0.037117118101581, "rouge1_fmeasure": 0.058334104546958664, "rouge1_fmeasure_stderr": 0.0019424473260893508, "rouge1_precision": 0.05430645822023474, "rouge1_precision_stderr": 0.0019806913087909678, "rouge1_recall": 0.08525381293697526, "rouge1_recall_stderr": 0.0029115181219816705, "rouge2_fmeasure": 0.014683899167593325, "rouge2_fmeasure_stderr": 0.0006904611688238195, "rouge2_precision": 0.013305245187707022, "rouge2_precision_stderr": 0.0006795482619014736, "rouge2_recall": 0.023040441919977284, "rouge2_recall_stderr": 0.0012094550673333144, "rougeL_fmeasure": 0.0453971126653514, "rougeL_fmeasure_stderr": 0.0014854698835850076, "rougeL_precision": 0.04223912463330309, "rougeL_precision_stderr": 0.0015456523042882138, "rougeL_recall": 0.06789064162959188, "rougeL_recall_stderr": 0.002360717874503064, "rougeLsum_fmeasure": 0.054378661639440586, "rougeLsum_fmeasure_stderr": 0.0018102104175546382, "rougeLsum_precision": 0.050625780842237005, "rougeLsum_precision_stderr": 0.001851652248586665, "rougeLsum_recall": 0.0798186917428021, "rougeLsum_recall_stderr": 0.0027367896999417234}}, "5": {"tldr_en": {"bleu": 4.942519396132508e-06, "bleu_stderr": 7.900655099204328e-06, "rouge1_fmeasure": 0.00973539490830721, "rouge1_fmeasure_stderr": 0.0008844226509773655, "rouge1_precision": 0.009892829148004599, "rouge1_precision_stderr": 0.0009735613257064788, "rouge1_recall": 0.014431890646129451, "rouge1_recall_stderr": 0.0013569083898125143, "rouge2_fmeasure": 0.0025645649630508895, "rouge2_fmeasure_stderr": 0.00032276345763389736, "rouge2_precision": 0.00252421633933382, "rouge2_precision_stderr": 0.000377998208069803, "rouge2_recall": 0.004045696439072071, "rouge2_recall_stderr": 0.0005345734135970881, "rougeL_fmeasure": 0.007755952948994762, "rougeL_fmeasure_stderr": 0.0007020625911996639, "rougeL_precision": 0.007913179064992113, "rougeL_precision_stderr": 0.0007973074868105818, "rougeL_recall": 0.011792588225184406, "rougeL_recall_stderr": 0.0011400182766348275, "rougeLsum_fmeasure": 0.009070666142968765, "rougeLsum_fmeasure_stderr": 0.0008231845717527685, "rougeLsum_precision": 0.00914536574048143, "rougeLsum_precision_stderr": 0.0008957548358299954, "rougeLsum_recall": 0.013493539148258968, "rougeLsum_recall_stderr": 0.0012759343183650173}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.1930690971863127, "bleu_stderr": 0.056687850782170965, "rouge1_fmeasure": 0.1878406531881343, "rouge1_fmeasure_stderr": 0.0012989451234580226, "rouge1_precision": 0.1429481399444759, "rouge1_precision_stderr": 0.0011565979029840442, "rouge1_recall": 0.29077511497038705, "rouge1_recall_stderr": 0.0018252280720869973, "rouge2_fmeasure": 0.04470849859270079, "rouge2_fmeasure_stderr": 0.0009406338502407552, "rouge2_precision": 0.03387362544764098, "rouge2_precision_stderr": 0.0007280854317065066, "rouge2_recall": 0.06980838714554283, "rouge2_recall_stderr": 0.0014665741377635003, "rougeL_fmeasure": 0.18167840406484337, "rougeL_fmeasure_stderr": 0.0011691157929579112, "rougeL_precision": 0.1379239272755364, "rougeL_precision_stderr": 0.0010249937289556548, "rougeL_recall": 0.282055407082142, "rougeL_recall_stderr": 0.0016756844442888477, "rougeLsum_fmeasure": 0.14684587587101583, "rougeLsum_fmeasure_stderr": 0.0012655964375177, "rougeLsum_precision": 0.11190322389127284, "rougeLsum_precision_stderr": 0.0010864937044464242, "rougeLsum_recall": 0.2269219536298954, "rougeLsum_recall_stderr": 0.0018279163341351945}}, "1": {"generate_text_restaurant": {"bleu": 9.418632832486113, "bleu_stderr": 0.15892088683907937, "rouge1_fmeasure": 0.3954201252360898, "rouge1_fmeasure_stderr": 0.0021957704553682567, "rouge1_precision": 0.47163188984542315, "rouge1_precision_stderr": 0.0032775365220683987, "rouge1_recall": 0.38270380914695573, "rouge1_recall_stderr": 0.0026942942607581883, "rouge2_fmeasure": 0.1690464759571803, "rouge2_fmeasure_stderr": 0.001791849078574847, "rouge2_precision": 0.20643841115958284, "rouge2_precision_stderr": 0.002452864241411027, "rouge2_recall": 0.16255723484968757, "rouge2_recall_stderr": 0.0018974959638824289, "rougeL_fmeasure": 0.2918320542459967, "rougeL_fmeasure_stderr": 0.0018703627623678988, "rougeL_precision": 0.3500552455605647, "rougeL_precision_stderr": 0.002819656762714209, "rougeL_recall": 0.2825970007466028, "rougeL_recall_stderr": 0.0022176737240157613, "rougeLsum_fmeasure": 0.326122364264301, "rougeLsum_fmeasure_stderr": 0.002124773392271949, "rougeLsum_precision": 0.3894346048213154, "rougeLsum_precision_stderr": 0.0030440975161392437, "rougeLsum_recall": 0.3156932402715717, "rougeLsum_recall_stderr": 0.0024964558829624114}}, "2": {"generate_text_restaurant": {"bleu": 11.157046042091494, "bleu_stderr": 0.22230912389093138, "rouge1_fmeasure": 0.41600816576099753, "rouge1_fmeasure_stderr": 0.002175752395461552, "rouge1_precision": 0.49125485302264194, "rouge1_precision_stderr": 0.003436345592117948, "rouge1_recall": 0.40692976602128633, "rouge1_recall_stderr": 0.0026701582120549137, "rouge2_fmeasure": 0.19315023086977673, "rouge2_fmeasure_stderr": 0.0018551400655270958, "rouge2_precision": 0.23319168418773684, "rouge2_precision_stderr": 0.0026655336361967345, "rouge2_recall": 0.18866131101543013, "rouge2_recall_stderr": 0.0020054934856301864, "rougeL_fmeasure": 0.31720505793101433, "rougeL_fmeasure_stderr": 0.0019047277615318289, "rougeL_precision": 0.37587753599102547, "rougeL_precision_stderr": 0.0030096046402435773, "rougeL_recall": 0.31127927557367574, "rougeL_recall_stderr": 0.0022995330104551115, "rougeLsum_fmeasure": 0.35124044195594184, "rougeLsum_fmeasure_stderr": 0.0021467843216000176, "rougeLsum_precision": 0.4153979997965865, "rougeLsum_precision_stderr": 0.003260911582709945, "rougeLsum_recall": 0.34370226391464437, "rougeLsum_recall_stderr": 0.0025225743998368595}}, "3": {"generate_text_restaurant": {"bleu": 11.85176645344793, "bleu_stderr": 0.2111782087019858, "rouge1_fmeasure": 0.41943531430791064, "rouge1_fmeasure_stderr": 0.0021527237387675337, "rouge1_precision": 0.491635847758295, "rouge1_precision_stderr": 0.0033990781081896203, "rouge1_recall": 0.4083984861443366, "rouge1_recall_stderr": 0.002572988732153086, "rouge2_fmeasure": 0.19799186403146818, "rouge2_fmeasure_stderr": 0.0018867520137521249, "rouge2_precision": 0.23811538412663988, "rouge2_precision_stderr": 0.0027210673636900535, "rouge2_recall": 0.19191761966553353, "rouge2_recall_stderr": 0.0019828033145899418, "rougeL_fmeasure": 0.3222366071758124, "rougeL_fmeasure_stderr": 0.0019270427825323075, "rougeL_precision": 0.3791841332466743, "rougeL_precision_stderr": 0.0030188679062324066, "rougeL_recall": 0.31428033033093916, "rougeL_recall_stderr": 0.0022404895896611845, "rougeLsum_fmeasure": 0.35705938021982625, "rougeLsum_fmeasure_stderr": 0.002152684592453826, "rougeLsum_precision": 0.41907603224901663, "rougeLsum_precision_stderr": 0.003233518240730724, "rougeLsum_recall": 0.34758989478390145, "rougeLsum_recall_stderr": 0.002468301879990052}}, "4": {"generate_text_restaurant": {"bleu": 12.213594934959904, "bleu_stderr": 0.17055645700844593, "rouge1_fmeasure": 0.42203788611472176, "rouge1_fmeasure_stderr": 0.002140927059483002, "rouge1_precision": 0.49389784953566224, "rouge1_precision_stderr": 0.003360677973991826, "rouge1_recall": 0.4084911763885813, "rouge1_recall_stderr": 0.0025681818250945453, "rouge2_fmeasure": 0.20046771550907586, "rouge2_fmeasure_stderr": 0.001900500158864706, "rouge2_precision": 0.24027533133594514, "rouge2_precision_stderr": 0.002717518145210136, "rouge2_recall": 0.1931689047942646, "rouge2_recall_stderr": 0.0019976751697372373, "rougeL_fmeasure": 0.3250827572900466, "rougeL_fmeasure_stderr": 0.0019242346482233237, "rougeL_precision": 0.3818307774217645, "rougeL_precision_stderr": 0.002997133945547209, "rougeL_recall": 0.314926852332832, "rougeL_recall_stderr": 0.00222944689068758, "rougeLsum_fmeasure": 0.36123637490493443, "rougeLsum_fmeasure_stderr": 0.0021695678925136796, "rougeLsum_precision": 0.4230876627442121, "rougeLsum_precision_stderr": 0.003228575618923689, "rougeLsum_recall": 0.3496643183193398, "rougeLsum_recall_stderr": 0.002491187104271206}}, "5": {"generate_text_restaurant": {"bleu": 12.236709448372414, "bleu_stderr": 0.14843010141725657, "rouge1_fmeasure": 0.4253127800982302, "rouge1_fmeasure_stderr": 0.0020707929304555938, "rouge1_precision": 0.4997594688814884, "rouge1_precision_stderr": 0.003305753668973342, "rouge1_recall": 0.4090412035280527, "rouge1_recall_stderr": 0.00251586811815331, "rouge2_fmeasure": 0.20204503717178296, "rouge2_fmeasure_stderr": 0.0018444156267695426, "rouge2_precision": 0.24277805797177893, "rouge2_precision_stderr": 0.0026344699743377476, "rouge2_recall": 0.19353472778536238, "rouge2_recall_stderr": 0.0019368139370174791, "rougeL_fmeasure": 0.3273140656892835, "rougeL_fmeasure_stderr": 0.0019260484275470976, "rougeL_precision": 0.3858614644050571, "rougeL_precision_stderr": 0.0029726208789718437, "rougeL_recall": 0.3148985662207896, "rougeL_recall_stderr": 0.002219870397709864, "rougeLsum_fmeasure": 0.36408266488946983, "rougeLsum_fmeasure_stderr": 0.0021286587722579102, "rougeLsum_precision": 0.4284452395301663, "rougeLsum_precision_stderr": 0.003202306041199407, "rougeLsum_recall": 0.34976945417244226, "rougeLsum_recall_stderr": 0.0024232597648873983}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.988238441001108, "bleu_stderr": 0.10949772127470961, "rouge1_fmeasure": 0.20686542088274673, "rouge1_fmeasure_stderr": 0.0024902036554014517, "rouge1_precision": 0.15302415683116935, "rouge1_precision_stderr": 0.0019851919515352376, "rouge1_recall": 0.3456886390551113, "rouge1_recall_stderr": 0.004315559637897738, "rouge2_fmeasure": 0.04696069247147477, "rouge2_fmeasure_stderr": 0.0015159747704938896, "rouge2_precision": 0.034018860231617355, "rouge2_precision_stderr": 0.001109162098832389, "rouge2_recall": 0.08143980891307401, "rouge2_recall_stderr": 0.002687566610952875, "rougeL_fmeasure": 0.15803246501095908, "rougeL_fmeasure_stderr": 0.001921548169434307, "rougeL_precision": 0.11681806758450688, "rougeL_precision_stderr": 0.0015393348907235753, "rougeL_recall": 0.2653922138123509, "rougeL_recall_stderr": 0.0034550152868841446, "rougeLsum_fmeasure": 0.16176874488861362, "rougeLsum_fmeasure_stderr": 0.0021510210289138135, "rougeLsum_precision": 0.11940647407681726, "rougeLsum_precision_stderr": 0.0016788400209633819, "rougeLsum_recall": 0.27201835686506154, "rougeLsum_recall_stderr": 0.003841749314846245}}, "1": {"article_DOC_summary": {"bleu": 1.5909474225090259, "bleu_stderr": 0.08800468492254938, "rouge1_fmeasure": 0.18507867023037966, "rouge1_fmeasure_stderr": 0.0025273674673725126, "rouge1_precision": 0.13152298566138992, "rouge1_precision_stderr": 0.0018868845425239439, "rouge1_recall": 0.3254583697191521, "rouge1_recall_stderr": 0.004322661638768359, "rouge2_fmeasure": 0.0380992656895211, "rouge2_fmeasure_stderr": 0.0014717030228569784, "rouge2_precision": 0.02689834498641723, "rouge2_precision_stderr": 0.0010461133533795838, "rouge2_recall": 0.06825421680606394, "rouge2_recall_stderr": 0.002680033168291131, "rougeL_fmeasure": 0.14309819474429272, "rougeL_fmeasure_stderr": 0.0018603866545710576, "rougeL_precision": 0.10149961917396043, "rougeL_precision_stderr": 0.00137753127127783, "rougeL_recall": 0.2532452910717512, "rougeL_recall_stderr": 0.003347172355658309, "rougeLsum_fmeasure": 0.14818866993420235, "rougeLsum_fmeasure_stderr": 0.002096775864782495, "rougeLsum_precision": 0.10503370198820076, "rougeLsum_precision_stderr": 0.001543510074115588, "rougeLsum_recall": 0.26248948719670867, "rougeLsum_recall_stderr": 0.0037288758856769653}}, "2": {"article_DOC_summary": {"bleu": 1.6356065306879652, "bleu_stderr": 0.08830570761410386, "rouge1_fmeasure": 0.19218962833273628, "rouge1_fmeasure_stderr": 0.0024770217525976845, "rouge1_precision": 0.13666091199269473, "rouge1_precision_stderr": 0.0018515926312016866, "rouge1_recall": 0.33730135278754275, "rouge1_recall_stderr": 0.00425282063617627, "rouge2_fmeasure": 0.04236560512098692, "rouge2_fmeasure_stderr": 0.0014536675075888475, "rouge2_precision": 0.029820018955821724, "rouge2_precision_stderr": 0.0010279167305570138, "rouge2_recall": 0.07638292429540472, "rouge2_recall_stderr": 0.0026839104282915466, "rougeL_fmeasure": 0.1480568822982458, "rougeL_fmeasure_stderr": 0.001849879468332256, "rougeL_precision": 0.10503833134735464, "rougeL_precision_stderr": 0.0013653085071660164, "rougeL_recall": 0.261752589120252, "rougeL_recall_stderr": 0.003367463350710988, "rougeLsum_fmeasure": 0.1527273124518781, "rougeLsum_fmeasure_stderr": 0.0020547393393318906, "rougeLsum_precision": 0.10828638412884824, "rougeLsum_precision_stderr": 0.0015066731606435022, "rougeLsum_recall": 0.2701647127318581, "rougeLsum_recall_stderr": 0.003716055569742923}}, "3": {"article_DOC_summary": {"bleu": 1.686714514186135, "bleu_stderr": 0.05743658327131587, "rouge1_fmeasure": 0.1867578685619972, "rouge1_fmeasure_stderr": 0.0027076829707336327, "rouge1_precision": 0.13506962844381415, "rouge1_precision_stderr": 0.002067491453716875, "rouge1_recall": 0.3234728739069624, "rouge1_recall_stderr": 0.004727722874229317, "rouge2_fmeasure": 0.04083587951791368, "rouge2_fmeasure_stderr": 0.0014903605347383073, "rouge2_precision": 0.028931791862333082, "rouge2_precision_stderr": 0.0010588495462508073, "rouge2_recall": 0.07274733136509248, "rouge2_recall_stderr": 0.0027548263183176493, "rougeL_fmeasure": 0.14341449152788618, "rougeL_fmeasure_stderr": 0.0020607906605954966, "rougeL_precision": 0.10368955828025853, "rougeL_precision_stderr": 0.0015784859184918452, "rougeL_recall": 0.24943189152228584, "rougeL_recall_stderr": 0.0037105916710632896, "rougeLsum_fmeasure": 0.1474850025435713, "rougeLsum_fmeasure_stderr": 0.0022564487363751556, "rougeLsum_precision": 0.10646045098053993, "rougeLsum_precision_stderr": 0.0017003493427749877, "rougeLsum_recall": 0.2568065885506006, "rougeLsum_recall_stderr": 0.004046522118014673}}, "4": {"article_DOC_summary": {"bleu": 0.8129834334474599, "bleu_stderr": 0.15535127400795215, "rouge1_fmeasure": 0.05229837627979885, "rouge1_fmeasure_stderr": 0.0028280084619077004, "rouge1_precision": 0.043902442296154094, "rouge1_precision_stderr": 0.0025879990015873823, "rouge1_recall": 0.08288624748163502, "rouge1_recall_stderr": 0.004634354203292547, "rouge2_fmeasure": 0.011004483045456277, "rouge2_fmeasure_stderr": 0.0009603677039917673, "rouge2_precision": 0.009017448289244204, "rouge2_precision_stderr": 0.000928647656293355, "rouge2_recall": 0.018080481336643847, "rouge2_recall_stderr": 0.0016363270970734646, "rougeL_fmeasure": 0.04070040451617707, "rougeL_fmeasure_stderr": 0.002174967302991672, "rougeL_precision": 0.03488188104345912, "rougeL_precision_stderr": 0.002148001214796994, "rougeL_recall": 0.06435420549635136, "rougeL_recall_stderr": 0.003580029698198211, "rougeLsum_fmeasure": 0.04267039522633286, "rougeLsum_fmeasure_stderr": 0.0023258157432622987, "rougeLsum_precision": 0.036478926891635684, "rougeLsum_precision_stderr": 0.0022479298210774765, "rougeLsum_recall": 0.06756132264586166, "rougeLsum_recall_stderr": 0.0038580697422047013}}, "5": {"article_DOC_summary": {"bleu": 4.26014647829622e-17, "bleu_stderr": 7.166752191638402e-14, "rouge1_fmeasure": 0.002779538343382174, "rouge1_fmeasure_stderr": 0.0007508746959336377, "rouge1_precision": 0.0024186506858969573, "rouge1_precision_stderr": 0.0006967044143553388, "rouge1_recall": 0.0041229184244589556, "rouge1_recall_stderr": 0.0011500680823563949, "rouge2_fmeasure": 0.00046142390179033124, "rouge2_fmeasure_stderr": 0.00017271322964265627, "rouge2_precision": 0.00039931331341648866, "rouge2_precision_stderr": 0.00015818778079209174, "rouge2_recall": 0.0006722407263472746, "rouge2_recall_stderr": 0.00025410945724169213, "rougeL_fmeasure": 0.0019880056310394296, "rougeL_fmeasure_stderr": 0.0005217836114821875, "rougeL_precision": 0.0017122099344230371, "rougeL_precision_stderr": 0.00047036977131503896, "rougeL_recall": 0.002921391378644953, "rougeL_recall_stderr": 0.0007787714133802412, "rougeLsum_fmeasure": 0.002161633417710008, "rougeLsum_fmeasure_stderr": 0.0005959969166706748, "rougeLsum_precision": 0.001894605060970088, "rougeLsum_precision_stderr": 0.0005535101918445765, "rougeLsum_recall": 0.003119076993452435, "rougeLsum_recall_stderr": 0.0008855064599210934}}}}
perplexity50/evaluation/rankeval/perplexity50_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.014944140233795023,0
3
+ anli_r2,acc,0.331,0.014888272588203938,0
4
+ anli_r3,acc,0.3425,0.013704669762934727,0
5
+ arc_challenge,acc,0.257679180887372,0.012780770562768403,0
6
+ arc_challenge,acc_norm,0.28498293515358364,0.013191348179838793,0
7
+ arc_easy,acc,0.5883838383838383,0.01009821864671491,0
8
+ arc_easy,acc_norm,0.5193602693602694,0.010252089491165522,0
9
+ boolq,acc,0.5923547400611621,0.008594580270731615,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.2956393200295639,,1
12
+ copa,acc,0.78,0.04163331998932261,0
13
+ hellaswag,acc,0.44901414060944034,0.004963771168672082,0
14
+ hellaswag,acc_norm,0.5796654052977495,0.004926038197714521,0
15
+ piqa,acc,0.7274211099020674,0.01038925680329602,0
16
+ piqa,acc_norm,0.7372143634385201,0.010269354068140777,0
17
+ rte,acc,0.516245487364621,0.030080573208738064,0
18
+ sciq,acc,0.816,0.012259457340938584,0
19
+ sciq,acc_norm,0.734,0.013979965645145153,0
20
+ storycloze_2016,acc,0.7071084981293426,0.010523873293246305,0
21
+ winogrande,acc,0.569060773480663,0.013917796623335962,0
perplexity50/evaluation/rankeval/perplexity50_0_lm-eval_global_step52452_2023-05-13-01-15-21_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795023
6
- },
7
- "anli_r2": {
8
- "acc": 0.331,
9
- "acc_stderr": 0.014888272588203938
10
- },
11
- "anli_r3": {
12
- "acc": 0.3425,
13
- "acc_stderr": 0.013704669762934727
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.2956393200295639
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932261
23
- },
24
- "hellaswag": {
25
- "acc": 0.44901414060944034,
26
- "acc_stderr": 0.004963771168672082,
27
- "acc_norm": 0.5796654052977495,
28
- "acc_norm_stderr": 0.004926038197714521
29
- },
30
- "rte": {
31
- "acc": 0.516245487364621,
32
- "acc_stderr": 0.030080573208738064
33
- },
34
- "winogrande": {
35
- "acc": 0.569060773480663,
36
- "acc_stderr": 0.013917796623335962
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7071084981293426,
40
- "acc_stderr": 0.010523873293246305
41
- },
42
- "boolq": {
43
- "acc": 0.5923547400611621,
44
- "acc_stderr": 0.008594580270731615
45
- },
46
- "arc_easy": {
47
- "acc": 0.5883838383838383,
48
- "acc_stderr": 0.01009821864671491,
49
- "acc_norm": 0.5193602693602694,
50
- "acc_norm_stderr": 0.010252089491165522
51
- },
52
- "arc_challenge": {
53
- "acc": 0.257679180887372,
54
- "acc_stderr": 0.012780770562768403,
55
- "acc_norm": 0.28498293515358364,
56
- "acc_norm_stderr": 0.013191348179838793
57
- },
58
- "sciq": {
59
- "acc": 0.816,
60
- "acc_stderr": 0.012259457340938584,
61
- "acc_norm": 0.734,
62
- "acc_norm_stderr": 0.013979965645145153
63
- },
64
- "piqa": {
65
- "acc": 0.7274211099020674,
66
- "acc_stderr": 0.01038925680329602,
67
- "acc_norm": 0.7372143634385201,
68
- "acc_norm_stderr": 0.010269354068140777
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.34,0.014987482264363937,0
3
+ anli_r2,acc,0.327,0.014842213153411242,0
4
+ anli_r3,acc,0.33416666666666667,0.013622434813136783,0
5
+ arc_challenge,acc,0.27303754266211605,0.013019332762635746,0
6
+ arc_challenge,acc_norm,0.2841296928327645,0.013179442447653886,0
7
+ arc_easy,acc,0.5951178451178452,0.010072423960395703,0
8
+ arc_easy,acc_norm,0.561026936026936,0.01018307601297206,0
9
+ boolq,acc,0.5813455657492355,0.008628545022868549,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.32523809523809527,,1
12
+ copa,acc,0.74,0.04408440022768079,0
13
+ hellaswag,acc,0.4448317068313085,0.00495931519801116,0
14
+ hellaswag,acc_norm,0.578370842461661,0.004928105880776072,0
15
+ piqa,acc,0.7285092491838956,0.010376251176596137,0
16
+ piqa,acc_norm,0.735582154515778,0.01028978724476716,0
17
+ rte,acc,0.5451263537906137,0.029973636495415255,0
18
+ sciq,acc,0.862,0.0109121526325044,0
19
+ sciq,acc_norm,0.828,0.011939788882495321,0
20
+ storycloze_2016,acc,0.7033671833244255,0.01056281918156322,0
21
+ winogrande,acc,0.5588003157063931,0.013954975072834731,0
perplexity50/evaluation/rankeval/perplexity50_1_lm-eval_global_step52452_2023-05-13-01-15-21_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.34,
5
- "acc_stderr": 0.014987482264363937
6
- },
7
- "anli_r2": {
8
- "acc": 0.327,
9
- "acc_stderr": 0.014842213153411242
10
- },
11
- "anli_r3": {
12
- "acc": 0.33416666666666667,
13
- "acc_stderr": 0.013622434813136783
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.32523809523809527
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768079
23
- },
24
- "hellaswag": {
25
- "acc": 0.4448317068313085,
26
- "acc_stderr": 0.00495931519801116,
27
- "acc_norm": 0.578370842461661,
28
- "acc_norm_stderr": 0.004928105880776072
29
- },
30
- "rte": {
31
- "acc": 0.5451263537906137,
32
- "acc_stderr": 0.029973636495415255
33
- },
34
- "winogrande": {
35
- "acc": 0.5588003157063931,
36
- "acc_stderr": 0.013954975072834731
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7033671833244255,
40
- "acc_stderr": 0.01056281918156322
41
- },
42
- "boolq": {
43
- "acc": 0.5813455657492355,
44
- "acc_stderr": 0.008628545022868549
45
- },
46
- "arc_easy": {
47
- "acc": 0.5951178451178452,
48
- "acc_stderr": 0.010072423960395703,
49
- "acc_norm": 0.561026936026936,
50
- "acc_norm_stderr": 0.01018307601297206
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27303754266211605,
54
- "acc_stderr": 0.013019332762635746,
55
- "acc_norm": 0.2841296928327645,
56
- "acc_norm_stderr": 0.013179442447653886
57
- },
58
- "sciq": {
59
- "acc": 0.862,
60
- "acc_stderr": 0.0109121526325044,
61
- "acc_norm": 0.828,
62
- "acc_norm_stderr": 0.011939788882495321
63
- },
64
- "piqa": {
65
- "acc": 0.7285092491838956,
66
- "acc_stderr": 0.010376251176596137,
67
- "acc_norm": 0.735582154515778,
68
- "acc_norm_stderr": 0.01028978724476716
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.343,0.015019206922356951,0
3
+ anli_r2,acc,0.321,0.014770821817934649,0
4
+ anli_r3,acc,0.3375,0.013655897185463658,0
5
+ arc_challenge,acc,0.2738907849829352,0.013032004972989501,0
6
+ arc_challenge,acc_norm,0.302901023890785,0.013428241573185349,0
7
+ arc_easy,acc,0.6153198653198653,0.009983171707009011,0
8
+ arc_easy,acc_norm,0.5888047138047138,0.010096663811817681,0
9
+ boolq,acc,0.5840978593272171,0.00862046960400103,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.31094339622641504,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.44612626966739694,0.004960732382255232,0
14
+ hellaswag,acc_norm,0.5825532762397929,0.004921300331285554,0
15
+ piqa,acc,0.7301414581066377,0.010356595421852199,0
16
+ piqa,acc_norm,0.7268770402611534,0.010395730264453262,0
17
+ rte,acc,0.5415162454873647,0.029992535385373314,0
18
+ sciq,acc,0.885,0.010093407594904617,0
19
+ sciq,acc_norm,0.843,0.011510146979230189,0
20
+ storycloze_2016,acc,0.7022982362373063,0.010573790208173062,0
21
+ winogrande,acc,0.5753749013417522,0.013891893150264227,0
perplexity50/evaluation/rankeval/perplexity50_2_lm-eval_global_step52452_2023-05-13-01-15-21_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.343,
5
- "acc_stderr": 0.015019206922356951
6
- },
7
- "anli_r2": {
8
- "acc": 0.321,
9
- "acc_stderr": 0.014770821817934649
10
- },
11
- "anli_r3": {
12
- "acc": 0.3375,
13
- "acc_stderr": 0.013655897185463658
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.31094339622641504
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.44612626966739694,
26
- "acc_stderr": 0.004960732382255232,
27
- "acc_norm": 0.5825532762397929,
28
- "acc_norm_stderr": 0.004921300331285554
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5753749013417522,
36
- "acc_stderr": 0.013891893150264227
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7022982362373063,
40
- "acc_stderr": 0.010573790208173062
41
- },
42
- "boolq": {
43
- "acc": 0.5840978593272171,
44
- "acc_stderr": 0.00862046960400103
45
- },
46
- "arc_easy": {
47
- "acc": 0.6153198653198653,
48
- "acc_stderr": 0.009983171707009011,
49
- "acc_norm": 0.5888047138047138,
50
- "acc_norm_stderr": 0.010096663811817681
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2738907849829352,
54
- "acc_stderr": 0.013032004972989501,
55
- "acc_norm": 0.302901023890785,
56
- "acc_norm_stderr": 0.013428241573185349
57
- },
58
- "sciq": {
59
- "acc": 0.885,
60
- "acc_stderr": 0.010093407594904617,
61
- "acc_norm": 0.843,
62
- "acc_norm_stderr": 0.011510146979230189
63
- },
64
- "piqa": {
65
- "acc": 0.7301414581066377,
66
- "acc_stderr": 0.010356595421852199,
67
- "acc_norm": 0.7268770402611534,
68
- "acc_norm_stderr": 0.010395730264453262
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.33,0.014876872027456732,0
3
+ anli_r2,acc,0.34,0.014987482264363937,0
4
+ anli_r3,acc,0.35,0.013774667009018554,0
5
+ arc_challenge,acc,0.28071672354948807,0.01313123812697558,0
6
+ arc_challenge,acc_norm,0.30204778156996587,0.013417519144716417,0
7
+ arc_easy,acc,0.6165824915824916,0.009976995068264717,0
8
+ arc_easy,acc_norm,0.5917508417508418,0.010085566195791248,0
9
+ boolq,acc,0.5871559633027523,0.00861117243047287,1
10
+ cb,acc,0.375,0.06527912098338669,1
11
+ cb,f1,0.33730158730158727,,1
12
+ copa,acc,0.78,0.04163331998932261,0
13
+ hellaswag,acc,0.444035052778331,0.004958426152481896,0
14
+ hellaswag,acc_norm,0.58105954989046,0.004923772581848489,0
15
+ piqa,acc,0.73449401523395,0.010303308653024427,0
16
+ piqa,acc_norm,0.7372143634385201,0.010269354068140777,0
17
+ rte,acc,0.5342960288808665,0.030025579819366422,0
18
+ sciq,acc,0.883,0.010169287802713329,0
19
+ sciq,acc_norm,0.861,0.010945263761042967,0
20
+ storycloze_2016,acc,0.7103153393907001,0.01048980809194661,0
21
+ winogrande,acc,0.5864246250986582,0.013840971763195303,0
perplexity50/evaluation/rankeval/perplexity50_3_lm-eval_global_step52452_2023-05-13-01-15-21_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.33,
5
- "acc_stderr": 0.014876872027456732
6
- },
7
- "anli_r2": {
8
- "acc": 0.34,
9
- "acc_stderr": 0.014987482264363937
10
- },
11
- "anli_r3": {
12
- "acc": 0.35,
13
- "acc_stderr": 0.013774667009018554
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.33730158730158727
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932261
23
- },
24
- "hellaswag": {
25
- "acc": 0.444035052778331,
26
- "acc_stderr": 0.004958426152481896,
27
- "acc_norm": 0.58105954989046,
28
- "acc_norm_stderr": 0.004923772581848489
29
- },
30
- "rte": {
31
- "acc": 0.5342960288808665,
32
- "acc_stderr": 0.030025579819366422
33
- },
34
- "winogrande": {
35
- "acc": 0.5864246250986582,
36
- "acc_stderr": 0.013840971763195303
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7103153393907001,
40
- "acc_stderr": 0.01048980809194661
41
- },
42
- "boolq": {
43
- "acc": 0.5871559633027523,
44
- "acc_stderr": 0.00861117243047287
45
- },
46
- "arc_easy": {
47
- "acc": 0.6165824915824916,
48
- "acc_stderr": 0.009976995068264717,
49
- "acc_norm": 0.5917508417508418,
50
- "acc_norm_stderr": 0.010085566195791248
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28071672354948807,
54
- "acc_stderr": 0.01313123812697558,
55
- "acc_norm": 0.30204778156996587,
56
- "acc_norm_stderr": 0.013417519144716417
57
- },
58
- "sciq": {
59
- "acc": 0.883,
60
- "acc_stderr": 0.010169287802713329,
61
- "acc_norm": 0.861,
62
- "acc_norm_stderr": 0.010945263761042967
63
- },
64
- "piqa": {
65
- "acc": 0.73449401523395,
66
- "acc_stderr": 0.010303308653024427,
67
- "acc_norm": 0.7372143634385201,
68
- "acc_norm_stderr": 0.010269354068140777
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.335,0.014933117490932575,0
3
+ anli_r2,acc,0.327,0.014842213153411242,0
4
+ anli_r3,acc,0.3458333333333333,0.013736245342311012,0
5
+ arc_challenge,acc,0.2687713310580205,0.012955065963710691,0
6
+ arc_challenge,acc_norm,0.30204778156996587,0.013417519144716417,0
7
+ arc_easy,acc,0.6153198653198653,0.009983171707009008,0
8
+ arc_easy,acc_norm,0.5980639730639731,0.010060521220920566,0
9
+ boolq,acc,0.5908256880733945,0.008599563442397349,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.28503144654088053,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4422425811591316,0.004956378590571537,0
14
+ hellaswag,acc_norm,0.5832503485361482,0.004920130733271772,0
15
+ piqa,acc,0.7230685527747551,0.010440499969334535,0
16
+ piqa,acc_norm,0.733949945593036,0.010310039263352826,0
17
+ rte,acc,0.5415162454873647,0.029992535385373314,0
18
+ sciq,acc,0.889,0.009938701010583726,0
19
+ sciq,acc_norm,0.862,0.010912152632504394,0
20
+ storycloze_2016,acc,0.7220737573490112,0.010359403651225854,0
21
+ winogrande,acc,0.5659037095501184,0.01392988255569405,0
perplexity50/evaluation/rankeval/perplexity50_4_lm-eval_global_step52452_2023-05-13-01-15-21_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.335,
5
- "acc_stderr": 0.014933117490932575
6
- },
7
- "anli_r2": {
8
- "acc": 0.327,
9
- "acc_stderr": 0.014842213153411242
10
- },
11
- "anli_r3": {
12
- "acc": 0.3458333333333333,
13
- "acc_stderr": 0.013736245342311012
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.28503144654088053
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4422425811591316,
26
- "acc_stderr": 0.004956378590571537,
27
- "acc_norm": 0.5832503485361482,
28
- "acc_norm_stderr": 0.004920130733271772
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5659037095501184,
36
- "acc_stderr": 0.01392988255569405
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7220737573490112,
40
- "acc_stderr": 0.010359403651225854
41
- },
42
- "boolq": {
43
- "acc": 0.5908256880733945,
44
- "acc_stderr": 0.008599563442397349
45
- },
46
- "arc_easy": {
47
- "acc": 0.6153198653198653,
48
- "acc_stderr": 0.009983171707009008,
49
- "acc_norm": 0.5980639730639731,
50
- "acc_norm_stderr": 0.010060521220920566
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2687713310580205,
54
- "acc_stderr": 0.012955065963710691,
55
- "acc_norm": 0.30204778156996587,
56
- "acc_norm_stderr": 0.013417519144716417
57
- },
58
- "sciq": {
59
- "acc": 0.889,
60
- "acc_stderr": 0.009938701010583726,
61
- "acc_norm": 0.862,
62
- "acc_norm_stderr": 0.010912152632504394
63
- },
64
- "piqa": {
65
- "acc": 0.7230685527747551,
66
- "acc_stderr": 0.010440499969334535,
67
- "acc_norm": 0.733949945593036,
68
- "acc_norm_stderr": 0.010310039263352826
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.352,0.015110404505648668,0
3
+ anli_r2,acc,0.327,0.014842213153411247,0
4
+ anli_r3,acc,0.3458333333333333,0.013736245342311012,0
5
+ arc_challenge,acc,0.27474402730375425,0.013044617212771227,0
6
+ arc_challenge,acc_norm,0.30887372013651876,0.013501770929344003,0
7
+ arc_easy,acc,0.6153198653198653,0.00998317170700901,0
8
+ arc_easy,acc_norm,0.5989057239057239,0.010057051106534385,0
9
+ boolq,acc,0.5960244648318043,0.008582268854021401,1
10
+ cb,acc,0.44642857142857145,0.06703189227942397,1
11
+ cb,f1,0.3134878193701723,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4457279426409082,0.004960299952519412,0
14
+ hellaswag,acc_norm,0.5867357100179247,0.00491413085543178,0
15
+ piqa,acc,0.7301414581066377,0.0103565954218522,0
16
+ piqa,acc_norm,0.7312295973884657,0.01034339294009,0
17
+ rte,acc,0.5415162454873647,0.029992535385373314,0
18
+ sciq,acc,0.891,0.009859828407037191,0
19
+ sciq,acc_norm,0.863,0.010878848714333316,0
20
+ storycloze_2016,acc,0.7071084981293426,0.010523873293246309,0
21
+ winogrande,acc,0.5730071033938438,0.013901878072575055,0
perplexity50/evaluation/rankeval/perplexity50_5_lm-eval_global_step52452_2023-05-13-01-15-21_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.352,
5
- "acc_stderr": 0.015110404505648668
6
- },
7
- "anli_r2": {
8
- "acc": 0.327,
9
- "acc_stderr": 0.014842213153411247
10
- },
11
- "anli_r3": {
12
- "acc": 0.3458333333333333,
13
- "acc_stderr": 0.013736245342311012
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942397,
18
- "f1": 0.3134878193701723
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4457279426409082,
26
- "acc_stderr": 0.004960299952519412,
27
- "acc_norm": 0.5867357100179247,
28
- "acc_norm_stderr": 0.00491413085543178
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5730071033938438,
36
- "acc_stderr": 0.013901878072575055
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7071084981293426,
40
- "acc_stderr": 0.010523873293246309
41
- },
42
- "boolq": {
43
- "acc": 0.5960244648318043,
44
- "acc_stderr": 0.008582268854021401
45
- },
46
- "arc_easy": {
47
- "acc": 0.6153198653198653,
48
- "acc_stderr": 0.00998317170700901,
49
- "acc_norm": 0.5989057239057239,
50
- "acc_norm_stderr": 0.010057051106534385
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27474402730375425,
54
- "acc_stderr": 0.013044617212771227,
55
- "acc_norm": 0.30887372013651876,
56
- "acc_norm_stderr": 0.013501770929344003
57
- },
58
- "sciq": {
59
- "acc": 0.891,
60
- "acc_stderr": 0.009859828407037191,
61
- "acc_norm": 0.863,
62
- "acc_norm_stderr": 0.010878848714333316
63
- },
64
- "piqa": {
65
- "acc": 0.7301414581066377,
66
- "acc_stderr": 0.0103565954218522,
67
- "acc_norm": 0.7312295973884657,
68
- "acc_norm_stderr": 0.01034339294009
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }