Muennighoff commited on
Commit
35f9bea
1 Parent(s): d6d2c1d
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 4b284b84b10c4py/evaluation/generation/merged.csv +53 -0
  2. 4b284b84b10c4py/evaluation/generation/merged.json +1 -0
  3. 4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_0.csv +21 -0
  4. 4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_1.csv +21 -0
  5. 4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_2.csv +21 -0
  6. 4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_3.csv +21 -0
  7. 4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_4.csv +21 -0
  8. 4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_5.csv +21 -0
  9. 4b284b84b20c4py/evaluation/generation/merged.csv +53 -0
  10. 4b284b84b20c4py/evaluation/generation/merged.json +1 -0
  11. 4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_0.csv +21 -0
  12. 4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_1.csv +21 -0
  13. 4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_2.csv +21 -0
  14. 4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_3.csv +21 -0
  15. 4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_4.csv +21 -0
  16. 4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_5.csv +21 -0
  17. 4b284b84b30c4py/evaluation/generation/merged.csv +53 -0
  18. 4b284b84b30c4py/evaluation/generation/merged.json +1 -0
  19. 4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_0.csv +21 -0
  20. 4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_1.csv +21 -0
  21. 4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_2.csv +21 -0
  22. 4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_3.csv +21 -0
  23. 4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_4.csv +21 -0
  24. 4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_5.csv +21 -0
  25. 4b284b84b40c4py/evaluation/generation/merged.csv +53 -0
  26. 4b284b84b40c4py/evaluation/generation/merged.json +1 -0
  27. 4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_0.csv +21 -0
  28. 4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_1.csv +21 -0
  29. 4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_2.csv +21 -0
  30. 4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_3.csv +21 -0
  31. 4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_4.csv +21 -0
  32. 4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_5.csv +21 -0
  33. 4b284b84b60c4py/evaluation/generation/merged.csv +53 -0
  34. 4b284b84b60c4py/evaluation/generation/merged.json +1 -0
  35. 4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_0.csv +21 -0
  36. 4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_1.csv +21 -0
  37. 4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_2.csv +21 -0
  38. 4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_3.csv +21 -0
  39. 4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_4.csv +21 -0
  40. 4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_5.csv +21 -0
  41. 4b284b84b70c4py/evaluation/generation/merged.csv +53 -0
  42. 4b284b84b70c4py/evaluation/generation/merged.json +1 -0
  43. 4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_0.csv +21 -0
  44. 4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_1.csv +21 -0
  45. 4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_2.csv +21 -0
  46. 4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_3.csv +21 -0
  47. 4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_4.csv +21 -0
  48. 4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_5.csv +21 -0
  49. 4b284b84b80c4py/evaluation/generation/merged.csv +53 -0
  50. 4b284b84b80c4py/evaluation/generation/merged.json +1 -0
4b284b84b10c4py/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.11599617848462068
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.11599617848462068
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.25244576500518445
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.25244576500518445
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.28087700716585834
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.28087700716585834
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.29137647462406413
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.29137647462406413
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.29718267067971493
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.29718267067971493
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2990249691536151
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2990249691536151
14
+ e2e_nlg_cleaned,5,average,multiple,0.25615051085217627
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03939889642394197
16
+ gem_xsum,0,median,rouge2_fmeasure,0.03939889642394197
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03225325166687549
18
+ gem_xsum,1,median,rouge2_fmeasure,0.03225325166687549
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03868362872963243
20
+ gem_xsum,2,median,rouge2_fmeasure,0.03868362872963243
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.035779638469116914
22
+ gem_xsum,3,median,rouge2_fmeasure,0.035779638469116914
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010566772575448751
24
+ gem_xsum,4,median,rouge2_fmeasure,0.010566772575448751
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0004013828039517138
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0004013828039517138
27
+ gem_xsum,5,average,multiple,0.026180595111494544
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05343668682534253
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05343668682534253
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08995358919930085
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.08995358919930085
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.10058078270140805
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.10058078270140805
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10373196315248306
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.10373196315248306
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10839195425794168
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.10839195425794168
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11341187327748274
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.11341187327748274
40
+ web_nlg_en,5,average,multiple,0.09491780823565982
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.048105932388098856
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.048105932388098856
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04414011089261056
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.04414011089261056
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.056871066430790325
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.056871066430790325
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04854258938628169
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04854258938628169
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.016216361695071208
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.016216361695071208
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0027628200691328687
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0027628200691328687
53
+ wiki_lingua_en,5,average,multiple,0.03610648014366425
4b284b84b10c4py/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4813014742136315, "bleu_stderr": 0.029298284790512823, "rouge1_fmeasure": 0.12355058601433035, "rouge1_fmeasure_stderr": 0.001894994722809818, "rouge1_precision": 0.07814637483776633, "rouge1_precision_stderr": 0.0013388565211954557, "rouge1_recall": 0.39367165732976755, "rouge1_recall_stderr": 0.0052609219673642686, "rouge2_fmeasure": 0.05343668682534253, "rouge2_fmeasure_stderr": 0.0011505936382800977, "rouge2_precision": 0.03357663912471221, "rouge2_precision_stderr": 0.0007824220781305639, "rouge2_recall": 0.18024451705008498, "rouge2_recall_stderr": 0.0037047453236374803, "rougeL_fmeasure": 0.11573328163228513, "rougeL_fmeasure_stderr": 0.0017001940430562083, "rougeL_precision": 0.07298923635178714, "rougeL_precision_stderr": 0.0011896452042006365, "rougeL_recall": 0.3727466061438325, "rougeL_recall_stderr": 0.005005690787063604, "rougeLsum_fmeasure": 0.11322320217099578, "rougeLsum_fmeasure_stderr": 0.0017394583047356308, "rougeLsum_precision": 0.07168838309172086, "rougeLsum_precision_stderr": 0.0012342784180538735, "rougeLsum_recall": 0.3612267886931813, "rougeLsum_recall_stderr": 0.004790388470075587}}, "1": {"PALM_prompt": {"bleu": 0.7255238696408423, "bleu_stderr": 0.053879561708253726, "rouge1_fmeasure": 0.17553593173895937, "rouge1_fmeasure_stderr": 0.004096195356643218, "rouge1_precision": 0.1625223257514928, "rouge1_precision_stderr": 0.005111682257014667, "rouge1_recall": 0.32599686697536673, "rouge1_recall_stderr": 0.005056163504321368, "rouge2_fmeasure": 0.08995358919930085, "rouge2_fmeasure_stderr": 0.00276677511793771, "rouge2_precision": 0.08437320166589189, "rouge2_precision_stderr": 0.003472257493578088, "rouge2_recall": 0.17009103545543425, "rouge2_recall_stderr": 0.0036595655247239614, "rougeL_fmeasure": 0.1575106385009677, "rougeL_fmeasure_stderr": 0.0035058149961314554, "rougeL_precision": 0.1443002602162733, "rougeL_precision_stderr": 0.004479274553715264, "rougeL_recall": 0.3023393609518485, "rougeL_recall_stderr": 0.004598931367210058, "rougeLsum_fmeasure": 0.1613119682169033, "rougeLsum_fmeasure_stderr": 0.0036048812212870155, "rougeLsum_precision": 0.14870447627323272, "rougeLsum_precision_stderr": 0.00462570655861519, "rougeLsum_recall": 0.3067494379024135, "rougeLsum_recall_stderr": 0.004650958784315044}}, "2": {"PALM_prompt": {"bleu": 0.8687128586366801, "bleu_stderr": 0.04673010602404015, "rouge1_fmeasure": 0.19129452100187833, "rouge1_fmeasure_stderr": 0.004216273048303227, "rouge1_precision": 0.1701554452582214, "rouge1_precision_stderr": 0.005107771330279079, "rouge1_recall": 0.355832019011606, "rouge1_recall_stderr": 0.004884775439773983, "rouge2_fmeasure": 0.10058078270140805, "rouge2_fmeasure_stderr": 0.0029257899877312156, "rouge2_precision": 0.09088743631977127, "rouge2_precision_stderr": 0.003444014418662031, "rouge2_recall": 0.18939437656895264, "rouge2_recall_stderr": 0.0037355969446100823, "rougeL_fmeasure": 0.1700727632598896, "rougeL_fmeasure_stderr": 0.003579698843822934, "rougeL_precision": 0.14879189467464685, "rougeL_precision_stderr": 0.0043534563737425485, "rougeL_recall": 0.32802790773843415, "rougeL_recall_stderr": 0.004479264764219749, "rougeLsum_fmeasure": 0.17593789645790273, "rougeLsum_fmeasure_stderr": 0.0037452392698808686, "rougeLsum_precision": 0.1549450062748448, "rougeLsum_precision_stderr": 0.004557766264854074, "rougeLsum_recall": 0.334886885021682, "rougeLsum_recall_stderr": 0.004545614262741492}}, "3": {"PALM_prompt": {"bleu": 0.976110601716665, "bleu_stderr": 0.05147232952331574, "rouge1_fmeasure": 0.1955261278048012, "rouge1_fmeasure_stderr": 0.0042687987276028025, "rouge1_precision": 0.17650374199610544, "rouge1_precision_stderr": 0.005278588530178822, "rouge1_recall": 0.3620490484768146, "rouge1_recall_stderr": 0.004763724984545098, "rouge2_fmeasure": 0.10373196315248306, "rouge2_fmeasure_stderr": 0.002909773730879196, "rouge2_precision": 0.09649007165425032, "rouge2_precision_stderr": 0.003579903171459426, "rouge2_recall": 0.1932492774677527, "rouge2_recall_stderr": 0.0036237451702642704, "rougeL_fmeasure": 0.1729686164923851, "rougeL_fmeasure_stderr": 0.003539784845210468, "rougeL_precision": 0.15353988316234024, "rougeL_precision_stderr": 0.004424257414579797, "rougeL_recall": 0.3329632854608314, "rougeL_recall_stderr": 0.004284658072005821, "rougeLsum_fmeasure": 0.17904838371148374, "rougeLsum_fmeasure_stderr": 0.003744462902264531, "rougeLsum_precision": 0.1606790115172626, "rougeLsum_precision_stderr": 0.004723404323298352, "rougeLsum_recall": 0.33915285374439175, "rougeLsum_recall_stderr": 0.004340341067923443}}, "4": {"PALM_prompt": {"bleu": 1.0350167319929164, "bleu_stderr": 0.08089511258341531, "rouge1_fmeasure": 0.20005984608564545, "rouge1_fmeasure_stderr": 0.0043621938032114985, "rouge1_precision": 0.18224379597295554, "rouge1_precision_stderr": 0.0054869315349140284, "rouge1_recall": 0.3728518882868007, "rouge1_recall_stderr": 0.004805363061413696, "rouge2_fmeasure": 0.10839195425794168, "rouge2_fmeasure_stderr": 0.0030187532546412294, "rouge2_precision": 0.10200985474477063, "rouge2_precision_stderr": 0.0037380638769571885, "rouge2_recall": 0.20291177783621683, "rouge2_recall_stderr": 0.003801448242387614, "rougeL_fmeasure": 0.17627371459053626, "rougeL_fmeasure_stderr": 0.0036366682681705747, "rougeL_precision": 0.15805723338022432, "rougeL_precision_stderr": 0.004637765426393908, "rougeL_recall": 0.341594924355393, "rougeL_recall_stderr": 0.004322861765243894, "rougeLsum_fmeasure": 0.18370609844319763, "rougeLsum_fmeasure_stderr": 0.003859835340459462, "rougeLsum_precision": 0.16598914017637473, "rougeLsum_precision_stderr": 0.004915957758353431, "rougeLsum_recall": 0.35013405773908396, "rougeLsum_recall_stderr": 0.0044261493721537235}}, "5": {"PALM_prompt": {"bleu": 1.099500805669065, "bleu_stderr": 0.05128274555498221, "rouge1_fmeasure": 0.21085477597688443, "rouge1_fmeasure_stderr": 0.004537288839362036, "rouge1_precision": 0.18895154801724418, "rouge1_precision_stderr": 0.005525818321828389, "rouge1_recall": 0.38248844776913693, "rouge1_recall_stderr": 0.004934647163878857, "rouge2_fmeasure": 0.11341187327748274, "rouge2_fmeasure_stderr": 0.0031174298961979667, "rouge2_precision": 0.10447278232108755, "rouge2_precision_stderr": 0.003746341946804797, "rouge2_recall": 0.2070479482405224, "rouge2_recall_stderr": 0.003822643707608457, "rougeL_fmeasure": 0.18499664526019835, "rougeL_fmeasure_stderr": 0.0037604793276689013, "rougeL_precision": 0.16301359107525903, "rougeL_precision_stderr": 0.004631366227638678, "rougeL_recall": 0.3486223306539461, "rougeL_recall_stderr": 0.0043860650726248295, "rougeLsum_fmeasure": 0.19265091926675612, "rougeLsum_fmeasure_stderr": 0.003995741079720777, "rougeLsum_precision": 0.1713978535792387, "rougeLsum_precision_stderr": 0.00494124825637956, "rougeLsum_recall": 0.3573153383883979, "rougeLsum_recall_stderr": 0.004488735837365747}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.439034811249811, "bleu_stderr": 0.08814349202270069, "rouge1_fmeasure": 0.20438865109293125, "rouge1_fmeasure_stderr": 0.0020542271239135474, "rouge1_precision": 0.1824848450703605, "rouge1_precision_stderr": 0.002258769733514356, "rouge1_recall": 0.2829504579856686, "rouge1_recall_stderr": 0.002892378725128921, "rouge2_fmeasure": 0.048105932388098856, "rouge2_fmeasure_stderr": 0.0010139576129839956, "rouge2_precision": 0.043109912585657335, "rouge2_precision_stderr": 0.0009711440225665886, "rouge2_recall": 0.06736443489887266, "rouge2_recall_stderr": 0.0015632414031077888, "rougeL_fmeasure": 0.15441829967250942, "rougeL_fmeasure_stderr": 0.0014514089251988502, "rougeL_precision": 0.13644080096663336, "rougeL_precision_stderr": 0.0015842215640080625, "rougeL_recall": 0.21853544620158255, "rougeL_recall_stderr": 0.0022785408725319403, "rougeLsum_fmeasure": 0.18999551527620429, "rougeLsum_fmeasure_stderr": 0.0019185335839288905, "rougeLsum_precision": 0.16955964056676814, "rougeLsum_precision_stderr": 0.002110604546495203, "rougeLsum_recall": 0.2633994155788588, "rougeLsum_recall_stderr": 0.0027106937173640546}}, "1": {"tldr_en": {"bleu": 2.592511832648172, "bleu_stderr": 0.0883522873390386, "rouge1_fmeasure": 0.1817945339591683, "rouge1_fmeasure_stderr": 0.0021379348675478974, "rouge1_precision": 0.27650295097853705, "rouge1_precision_stderr": 0.00433037610306865, "rouge1_recall": 0.20123333179446745, "rouge1_recall_stderr": 0.0029592287408768563, "rouge2_fmeasure": 0.04414011089261056, "rouge2_fmeasure_stderr": 0.0011864515717073022, "rouge2_precision": 0.07927146370780327, "rouge2_precision_stderr": 0.0028993886109516787, "rouge2_recall": 0.048713843049408144, "rouge2_recall_stderr": 0.0014696170848375373, "rougeL_fmeasure": 0.14036723611181662, "rougeL_fmeasure_stderr": 0.001621088879287515, "rougeL_precision": 0.22325969773232404, "rougeL_precision_stderr": 0.0038282019630928436, "rougeL_recall": 0.15381843600559833, "rougeL_recall_stderr": 0.0022441425803852682, "rougeLsum_fmeasure": 0.17107746115835645, "rougeLsum_fmeasure_stderr": 0.002004425675535598, "rougeLsum_precision": 0.2620080990727989, "rougeLsum_precision_stderr": 0.0041842438687289705, "rougeLsum_recall": 0.18922669116358481, "rougeLsum_recall_stderr": 0.0027816420778518647}}, "2": {"tldr_en": {"bleu": 3.1897316776859443, "bleu_stderr": 0.10383641450422736, "rouge1_fmeasure": 0.201455981397101, "rouge1_fmeasure_stderr": 0.002215864605685192, "rouge1_precision": 0.33293478487683886, "rouge1_precision_stderr": 0.004638481088709241, "rouge1_recall": 0.20832991559189593, "rouge1_recall_stderr": 0.002882384230492161, "rouge2_fmeasure": 0.056871066430790325, "rouge2_fmeasure_stderr": 0.001341215790213451, "rouge2_precision": 0.10981600440845445, "rouge2_precision_stderr": 0.0032662143067739334, "rouge2_recall": 0.0571749108797057, "rouge2_recall_stderr": 0.0014697451697197228, "rougeL_fmeasure": 0.1572901498332904, "rougeL_fmeasure_stderr": 0.001727626060232697, "rougeL_precision": 0.2702435488101451, "rougeL_precision_stderr": 0.004129606571484203, "rougeL_recall": 0.1613125746864521, "rougeL_recall_stderr": 0.0022067815922600644, "rougeLsum_fmeasure": 0.1904207426584212, "rougeLsum_fmeasure_stderr": 0.0020985538580578374, "rougeLsum_precision": 0.31764168504184737, "rougeLsum_precision_stderr": 0.004542434958936431, "rougeLsum_recall": 0.19654070699277223, "rougeLsum_recall_stderr": 0.0027147097304651784}}, "3": {"tldr_en": {"bleu": 2.1373295852103618, "bleu_stderr": 0.09657133458196947, "rouge1_fmeasure": 0.17011994598152969, "rouge1_fmeasure_stderr": 0.0024811944029056563, "rouge1_precision": 0.2993896407068062, "rouge1_precision_stderr": 0.004961826525869491, "rouge1_recall": 0.17036445945014525, "rouge1_recall_stderr": 0.0030340761083348573, "rouge2_fmeasure": 0.04854258938628169, "rouge2_fmeasure_stderr": 0.0013429745795374734, "rouge2_precision": 0.10010009650819313, "rouge2_precision_stderr": 0.0032857461552203595, "rouge2_recall": 0.04756100531684024, "rouge2_recall_stderr": 0.0014329928419246308, "rougeL_fmeasure": 0.13474954073077003, "rougeL_fmeasure_stderr": 0.001970359077922383, "rougeL_precision": 0.2477387853206388, "rougeL_precision_stderr": 0.004400377182402362, "rougeL_recall": 0.13297516874095594, "rougeL_recall_stderr": 0.0023567485440671196, "rougeLsum_fmeasure": 0.16095317824352906, "rougeLsum_fmeasure_stderr": 0.0023610481534614897, "rougeLsum_precision": 0.28541851433610155, "rougeLsum_precision_stderr": 0.004807244705293582, "rougeLsum_recall": 0.16068089265916596, "rougeLsum_recall_stderr": 0.0028651117671434395}}, "4": {"tldr_en": {"bleu": 0.03779045940266729, "bleu_stderr": 0.00499486813251369, "rouge1_fmeasure": 0.05785530003387149, "rouge1_fmeasure_stderr": 0.002104739102006248, "rouge1_precision": 0.10233602202490566, "rouge1_precision_stderr": 0.003901148966833399, "rouge1_recall": 0.05690170219061377, "rouge1_recall_stderr": 0.002261392037062523, "rouge2_fmeasure": 0.016216361695071208, "rouge2_fmeasure_stderr": 0.0009795250112890484, "rouge2_precision": 0.032881948765465734, "rouge2_precision_stderr": 0.00214275596785197, "rouge2_recall": 0.015513277487513246, "rouge2_recall_stderr": 0.0009900569977354416, "rougeL_fmeasure": 0.046183940516235616, "rougeL_fmeasure_stderr": 0.0016983322564463742, "rougeL_precision": 0.0857146344396996, "rougeL_precision_stderr": 0.003416320759360737, "rougeL_recall": 0.0445264515835226, "rougeL_recall_stderr": 0.0017587334889400765, "rougeLsum_fmeasure": 0.054222605768747344, "rougeLsum_fmeasure_stderr": 0.001979311930128123, "rougeLsum_precision": 0.0972394371974616, "rougeLsum_precision_stderr": 0.003756250172857945, "rougeLsum_recall": 0.05298521225905992, "rougeLsum_recall_stderr": 0.0021031803096113336}}, "5": {"tldr_en": {"bleu": 6.058866159905281e-16, "bleu_stderr": 2.3900117794994548e-15, "rouge1_fmeasure": 0.00927977866130263, "rouge1_fmeasure_stderr": 0.0009046636518910162, "rouge1_precision": 0.01666321917537446, "rouge1_precision_stderr": 0.0017144844493535089, "rouge1_recall": 0.009157687542181802, "rouge1_recall_stderr": 0.0009736218557175297, "rouge2_fmeasure": 0.0027628200691328687, "rouge2_fmeasure_stderr": 0.00040474421977877023, "rouge2_precision": 0.005635566478053115, "rouge2_precision_stderr": 0.0009678519939045886, "rouge2_recall": 0.002607955654242479, "rouge2_recall_stderr": 0.0004157349477803617, "rougeL_fmeasure": 0.007640431942454017, "rougeL_fmeasure_stderr": 0.0007551641757705242, "rougeL_precision": 0.014505480826312199, "rougeL_precision_stderr": 0.0015703654365691406, "rougeL_recall": 0.007377263181545838, "rougeL_recall_stderr": 0.0007854353219556513, "rougeLsum_fmeasure": 0.008819513768437372, "rougeLsum_fmeasure_stderr": 0.0008640055278148719, "rougeLsum_precision": 0.01613430616214765, "rougeLsum_precision_stderr": 0.0016859400230554314, "rougeLsum_recall": 0.008638145878204216, "rougeLsum_recall_stderr": 0.0009157832781292763}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.021544193176019, "bleu_stderr": 0.04672525631492646, "rouge1_fmeasure": 0.33225583490736194, "rouge1_fmeasure_stderr": 0.0030225097754752873, "rouge1_precision": 0.3452949497006971, "rouge1_precision_stderr": 0.0031874336438570136, "rouge1_recall": 0.3412418289576795, "rouge1_recall_stderr": 0.0034398228533816144, "rouge2_fmeasure": 0.11599617848462068, "rouge2_fmeasure_stderr": 0.0015886302807693743, "rouge2_precision": 0.11855573943397561, "rouge2_precision_stderr": 0.0015761697052151904, "rouge2_recall": 0.12140620974140548, "rouge2_recall_stderr": 0.0018263607260067699, "rougeL_fmeasure": 0.2383281424265221, "rougeL_fmeasure_stderr": 0.002215264906335406, "rougeL_precision": 0.24616297725036965, "rougeL_precision_stderr": 0.0022797007734971847, "rougeL_recall": 0.24639726684479762, "rougeL_recall_stderr": 0.0025797032547250176, "rougeLsum_fmeasure": 0.3081149911342974, "rougeLsum_fmeasure_stderr": 0.0028594046078756815, "rougeLsum_precision": 0.3190175340171995, "rougeLsum_precision_stderr": 0.002958732504531792, "rougeLsum_recall": 0.31764816538422297, "rougeLsum_recall_stderr": 0.0033043767478234854}}, "1": {"generate_text_restaurant": {"bleu": 14.53743726129001, "bleu_stderr": 0.14445690934234265, "rouge1_fmeasure": 0.5180906148683028, "rouge1_fmeasure_stderr": 0.0024567692495659718, "rouge1_precision": 0.6082899833637005, "rouge1_precision_stderr": 0.0032135327715503183, "rouge1_recall": 0.4939991587989527, "rouge1_recall_stderr": 0.0031897368344663206, "rouge2_fmeasure": 0.25244576500518445, "rouge2_fmeasure_stderr": 0.0021987704228017047, "rouge2_precision": 0.2992841860287957, "rouge2_precision_stderr": 0.0027535069501413525, "rouge2_recall": 0.24073848287899435, "rouge2_recall_stderr": 0.0023692738232853716, "rougeL_fmeasure": 0.3702354270220447, "rougeL_fmeasure_stderr": 0.002229049797298906, "rougeL_precision": 0.4377558923516418, "rougeL_precision_stderr": 0.0030209364495319283, "rougeL_recall": 0.3522118606318514, "rougeL_recall_stderr": 0.0026145016755065214, "rougeLsum_fmeasure": 0.4215085599639745, "rougeLsum_fmeasure_stderr": 0.0024837056691902086, "rougeLsum_precision": 0.49559037424824853, "rougeLsum_precision_stderr": 0.0031867294805906312, "rougeLsum_recall": 0.4017134750190358, "rougeLsum_recall_stderr": 0.002963390841940911}}, "2": {"generate_text_restaurant": {"bleu": 16.6974233877375, "bleu_stderr": 0.2259702264159085, "rouge1_fmeasure": 0.550751959150263, "rouge1_fmeasure_stderr": 0.002334628388103356, "rouge1_precision": 0.6308114156673001, "rouge1_precision_stderr": 0.0030515635455948056, "rouge1_recall": 0.5241737976358826, "rouge1_recall_stderr": 0.0029634534710997382, "rouge2_fmeasure": 0.28087700716585834, "rouge2_fmeasure_stderr": 0.002229388349802515, "rouge2_precision": 0.3245854766100972, "rouge2_precision_stderr": 0.002738959218334304, "rouge2_recall": 0.26676548563429314, "rouge2_recall_stderr": 0.002350700363573182, "rougeL_fmeasure": 0.40000959435601263, "rougeL_fmeasure_stderr": 0.002278833446615684, "rougeL_precision": 0.4599329987100549, "rougeL_precision_stderr": 0.0029633837519416856, "rougeL_recall": 0.38012920627255303, "rougeL_recall_stderr": 0.002577942917596876, "rougeLsum_fmeasure": 0.4557773932977128, "rougeLsum_fmeasure_stderr": 0.0024812907319972668, "rougeLsum_precision": 0.52202474439705, "rougeLsum_precision_stderr": 0.0031213147774664753, "rougeLsum_recall": 0.4336862446313041, "rougeLsum_recall_stderr": 0.002861587766898658}}, "3": {"generate_text_restaurant": {"bleu": 17.234863759565464, "bleu_stderr": 0.10626094777357345, "rouge1_fmeasure": 0.5604782743289571, "rouge1_fmeasure_stderr": 0.0022490187814614418, "rouge1_precision": 0.6451638771759513, "rouge1_precision_stderr": 0.0029145371412898314, "rouge1_recall": 0.5273596630229687, "rouge1_recall_stderr": 0.0028796397050091654, "rouge2_fmeasure": 0.29137647462406413, "rouge2_fmeasure_stderr": 0.0022566745277518207, "rouge2_precision": 0.337823399687939, "rouge2_precision_stderr": 0.002721131963155123, "rouge2_recall": 0.2741776446898378, "rouge2_recall_stderr": 0.002401017670924296, "rougeL_fmeasure": 0.40840245354897814, "rougeL_fmeasure_stderr": 0.002267056969140555, "rougeL_precision": 0.4713022566374456, "rougeL_precision_stderr": 0.002890868676128115, "rougeL_recall": 0.38393681062737384, "rougeL_recall_stderr": 0.0025637744220680424, "rougeLsum_fmeasure": 0.4667947275839455, "rougeLsum_fmeasure_stderr": 0.002424767456065824, "rougeLsum_precision": 0.5372731680476394, "rougeLsum_precision_stderr": 0.0030240854085328223, "rougeLsum_recall": 0.4392611204948281, "rougeLsum_recall_stderr": 0.0028130487549137095}}, "4": {"generate_text_restaurant": {"bleu": 17.43221273186975, "bleu_stderr": 0.14852992258132883, "rouge1_fmeasure": 0.5644088431191606, "rouge1_fmeasure_stderr": 0.0022998414866846426, "rouge1_precision": 0.6566667386405739, "rouge1_precision_stderr": 0.0029901104310675994, "rouge1_recall": 0.5250310784733211, "rouge1_recall_stderr": 0.002880579788531886, "rouge2_fmeasure": 0.29718267067971493, "rouge2_fmeasure_stderr": 0.0023174404504521418, "rouge2_precision": 0.3489715445596063, "rouge2_precision_stderr": 0.0028379714911102717, "rouge2_recall": 0.2759061577612267, "rouge2_recall_stderr": 0.002418733555610404, "rougeL_fmeasure": 0.413398262443025, "rougeL_fmeasure_stderr": 0.0023490992885206218, "rougeL_precision": 0.48238946977859254, "rougeL_precision_stderr": 0.0030258886889425406, "rougeL_recall": 0.3841163890326242, "rougeL_recall_stderr": 0.0025948324351265347, "rougeLsum_fmeasure": 0.4722659980511927, "rougeLsum_fmeasure_stderr": 0.002493574854840473, "rougeLsum_precision": 0.5493173554754809, "rougeLsum_precision_stderr": 0.0031195526839171693, "rougeLsum_recall": 0.43936018496359347, "rougeLsum_recall_stderr": 0.0028421410516723516}}, "5": {"generate_text_restaurant": {"bleu": 17.27987171682975, "bleu_stderr": 0.22299466307707344, "rouge1_fmeasure": 0.5661613833598578, "rouge1_fmeasure_stderr": 0.0022266488649150696, "rouge1_precision": 0.6628915228196061, "rouge1_precision_stderr": 0.002895062323071317, "rouge1_recall": 0.5218597939831594, "rouge1_recall_stderr": 0.0027835407580159314, "rouge2_fmeasure": 0.2990249691536151, "rouge2_fmeasure_stderr": 0.0022661493792424697, "rouge2_precision": 0.3538900021766568, "rouge2_precision_stderr": 0.002806480832935743, "rouge2_recall": 0.27471917389658523, "rouge2_recall_stderr": 0.0023266477815265104, "rougeL_fmeasure": 0.41542888953754, "rougeL_fmeasure_stderr": 0.002290321145556158, "rougeL_precision": 0.4881090745767031, "rougeL_precision_stderr": 0.0029641049977624664, "rougeL_recall": 0.3820604186528641, "rougeL_recall_stderr": 0.0024902085041821334, "rougeLsum_fmeasure": 0.47431838670472687, "rougeLsum_fmeasure_stderr": 0.002431375830473114, "rougeLsum_precision": 0.5555907001865986, "rougeLsum_precision_stderr": 0.0030782994957697994, "rougeLsum_recall": 0.4369700778836929, "rougeLsum_recall_stderr": 0.002723697352083028}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.598333295912428, "bleu_stderr": 0.04369181828078937, "rouge1_fmeasure": 0.20154310340791382, "rouge1_fmeasure_stderr": 0.002503527086260468, "rouge1_precision": 0.16091335382563132, "rouge1_precision_stderr": 0.0023214839725675105, "rouge1_recall": 0.31192453482745763, "rouge1_recall_stderr": 0.00431663840907332, "rouge2_fmeasure": 0.03939889642394197, "rouge2_fmeasure_stderr": 0.0014269911312081135, "rouge2_precision": 0.030192336082604687, "rouge2_precision_stderr": 0.0011298757941540884, "rouge2_recall": 0.06486203625582514, "rouge2_recall_stderr": 0.002495687484786392, "rougeL_fmeasure": 0.14814803620232633, "rougeL_fmeasure_stderr": 0.0018194923690126705, "rougeL_precision": 0.11804593952699223, "rougeL_precision_stderr": 0.001705008234352379, "rougeL_recall": 0.23090498894651582, "rougeL_recall_stderr": 0.003262191317539296, "rougeLsum_fmeasure": 0.15559873712369335, "rougeLsum_fmeasure_stderr": 0.002082741430413441, "rougeLsum_precision": 0.12345041061371383, "rougeLsum_precision_stderr": 0.0018404072547649245, "rougeLsum_recall": 0.24335540258442262, "rougeLsum_recall_stderr": 0.0037333681065596135}}, "1": {"article_DOC_summary": {"bleu": 1.3695546970670758, "bleu_stderr": 0.11288065979583667, "rouge1_fmeasure": 0.18440937659488896, "rouge1_fmeasure_stderr": 0.003013311852017229, "rouge1_precision": 0.1740287506077371, "rouge1_precision_stderr": 0.0035762963549450144, "rouge1_recall": 0.23534981942590377, "rouge1_recall_stderr": 0.003659738471811391, "rouge2_fmeasure": 0.03225325166687549, "rouge2_fmeasure_stderr": 0.0016333078188436293, "rouge2_precision": 0.03131736257651789, "rouge2_precision_stderr": 0.0017657388713041011, "rouge2_recall": 0.03976502875447221, "rouge2_recall_stderr": 0.001888187751012635, "rougeL_fmeasure": 0.14145239281725078, "rougeL_fmeasure_stderr": 0.0022406116825912387, "rougeL_precision": 0.13323315845033426, "rougeL_precision_stderr": 0.0026925290035192807, "rougeL_recall": 0.18196243615159283, "rougeL_recall_stderr": 0.002782613837447707, "rougeLsum_fmeasure": 0.14336629014297197, "rougeLsum_fmeasure_stderr": 0.0023103743217644922, "rougeLsum_precision": 0.13483891468420867, "rougeLsum_precision_stderr": 0.0027354596616931065, "rougeLsum_recall": 0.18482502398325326, "rougeLsum_recall_stderr": 0.002947981471107514}}, "2": {"article_DOC_summary": {"bleu": 1.7308730289259378, "bleu_stderr": 0.17770532210337478, "rouge1_fmeasure": 0.20363897243347945, "rouge1_fmeasure_stderr": 0.0032917079133891604, "rouge1_precision": 0.19985236655576322, "rouge1_precision_stderr": 0.003978452096741706, "rouge1_recall": 0.243103956744537, "rouge1_recall_stderr": 0.0036939855243832025, "rouge2_fmeasure": 0.03868362872963243, "rouge2_fmeasure_stderr": 0.0018516242073862486, "rouge2_precision": 0.039138232698124914, "rouge2_precision_stderr": 0.002062532698520258, "rouge2_recall": 0.044204414475646464, "rouge2_recall_stderr": 0.002004713309978894, "rougeL_fmeasure": 0.15378100277488155, "rougeL_fmeasure_stderr": 0.0024750584853391777, "rougeL_precision": 0.1506477774402608, "rougeL_precision_stderr": 0.003020476793401718, "rougeL_recall": 0.18469817921156023, "rougeL_recall_stderr": 0.0027972563470297306, "rougeLsum_fmeasure": 0.15570322493810856, "rougeLsum_fmeasure_stderr": 0.0025139232679603388, "rougeLsum_precision": 0.15239590573770034, "rougeLsum_precision_stderr": 0.0030467162920539536, "rougeLsum_recall": 0.18749040242023876, "rougeLsum_recall_stderr": 0.0029321410628762915}}, "3": {"article_DOC_summary": {"bleu": 1.6177421822314764, "bleu_stderr": 0.16975721226529852, "rouge1_fmeasure": 0.1953341147235192, "rouge1_fmeasure_stderr": 0.003349630735723747, "rouge1_precision": 0.19589658794017453, "rouge1_precision_stderr": 0.0039853906513849366, "rouge1_recall": 0.22849167295097247, "rouge1_recall_stderr": 0.0038566308815145126, "rouge2_fmeasure": 0.035779638469116914, "rouge2_fmeasure_stderr": 0.0016750996803778595, "rouge2_precision": 0.03625824703902077, "rouge2_precision_stderr": 0.0018314388825531765, "rouge2_recall": 0.0410258003930899, "rouge2_recall_stderr": 0.0018846780449691557, "rougeL_fmeasure": 0.1480464200642175, "rougeL_fmeasure_stderr": 0.0025456089591690385, "rougeL_precision": 0.148688151900945, "rougeL_precision_stderr": 0.003095918669053127, "rougeL_recall": 0.17399714292187934, "rougeL_recall_stderr": 0.0029506577213898122, "rougeLsum_fmeasure": 0.150857629725387, "rougeLsum_fmeasure_stderr": 0.0025908796219687017, "rougeLsum_precision": 0.1509875423818125, "rougeLsum_precision_stderr": 0.003109411289001428, "rougeLsum_recall": 0.17818678675083813, "rougeLsum_recall_stderr": 0.003105658861483215}}, "4": {"article_DOC_summary": {"bleu": 0.21556427831831304, "bleu_stderr": 0.05992392947727259, "rouge1_fmeasure": 0.055186415576932185, "rouge1_fmeasure_stderr": 0.0031944085309462042, "rouge1_precision": 0.06152754510214762, "rouge1_precision_stderr": 0.003813964694018243, "rouge1_recall": 0.060593832681025625, "rouge1_recall_stderr": 0.0035841876676039335, "rouge2_fmeasure": 0.010566772575448751, "rouge2_fmeasure_stderr": 0.0010986586323403539, "rouge2_precision": 0.011572977582773758, "rouge2_precision_stderr": 0.0012872321658089607, "rouge2_recall": 0.01149657270593299, "rouge2_recall_stderr": 0.0011931029906206063, "rougeL_fmeasure": 0.041175984283283985, "rougeL_fmeasure_stderr": 0.0024057569727709742, "rougeL_precision": 0.046781635255582606, "rougeL_precision_stderr": 0.00298661745817834, "rougeL_recall": 0.04475132182365535, "rougeL_recall_stderr": 0.0026420061653404576, "rougeLsum_fmeasure": 0.04268034684571005, "rougeLsum_fmeasure_stderr": 0.002476327367349351, "rougeLsum_precision": 0.04812651646509195, "rougeLsum_precision_stderr": 0.0030363159363846065, "rougeLsum_recall": 0.04684919680777955, "rougeLsum_recall_stderr": 0.0027817704867131674}}, "5": {"article_DOC_summary": {"bleu": 1.1959919832512981e-41, "bleu_stderr": 4.418229601032825e-38, "rouge1_fmeasure": 0.0027771927477267732, "rouge1_fmeasure_stderr": 0.0007722613879690155, "rouge1_precision": 0.0031162728399716532, "rouge1_precision_stderr": 0.0008737460454355458, "rouge1_recall": 0.002628521163198049, "rouge1_recall_stderr": 0.0007303851868588104, "rouge2_fmeasure": 0.0004013828039517138, "rouge2_fmeasure_stderr": 0.00020138596141432264, "rouge2_precision": 0.000451588582831646, "rouge2_precision_stderr": 0.00021783265095521171, "rouge2_recall": 0.00036940260996864765, "rouge2_recall_stderr": 0.00019380741759243087, "rougeL_fmeasure": 0.002068253073845221, "rougeL_fmeasure_stderr": 0.0005744934556807894, "rougeL_precision": 0.0023063072540261626, "rougeL_precision_stderr": 0.000636589500329542, "rougeL_recall": 0.001989261212241742, "rougeL_recall_stderr": 0.0005658258080821268, "rougeLsum_fmeasure": 0.00228235191573137, "rougeLsum_fmeasure_stderr": 0.0006382890720853227, "rougeLsum_precision": 0.0025476363697054154, "rougeLsum_precision_stderr": 0.0007118711755257363, "rougeLsum_recall": 0.0021837068783855215, "rougeLsum_recall_stderr": 0.0006187556981975251}}}}
4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.333,0.014910846164229868,0
3
+ anli_r2,acc,0.338,0.014965960710224482,0
4
+ anli_r3,acc,0.3458333333333333,0.013736245342311012,0
5
+ arc_challenge,acc,0.21416382252559726,0.011988383205966499,0
6
+ arc_challenge,acc_norm,0.25597269624573377,0.012753013241244513,0
7
+ arc_easy,acc,0.5294612794612794,0.010241957728409686,0
8
+ arc_easy,acc_norm,0.45202020202020204,0.010212436978834111,0
9
+ boolq,acc,0.6048929663608563,0.008550454248280895,1
10
+ cb,acc,0.35714285714285715,0.0646095738380922,1
11
+ cb,f1,0.19814814814814818,,1
12
+ copa,acc,0.68,0.046882617226215034,0
13
+ hellaswag,acc,0.3619796853216491,0.00479590828258455,0
14
+ hellaswag,acc_norm,0.44632543318064133,0.0049609473885351,0
15
+ piqa,acc,0.6980413492927094,0.01071173289158835,0
16
+ piqa,acc_norm,0.7040261153427638,0.010650414317148128,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.821,0.012128730605719111,0
19
+ sciq,acc_norm,0.711,0.01434171135829618,0
20
+ storycloze_2016,acc,0.6504543025120256,0.01102654800403797,0
21
+ winogrande,acc,0.5201262825572218,0.014041096664344327,0
4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.316,0.014709193056057121,0
3
+ anli_r2,acc,0.324,0.014806864733738856,0
4
+ anli_r3,acc,0.3358333333333333,0.013639261190932887,0
5
+ arc_challenge,acc,0.22440273037542663,0.012191404938603843,0
6
+ arc_challenge,acc_norm,0.2568259385665529,0.0127669237941168,0
7
+ arc_easy,acc,0.5429292929292929,0.010221897564256052,0
8
+ arc_easy,acc_norm,0.5096801346801347,0.010257860554461127,0
9
+ boolq,acc,0.55565749235474,0.00869070599067338,1
10
+ cb,acc,0.48214285714285715,0.06737697508644648,1
11
+ cb,f1,0.3373075012419274,,1
12
+ copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.3617805218084047,0.004795337009118191,0
14
+ hellaswag,acc_norm,0.44234216291575384,0.0049564940598648966,0
15
+ piqa,acc,0.6980413492927094,0.010711732891588357,0
16
+ piqa,acc_norm,0.691512513601741,0.010776164678037155,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.885,0.010093407594904628,0
19
+ sciq,acc_norm,0.853,0.011203415395160336,0
20
+ storycloze_2016,acc,0.6221272047033671,0.01121221988713706,0
21
+ winogrande,acc,0.5169692186266772,0.014044390401612978,0
4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811473,0
3
+ anli_r2,acc,0.332,0.01489959724281148,0
4
+ anli_r3,acc,0.3258333333333333,0.013535422043417462,0
5
+ arc_challenge,acc,0.2167235494880546,0.012040156713481189,0
6
+ arc_challenge,acc_norm,0.2593856655290102,0.012808273573927097,0
7
+ arc_easy,acc,0.5429292929292929,0.01022189756425605,0
8
+ arc_easy,acc_norm,0.5315656565656566,0.010239317603199512,0
9
+ boolq,acc,0.5559633027522936,0.00869010521492079,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.3268398268398269,,1
12
+ copa,acc,0.64,0.04824181513244218,0
13
+ hellaswag,acc,0.3599880501892053,0.004790155370993451,0
14
+ hellaswag,acc_norm,0.44911372236606256,0.004963872936857939,0
15
+ piqa,acc,0.6985854189336235,0.01070624824275376,0
16
+ piqa,acc_norm,0.6969532100108814,0.010722648689531501,0
17
+ rte,acc,0.5126353790613718,0.030086851767188564,0
18
+ sciq,acc,0.881,0.010244215145336662,0
19
+ sciq,acc_norm,0.877,0.010391293421849879,0
20
+ storycloze_2016,acc,0.6264029930518439,0.011186849693644696,0
21
+ winogrande,acc,0.5240726124704025,0.014036189665395134,0
4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.29,0.014356395999905689,0
3
+ anli_r2,acc,0.365,0.015231776226264903,0
4
+ anli_r3,acc,0.3333333333333333,0.013613950010225603,0
5
+ arc_challenge,acc,0.21843003412969283,0.01207429160570098,0
6
+ arc_challenge,acc_norm,0.25597269624573377,0.012753013241244518,0
7
+ arc_easy,acc,0.5370370370370371,0.010231597249131051,0
8
+ arc_easy,acc_norm,0.5172558922558923,0.010253671674754631,0
9
+ boolq,acc,0.5663608562691131,0.008667690464344683,1
10
+ cb,acc,0.6071428571428571,0.0658538889806635,1
11
+ cb,f1,0.4062342885872297,,1
12
+ copa,acc,0.73,0.044619604333847394,0
13
+ hellaswag,acc,0.36168094005178253,0.004795051037917719,0
14
+ hellaswag,acc_norm,0.45210117506472813,0.004966832553245038,0
15
+ piqa,acc,0.704570184983678,0.010644731559342464,0
16
+ piqa,acc_norm,0.705658324265506,0.010633311470347519,0
17
+ rte,acc,0.5379061371841155,0.030009848912529117,0
18
+ sciq,acc,0.889,0.009938701010583726,0
19
+ sciq,acc_norm,0.875,0.010463483381956722,0
20
+ storycloze_2016,acc,0.6344200962052379,0.011136758947688388,0
21
+ winogrande,acc,0.5122336227308603,0.01404827882040562,0
4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.329,0.014865395385928364,0
3
+ anli_r2,acc,0.338,0.014965960710224487,0
4
+ anli_r3,acc,0.3325,0.013605417345710528,0
5
+ arc_challenge,acc,0.22525597269624573,0.012207839995407309,0
6
+ arc_challenge,acc_norm,0.25853242320819114,0.012794553754288673,0
7
+ arc_easy,acc,0.5433501683501684,0.010221149650118182,0
8
+ arc_easy,acc_norm,0.523989898989899,0.010247967392742688,0
9
+ boolq,acc,0.5577981651376147,0.00868643052611449,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.3502252252252252,,1
12
+ copa,acc,0.68,0.04688261722621505,0
13
+ hellaswag,acc,0.3595897231627166,0.004788994060654276,0
14
+ hellaswag,acc_norm,0.44911372236606256,0.004963872936857938,0
15
+ piqa,acc,0.7083786724700761,0.01060444152742879,0
16
+ piqa,acc_norm,0.7007616974972797,0.010684130673134581,0
17
+ rte,acc,0.4657039711191336,0.030025579819366426,0
18
+ sciq,acc,0.895,0.009698921026024968,0
19
+ sciq,acc_norm,0.898,0.00957536880165389,0
20
+ storycloze_2016,acc,0.6365579903794762,0.011122841442059708,0
21
+ winogrande,acc,0.5224940805051302,0.014038257824059876,0
4b284b84b10c4py/evaluation/rankeval/4b284b84b10c4py_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.337,0.014955087918653607,0
3
+ anli_r2,acc,0.337,0.014955087918653605,0
4
+ anli_r3,acc,0.335,0.013630871843821474,0
5
+ arc_challenge,acc,0.23122866894197952,0.012320858834772273,0
6
+ arc_challenge,acc_norm,0.2636518771331058,0.012875929151297065,0
7
+ arc_easy,acc,0.5454545454545454,0.010217299762709419,0
8
+ arc_easy,acc_norm,0.5374579124579124,0.010230952104570801,0
9
+ boolq,acc,0.5657492354740061,0.008669116184243044,1
10
+ cb,acc,0.5892857142857143,0.06633634150359538,1
11
+ cb,f1,0.365874363327674,,1
12
+ copa,acc,0.71,0.04560480215720684,0
13
+ hellaswag,acc,0.36217884883489343,0.004796478664403837,0
14
+ hellaswag,acc_norm,0.4455287791276638,0.0049600825288524325,0
15
+ piqa,acc,0.705114254624592,0.010639030620156998,0
16
+ piqa,acc_norm,0.6974972796517954,0.010717199698083898,0
17
+ rte,acc,0.5667870036101083,0.029826764082138267,0
18
+ sciq,acc,0.9,0.009491579957525057,0
19
+ sciq,acc_norm,0.892,0.009820001651345694,0
20
+ storycloze_2016,acc,0.6419027258150721,0.011087006809925708,0
21
+ winogrande,acc,0.510655090765588,0.0140492945362904,0
4b284b84b20c4py/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.01661379695567998
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.01661379695567998
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.24536868897035313
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.24536868897035313
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2731108373811898
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2731108373811898
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.28644947317730374
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.28644947317730374
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2885935293923734
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2885935293923734
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.29047091894764593
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.29047091894764593
14
+ e2e_nlg_cleaned,5,average,multiple,0.233434540804091
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04236381811374163
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04236381811374163
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.041266355576914736
18
+ gem_xsum,1,median,rouge2_fmeasure,0.041266355576914736
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.046526613891962755
20
+ gem_xsum,2,median,rouge2_fmeasure,0.046526613891962755
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04886471919044582
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04886471919044582
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01224992383643623
24
+ gem_xsum,4,median,rouge2_fmeasure,0.01224992383643623
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,7.982297862229252e-05
26
+ gem_xsum,5,median,rouge2_fmeasure,7.982297862229252e-05
27
+ gem_xsum,5,average,multiple,0.03189187559802058
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05110539884351626
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05110539884351626
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08390644664058856
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.08390644664058856
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09440270765882894
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.09440270765882894
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10149124697611836
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.10149124697611836
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.11095071967065845
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.11095071967065845
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11183554919456225
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.11183554919456225
40
+ web_nlg_en,5,average,multiple,0.0922820114973788
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04782253241118541
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.04782253241118541
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05830265243888475
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05830265243888475
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06455785570484715
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.06455785570484715
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05320436365331453
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.05320436365331453
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.017957660778235317
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.017957660778235317
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0032253758284172046
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0032253758284172046
53
+ wiki_lingua_en,5,average,multiple,0.0408450734691474
4b284b84b20c4py/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.5821529903006863, "bleu_stderr": 0.04918999046303147, "rouge1_fmeasure": 0.11648399520322475, "rouge1_fmeasure_stderr": 0.001962734009745805, "rouge1_precision": 0.07386641694533813, "rouge1_precision_stderr": 0.001473585372150955, "rouge1_recall": 0.39567846626033865, "rouge1_recall_stderr": 0.005414503089578944, "rouge2_fmeasure": 0.05110539884351626, "rouge2_fmeasure_stderr": 0.0011859538860275287, "rouge2_precision": 0.03248305080433679, "rouge2_precision_stderr": 0.0009633290737548379, "rouge2_recall": 0.1832552775488699, "rouge2_recall_stderr": 0.004031339201967509, "rougeL_fmeasure": 0.10857448831592628, "rougeL_fmeasure_stderr": 0.0017327757539863185, "rougeL_precision": 0.0686187411531768, "rougeL_precision_stderr": 0.0013133622767438772, "rougeL_recall": 0.37529249384927266, "rougeL_recall_stderr": 0.005205708562785746, "rougeLsum_fmeasure": 0.10680966736431895, "rougeLsum_fmeasure_stderr": 0.0018094617190232258, "rougeLsum_precision": 0.0678966567842991, "rougeLsum_precision_stderr": 0.0013783793387494858, "rougeLsum_recall": 0.3611732242180814, "rougeLsum_recall_stderr": 0.0048968470368908395}}, "1": {"PALM_prompt": {"bleu": 0.7223543082267028, "bleu_stderr": 0.04821949690776964, "rouge1_fmeasure": 0.16315293939256764, "rouge1_fmeasure_stderr": 0.00406606391553416, "rouge1_precision": 0.1544990164378509, "rouge1_precision_stderr": 0.005109936829222964, "rouge1_recall": 0.29750968177881815, "rouge1_recall_stderr": 0.0053921734698164, "rouge2_fmeasure": 0.08390644664058856, "rouge2_fmeasure_stderr": 0.002709652420328561, "rouge2_precision": 0.08128625147185826, "rouge2_precision_stderr": 0.003552808995225472, "rouge2_recall": 0.1541419771733494, "rouge2_recall_stderr": 0.003687653662432468, "rougeL_fmeasure": 0.14654695823818142, "rougeL_fmeasure_stderr": 0.003492184169587925, "rougeL_precision": 0.13931328945798263, "rougeL_precision_stderr": 0.004639745020801835, "rougeL_recall": 0.2742437769894535, "rougeL_recall_stderr": 0.004851674974111816, "rougeLsum_fmeasure": 0.14985047771566065, "rougeLsum_fmeasure_stderr": 0.0035906957715719674, "rougeLsum_precision": 0.1423726476383948, "rougeLsum_precision_stderr": 0.0047153719491208395, "rougeLsum_recall": 0.2782302214571647, "rougeLsum_recall_stderr": 0.004924012809456184}}, "2": {"PALM_prompt": {"bleu": 0.7378703499833328, "bleu_stderr": 0.05244772521664121, "rouge1_fmeasure": 0.18021130544779096, "rouge1_fmeasure_stderr": 0.004272941447400605, "rouge1_precision": 0.1563219057138968, "rouge1_precision_stderr": 0.004858791940078939, "rouge1_recall": 0.34096200412946576, "rouge1_recall_stderr": 0.0052420535074354235, "rouge2_fmeasure": 0.09440270765882894, "rouge2_fmeasure_stderr": 0.0028820359666008894, "rouge2_precision": 0.08318072395118746, "rouge2_precision_stderr": 0.003239501070654235, "rouge2_recall": 0.17995254809157346, "rouge2_recall_stderr": 0.0038237592317804203, "rougeL_fmeasure": 0.15883054738596875, "rougeL_fmeasure_stderr": 0.0035788011689506607, "rougeL_precision": 0.13592667591898291, "rougeL_precision_stderr": 0.004122751284116168, "rougeL_recall": 0.31128419898844045, "rougeL_recall_stderr": 0.0046882721291824836, "rougeLsum_fmeasure": 0.16488490511394102, "rougeLsum_fmeasure_stderr": 0.003755662348446356, "rougeLsum_precision": 0.14180027524312472, "rougeLsum_precision_stderr": 0.0043181462846973075, "rougeLsum_recall": 0.31911419618669157, "rougeLsum_recall_stderr": 0.004792670653494021}}, "3": {"PALM_prompt": {"bleu": 0.8455200709652629, "bleu_stderr": 0.04196246583354288, "rouge1_fmeasure": 0.18931101361421848, "rouge1_fmeasure_stderr": 0.00441175543523593, "rouge1_precision": 0.1657620132776089, "rouge1_precision_stderr": 0.005043163950083002, "rouge1_recall": 0.3530523030067534, "rouge1_recall_stderr": 0.005295085557460209, "rouge2_fmeasure": 0.10149124697611836, "rouge2_fmeasure_stderr": 0.0030597860330415136, "rouge2_precision": 0.09113130928821495, "rouge2_precision_stderr": 0.00352182450003332, "rouge2_recall": 0.19048234238417183, "rouge2_recall_stderr": 0.003997027920968389, "rougeL_fmeasure": 0.16703960549823277, "rougeL_fmeasure_stderr": 0.003744018185437013, "rougeL_precision": 0.1447000344818964, "rougeL_precision_stderr": 0.0043508515855694265, "rougeL_recall": 0.32151424349582025, "rougeL_recall_stderr": 0.004751436169912958, "rougeLsum_fmeasure": 0.1737022822557869, "rougeLsum_fmeasure_stderr": 0.003918765580791864, "rougeLsum_precision": 0.15117602656048074, "rougeLsum_precision_stderr": 0.004540593426899662, "rougeLsum_recall": 0.3302972415087775, "rougeLsum_recall_stderr": 0.0048599048694187045}}, "4": {"PALM_prompt": {"bleu": 1.0438894385302977, "bleu_stderr": 0.04793858954840552, "rouge1_fmeasure": 0.19902765425868502, "rouge1_fmeasure_stderr": 0.004577559193842631, "rouge1_precision": 0.17498650423231482, "rouge1_precision_stderr": 0.005226003580397081, "rouge1_recall": 0.36780447034284164, "rouge1_recall_stderr": 0.005431475165656425, "rouge2_fmeasure": 0.11095071967065845, "rouge2_fmeasure_stderr": 0.0032440936870027355, "rouge2_precision": 0.09974409410725339, "rouge2_precision_stderr": 0.003653629975759034, "rouge2_recall": 0.20509195524896126, "rouge2_recall_stderr": 0.004169873380236561, "rougeL_fmeasure": 0.17536389049505893, "rougeL_fmeasure_stderr": 0.0038982782803899446, "rougeL_precision": 0.15273278924825234, "rougeL_precision_stderr": 0.004512681594242828, "rougeL_recall": 0.3341937179899564, "rougeL_recall_stderr": 0.004863935699990443, "rougeLsum_fmeasure": 0.18302148806966395, "rougeLsum_fmeasure_stderr": 0.0041088067443544605, "rougeLsum_precision": 0.16022092144366937, "rougeLsum_precision_stderr": 0.0047478422151604575, "rougeLsum_recall": 0.3441177829627617, "rougeLsum_recall_stderr": 0.005004731636362767}}, "5": {"PALM_prompt": {"bleu": 1.0466750858653324, "bleu_stderr": 0.05396570601295072, "rouge1_fmeasure": 0.20551706538102096, "rouge1_fmeasure_stderr": 0.004610400264565149, "rouge1_precision": 0.18406549705329572, "rouge1_precision_stderr": 0.005371878868504491, "rouge1_recall": 0.3719286607456516, "rouge1_recall_stderr": 0.005364333907557689, "rouge2_fmeasure": 0.11183554919456225, "rouge2_fmeasure_stderr": 0.003125085276400022, "rouge2_precision": 0.10307937795923339, "rouge2_precision_stderr": 0.0036674716128668973, "rouge2_recall": 0.20381389809228248, "rouge2_recall_stderr": 0.004056676150979306, "rougeL_fmeasure": 0.18043825832369542, "rougeL_fmeasure_stderr": 0.0038675119824013696, "rougeL_precision": 0.1593440951510111, "rougeL_precision_stderr": 0.004547392408505862, "rougeL_recall": 0.33912003411872, "rougeL_recall_stderr": 0.004834061594195042, "rougeLsum_fmeasure": 0.1884167521241184, "rougeLsum_fmeasure_stderr": 0.0040990266910383025, "rougeLsum_precision": 0.16770822931202312, "rougeLsum_precision_stderr": 0.004841756735009651, "rougeLsum_recall": 0.34856549792968894, "rougeLsum_recall_stderr": 0.004968142545978921}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.8644959703383694, "bleu_stderr": 0.09840283116549842, "rouge1_fmeasure": 0.19105005971268485, "rouge1_fmeasure_stderr": 0.0023103077379001513, "rouge1_precision": 0.17666729161870398, "rouge1_precision_stderr": 0.002564314103617473, "rouge1_recall": 0.2592480740561813, "rouge1_recall_stderr": 0.003244297331582796, "rouge2_fmeasure": 0.04782253241118541, "rouge2_fmeasure_stderr": 0.0010918302847590757, "rouge2_precision": 0.04362509472364397, "rouge2_precision_stderr": 0.0010895791924211578, "rouge2_recall": 0.06638238429547004, "rouge2_recall_stderr": 0.0016626273617933336, "rougeL_fmeasure": 0.14392086336616933, "rougeL_fmeasure_stderr": 0.0016839270045610358, "rougeL_precision": 0.13256720312713946, "rougeL_precision_stderr": 0.001929533760984254, "rougeL_recall": 0.1991189509046663, "rougeL_recall_stderr": 0.002560502239462447, "rougeLsum_fmeasure": 0.17821464833251138, "rougeLsum_fmeasure_stderr": 0.002170363421403627, "rougeLsum_precision": 0.16494730429754728, "rougeLsum_precision_stderr": 0.0024267862201027405, "rougeLsum_recall": 0.24216666142985147, "rougeLsum_recall_stderr": 0.003059394679821919}}, "1": {"tldr_en": {"bleu": 3.2291930252196894, "bleu_stderr": 0.0813662472819421, "rouge1_fmeasure": 0.21138936951170206, "rouge1_fmeasure_stderr": 0.0021119436014965126, "rouge1_precision": 0.3284115430545649, "rouge1_precision_stderr": 0.0044703231640133324, "rouge1_recall": 0.23063838463945993, "rouge1_recall_stderr": 0.002967307451046764, "rouge2_fmeasure": 0.05830265243888475, "rouge2_fmeasure_stderr": 0.0013343876301249531, "rouge2_precision": 0.10638602617553335, "rouge2_precision_stderr": 0.0032259378569320244, "rouge2_recall": 0.06165529786910488, "rouge2_recall_stderr": 0.0015606126487480723, "rougeL_fmeasure": 0.1612495154940211, "rougeL_fmeasure_stderr": 0.00164535062263176, "rougeL_precision": 0.2614059247131426, "rougeL_precision_stderr": 0.004001018574808426, "rougeL_recall": 0.17470934598435137, "rougeL_recall_stderr": 0.002291915300777273, "rougeLsum_fmeasure": 0.19781460859524516, "rougeLsum_fmeasure_stderr": 0.0019870772811840932, "rougeLsum_precision": 0.30990128884139945, "rougeLsum_precision_stderr": 0.004333017412295463, "rougeLsum_recall": 0.21562666164817174, "rougeLsum_recall_stderr": 0.0027832898109490414}}, "2": {"tldr_en": {"bleu": 3.5225452851718835, "bleu_stderr": 0.11186704613999687, "rouge1_fmeasure": 0.22002063765271593, "rouge1_fmeasure_stderr": 0.002220610600873451, "rouge1_precision": 0.35516121344732815, "rouge1_precision_stderr": 0.004503744620650759, "rouge1_recall": 0.22637157441136155, "rouge1_recall_stderr": 0.002939317731121365, "rouge2_fmeasure": 0.06455785570484715, "rouge2_fmeasure_stderr": 0.001428269385467915, "rouge2_precision": 0.12089774581306544, "rouge2_precision_stderr": 0.0033596165725887587, "rouge2_recall": 0.06380383704660388, "rouge2_recall_stderr": 0.0015307708282999337, "rougeL_fmeasure": 0.1710811281214488, "rougeL_fmeasure_stderr": 0.001767537350938728, "rougeL_precision": 0.28750238667680816, "rougeL_precision_stderr": 0.00406689065814819, "rougeL_recall": 0.17391305276884944, "rougeL_recall_stderr": 0.00227935817197457, "rougeLsum_fmeasure": 0.20737584657014088, "rougeLsum_fmeasure_stderr": 0.002104069411424446, "rougeLsum_precision": 0.3377932043147537, "rougeLsum_precision_stderr": 0.004415802851933227, "rougeLsum_recall": 0.2129690629149275, "rougeLsum_recall_stderr": 0.0027739680144594427}}, "3": {"tldr_en": {"bleu": 2.36637952818202, "bleu_stderr": 0.09034253842241578, "rouge1_fmeasure": 0.1842523172993621, "rouge1_fmeasure_stderr": 0.0025270968369157606, "rouge1_precision": 0.30492171131094353, "rouge1_precision_stderr": 0.0047772838466778565, "rouge1_recall": 0.18429202449323562, "rouge1_recall_stderr": 0.0030852957702281756, "rouge2_fmeasure": 0.05320436365331453, "rouge2_fmeasure_stderr": 0.0013871858417417512, "rouge2_precision": 0.09877293469478084, "rouge2_precision_stderr": 0.003042979575375095, "rouge2_recall": 0.05222761350470907, "rouge2_recall_stderr": 0.0015128302917191373, "rougeL_fmeasure": 0.14440728205894401, "rougeL_fmeasure_stderr": 0.0019982150797446126, "rougeL_precision": 0.24798805756012632, "rougeL_precision_stderr": 0.004187155485268946, "rougeL_recall": 0.14302881144547364, "rougeL_recall_stderr": 0.002414714553340016, "rougeLsum_fmeasure": 0.17378946036020648, "rougeLsum_fmeasure_stderr": 0.0023823495138698175, "rougeLsum_precision": 0.2898125284191609, "rougeLsum_precision_stderr": 0.004617020091129872, "rougeLsum_recall": 0.17345207821084954, "rougeLsum_recall_stderr": 0.002903285955613911}}, "4": {"tldr_en": {"bleu": 0.06457564141470507, "bleu_stderr": 0.008329756004432651, "rouge1_fmeasure": 0.06118855220434953, "rouge1_fmeasure_stderr": 0.0021661323525031715, "rouge1_precision": 0.10059288555045104, "rouge1_precision_stderr": 0.0038158421954036007, "rouge1_recall": 0.0629899973616509, "rouge1_recall_stderr": 0.002443758406766456, "rouge2_fmeasure": 0.017957660778235317, "rouge2_fmeasure_stderr": 0.0009566364194150249, "rouge2_precision": 0.03429382030631058, "rouge2_precision_stderr": 0.002180070026084628, "rouge2_recall": 0.01817045547905002, "rouge2_recall_stderr": 0.0010725598013084623, "rougeL_fmeasure": 0.0489275300669287, "rougeL_fmeasure_stderr": 0.001745459259837189, "rougeL_precision": 0.08431661025154882, "rougeL_precision_stderr": 0.003371556566767307, "rougeL_recall": 0.049720662704722134, "rougeL_recall_stderr": 0.001944703599780324, "rougeLsum_fmeasure": 0.05752768159536108, "rougeLsum_fmeasure_stderr": 0.002029725801391926, "rougeLsum_precision": 0.09577435530540411, "rougeLsum_precision_stderr": 0.003676164831864219, "rougeLsum_recall": 0.05915809171587482, "rougeLsum_recall_stderr": 0.0022962397310322636}}, "5": {"tldr_en": {"bleu": 1.8380765632449567e-14, "bleu_stderr": 2.3549500616400215e-13, "rouge1_fmeasure": 0.009984147630959901, "rouge1_fmeasure_stderr": 0.0009984248511469506, "rouge1_precision": 0.016898017901863373, "rouge1_precision_stderr": 0.0017750990265208825, "rouge1_recall": 0.010082034086545925, "rouge1_recall_stderr": 0.0010784380652309026, "rouge2_fmeasure": 0.0032253758284172046, "rouge2_fmeasure_stderr": 0.00046285054343731663, "rouge2_precision": 0.0065299458009213975, "rouge2_precision_stderr": 0.001035120513731759, "rouge2_recall": 0.0031770613695000236, "rouge2_recall_stderr": 0.0004797988402385803, "rougeL_fmeasure": 0.008077159759742726, "rougeL_fmeasure_stderr": 0.0008280485118836229, "rougeL_precision": 0.014479800935916555, "rougeL_precision_stderr": 0.0016146849938303849, "rougeL_recall": 0.00801090505543412, "rougeL_recall_stderr": 0.0008700009740388975, "rougeLsum_fmeasure": 0.00943652067919951, "rougeLsum_fmeasure_stderr": 0.0009461154098825179, "rougeLsum_precision": 0.016184294097583637, "rougeLsum_precision_stderr": 0.001720217259749641, "rougeLsum_recall": 0.009470184917356883, "rougeLsum_recall_stderr": 0.0010100495348991784}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.43247677184685024, "bleu_stderr": 0.033875851652353954, "rouge1_fmeasure": 0.10950171614553199, "rouge1_fmeasure_stderr": 0.001695097915251923, "rouge1_precision": 0.10959397607392916, "rouge1_precision_stderr": 0.0015082486108122235, "rouge1_recall": 0.11864984727835637, "rouge1_recall_stderr": 0.0022176499929626235, "rouge2_fmeasure": 0.01661379695567998, "rouge2_fmeasure_stderr": 0.0007317122404629029, "rouge2_precision": 0.01527462996638652, "rouge2_precision_stderr": 0.0006638299650542848, "rouge2_recall": 0.019999829904043694, "rouge2_recall_stderr": 0.0009341184850982453, "rougeL_fmeasure": 0.09304292066966034, "rougeL_fmeasure_stderr": 0.0011571807694826105, "rougeL_precision": 0.09436066788883278, "rougeL_precision_stderr": 0.0010570471723853009, "rougeL_recall": 0.09932327469630105, "rougeL_recall_stderr": 0.0015360141973192033, "rougeLsum_fmeasure": 0.0998911245421451, "rougeLsum_fmeasure_stderr": 0.0014072933040335162, "rougeLsum_precision": 0.10077063698381258, "rougeLsum_precision_stderr": 0.00127965803739232, "rougeLsum_recall": 0.10731074612593083, "rougeLsum_recall_stderr": 0.0018385590198386557}}, "1": {"generate_text_restaurant": {"bleu": 14.138799978501053, "bleu_stderr": 0.15480830784903687, "rouge1_fmeasure": 0.5125167205611227, "rouge1_fmeasure_stderr": 0.0023651662897042755, "rouge1_precision": 0.5968366957436839, "rouge1_precision_stderr": 0.003015142456201993, "rouge1_recall": 0.48954973616011194, "rouge1_recall_stderr": 0.003154110033453677, "rouge2_fmeasure": 0.24536868897035313, "rouge2_fmeasure_stderr": 0.0021152678254568883, "rouge2_precision": 0.28811459984378174, "rouge2_precision_stderr": 0.0026040248998219972, "rouge2_recall": 0.23475714793970334, "rouge2_recall_stderr": 0.002314968393870581, "rougeL_fmeasure": 0.36458967944121407, "rougeL_fmeasure_stderr": 0.0021531883816726166, "rougeL_precision": 0.42787430905551216, "rougeL_precision_stderr": 0.0028633121423425455, "rougeL_recall": 0.3470673946354127, "rougeL_recall_stderr": 0.0025728563437977126, "rougeLsum_fmeasure": 0.417350163512538, "rougeLsum_fmeasure_stderr": 0.0024012705814720663, "rougeLsum_precision": 0.48668255912820346, "rougeLsum_precision_stderr": 0.00301880494587729, "rougeLsum_recall": 0.3985290767399889, "rougeLsum_recall_stderr": 0.0029315744162868814}}, "2": {"generate_text_restaurant": {"bleu": 16.67584789617738, "bleu_stderr": 0.24934972090088892, "rouge1_fmeasure": 0.5434492838820844, "rouge1_fmeasure_stderr": 0.0022798657306134494, "rouge1_precision": 0.6044891850192401, "rouge1_precision_stderr": 0.002962047333298037, "rouge1_recall": 0.5296091433441801, "rouge1_recall_stderr": 0.0029699728553327985, "rouge2_fmeasure": 0.2731108373811898, "rouge2_fmeasure_stderr": 0.0021911774712661926, "rouge2_precision": 0.3060578998541716, "rouge2_precision_stderr": 0.0026210349607320195, "rouge2_recall": 0.2663332824849255, "rouge2_recall_stderr": 0.0023940282146509124, "rougeL_fmeasure": 0.3886806544508872, "rougeL_fmeasure_stderr": 0.0021844936986815003, "rougeL_precision": 0.4339464538873811, "rougeL_precision_stderr": 0.0027896307194930727, "rougeL_recall": 0.37822202655327963, "rougeL_recall_stderr": 0.0025535736155600725, "rougeLsum_fmeasure": 0.44995734822290556, "rougeLsum_fmeasure_stderr": 0.002414589606781355, "rougeLsum_precision": 0.500797595703899, "rougeLsum_precision_stderr": 0.0030140205124491366, "rougeLsum_recall": 0.43833203269544485, "rougeLsum_recall_stderr": 0.0028575376559899297}}, "3": {"generate_text_restaurant": {"bleu": 17.9289834939611, "bleu_stderr": 0.2213809634052702, "rouge1_fmeasure": 0.5531025638271319, "rouge1_fmeasure_stderr": 0.002287431155068112, "rouge1_precision": 0.6095035638446965, "rouge1_precision_stderr": 0.002930771405927963, "rouge1_recall": 0.5400385856426181, "rouge1_recall_stderr": 0.002958534482681524, "rouge2_fmeasure": 0.28644947317730374, "rouge2_fmeasure_stderr": 0.0022634076265008176, "rouge2_precision": 0.31735677434972087, "rouge2_precision_stderr": 0.002652924486773051, "rouge2_recall": 0.2801209686683005, "rouge2_recall_stderr": 0.0024697622358377117, "rougeL_fmeasure": 0.4009270336830942, "rougeL_fmeasure_stderr": 0.0022477469014727212, "rougeL_precision": 0.44321547746696427, "rougeL_precision_stderr": 0.0028028512374731335, "rougeL_recall": 0.39096477845132616, "rougeL_recall_stderr": 0.0026123031329029745, "rougeLsum_fmeasure": 0.46312437456752364, "rougeLsum_fmeasure_stderr": 0.002440885766456274, "rougeLsum_precision": 0.5103701959134134, "rougeLsum_precision_stderr": 0.002984407196868822, "rougeLsum_recall": 0.4522481230027106, "rougeLsum_recall_stderr": 0.0028960497978095996}}, "4": {"generate_text_restaurant": {"bleu": 18.149147747916913, "bleu_stderr": 0.20148633900165086, "rouge1_fmeasure": 0.5535709465739721, "rouge1_fmeasure_stderr": 0.0022696074476917737, "rouge1_precision": 0.6065356554980276, "rouge1_precision_stderr": 0.0029114663522908494, "rouge1_recall": 0.5407131752166793, "rouge1_recall_stderr": 0.0028869233555941847, "rouge2_fmeasure": 0.2885935293923734, "rouge2_fmeasure_stderr": 0.00227878913447549, "rouge2_precision": 0.3175050521033487, "rouge2_precision_stderr": 0.0026446629424917993, "rouge2_recall": 0.28222258859716826, "rouge2_recall_stderr": 0.0024700817238289944, "rougeL_fmeasure": 0.40073709641210925, "rougeL_fmeasure_stderr": 0.002259079219646151, "rougeL_precision": 0.43968215201631833, "rougeL_precision_stderr": 0.0027559200740548723, "rougeL_recall": 0.3912634075317438, "rougeL_recall_stderr": 0.0026077143358096103, "rougeLsum_fmeasure": 0.46512455647571105, "rougeLsum_fmeasure_stderr": 0.00246162611684128, "rougeLsum_precision": 0.509403064079757, "rougeLsum_precision_stderr": 0.0029861554036300585, "rougeLsum_recall": 0.45442412908463375, "rougeLsum_recall_stderr": 0.0028784777463463844}}, "5": {"generate_text_restaurant": {"bleu": 18.293098564935043, "bleu_stderr": 0.2015089174052472, "rouge1_fmeasure": 0.5549735351709728, "rouge1_fmeasure_stderr": 0.002267759028289067, "rouge1_precision": 0.6077504578417503, "rouge1_precision_stderr": 0.0029261887327982562, "rouge1_recall": 0.5415558465280581, "rouge1_recall_stderr": 0.002846550944908392, "rouge2_fmeasure": 0.29047091894764593, "rouge2_fmeasure_stderr": 0.0023151393500169828, "rouge2_precision": 0.3196575755449254, "rouge2_precision_stderr": 0.0027013822082627096, "rouge2_recall": 0.28374451994336647, "rouge2_recall_stderr": 0.002479580564388698, "rougeL_fmeasure": 0.4038165297365153, "rougeL_fmeasure_stderr": 0.0022641228045925934, "rougeL_precision": 0.4429726081641192, "rougeL_precision_stderr": 0.002791198713554071, "rougeL_recall": 0.39397526144610207, "rougeL_recall_stderr": 0.0025934124726939927, "rougeLsum_fmeasure": 0.4692296832791923, "rougeLsum_fmeasure_stderr": 0.002459426145702881, "rougeLsum_precision": 0.5137249582242605, "rougeLsum_precision_stderr": 0.0030010894093243884, "rougeLsum_recall": 0.4579977387073725, "rougeLsum_recall_stderr": 0.0028534759678292203}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.7077858100284875, "bleu_stderr": 0.06318104051496665, "rouge1_fmeasure": 0.20585322846240078, "rouge1_fmeasure_stderr": 0.0025590515279682287, "rouge1_precision": 0.15957877260264566, "rouge1_precision_stderr": 0.002268991781760454, "rouge1_recall": 0.32811311012769145, "rouge1_recall_stderr": 0.00437654661974792, "rouge2_fmeasure": 0.04236381811374163, "rouge2_fmeasure_stderr": 0.0014754029359749594, "rouge2_precision": 0.0321492286692698, "rouge2_precision_stderr": 0.0011871798504177848, "rouge2_recall": 0.07023735951022572, "rouge2_recall_stderr": 0.0024919186942029548, "rougeL_fmeasure": 0.15149462132825195, "rougeL_fmeasure_stderr": 0.001896257149928836, "rougeL_precision": 0.11772435407501897, "rougeL_precision_stderr": 0.0017511876283900834, "rougeL_recall": 0.24205558291065699, "rougeL_recall_stderr": 0.0032633158272913202, "rougeLsum_fmeasure": 0.16193719293154057, "rougeLsum_fmeasure_stderr": 0.0021003252791642214, "rougeLsum_precision": 0.1252147373709473, "rougeLsum_precision_stderr": 0.0018424550712854318, "rougeLsum_recall": 0.2599266014362882, "rougeLsum_recall_stderr": 0.0036876361603540428}}, "1": {"article_DOC_summary": {"bleu": 1.8282472672146812, "bleu_stderr": 0.1020121938444088, "rouge1_fmeasure": 0.20198994819537205, "rouge1_fmeasure_stderr": 0.002984694518793648, "rouge1_precision": 0.18241736362419408, "rouge1_precision_stderr": 0.0035671793011444824, "rouge1_recall": 0.28088578439333156, "rouge1_recall_stderr": 0.004150802521339531, "rouge2_fmeasure": 0.041266355576914736, "rouge2_fmeasure_stderr": 0.0017486414825347912, "rouge2_precision": 0.03766732268521147, "rouge2_precision_stderr": 0.0018002611735446364, "rouge2_recall": 0.05744303705984819, "rouge2_recall_stderr": 0.002394118114694247, "rougeL_fmeasure": 0.1537399711922189, "rougeL_fmeasure_stderr": 0.0023040866331068065, "rougeL_precision": 0.13852250267452665, "rougeL_precision_stderr": 0.0027600347382047627, "rougeL_recall": 0.21557457573877506, "rougeL_recall_stderr": 0.0032748640854860915, "rougeLsum_fmeasure": 0.157801209736622, "rougeLsum_fmeasure_stderr": 0.0023904477003575016, "rougeLsum_precision": 0.14149023067517916, "rougeLsum_precision_stderr": 0.002779812720023245, "rougeLsum_recall": 0.2224713251909163, "rougeLsum_recall_stderr": 0.0035223603046659778}}, "2": {"article_DOC_summary": {"bleu": 2.0883920440199915, "bleu_stderr": 0.11737825754959746, "rouge1_fmeasure": 0.2195684919577384, "rouge1_fmeasure_stderr": 0.003254265138077691, "rouge1_precision": 0.21893354662011963, "rouge1_precision_stderr": 0.004048272249341602, "rouge1_recall": 0.260092993109534, "rouge1_recall_stderr": 0.003902306781441875, "rouge2_fmeasure": 0.046526613891962755, "rouge2_fmeasure_stderr": 0.00204903807224034, "rouge2_precision": 0.04791765027513919, "rouge2_precision_stderr": 0.0022980519753049485, "rouge2_recall": 0.053493298501079714, "rouge2_recall_stderr": 0.0023052325119618023, "rougeL_fmeasure": 0.16501217387582887, "rougeL_fmeasure_stderr": 0.0025424570393059753, "rougeL_precision": 0.16473734247474792, "rougeL_precision_stderr": 0.0031841165642595994, "rougeL_recall": 0.19604258368067826, "rougeL_recall_stderr": 0.0030444450297627, "rougeLsum_fmeasure": 0.16810787872676755, "rougeLsum_fmeasure_stderr": 0.0025882089916786253, "rougeLsum_precision": 0.1671435431989699, "rougeLsum_precision_stderr": 0.0031935413338326185, "rougeLsum_recall": 0.20091357776545432, "rougeLsum_recall_stderr": 0.003229501111978948}}, "3": {"article_DOC_summary": {"bleu": 2.4903444029414104, "bleu_stderr": 0.16888554758075064, "rouge1_fmeasure": 0.21513910465275415, "rouge1_fmeasure_stderr": 0.0036788747099070216, "rouge1_precision": 0.22154258705050656, "rouge1_precision_stderr": 0.004416958718487466, "rouge1_recall": 0.2427641584174357, "rouge1_recall_stderr": 0.00420050957184403, "rouge2_fmeasure": 0.04886471919044582, "rouge2_fmeasure_stderr": 0.002157338930931576, "rouge2_precision": 0.050821433347839264, "rouge2_precision_stderr": 0.0023945363772249644, "rouge2_recall": 0.05453094107822012, "rouge2_recall_stderr": 0.002408142744637484, "rougeL_fmeasure": 0.16128891406853352, "rougeL_fmeasure_stderr": 0.002880789024699209, "rougeL_precision": 0.16668887518076203, "rougeL_precision_stderr": 0.003514628068406048, "rougeL_recall": 0.18226837233438312, "rougeL_recall_stderr": 0.003295475230668282, "rougeLsum_fmeasure": 0.1642209135986658, "rougeLsum_fmeasure_stderr": 0.0029068460098697737, "rougeLsum_precision": 0.16896742575336499, "rougeLsum_precision_stderr": 0.00351461682461915, "rougeLsum_recall": 0.1869678121534825, "rougeLsum_recall_stderr": 0.003429064046115561}}, "4": {"article_DOC_summary": {"bleu": 0.215456941864469, "bleu_stderr": 0.06941440736262179, "rouge1_fmeasure": 0.0572325652119175, "rouge1_fmeasure_stderr": 0.0032468616776549866, "rouge1_precision": 0.06587717027902634, "rouge1_precision_stderr": 0.0039704424509340985, "rouge1_recall": 0.06115142824616809, "rouge1_recall_stderr": 0.003616406533465941, "rouge2_fmeasure": 0.01224992383643623, "rouge2_fmeasure_stderr": 0.0011777335201229031, "rouge2_precision": 0.014030183412219063, "rouge2_precision_stderr": 0.001506975166878615, "rouge2_recall": 0.013471932961216628, "rouge2_recall_stderr": 0.0013203271107582826, "rougeL_fmeasure": 0.04246528505415009, "rougeL_fmeasure_stderr": 0.0024489654144463574, "rougeL_precision": 0.050416403187604805, "rougeL_precision_stderr": 0.0032341383122656336, "rougeL_recall": 0.0450041617246973, "rougeL_recall_stderr": 0.002689245329612641, "rougeLsum_fmeasure": 0.04361018726263517, "rougeLsum_fmeasure_stderr": 0.0025148690575369646, "rougeLsum_precision": 0.05133449482703434, "rougeLsum_precision_stderr": 0.0032661771463960336, "rougeLsum_recall": 0.046814652800863985, "rougeLsum_recall_stderr": 0.002844770088636042}}, "5": {"article_DOC_summary": {"bleu": 4.047824391489386e-39, "bleu_stderr": 1.7044826243904522e-34, "rouge1_fmeasure": 0.00214003217049867, "rouge1_fmeasure_stderr": 0.0006281208747948269, "rouge1_precision": 0.0023814022915898026, "rouge1_precision_stderr": 0.0007101439263166566, "rouge1_recall": 0.0020217833554038362, "rouge1_recall_stderr": 0.0005950726577170537, "rouge2_fmeasure": 7.982297862229252e-05, "rouge2_fmeasure_stderr": 5.643442046339631e-05, "rouge2_precision": 9.874063374559899e-05, "rouge2_precision_stderr": 7.004654141515487e-05, "rouge2_recall": 6.729119936667107e-05, "rouge2_recall_stderr": 4.7570795295869234e-05, "rougeL_fmeasure": 0.0015217948267156234, "rougeL_fmeasure_stderr": 0.0004371612073436473, "rougeL_precision": 0.0017095297689958592, "rougeL_precision_stderr": 0.0005079974336227376, "rougeL_recall": 0.0014314059035737939, "rougeL_recall_stderr": 0.0004049546110440357, "rougeLsum_fmeasure": 0.001626133579867523, "rougeLsum_fmeasure_stderr": 0.00046047748478715914, "rougeLsum_precision": 0.0018171542939344795, "rougeLsum_precision_stderr": 0.0005294219136072831, "rougeLsum_recall": 0.0015326542359542015, "rougeLsum_recall_stderr": 0.0004286235600707048}}}}
4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.335,0.014933117490932573,0
3
+ anli_r2,acc,0.327,0.014842213153411245,0
4
+ anli_r3,acc,0.32916666666666666,0.01357080625843362,0
5
+ arc_challenge,acc,0.23378839590443687,0.012368225378507139,0
6
+ arc_challenge,acc_norm,0.257679180887372,0.012780770562768409,0
7
+ arc_easy,acc,0.5467171717171717,0.010214901516731618,0
8
+ arc_easy,acc_norm,0.48358585858585856,0.010254253565929301,0
9
+ boolq,acc,0.5948012232415902,0.008586427929715526,1
10
+ cb,acc,0.32142857142857145,0.06297362289056342,1
11
+ cb,f1,0.16901408450704228,,1
12
+ copa,acc,0.74,0.04408440022768077,0
13
+ hellaswag,acc,0.39613622784305913,0.004880937933163286,0
14
+ hellaswag,acc_norm,0.5023899621589325,0.0049897244086645216,0
15
+ piqa,acc,0.7132752992383025,0.010551314503108063,0
16
+ piqa,acc_norm,0.7154515778019587,0.010527218464130635,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.843,0.011510146979230194,0
19
+ sciq,acc_norm,0.729,0.014062601350986186,0
20
+ storycloze_2016,acc,0.6648850881881346,0.010915644164980039,0
21
+ winogrande,acc,0.5382794001578532,0.014011242594964118,0
4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.327,0.014842213153411244,0
3
+ anli_r2,acc,0.313,0.014671272822977892,0
4
+ anli_r3,acc,0.345,0.013728421539454876,0
5
+ arc_challenge,acc,0.24658703071672355,0.012595726268790122,0
6
+ arc_challenge,acc_norm,0.26023890784982934,0.01282193022511256,0
7
+ arc_easy,acc,0.5681818181818182,0.010163945352271723,0
8
+ arc_easy,acc_norm,0.5349326599326599,0.010234713052723674,0
9
+ boolq,acc,0.581039755351682,0.008629425249245244,1
10
+ cb,acc,0.5535714285714286,0.06703189227942397,1
11
+ cb,f1,0.3862470862470862,,1
12
+ copa,acc,0.72,0.04512608598542127,0
13
+ hellaswag,acc,0.39494124676359293,0.004878390226591719,0
14
+ hellaswag,acc_norm,0.5043815972913762,0.004989589816180231,0
15
+ piqa,acc,0.7165397170837867,0.010515057791152058,0
16
+ piqa,acc_norm,0.7138193688792165,0.010545318576106659,0
17
+ rte,acc,0.5740072202166066,0.02976495674177765,0
18
+ sciq,acc,0.893,0.00977991035984717,0
19
+ sciq,acc_norm,0.881,0.010244215145336666,0
20
+ storycloze_2016,acc,0.6632816675574559,0.010928525619392455,0
21
+ winogrande,acc,0.5430149960536701,0.01400038676159829,0
4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.331,0.014888272588203947,0
3
+ anli_r2,acc,0.34,0.014987482264363935,0
4
+ anli_r3,acc,0.3308333333333333,0.013588208070709002,0
5
+ arc_challenge,acc,0.25170648464163825,0.01268249633404296,0
6
+ arc_challenge,acc_norm,0.2713310580204778,0.012993807727545794,0
7
+ arc_easy,acc,0.5782828282828283,0.01013325528401233,0
8
+ arc_easy,acc_norm,0.561026936026936,0.010183076012972057,0
9
+ boolq,acc,0.5743119266055046,0.008647930658219415,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.3899371069182391,,1
12
+ copa,acc,0.71,0.045604802157206845,0
13
+ hellaswag,acc,0.39384584744074885,0.004876028037941942,0
14
+ hellaswag,acc_norm,0.5052778331009758,0.004989503417767287,0
15
+ piqa,acc,0.7181719260065288,0.01049667523125817,0
16
+ piqa,acc_norm,0.7290533188248096,0.010369718937426844,0
17
+ rte,acc,0.5379061371841155,0.03000984891252912,0
18
+ sciq,acc,0.902,0.009406619184621249,0
19
+ sciq,acc_norm,0.902,0.009406619184621228,0
20
+ storycloze_2016,acc,0.6509887760555852,0.011022640519108541,0
21
+ winogrande,acc,0.5374901341752171,0.014012928183336573,0
4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.31,0.014632638658632896,0
3
+ anli_r2,acc,0.323,0.014794927843348635,0
4
+ anli_r3,acc,0.31916666666666665,0.013462309712005129,0
5
+ arc_challenge,acc,0.25170648464163825,0.012682496334042961,0
6
+ arc_challenge,acc_norm,0.26706484641638223,0.01292893319649635,0
7
+ arc_easy,acc,0.5728114478114478,0.010150415974210868,0
8
+ arc_easy,acc_norm,0.563973063973064,0.010175459582759732,0
9
+ boolq,acc,0.5685015290519878,0.008662594569027307,1
10
+ cb,acc,0.5714285714285714,0.06672848092813058,1
11
+ cb,f1,0.399067599067599,,1
12
+ copa,acc,0.74,0.04408440022768079,0
13
+ hellaswag,acc,0.39533957379008167,0.00487924284847347,0
14
+ hellaswag,acc_norm,0.510655247958574,0.004988648260010036,0
15
+ piqa,acc,0.7219804134929271,0.010453117358332799,0
16
+ piqa,acc_norm,0.720892274211099,0.010465657948498231,0
17
+ rte,acc,0.5487364620938628,0.02995314924180894,0
18
+ sciq,acc,0.906,0.009233052000787743,0
19
+ sciq,acc_norm,0.897,0.009616833339695796,0
20
+ storycloze_2016,acc,0.6622127204703367,0.01093703499100388,0
21
+ winogrande,acc,0.5509076558800315,0.013979459389140846,0
4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.342,0.015008706182121733,0
3
+ anli_r2,acc,0.327,0.014842213153411245,0
4
+ anli_r3,acc,0.32166666666666666,0.013490095282989521,0
5
+ arc_challenge,acc,0.23464163822525597,0.012383873560768678,0
6
+ arc_challenge,acc_norm,0.2696245733788396,0.012968040686869148,0
7
+ arc_easy,acc,0.577020202020202,0.010137328382209097,0
8
+ arc_easy,acc_norm,0.5664983164983165,0.010168640625454101,0
9
+ boolq,acc,0.5623853211009174,0.008676717715731629,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.3335679099225897,,1
12
+ copa,acc,0.71,0.04560480215720684,0
13
+ hellaswag,acc,0.39623580959968135,0.004881148866874186,0
14
+ hellaswag,acc_norm,0.5078669587731528,0.004989163747650759,0
15
+ piqa,acc,0.719804134929271,0.01047812201557708,0
16
+ piqa,acc_norm,0.7181719260065288,0.010496675231258152,0
17
+ rte,acc,0.5054151624548736,0.030094698123239966,0
18
+ sciq,acc,0.912,0.008963053962592065,0
19
+ sciq,acc_norm,0.903,0.009363689373248116,0
20
+ storycloze_2016,acc,0.65793693212186,0.01097044950238848,0
21
+ winogrande,acc,0.5453827940015785,0.013994481027065997,0
4b284b84b20c4py/evaluation/rankeval/4b284b84b20c4py_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811476,0
3
+ anli_r2,acc,0.302,0.014526080235459557,0
4
+ anli_r3,acc,0.3516666666666667,0.013789711695404801,0
5
+ arc_challenge,acc,0.24829351535836178,0.012624912868089758,0
6
+ arc_challenge,acc_norm,0.26535836177474403,0.012902554762313967,0
7
+ arc_easy,acc,0.5791245791245792,0.01013050216406634,0
8
+ arc_easy,acc_norm,0.5732323232323232,0.010149141043955631,0
9
+ boolq,acc,0.5663608562691131,0.00866769046434468,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.28648018648018647,,1
12
+ copa,acc,0.74,0.04408440022768079,0
13
+ hellaswag,acc,0.3934475204142601,0.004875162699121656,0
14
+ hellaswag,acc_norm,0.5103565026887075,0.004988710917169336,0
15
+ piqa,acc,0.7219804134929271,0.010453117358332811,0
16
+ piqa,acc_norm,0.7230685527747551,0.010440499969334554,0
17
+ rte,acc,0.555956678700361,0.029907396333795994,0
18
+ sciq,acc,0.918,0.008680515615523706,0
19
+ sciq,acc_norm,0.913,0.0089168666307459,0
20
+ storycloze_2016,acc,0.6691608765366115,0.010880601338204657,0
21
+ winogrande,acc,0.5382794001578532,0.014011242594964122,0
4b284b84b30c4py/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.014492033405756072
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.014492033405756072
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.25522634040246184
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.25522634040246184
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2838818030589508
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2838818030589508
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.29715507735580454
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.29715507735580454
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.3006625146633337
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.3006625146633337
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.30151673174642446
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.30151673174642446
14
+ e2e_nlg_cleaned,5,average,multiple,0.24215575010545523
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04553378038807564
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04553378038807564
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03643482767652127
18
+ gem_xsum,1,median,rouge2_fmeasure,0.03643482767652127
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.045363332570277484
20
+ gem_xsum,2,median,rouge2_fmeasure,0.045363332570277484
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04908388537481377
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04908388537481377
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.012513607373749262
24
+ gem_xsum,4,median,rouge2_fmeasure,0.012513607373749262
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005571287960967399
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0005571287960967399
27
+ gem_xsum,5,average,multiple,0.03158109369658903
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05160547340597557
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05160547340597557
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.10122848201026645
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.10122848201026645
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.12130233036408532
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.12130233036408532
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.12283930010286082
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.12283930010286082
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.12839815254113054
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.12839815254113054
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.1374442953526929
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.1374442953526929
40
+ web_nlg_en,5,average,multiple,0.1104696722961686
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04097769287043771
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.04097769287043771
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.061540538934318864
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.061540538934318864
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06961044165268082
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.06961044165268082
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.057306430176183934
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.057306430176183934
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01928210787933178
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.01928210787933178
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0032920016871554447
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0032920016871554447
53
+ wiki_lingua_en,5,average,multiple,0.042001535533351425
4b284b84b30c4py/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.5291030476124639, "bleu_stderr": 0.04197202611227597, "rouge1_fmeasure": 0.11611625065768792, "rouge1_fmeasure_stderr": 0.0020496853364554164, "rouge1_precision": 0.0755669969216671, "rouge1_precision_stderr": 0.0016066780990848701, "rouge1_recall": 0.3614964445579484, "rouge1_recall_stderr": 0.004796800609548473, "rouge2_fmeasure": 0.05160547340597557, "rouge2_fmeasure_stderr": 0.001305606139483706, "rouge2_precision": 0.03361637012774977, "rouge2_precision_stderr": 0.0009415823767637188, "rouge2_recall": 0.15950569466775388, "rouge2_recall_stderr": 0.0035094728487810923, "rougeL_fmeasure": 0.10824255638171551, "rougeL_fmeasure_stderr": 0.0018389871804153166, "rougeL_precision": 0.07020947284959439, "rougeL_precision_stderr": 0.00142789274706649, "rougeL_recall": 0.34081068773606127, "rougeL_recall_stderr": 0.004546933483026844, "rougeLsum_fmeasure": 0.10717711946390289, "rougeLsum_fmeasure_stderr": 0.0019225600139694417, "rougeLsum_precision": 0.06994815818201304, "rougeLsum_precision_stderr": 0.0015093631600602148, "rougeLsum_recall": 0.3319720321322399, "rougeLsum_recall_stderr": 0.004483327202743954}}, "1": {"PALM_prompt": {"bleu": 0.816008297767922, "bleu_stderr": 0.039578647327447286, "rouge1_fmeasure": 0.18847439618402229, "rouge1_fmeasure_stderr": 0.0043551864228643065, "rouge1_precision": 0.17351010848243043, "rouge1_precision_stderr": 0.005277286960823768, "rouge1_recall": 0.33501513478045125, "rouge1_recall_stderr": 0.005255847690933239, "rouge2_fmeasure": 0.10122848201026645, "rouge2_fmeasure_stderr": 0.0031094307144283895, "rouge2_precision": 0.09688460500952906, "rouge2_precision_stderr": 0.0038808402094324644, "rouge2_recall": 0.17930986784245564, "rouge2_recall_stderr": 0.003859146237410314, "rougeL_fmeasure": 0.16839433021909359, "rougeL_fmeasure_stderr": 0.0037253893635597144, "rougeL_precision": 0.15406283102476945, "rougeL_precision_stderr": 0.004666377429425112, "rougeL_recall": 0.30863039329762465, "rougeL_recall_stderr": 0.004749839903833515, "rougeLsum_fmeasure": 0.17289872216441404, "rougeLsum_fmeasure_stderr": 0.00383815524261692, "rougeLsum_precision": 0.15883805092079975, "rougeLsum_precision_stderr": 0.0047963831306514874, "rougeLsum_recall": 0.3136844044151392, "rougeLsum_recall_stderr": 0.004814998633951047}}, "2": {"PALM_prompt": {"bleu": 1.0874517105392074, "bleu_stderr": 0.05517474807219761, "rouge1_fmeasure": 0.22045252325070877, "rouge1_fmeasure_stderr": 0.0047715720871285994, "rouge1_precision": 0.20702377668412475, "rouge1_precision_stderr": 0.005856753433793659, "rouge1_recall": 0.3701850962094367, "rouge1_recall_stderr": 0.005118554273628172, "rouge2_fmeasure": 0.12130233036408532, "rouge2_fmeasure_stderr": 0.0033904789798544607, "rouge2_precision": 0.11745995038629221, "rouge2_precision_stderr": 0.0040972793894429645, "rouge2_recall": 0.20377481545578427, "rouge2_recall_stderr": 0.003977928105521273, "rougeL_fmeasure": 0.1943548401050752, "rougeL_fmeasure_stderr": 0.004059123307343083, "rougeL_precision": 0.18043706405573443, "rougeL_precision_stderr": 0.00504546321013034, "rougeL_recall": 0.3378526690077897, "rougeL_recall_stderr": 0.00459675285758023, "rougeLsum_fmeasure": 0.20082266435470564, "rougeLsum_fmeasure_stderr": 0.004224985170643138, "rougeLsum_precision": 0.18718784128186539, "rougeLsum_precision_stderr": 0.005238053935576105, "rougeLsum_recall": 0.34555708735479357, "rougeLsum_recall_stderr": 0.004721420537792257}}, "3": {"PALM_prompt": {"bleu": 1.1547949878110682, "bleu_stderr": 0.038315715594886474, "rouge1_fmeasure": 0.22251781407136842, "rouge1_fmeasure_stderr": 0.004747359850504829, "rouge1_precision": 0.2088312317468897, "rouge1_precision_stderr": 0.005867492598257474, "rouge1_recall": 0.37575969333224407, "rouge1_recall_stderr": 0.00497930483888963, "rouge2_fmeasure": 0.12283930010286082, "rouge2_fmeasure_stderr": 0.003325035383391392, "rouge2_precision": 0.11863907378151228, "rouge2_precision_stderr": 0.004058682037048737, "rouge2_recall": 0.2076798877974614, "rouge2_recall_stderr": 0.0039057748486226894, "rougeL_fmeasure": 0.194637506433406, "rougeL_fmeasure_stderr": 0.003931160067615202, "rougeL_precision": 0.18047631320315732, "rougeL_precision_stderr": 0.004955177644287909, "rougeL_recall": 0.34161179153114873, "rougeL_recall_stderr": 0.004411502210112619, "rougeLsum_fmeasure": 0.2020774724552748, "rougeLsum_fmeasure_stderr": 0.004153005153776811, "rougeLsum_precision": 0.18889486408271014, "rougeLsum_precision_stderr": 0.005251166286157393, "rougeLsum_recall": 0.35004448006634664, "rougeLsum_recall_stderr": 0.004548085604415196}}, "4": {"PALM_prompt": {"bleu": 1.2831840974421462, "bleu_stderr": 0.05033217844028594, "rouge1_fmeasure": 0.23228788333311123, "rouge1_fmeasure_stderr": 0.0047650615896163255, "rouge1_precision": 0.21509090184797933, "rouge1_precision_stderr": 0.005786012243464188, "rouge1_recall": 0.39075900091373755, "rouge1_recall_stderr": 0.004979444479868987, "rouge2_fmeasure": 0.12839815254113054, "rouge2_fmeasure_stderr": 0.003362792758070131, "rouge2_precision": 0.12174140590353577, "rouge2_precision_stderr": 0.004035409467808737, "rouge2_recall": 0.21738007732858777, "rouge2_recall_stderr": 0.003985610646127817, "rougeL_fmeasure": 0.2043246097878992, "rougeL_fmeasure_stderr": 0.004044808374925389, "rougeL_precision": 0.18709030645359226, "rougeL_precision_stderr": 0.004962036698521558, "rougeL_recall": 0.3551992661063061, "rougeL_recall_stderr": 0.004460630587408427, "rougeLsum_fmeasure": 0.21270006018010992, "rougeLsum_fmeasure_stderr": 0.004260014275831547, "rougeLsum_precision": 0.19672589534364512, "rougeLsum_precision_stderr": 0.005274152172955492, "rougeLsum_recall": 0.3644513652080519, "rougeLsum_recall_stderr": 0.00457749232292948}}, "5": {"PALM_prompt": {"bleu": 1.4005205832982575, "bleu_stderr": 0.06926000215952662, "rouge1_fmeasure": 0.24446201038709578, "rouge1_fmeasure_stderr": 0.005079320217162989, "rouge1_precision": 0.23464661799359093, "rouge1_precision_stderr": 0.006350773037843486, "rouge1_recall": 0.39377150276979583, "rouge1_recall_stderr": 0.005090363960528904, "rouge2_fmeasure": 0.1374442953526929, "rouge2_fmeasure_stderr": 0.003667286101436062, "rouge2_precision": 0.13649941034214283, "rouge2_precision_stderr": 0.004560065406271386, "rouge2_recall": 0.21962108843622127, "rouge2_recall_stderr": 0.003997729138604017, "rougeL_fmeasure": 0.21227050148808532, "rougeL_fmeasure_stderr": 0.0042533854910336635, "rougeL_precision": 0.20150575544209304, "rougeL_precision_stderr": 0.005436660437994448, "rougeL_recall": 0.35477162089877506, "rougeL_recall_stderr": 0.004463834844360038, "rougeLsum_fmeasure": 0.22200694795225623, "rougeLsum_fmeasure_stderr": 0.0044956653991661, "rougeLsum_precision": 0.21209426618977525, "rougeLsum_precision_stderr": 0.005733038228033396, "rougeLsum_recall": 0.3656630384895249, "rougeLsum_recall_stderr": 0.004619338322114935}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.930658245267006, "bleu_stderr": 0.05697492480316102, "rouge1_fmeasure": 0.19386940954770898, "rouge1_fmeasure_stderr": 0.001994698102106922, "rouge1_precision": 0.1727914576871466, "rouge1_precision_stderr": 0.0021351110231495087, "rouge1_recall": 0.2660682916333499, "rouge1_recall_stderr": 0.0026849446428291006, "rouge2_fmeasure": 0.04097769287043771, "rouge2_fmeasure_stderr": 0.0009426940598246375, "rouge2_precision": 0.03624809666333132, "rouge2_precision_stderr": 0.0008715161901282981, "rouge2_recall": 0.05744373910553745, "rouge2_recall_stderr": 0.0014157419279156214, "rougeL_fmeasure": 0.145750867603629, "rougeL_fmeasure_stderr": 0.0014228565663962664, "rougeL_precision": 0.1282702742126295, "rougeL_precision_stderr": 0.001477817398891479, "rougeL_recall": 0.20533525377010228, "rougeL_recall_stderr": 0.0021674082092200597, "rougeLsum_fmeasure": 0.17946061827488988, "rougeLsum_fmeasure_stderr": 0.001854650356945841, "rougeLsum_precision": 0.1596507235495731, "rougeLsum_precision_stderr": 0.001971335655776246, "rougeLsum_recall": 0.24708864548023515, "rougeLsum_recall_stderr": 0.0025296879150935704}}, "1": {"tldr_en": {"bleu": 2.6170047685197764, "bleu_stderr": 0.10132165559128563, "rouge1_fmeasure": 0.20977393883361076, "rouge1_fmeasure_stderr": 0.0022254766571720066, "rouge1_precision": 0.38449460728577606, "rouge1_precision_stderr": 0.0047054324032565625, "rouge1_recall": 0.19207344290265743, "rouge1_recall_stderr": 0.002629170272910203, "rouge2_fmeasure": 0.061540538934318864, "rouge2_fmeasure_stderr": 0.001442138550854557, "rouge2_precision": 0.12783991313824408, "rouge2_precision_stderr": 0.00341985688794511, "rouge2_recall": 0.05465738420487717, "rouge2_recall_stderr": 0.001433272452461301, "rougeL_fmeasure": 0.16530272812735328, "rougeL_fmeasure_stderr": 0.0017882125885449309, "rougeL_precision": 0.3124230043998015, "rougeL_precision_stderr": 0.004177882648440945, "rougeL_recall": 0.15029481037618114, "rougeL_recall_stderr": 0.002078055484599921, "rougeLsum_fmeasure": 0.1977113639485601, "rougeLsum_fmeasure_stderr": 0.002099547935718861, "rougeLsum_precision": 0.3650530159295627, "rougeLsum_precision_stderr": 0.0045705042050303395, "rougeLsum_recall": 0.1806802008213447, "rougeLsum_recall_stderr": 0.002463525306412132}}, "2": {"tldr_en": {"bleu": 3.083033051206459, "bleu_stderr": 0.05582952322326019, "rouge1_fmeasure": 0.22729532838677682, "rouge1_fmeasure_stderr": 0.002278573747439395, "rouge1_precision": 0.4032655656845897, "rouge1_precision_stderr": 0.004479627558844031, "rouge1_recall": 0.2049382529474769, "rouge1_recall_stderr": 0.002681669637287718, "rouge2_fmeasure": 0.06961044165268082, "rouge2_fmeasure_stderr": 0.0015202570622053406, "rouge2_precision": 0.1351671605771911, "rouge2_precision_stderr": 0.0032754902404022617, "rouge2_recall": 0.061272791853854175, "rouge2_recall_stderr": 0.0014820501363554317, "rougeL_fmeasure": 0.18038905985376455, "rougeL_fmeasure_stderr": 0.0018570287059750698, "rougeL_precision": 0.3282332800418989, "rougeL_precision_stderr": 0.004005725965978507, "rougeL_recall": 0.16159432180284913, "rougeL_recall_stderr": 0.0021344435988843486, "rougeLsum_fmeasure": 0.21569150736989062, "rougeLsum_fmeasure_stderr": 0.0021865283875637733, "rougeLsum_precision": 0.38453726169616403, "rougeLsum_precision_stderr": 0.004365802550533319, "rougeLsum_recall": 0.19401936112628262, "rougeLsum_recall_stderr": 0.0025374156051523327}}, "3": {"tldr_en": {"bleu": 2.093173740089239, "bleu_stderr": 0.04804462336336302, "rouge1_fmeasure": 0.1884840420361697, "rouge1_fmeasure_stderr": 0.0025860544825958565, "rouge1_precision": 0.3306053825100859, "rouge1_precision_stderr": 0.004859303428309578, "rouge1_recall": 0.16989164386666203, "rouge1_recall_stderr": 0.0028394105099450286, "rouge2_fmeasure": 0.057306430176183934, "rouge2_fmeasure_stderr": 0.0014618861427840235, "rouge2_precision": 0.11003967873460772, "rouge2_precision_stderr": 0.0031464698650606132, "rouge2_recall": 0.05095629186962663, "rouge2_recall_stderr": 0.0014425612991473366, "rougeL_fmeasure": 0.1495904617320012, "rougeL_fmeasure_stderr": 0.0020700698299425295, "rougeL_precision": 0.26893516543775503, "rougeL_precision_stderr": 0.0041976461470786845, "rougeL_recall": 0.13418506396515523, "rougeL_recall_stderr": 0.0022587351419021454, "rougeLsum_fmeasure": 0.17876376026134483, "rougeLsum_fmeasure_stderr": 0.002457444582263677, "rougeLsum_precision": 0.3153769862337482, "rougeLsum_precision_stderr": 0.00469618155894323, "rougeLsum_recall": 0.16064683373240374, "rougeLsum_recall_stderr": 0.002677404108882927}}, "4": {"tldr_en": {"bleu": 0.01342591715001171, "bleu_stderr": 0.004001407452183195, "rouge1_fmeasure": 0.06190312721435752, "rouge1_fmeasure_stderr": 0.0022453212014093074, "rouge1_precision": 0.11056363728035501, "rouge1_precision_stderr": 0.004078087294566337, "rouge1_recall": 0.05565752667360576, "rouge1_recall_stderr": 0.002228196068646721, "rouge2_fmeasure": 0.01928210787933178, "rouge2_fmeasure_stderr": 0.0010535821351118106, "rouge2_precision": 0.03823358445187834, "rouge2_precision_stderr": 0.002265586123627705, "rouge2_recall": 0.017029256146607465, "rouge2_recall_stderr": 0.001010805384777592, "rougeL_fmeasure": 0.05032197921505991, "rougeL_fmeasure_stderr": 0.0018381156651678721, "rougeL_precision": 0.09199293132037738, "rougeL_precision_stderr": 0.0035027299521763655, "rougeL_recall": 0.045072079383827686, "rougeL_recall_stderr": 0.0018206845822862608, "rougeLsum_fmeasure": 0.058416404243415425, "rougeLsum_fmeasure_stderr": 0.0021202689317476086, "rougeLsum_precision": 0.1054403051266621, "rougeLsum_precision_stderr": 0.003925170312170128, "rougeLsum_recall": 0.05235998421489741, "rougeLsum_recall_stderr": 0.0020961513213487155}}, "5": {"tldr_en": {"bleu": 1.296861751779121e-20, "bleu_stderr": 3.6442754943686004e-18, "rouge1_fmeasure": 0.009967132608929053, "rouge1_fmeasure_stderr": 0.0010055731539876701, "rouge1_precision": 0.018922785858028892, "rouge1_precision_stderr": 0.0019403990469699873, "rouge1_recall": 0.008975587262162682, "rouge1_recall_stderr": 0.0009872147516307268, "rouge2_fmeasure": 0.0032920016871554447, "rouge2_fmeasure_stderr": 0.0004596506927749954, "rouge2_precision": 0.006846984878698437, "rouge2_precision_stderr": 0.0010455084784967584, "rouge2_recall": 0.0029280528226017476, "rouge2_recall_stderr": 0.0004213012126424815, "rougeL_fmeasure": 0.008284461998593133, "rougeL_fmeasure_stderr": 0.0008347213758083215, "rougeL_precision": 0.01640793896701804, "rougeL_precision_stderr": 0.0017460251598696908, "rougeL_recall": 0.00741137813780437, "rougeL_recall_stderr": 0.0008080417855090419, "rougeLsum_fmeasure": 0.009575391795850074, "rougeLsum_fmeasure_stderr": 0.0009634559746070253, "rougeLsum_precision": 0.018392291454882706, "rougeLsum_precision_stderr": 0.0018972624951893056, "rougeLsum_recall": 0.008584757938961327, "rougeLsum_recall_stderr": 0.0009377330143675642}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.5875021094029641, "bleu_stderr": 0.03276340732837118, "rouge1_fmeasure": 0.13542593032443753, "rouge1_fmeasure_stderr": 0.0013818435725228544, "rouge1_precision": 0.1462332519069936, "rouge1_precision_stderr": 0.001201997877137717, "rouge1_recall": 0.13608182285447765, "rouge1_recall_stderr": 0.0018538120469736344, "rouge2_fmeasure": 0.014492033405756072, "rouge2_fmeasure_stderr": 0.0006689542384395832, "rouge2_precision": 0.013499709926694298, "rouge2_precision_stderr": 0.000598525698558085, "rouge2_recall": 0.01714105346982889, "rouge2_recall_stderr": 0.0008424236847666467, "rougeL_fmeasure": 0.11332549778567229, "rougeL_fmeasure_stderr": 0.0011123716447378896, "rougeL_precision": 0.12301116965343654, "rougeL_precision_stderr": 0.0009905504724211118, "rougeL_recall": 0.1134065162008498, "rougeL_recall_stderr": 0.0015013680445233953, "rougeLsum_fmeasure": 0.12253149321767402, "rougeLsum_fmeasure_stderr": 0.001312416630786411, "rougeLsum_precision": 0.13229355571816984, "rougeLsum_precision_stderr": 0.0011610994075002482, "rougeLsum_recall": 0.12319740065146932, "rougeLsum_recall_stderr": 0.0017345128865693803}}, "1": {"generate_text_restaurant": {"bleu": 14.232244328535277, "bleu_stderr": 0.23587107720067682, "rouge1_fmeasure": 0.5201864241755413, "rouge1_fmeasure_stderr": 0.002414364040627501, "rouge1_precision": 0.6148712325769302, "rouge1_precision_stderr": 0.0031045086947765403, "rouge1_recall": 0.4893924345662815, "rouge1_recall_stderr": 0.0031311058945645894, "rouge2_fmeasure": 0.25522634040246184, "rouge2_fmeasure_stderr": 0.002201786658646484, "rouge2_precision": 0.3047884290492825, "rouge2_precision_stderr": 0.0027480747372428324, "rouge2_recall": 0.24006394337233386, "rouge2_recall_stderr": 0.0023451326400918348, "rougeL_fmeasure": 0.36904290688831, "rougeL_fmeasure_stderr": 0.002220539945315262, "rougeL_precision": 0.439249556651525, "rougeL_precision_stderr": 0.0029626289987940402, "rougeL_recall": 0.3461182837453476, "rougeL_recall_stderr": 0.00257046135656233, "rougeLsum_fmeasure": 0.42278608800790923, "rougeLsum_fmeasure_stderr": 0.0024933744914271616, "rougeLsum_precision": 0.5001725889202292, "rougeLsum_precision_stderr": 0.0031476221294652043, "rougeLsum_recall": 0.39773640972924135, "rougeLsum_recall_stderr": 0.002946671345321774}}, "2": {"generate_text_restaurant": {"bleu": 16.887275962764303, "bleu_stderr": 0.2320462281621436, "rouge1_fmeasure": 0.5564976544403136, "rouge1_fmeasure_stderr": 0.0022595472107108713, "rouge1_precision": 0.6278939181678378, "rouge1_precision_stderr": 0.0029056056968577437, "rouge1_recall": 0.5306697903503012, "rouge1_recall_stderr": 0.0029063038898311674, "rouge2_fmeasure": 0.2838818030589508, "rouge2_fmeasure_stderr": 0.002242536844423417, "rouge2_precision": 0.3222139474860908, "rouge2_precision_stderr": 0.0026789978742530595, "rouge2_recall": 0.27087358968954384, "rouge2_recall_stderr": 0.0023919284150648304, "rougeL_fmeasure": 0.3971723386962624, "rougeL_fmeasure_stderr": 0.0022430653669999988, "rougeL_precision": 0.44917260724498614, "rougeL_precision_stderr": 0.0028148875164517367, "rougeL_recall": 0.37819866335501845, "rougeL_recall_stderr": 0.0025322793268973353, "rougeLsum_fmeasure": 0.4598398358662466, "rougeLsum_fmeasure_stderr": 0.002440212683541689, "rougeLsum_precision": 0.5185526365601005, "rougeLsum_precision_stderr": 0.0029868320619852976, "rougeLsum_recall": 0.43845723389636404, "rougeLsum_recall_stderr": 0.0028309828915384487}}, "3": {"generate_text_restaurant": {"bleu": 17.881132925926057, "bleu_stderr": 0.24331221464935227, "rouge1_fmeasure": 0.5693990418239955, "rouge1_fmeasure_stderr": 0.0022497883873787844, "rouge1_precision": 0.638558204425189, "rouge1_precision_stderr": 0.0028561731365242494, "rouge1_recall": 0.5417918907627629, "rouge1_recall_stderr": 0.002839412445787648, "rouge2_fmeasure": 0.29715507735580454, "rouge2_fmeasure_stderr": 0.002309950561543432, "rouge2_precision": 0.33507225639392146, "rouge2_precision_stderr": 0.002694881152132202, "rouge2_recall": 0.2827392626784024, "rouge2_recall_stderr": 0.0024512921892777743, "rougeL_fmeasure": 0.40768617762191184, "rougeL_fmeasure_stderr": 0.0022696184161343264, "rougeL_precision": 0.45834114038135865, "rougeL_precision_stderr": 0.0028021806357125157, "rougeL_recall": 0.3873396760056063, "rougeL_recall_stderr": 0.0025309151699083555, "rougeLsum_fmeasure": 0.47242282187982376, "rougeLsum_fmeasure_stderr": 0.0024594738133128253, "rougeLsum_precision": 0.5295403903479906, "rougeLsum_precision_stderr": 0.002952223036550424, "rougeLsum_recall": 0.44954601881032297, "rougeLsum_recall_stderr": 0.00282635305347299}}, "4": {"generate_text_restaurant": {"bleu": 18.176178151784033, "bleu_stderr": 0.17173354880624206, "rouge1_fmeasure": 0.5737969337422074, "rouge1_fmeasure_stderr": 0.0022430034562036178, "rouge1_precision": 0.6428304979924027, "rouge1_precision_stderr": 0.002878289118202974, "rouge1_recall": 0.5453482063114422, "rouge1_recall_stderr": 0.0027939400417403394, "rouge2_fmeasure": 0.3006625146633337, "rouge2_fmeasure_stderr": 0.0023334096671335657, "rouge2_precision": 0.33872467665379435, "rouge2_precision_stderr": 0.0027583524448216616, "rouge2_recall": 0.2856920671903507, "rouge2_recall_stderr": 0.002448195021463064, "rougeL_fmeasure": 0.4114892561640318, "rougeL_fmeasure_stderr": 0.002298706971006199, "rougeL_precision": 0.46182553147220984, "rougeL_precision_stderr": 0.0028240491213298363, "rougeL_recall": 0.3906784885545622, "rougeL_recall_stderr": 0.0025434310940566462, "rougeLsum_fmeasure": 0.4779412391272871, "rougeLsum_fmeasure_stderr": 0.0024862148673544127, "rougeLsum_precision": 0.5350589809511, "rougeLsum_precision_stderr": 0.0029986519326935216, "rougeLsum_recall": 0.45428916038276906, "rougeLsum_recall_stderr": 0.002814865590330368}}, "5": {"generate_text_restaurant": {"bleu": 17.78551604265496, "bleu_stderr": 0.2635749470688093, "rouge1_fmeasure": 0.5738131668848636, "rouge1_fmeasure_stderr": 0.0022481583216242256, "rouge1_precision": 0.649311924377427, "rouge1_precision_stderr": 0.0028669560542196265, "rouge1_recall": 0.5405285575978017, "rouge1_recall_stderr": 0.0028004073492855905, "rouge2_fmeasure": 0.30151673174642446, "rouge2_fmeasure_stderr": 0.0023279548457261537, "rouge2_precision": 0.34320855954861257, "rouge2_precision_stderr": 0.002720830070622915, "rouge2_recall": 0.28394139725033024, "rouge2_recall_stderr": 0.0024538141734712074, "rougeL_fmeasure": 0.41231320413610334, "rougeL_fmeasure_stderr": 0.002294726560339896, "rougeL_precision": 0.4668510985570315, "rougeL_precision_stderr": 0.002789241271717688, "rougeL_recall": 0.3883916681664931, "rougeL_recall_stderr": 0.002560663785520964, "rougeLsum_fmeasure": 0.4768250136491845, "rougeLsum_fmeasure_stderr": 0.0024666489811737786, "rougeLsum_precision": 0.5391824276449253, "rougeLsum_precision_stderr": 0.002973787805282262, "rougeLsum_recall": 0.44931643081079575, "rougeLsum_recall_stderr": 0.00280168000543349}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.682951622425175, "bleu_stderr": 0.0851603484366571, "rouge1_fmeasure": 0.2028122765212525, "rouge1_fmeasure_stderr": 0.002390883517791929, "rouge1_precision": 0.1465393283772569, "rouge1_precision_stderr": 0.0018440142786674523, "rouge1_recall": 0.34852125207926665, "rouge1_recall_stderr": 0.004176582428129474, "rouge2_fmeasure": 0.04553378038807564, "rouge2_fmeasure_stderr": 0.0014232037651747472, "rouge2_precision": 0.03237190436886122, "rouge2_precision_stderr": 0.0010186372979639458, "rouge2_recall": 0.08155513999057418, "rouge2_recall_stderr": 0.002654299050421707, "rougeL_fmeasure": 0.15173815616109243, "rougeL_fmeasure_stderr": 0.0017484592331627647, "rougeL_precision": 0.10927435729309919, "rougeL_precision_stderr": 0.0013232281161521923, "rougeL_recall": 0.2630588346762938, "rougeL_recall_stderr": 0.0032775160998989995, "rougeLsum_fmeasure": 0.15922778959106051, "rougeLsum_fmeasure_stderr": 0.0019919293929075467, "rougeLsum_precision": 0.11460229405876507, "rougeLsum_precision_stderr": 0.0014894111268745059, "rougeLsum_recall": 0.2761136449255996, "rougeLsum_recall_stderr": 0.0036855999708369533}}, "1": {"article_DOC_summary": {"bleu": 1.3944702564913742, "bleu_stderr": 0.1006715007044181, "rouge1_fmeasure": 0.18674323256729772, "rouge1_fmeasure_stderr": 0.0028962198010261913, "rouge1_precision": 0.15529855754434083, "rouge1_precision_stderr": 0.003197013485161652, "rouge1_recall": 0.28328408071830175, "rouge1_recall_stderr": 0.004095283443267801, "rouge2_fmeasure": 0.03643482767652127, "rouge2_fmeasure_stderr": 0.001615010422919914, "rouge2_precision": 0.031152103023872623, "rouge2_precision_stderr": 0.0016371933400431812, "rouge2_recall": 0.05472982360835645, "rouge2_recall_stderr": 0.0022833781923103074, "rougeL_fmeasure": 0.14255280668735085, "rougeL_fmeasure_stderr": 0.0021576392267215424, "rougeL_precision": 0.11843177633482352, "rougeL_precision_stderr": 0.002409552948308063, "rougeL_recall": 0.21763400321442317, "rougeL_recall_stderr": 0.003129499198247128, "rougeLsum_fmeasure": 0.1476471620111356, "rougeLsum_fmeasure_stderr": 0.002291024509660889, "rougeLsum_precision": 0.12213275130709968, "rougeLsum_precision_stderr": 0.002472334330381876, "rougeLsum_recall": 0.22627961659456908, "rougeLsum_recall_stderr": 0.0034141449111551652}}, "2": {"article_DOC_summary": {"bleu": 1.9222594192888955, "bleu_stderr": 0.14749409898511368, "rouge1_fmeasure": 0.21852367466812617, "rouge1_fmeasure_stderr": 0.0033445824150230403, "rouge1_precision": 0.21198884995411932, "rouge1_precision_stderr": 0.004014145881410469, "rouge1_recall": 0.26445123542071497, "rouge1_recall_stderr": 0.003817051442823084, "rouge2_fmeasure": 0.045363332570277484, "rouge2_fmeasure_stderr": 0.001984243048357356, "rouge2_precision": 0.04519757214182958, "rouge2_precision_stderr": 0.002124096891786233, "rouge2_recall": 0.05272699187995582, "rouge2_recall_stderr": 0.0021965613474949344, "rougeL_fmeasure": 0.16418461242259857, "rougeL_fmeasure_stderr": 0.002543511838527791, "rougeL_precision": 0.15886071081540448, "rougeL_precision_stderr": 0.003028374996503029, "rougeL_recall": 0.19968033748130481, "rougeL_recall_stderr": 0.0029024042977297817, "rougeLsum_fmeasure": 0.16683014810182795, "rougeLsum_fmeasure_stderr": 0.0025785714568837526, "rougeLsum_precision": 0.16082182630785, "rougeLsum_precision_stderr": 0.003033425743026839, "rougeLsum_recall": 0.20439672538270415, "rougeLsum_recall_stderr": 0.003106316702963481}}, "3": {"article_DOC_summary": {"bleu": 2.3051107454233657, "bleu_stderr": 0.13294402796846333, "rouge1_fmeasure": 0.22044754657110907, "rouge1_fmeasure_stderr": 0.0036816828414909354, "rouge1_precision": 0.22353929011189883, "rouge1_precision_stderr": 0.004419661484900206, "rouge1_recall": 0.25151365302811174, "rouge1_recall_stderr": 0.004022931331536087, "rouge2_fmeasure": 0.04908388537481377, "rouge2_fmeasure_stderr": 0.002101125106827758, "rouge2_precision": 0.05068583418207386, "rouge2_precision_stderr": 0.002274057043865072, "rouge2_recall": 0.05446707516646748, "rouge2_recall_stderr": 0.0022915422148587885, "rougeL_fmeasure": 0.16634527117743053, "rougeL_fmeasure_stderr": 0.002806196714917542, "rougeL_precision": 0.1683362752791849, "rougeL_precision_stderr": 0.0033749994789565216, "rougeL_recall": 0.19091666832782667, "rougeL_recall_stderr": 0.003108256435550969, "rougeLsum_fmeasure": 0.16814801555504616, "rougeLsum_fmeasure_stderr": 0.0028342059539106772, "rougeLsum_precision": 0.16982900423259714, "rougeLsum_precision_stderr": 0.0033786524957777875, "rougeLsum_recall": 0.1937368770717562, "rougeLsum_recall_stderr": 0.003240956961670264}}, "4": {"article_DOC_summary": {"bleu": 0.26083774647255553, "bleu_stderr": 0.049583832108113866, "rouge1_fmeasure": 0.05786293608582415, "rouge1_fmeasure_stderr": 0.0034026702106971125, "rouge1_precision": 0.06367196654851114, "rouge1_precision_stderr": 0.003962614910205744, "rouge1_recall": 0.06299530080475903, "rouge1_recall_stderr": 0.0037475427287808885, "rouge2_fmeasure": 0.012513607373749262, "rouge2_fmeasure_stderr": 0.001250609752118574, "rouge2_precision": 0.013635611245205721, "rouge2_precision_stderr": 0.0014792298855385423, "rouge2_recall": 0.01336383696893493, "rouge2_recall_stderr": 0.0012695510329812508, "rougeL_fmeasure": 0.04353546335037799, "rougeL_fmeasure_stderr": 0.0025802719961996994, "rougeL_precision": 0.048408895244731735, "rougeL_precision_stderr": 0.0030897746488708286, "rougeL_recall": 0.04747705562614682, "rougeL_recall_stderr": 0.0028345771577288153, "rougeLsum_fmeasure": 0.04414934016799974, "rougeLsum_fmeasure_stderr": 0.0026101400275025726, "rougeLsum_precision": 0.04896365056449781, "rougeLsum_precision_stderr": 0.0031126290923185673, "rougeLsum_recall": 0.04827223064379905, "rougeLsum_recall_stderr": 0.002881696104661706}}, "5": {"article_DOC_summary": {"bleu": 7.868752708746091e-44, "bleu_stderr": 1.2736983233832189e-41, "rouge1_fmeasure": 0.002843556933334533, "rouge1_fmeasure_stderr": 0.0007574576020821273, "rouge1_precision": 0.004566324612034057, "rouge1_precision_stderr": 0.0014730318250486575, "rouge1_recall": 0.002557303833094749, "rouge1_recall_stderr": 0.0006901355157863597, "rouge2_fmeasure": 0.0005571287960967399, "rouge2_fmeasure_stderr": 0.00022892269931771824, "rouge2_precision": 0.001489661496087105, "rouge2_precision_stderr": 0.0009052255446208827, "rouge2_recall": 0.00047034880275723784, "rouge2_recall_stderr": 0.0002063572366660285, "rougeL_fmeasure": 0.0020842072687093584, "rougeL_fmeasure_stderr": 0.0005511729003516569, "rougeL_precision": 0.003393761608318533, "rougeL_precision_stderr": 0.0011516288498803838, "rougeL_recall": 0.0018871215670564604, "rougeL_recall_stderr": 0.0005069615563777212, "rougeLsum_fmeasure": 0.0021808611389481343, "rougeLsum_fmeasure_stderr": 0.0005795738011449526, "rougeLsum_precision": 0.0034865464578064612, "rougeLsum_precision_stderr": 0.0011646228055732888, "rougeLsum_recall": 0.001988369899436868, "rougeLsum_recall_stderr": 0.0005399078494680753}}}}
4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.014944140233795023,0
3
+ anli_r2,acc,0.334,0.014922019523732963,0
4
+ anli_r3,acc,0.3325,0.013605417345710528,0
5
+ arc_challenge,acc,0.25597269624573377,0.012753013241244518,0
6
+ arc_challenge,acc_norm,0.27047781569965873,0.012980954547659554,0
7
+ arc_easy,acc,0.569023569023569,0.010161552863493755,0
8
+ arc_easy,acc_norm,0.5096801346801347,0.010257860554461127,0
9
+ boolq,acc,0.5886850152905199,0.008606395426309208,1
10
+ cb,acc,0.32142857142857145,0.06297362289056341,1
11
+ cb,f1,0.258974358974359,,1
12
+ copa,acc,0.71,0.04560480215720684,0
13
+ hellaswag,acc,0.42033459470225054,0.004926038197714533,0
14
+ hellaswag,acc_norm,0.5420235012945628,0.004972126523031945,0
15
+ piqa,acc,0.7328618063112078,0.010323440492612431,0
16
+ piqa,acc_norm,0.7383025027203483,0.01025563077270823,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.836,0.011715000693181326,0
19
+ sciq,acc_norm,0.74,0.013877773329774166,0
20
+ storycloze_2016,acc,0.6814537680384821,0.010774165229761351,0
21
+ winogrande,acc,0.5493291239147593,0.01398392886904024,0
4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.316,0.014709193056057135,0
3
+ anli_r2,acc,0.313,0.014671272822977892,0
4
+ anli_r3,acc,0.3416666666666667,0.013696658778002527,0
5
+ arc_challenge,acc,0.26535836177474403,0.012902554762313962,0
6
+ arc_challenge,acc_norm,0.2764505119453925,0.013069662474252425,0
7
+ arc_easy,acc,0.5837542087542088,0.010114819404500867,0
8
+ arc_easy,acc_norm,0.5622895622895623,0.010179856486006897,0
9
+ boolq,acc,0.5599388379204893,0.00868199149713359,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.2431168831168831,,1
12
+ copa,acc,0.67,0.047258156262526066,0
13
+ hellaswag,acc,0.42103166699860584,0.004927155882598188,0
14
+ hellaswag,acc_norm,0.5467038438558056,0.004967965810199984,0
15
+ piqa,acc,0.7181719260065288,0.010496675231258173,0
16
+ piqa,acc_norm,0.7306855277475517,0.010350004070588757,0
17
+ rte,acc,0.516245487364621,0.030080573208738064,0
18
+ sciq,acc,0.894,0.009739551265785134,0
19
+ sciq,acc_norm,0.886,0.01005510343582333,0
20
+ storycloze_2016,acc,0.6707642971672902,0.010867199207548977,0
21
+ winogrande,acc,0.5509076558800315,0.013979459389140844,0
4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.315,0.014696631960792501,0
3
+ anli_r2,acc,0.33,0.014876872027456729,0
4
+ anli_r3,acc,0.31583333333333335,0.013424568830356462,0
5
+ arc_challenge,acc,0.27986348122866894,0.013119040897725922,0
6
+ arc_challenge,acc_norm,0.29180887372013653,0.01328452529240351,0
7
+ arc_easy,acc,0.6077441077441077,0.010018744689650043,0
8
+ arc_easy,acc_norm,0.5955387205387206,0.010070746648278792,0
9
+ boolq,acc,0.5902140672782875,0.008601532621213527,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.3001349527665318,,1
12
+ copa,acc,0.71,0.045604802157206845,0
13
+ hellaswag,acc,0.42103166699860584,0.00492715588259819,0
14
+ hellaswag,acc_norm,0.5474009161521609,0.004967308254425745,0
15
+ piqa,acc,0.7317736670293797,0.010336761992404485,0
16
+ piqa,acc_norm,0.7437431991294886,0.010185787831565051,0
17
+ rte,acc,0.5487364620938628,0.029953149241808943,0
18
+ sciq,acc,0.905,0.009276910103103294,0
19
+ sciq,acc_norm,0.905,0.009276910103103305,0
20
+ storycloze_2016,acc,0.6819882415820417,0.010769343495248539,0
21
+ winogrande,acc,0.5430149960536701,0.01400038676159829,0
4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.333,0.014910846164229859,0
3
+ anli_r2,acc,0.345,0.015039986742055242,0
4
+ anli_r3,acc,0.3225,0.013499258621103245,0
5
+ arc_challenge,acc,0.26706484641638223,0.012928933196496357,0
6
+ arc_challenge,acc_norm,0.2901023890784983,0.013261573677520764,0
7
+ arc_easy,acc,0.6026936026936027,0.010041053078884272,0
8
+ arc_easy,acc_norm,0.5980639730639731,0.010060521220920566,0
9
+ boolq,acc,0.6033639143730887,0.008556148582031999,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.4572362278244631,,1
12
+ copa,acc,0.73,0.0446196043338474,0
13
+ hellaswag,acc,0.4221270663214499,0.0049288918958742935,0
14
+ hellaswag,acc_norm,0.5481975702051384,0.004966544724452231,0
15
+ piqa,acc,0.7334058759521219,0.010316749863541367,0
16
+ piqa,acc_norm,0.7404787812840044,0.010227939888173923,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.918,0.00868051561552372,0
19
+ sciq,acc_norm,0.914,0.008870325962594766,0
20
+ storycloze_2016,acc,0.6862640299305185,0.010730179119317628,0
21
+ winogrande,acc,0.5445935280189423,0.013996485037729796,0
4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.361,0.015195720118175118,0
3
+ anli_r2,acc,0.348,0.01507060460376841,0
4
+ anli_r3,acc,0.33416666666666667,0.013622434813136774,0
5
+ arc_challenge,acc,0.2687713310580205,0.01295506596371069,0
6
+ arc_challenge,acc_norm,0.28242320819112626,0.013155456884097222,0
7
+ arc_easy,acc,0.6064814814814815,0.010024426884292559,0
8
+ arc_easy,acc_norm,0.6064814814814815,0.01002442688429257,0
9
+ boolq,acc,0.6079510703363914,0.00853880291491199,1
10
+ cb,acc,0.5714285714285714,0.06672848092813058,1
11
+ cb,f1,0.43832345191040845,,1
12
+ copa,acc,0.71,0.04560480215720684,0
13
+ hellaswag,acc,0.4191396136227843,0.004924098711864577,0
14
+ hellaswag,acc_norm,0.545807608046206,0.004968796800410405,0
15
+ piqa,acc,0.7279651795429815,0.010382763786247381,0
16
+ piqa,acc_norm,0.7377584330794341,0.010262502565172445,0
17
+ rte,acc,0.51985559566787,0.030072723167317177,0
18
+ sciq,acc,0.923,0.00843458014024065,0
19
+ sciq,acc_norm,0.919,0.008632121032139981,0
20
+ storycloze_2016,acc,0.6873329770176376,0.01072022317295317,0
21
+ winogrande,acc,0.5595895816890292,0.0139523303119156,0
4b284b84b30c4py/evaluation/rankeval/4b284b84b30c4py_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811476,0
3
+ anli_r2,acc,0.338,0.01496596071022448,0
4
+ anli_r3,acc,0.33666666666666667,0.013647602942406393,0
5
+ arc_challenge,acc,0.2738907849829352,0.013032004972989503,0
6
+ arc_challenge,acc_norm,0.30204778156996587,0.01341751914471642,0
7
+ arc_easy,acc,0.6056397306397306,0.010028176038393002,0
8
+ arc_easy,acc_norm,0.6132154882154882,0.009993308355370966,0
9
+ boolq,acc,0.6061162079510704,0.008545835792614984,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.44414735591206184,,1
12
+ copa,acc,0.76,0.042923469599092816,0
13
+ hellaswag,acc,0.41814379605656243,0.004922459820434768,0
14
+ hellaswag,acc_norm,0.5498904600677156,0.00496487956351331,0
15
+ piqa,acc,0.7279651795429815,0.010382763786247378,0
16
+ piqa,acc_norm,0.73449401523395,0.01030330865302443,0
17
+ rte,acc,0.5451263537906137,0.02997363649541526,0
18
+ sciq,acc,0.927,0.008230354715244062,0
19
+ sciq,acc_norm,0.923,0.008434580140240656,0
20
+ storycloze_2016,acc,0.6862640299305185,0.010730179119317621,0
21
+ winogrande,acc,0.5438042620363063,0.013998453610924324,0
4b284b84b40c4py/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.013582511719677682
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.013582511719677682
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.25473178118730816
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.25473178118730816
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2798784226346458
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2798784226346458
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2881030819423194
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2881030819423194
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.29650945041353777
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.29650945041353777
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2962156347894723
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2962156347894723
14
+ e2e_nlg_cleaned,5,average,multiple,0.23817014711449352
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.048108834032744206
16
+ gem_xsum,0,median,rouge2_fmeasure,0.048108834032744206
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.044832702231943945
18
+ gem_xsum,1,median,rouge2_fmeasure,0.044832702231943945
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.052437650370502395
20
+ gem_xsum,2,median,rouge2_fmeasure,0.052437650370502395
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05437609448479325
22
+ gem_xsum,3,median,rouge2_fmeasure,0.05437609448479325
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01342732886723235
24
+ gem_xsum,4,median,rouge2_fmeasure,0.01342732886723235
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0002394225271583762
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0002394225271583762
27
+ gem_xsum,5,average,multiple,0.03557033875239576
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05799475224539202
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05799475224539202
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.09451771999618133
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.09451771999618133
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.1117987450446678
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.1117987450446678
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.11529437108673728
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.11529437108673728
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.11543210637514889
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.11543210637514889
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.12449199494738342
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.12449199494738342
40
+ web_nlg_en,5,average,multiple,0.10325494828258512
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04504282783094545
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.04504282783094545
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05887762376904746
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05887762376904746
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06725027138313636
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.06725027138313636
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05569143713014329
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.05569143713014329
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.018778427314734628
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.018778427314734628
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.003584082140958806
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.003584082140958806
53
+ wiki_lingua_en,5,average,multiple,0.041537444928161
4b284b84b40c4py/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.6265144539832883, "bleu_stderr": 0.055288509436259625, "rouge1_fmeasure": 0.1302900347510512, "rouge1_fmeasure_stderr": 0.0022903688259625294, "rouge1_precision": 0.08409330916607455, "rouge1_precision_stderr": 0.001731188299143665, "rouge1_recall": 0.4063814200286807, "rouge1_recall_stderr": 0.005322562119580885, "rouge2_fmeasure": 0.05799475224539202, "rouge2_fmeasure_stderr": 0.0014369181291833464, "rouge2_precision": 0.03762110461790505, "rouge2_precision_stderr": 0.0010589063250982793, "rouge2_recall": 0.183072399910378, "rouge2_recall_stderr": 0.0036870814896147993, "rougeL_fmeasure": 0.11927894097594424, "rougeL_fmeasure_stderr": 0.001947115069680402, "rougeL_precision": 0.0765776549932987, "rougeL_precision_stderr": 0.0014476432428377694, "rougeL_recall": 0.3781969127331524, "rougeL_recall_stderr": 0.004954701342417325, "rougeLsum_fmeasure": 0.11994292892993323, "rougeLsum_fmeasure_stderr": 0.0020893576616311574, "rougeLsum_precision": 0.07751225034608102, "rougeLsum_precision_stderr": 0.0015835046272373015, "rougeLsum_recall": 0.3739147995747494, "rougeLsum_recall_stderr": 0.004813979853829334}}, "1": {"PALM_prompt": {"bleu": 0.8344374763199134, "bleu_stderr": 0.04109129330525337, "rouge1_fmeasure": 0.18139167901720007, "rouge1_fmeasure_stderr": 0.004050256601077393, "rouge1_precision": 0.16012902553944608, "rouge1_precision_stderr": 0.00493508649996749, "rouge1_recall": 0.350905960763819, "rouge1_recall_stderr": 0.0053197895642003975, "rouge2_fmeasure": 0.09451771999618133, "rouge2_fmeasure_stderr": 0.0027611715037577527, "rouge2_precision": 0.0863440035952581, "rouge2_precision_stderr": 0.003544966232778681, "rouge2_recall": 0.18688434185281602, "rouge2_recall_stderr": 0.003824515226005615, "rougeL_fmeasure": 0.1613351669794436, "rougeL_fmeasure_stderr": 0.0033869728261115917, "rougeL_precision": 0.14103817959421036, "rougeL_precision_stderr": 0.004295389718283515, "rougeL_recall": 0.32354798361564996, "rougeL_recall_stderr": 0.0048027744606671295, "rougeLsum_fmeasure": 0.1655148069300789, "rougeLsum_fmeasure_stderr": 0.0035114085522647727, "rougeLsum_precision": 0.14503143128774412, "rougeLsum_precision_stderr": 0.0044054243786893, "rougeLsum_recall": 0.3283949322633198, "rougeLsum_recall_stderr": 0.004870947512298453}}, "2": {"PALM_prompt": {"bleu": 0.9565960474218924, "bleu_stderr": 0.041963669547927344, "rouge1_fmeasure": 0.20428683815165033, "rouge1_fmeasure_stderr": 0.004528341721325581, "rouge1_precision": 0.18209107412737494, "rouge1_precision_stderr": 0.005392125769908202, "rouge1_recall": 0.36769121274819816, "rouge1_recall_stderr": 0.0051111822287508435, "rouge2_fmeasure": 0.1117987450446678, "rouge2_fmeasure_stderr": 0.003248764038283069, "rouge2_precision": 0.10308538953562238, "rouge2_precision_stderr": 0.00387520058460137, "rouge2_recall": 0.20157982153544848, "rouge2_recall_stderr": 0.003965637044656587, "rougeL_fmeasure": 0.1815614906008451, "rougeL_fmeasure_stderr": 0.0038632677743123687, "rougeL_precision": 0.1595369471218227, "rougeL_precision_stderr": 0.004636944175835897, "rougeL_recall": 0.3383597284132352, "rougeL_recall_stderr": 0.004651282237874797, "rougeLsum_fmeasure": 0.1873650020675151, "rougeLsum_fmeasure_stderr": 0.004004617036942983, "rougeLsum_precision": 0.16521257658083127, "rougeLsum_precision_stderr": 0.004801115532460492, "rougeLsum_recall": 0.3459845827341206, "rougeLsum_recall_stderr": 0.004742942627832055}}, "3": {"PALM_prompt": {"bleu": 1.0617439530707073, "bleu_stderr": 0.047024442532253234, "rouge1_fmeasure": 0.21061512880431646, "rouge1_fmeasure_stderr": 0.0045943037324396715, "rouge1_precision": 0.188036232874631, "rouge1_precision_stderr": 0.0054386379356263686, "rouge1_recall": 0.37827335680919916, "rouge1_recall_stderr": 0.005147590785038739, "rouge2_fmeasure": 0.11529437108673728, "rouge2_fmeasure_stderr": 0.0032569594098975905, "rouge2_precision": 0.10668916716284264, "rouge2_precision_stderr": 0.003846084176006283, "rouge2_recall": 0.20673523761199128, "rouge2_recall_stderr": 0.00397927968083411, "rougeL_fmeasure": 0.1863711190248487, "rougeL_fmeasure_stderr": 0.003915981778526946, "rougeL_precision": 0.16468743692744256, "rougeL_precision_stderr": 0.004704996032193491, "rougeL_recall": 0.3459854629448614, "rougeL_recall_stderr": 0.00462879248759252, "rougeLsum_fmeasure": 0.1931876648789347, "rougeLsum_fmeasure_stderr": 0.004108151816011205, "rougeLsum_precision": 0.1718266059828415, "rougeLsum_precision_stderr": 0.004935594786208739, "rougeLsum_recall": 0.3542260592411487, "rougeLsum_recall_stderr": 0.004741322152461482}}, "4": {"PALM_prompt": {"bleu": 1.1136180225565688, "bleu_stderr": 0.0494077452293476, "rouge1_fmeasure": 0.20993058927949035, "rouge1_fmeasure_stderr": 0.004473246729902349, "rouge1_precision": 0.18441223613152646, "rouge1_precision_stderr": 0.005237742207765776, "rouge1_recall": 0.3815914857028489, "rouge1_recall_stderr": 0.005078263914757465, "rouge2_fmeasure": 0.11543210637514889, "rouge2_fmeasure_stderr": 0.003168721672270811, "rouge2_precision": 0.10369502133226015, "rouge2_precision_stderr": 0.0036312629898113467, "rouge2_recall": 0.2114324572870707, "rouge2_recall_stderr": 0.004026762762569416, "rougeL_fmeasure": 0.18499501783138653, "rougeL_fmeasure_stderr": 0.003757293610248477, "rougeL_precision": 0.1598629331851015, "rougeL_precision_stderr": 0.004421837663247027, "rougeL_recall": 0.34928387082687623, "rougeL_recall_stderr": 0.004577573550259925, "rougeLsum_fmeasure": 0.19301994872871828, "rougeLsum_fmeasure_stderr": 0.003997084991726825, "rougeLsum_precision": 0.16799152271769707, "rougeLsum_precision_stderr": 0.0046996352327360585, "rougeLsum_recall": 0.35910045537819385, "rougeLsum_recall_stderr": 0.004702681988394943}}, "5": {"PALM_prompt": {"bleu": 1.2079287925546833, "bleu_stderr": 0.03843019866071557, "rouge1_fmeasure": 0.22411144484951734, "rouge1_fmeasure_stderr": 0.004729237862708754, "rouge1_precision": 0.2011088882864706, "rouge1_precision_stderr": 0.005614414300996843, "rouge1_recall": 0.39014306466931925, "rouge1_recall_stderr": 0.005104602655490083, "rouge2_fmeasure": 0.12449199494738342, "rouge2_fmeasure_stderr": 0.003421351799733747, "rouge2_precision": 0.11452634659390692, "rouge2_precision_stderr": 0.004008257112780822, "rouge2_recall": 0.21796180474727392, "rouge2_recall_stderr": 0.004154801349538914, "rougeL_fmeasure": 0.197365828998565, "rougeL_fmeasure_stderr": 0.004010251755555753, "rougeL_precision": 0.1747074582799755, "rougeL_precision_stderr": 0.0047968506723291525, "rougeL_recall": 0.3568477715296954, "rougeL_recall_stderr": 0.004626684942440914, "rougeLsum_fmeasure": 0.2057198966329756, "rougeLsum_fmeasure_stderr": 0.004244079124741885, "rougeLsum_precision": 0.18344461393785036, "rougeLsum_precision_stderr": 0.0050727899792715305, "rougeLsum_recall": 0.3663230068263453, "rougeLsum_recall_stderr": 0.004751513219414558}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.840678426820805, "bleu_stderr": 0.12694166716059652, "rouge1_fmeasure": 0.18392484483783464, "rouge1_fmeasure_stderr": 0.0023537177657535603, "rouge1_precision": 0.1677397932867038, "rouge1_precision_stderr": 0.002566350867924021, "rouge1_recall": 0.24946996915581057, "rouge1_recall_stderr": 0.00319451422614205, "rouge2_fmeasure": 0.04504282783094545, "rouge2_fmeasure_stderr": 0.0010952728613557881, "rouge2_precision": 0.0405629048534602, "rouge2_precision_stderr": 0.0010511496416343929, "rouge2_recall": 0.062009980427624525, "rouge2_recall_stderr": 0.0016080499352392, "rougeL_fmeasure": 0.14191048871638867, "rougeL_fmeasure_stderr": 0.0017377427041717932, "rougeL_precision": 0.1284063955670048, "rougeL_precision_stderr": 0.0019270098617566777, "rougeL_recall": 0.19661883074772485, "rougeL_recall_stderr": 0.0025681983640924964, "rougeLsum_fmeasure": 0.17072751234307404, "rougeLsum_fmeasure_stderr": 0.002207941731763039, "rougeLsum_precision": 0.15585306871531224, "rougeLsum_precision_stderr": 0.0024216361410119005, "rougeLsum_recall": 0.23173795351146165, "rougeLsum_recall_stderr": 0.003002941396047478}}, "1": {"tldr_en": {"bleu": 3.2067090455873224, "bleu_stderr": 0.11656826410567804, "rouge1_fmeasure": 0.20771219180814662, "rouge1_fmeasure_stderr": 0.0022643617432163915, "rouge1_precision": 0.3407413152948739, "rouge1_precision_stderr": 0.004783718812192723, "rouge1_recall": 0.21353399677147264, "rouge1_recall_stderr": 0.0028840090046386786, "rouge2_fmeasure": 0.05887762376904746, "rouge2_fmeasure_stderr": 0.0014431237833357007, "rouge2_precision": 0.11360325677041368, "rouge2_precision_stderr": 0.0034140372277618536, "rouge2_recall": 0.05825645797528758, "rouge2_recall_stderr": 0.0015416218332715277, "rougeL_fmeasure": 0.16061237034918419, "rougeL_fmeasure_stderr": 0.0017809679783284652, "rougeL_precision": 0.2735894085623474, "rougeL_precision_stderr": 0.004203057515018806, "rougeL_recall": 0.1636710938863396, "rougeL_recall_stderr": 0.0022229843323821395, "rougeLsum_fmeasure": 0.1952755581276307, "rougeLsum_fmeasure_stderr": 0.0021369910403940787, "rougeLsum_precision": 0.32216782210671335, "rougeLsum_precision_stderr": 0.004614139608985266, "rougeLsum_recall": 0.20065798508507215, "rougeLsum_recall_stderr": 0.0027198500682546994}}, "2": {"tldr_en": {"bleu": 3.650453022881501, "bleu_stderr": 0.07644896749881058, "rouge1_fmeasure": 0.22383562052748845, "rouge1_fmeasure_stderr": 0.0022512992176381185, "rouge1_precision": 0.37489027065854674, "rouge1_precision_stderr": 0.004653679921917693, "rouge1_recall": 0.22139371201517177, "rouge1_recall_stderr": 0.0028991483884995943, "rouge2_fmeasure": 0.06725027138313636, "rouge2_fmeasure_stderr": 0.0014719041196881448, "rouge2_precision": 0.12803033212041234, "rouge2_precision_stderr": 0.003371022901181868, "rouge2_recall": 0.0647863698257365, "rouge2_recall_stderr": 0.0015989835379964367, "rougeL_fmeasure": 0.17573433947211856, "rougeL_fmeasure_stderr": 0.0018043755197798281, "rougeL_precision": 0.3049926706764348, "rougeL_precision_stderr": 0.00414806419425682, "rougeL_recall": 0.1721864403939517, "rougeL_recall_stderr": 0.0022811554151050233, "rougeLsum_fmeasure": 0.21166423096958442, "rougeLsum_fmeasure_stderr": 0.00214097672527724, "rougeLsum_precision": 0.3569590135457494, "rougeLsum_precision_stderr": 0.0045221715924275395, "rougeLsum_recall": 0.2090381046171653, "rougeLsum_recall_stderr": 0.002740384369085583}}, "3": {"tldr_en": {"bleu": 2.5112437793985913, "bleu_stderr": 0.14040045439903112, "rouge1_fmeasure": 0.18936319712955674, "rouge1_fmeasure_stderr": 0.002593571338555069, "rouge1_precision": 0.32014187646603054, "rouge1_precision_stderr": 0.00488134967014187, "rouge1_recall": 0.18502696283491812, "rouge1_recall_stderr": 0.0030765259444242047, "rouge2_fmeasure": 0.05569143713014329, "rouge2_fmeasure_stderr": 0.0014457670655771454, "rouge2_precision": 0.10352546080846126, "rouge2_precision_stderr": 0.003090041248454077, "rouge2_recall": 0.05371435600535107, "rouge2_recall_stderr": 0.0015581965583080434, "rougeL_fmeasure": 0.14911760701154847, "rougeL_fmeasure_stderr": 0.0020683509596811874, "rougeL_precision": 0.26009935501544534, "rougeL_precision_stderr": 0.004234470567033635, "rougeL_recall": 0.1446676385219857, "rougeL_recall_stderr": 0.0024455654532861432, "rougeLsum_fmeasure": 0.17878053391296173, "rougeLsum_fmeasure_stderr": 0.002453979645558079, "rougeLsum_precision": 0.30440779153380226, "rougeLsum_precision_stderr": 0.0047270024679375855, "rougeLsum_recall": 0.17432619542183717, "rougeLsum_recall_stderr": 0.0028933407499825265}}, "4": {"tldr_en": {"bleu": 0.061354216703541076, "bleu_stderr": 0.0097430668139427, "rouge1_fmeasure": 0.06266806778314261, "rouge1_fmeasure_stderr": 0.0021919430354280715, "rouge1_precision": 0.10312269683887926, "rouge1_precision_stderr": 0.0038910187038941626, "rouge1_recall": 0.06401113472670934, "rouge1_recall_stderr": 0.002481421114430076, "rouge2_fmeasure": 0.018778427314734628, "rouge2_fmeasure_stderr": 0.0009813676255886164, "rouge2_precision": 0.03501323018553138, "rouge2_precision_stderr": 0.002132182844830649, "rouge2_recall": 0.019282445467239333, "rouge2_recall_stderr": 0.0011371428329548293, "rougeL_fmeasure": 0.050382052095246765, "rougeL_fmeasure_stderr": 0.0017922872292586594, "rougeL_precision": 0.08503067955831338, "rougeL_precision_stderr": 0.003324654538260431, "rougeL_recall": 0.051239926387538104, "rougeL_recall_stderr": 0.0020286236079580505, "rougeLsum_fmeasure": 0.05921503659210079, "rougeLsum_fmeasure_stderr": 0.002077262378330786, "rougeLsum_precision": 0.09812516865330335, "rougeLsum_precision_stderr": 0.003740801713533611, "rougeLsum_recall": 0.060428521297660256, "rougeLsum_recall_stderr": 0.002345004897412319}}, "5": {"tldr_en": {"bleu": 2.9574446288686244e-14, "bleu_stderr": 7.42461186082665e-13, "rouge1_fmeasure": 0.01047162915187438, "rouge1_fmeasure_stderr": 0.0010450320293293098, "rouge1_precision": 0.01656654257225165, "rouge1_precision_stderr": 0.0017305037314842436, "rouge1_recall": 0.011004210419708334, "rouge1_recall_stderr": 0.00118477734570795, "rouge2_fmeasure": 0.003584082140958806, "rouge2_fmeasure_stderr": 0.0004898638575125038, "rouge2_precision": 0.006097888819659734, "rouge2_precision_stderr": 0.0009228410368757805, "rouge2_recall": 0.003796333622075436, "rouge2_recall_stderr": 0.0005529276313461422, "rougeL_fmeasure": 0.008321979529841074, "rougeL_fmeasure_stderr": 0.0008501773787983126, "rougeL_precision": 0.013606829819008421, "rougeL_precision_stderr": 0.0014877869263599933, "rougeL_recall": 0.008733788463260665, "rougeL_recall_stderr": 0.0009644639613059458, "rougeLsum_fmeasure": 0.00982947752327106, "rougeLsum_fmeasure_stderr": 0.0009859010840668563, "rougeLsum_precision": 0.015681697775468174, "rougeLsum_precision_stderr": 0.0016575658374222873, "rougeLsum_recall": 0.01035945330589562, "rougeLsum_recall_stderr": 0.001125242740806582}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.12619078511586218, "bleu_stderr": 0.02503660481789887, "rouge1_fmeasure": 0.10310636416929035, "rouge1_fmeasure_stderr": 0.0012556554913362046, "rouge1_precision": 0.1132355112800607, "rouge1_precision_stderr": 0.0012718736909336596, "rouge1_recall": 0.10043053288356422, "rouge1_recall_stderr": 0.0014321553738130103, "rouge2_fmeasure": 0.013582511719677682, "rouge2_fmeasure_stderr": 0.00050869964533704, "rouge2_precision": 0.01347978484360828, "rouge2_precision_stderr": 0.0004956579366101666, "rouge2_recall": 0.014479098605062917, "rouge2_recall_stderr": 0.0005737487964794092, "rougeL_fmeasure": 0.08760663225600158, "rougeL_fmeasure_stderr": 0.001050066863353876, "rougeL_precision": 0.09702036725543593, "rougeL_precision_stderr": 0.0010914873840099798, "rougeL_recall": 0.08477718721124175, "rougeL_recall_stderr": 0.0011916437566732185, "rougeLsum_fmeasure": 0.10022114236190989, "rougeLsum_fmeasure_stderr": 0.001202232045606008, "rougeLsum_precision": 0.11018993891629533, "rougeLsum_precision_stderr": 0.0012193575585225534, "rougeLsum_recall": 0.09756840081312741, "rougeLsum_recall_stderr": 0.0013797292627960992}}, "1": {"generate_text_restaurant": {"bleu": 14.195894441929788, "bleu_stderr": 0.1898474604172003, "rouge1_fmeasure": 0.5169393508079786, "rouge1_fmeasure_stderr": 0.0023972549609494772, "rouge1_precision": 0.6197584666189688, "rouge1_precision_stderr": 0.0031541493434612414, "rouge1_recall": 0.48226172529271133, "rouge1_recall_stderr": 0.003109933232539138, "rouge2_fmeasure": 0.25473178118730816, "rouge2_fmeasure_stderr": 0.0021891064874697964, "rouge2_precision": 0.30901028694460597, "rouge2_precision_stderr": 0.0028079796282801384, "rouge2_recall": 0.23729665867102903, "rouge2_recall_stderr": 0.0023201124865564427, "rougeL_fmeasure": 0.36882650686781565, "rougeL_fmeasure_stderr": 0.002204681251618594, "rougeL_precision": 0.44530014852661626, "rougeL_precision_stderr": 0.0030178076881037736, "rougeL_recall": 0.3428922610954134, "rougeL_recall_stderr": 0.002544460708269765, "rougeLsum_fmeasure": 0.4206846158284769, "rougeLsum_fmeasure_stderr": 0.0024575700499847946, "rougeLsum_precision": 0.5056038833641523, "rougeLsum_precision_stderr": 0.0032229148183322434, "rougeLsum_recall": 0.3919088719268619, "rougeLsum_recall_stderr": 0.0028820837579924753}}, "2": {"generate_text_restaurant": {"bleu": 16.86712634160098, "bleu_stderr": 0.20595777968131967, "rouge1_fmeasure": 0.5481507172310653, "rouge1_fmeasure_stderr": 0.0022610388159171897, "rouge1_precision": 0.6172826437348891, "rouge1_precision_stderr": 0.0029847159553384657, "rouge1_recall": 0.5286285647287439, "rouge1_recall_stderr": 0.002971496665411117, "rouge2_fmeasure": 0.2798784226346458, "rouge2_fmeasure_stderr": 0.002225987158045661, "rouge2_precision": 0.31710772626229494, "rouge2_precision_stderr": 0.0026819576449413564, "rouge2_recall": 0.27050833783163336, "rouge2_recall_stderr": 0.0024465990819409, "rougeL_fmeasure": 0.39080370209384113, "rougeL_fmeasure_stderr": 0.002203350066360137, "rougeL_precision": 0.4417460343620748, "rougeL_precision_stderr": 0.002842179507994522, "rougeL_recall": 0.3761113050964613, "rougeL_recall_stderr": 0.00255557581948251, "rougeLsum_fmeasure": 0.4510349482724993, "rougeLsum_fmeasure_stderr": 0.0024238627019089686, "rougeLsum_precision": 0.5080968975292306, "rougeLsum_precision_stderr": 0.003032446790946152, "rougeLsum_recall": 0.4346529082415982, "rougeLsum_recall_stderr": 0.002865555128339054}}, "3": {"generate_text_restaurant": {"bleu": 17.56374362342707, "bleu_stderr": 0.25050597399216695, "rouge1_fmeasure": 0.5592809137055812, "rouge1_fmeasure_stderr": 0.0022620185875293105, "rouge1_precision": 0.623764742211529, "rouge1_precision_stderr": 0.002923612332826476, "rouge1_recall": 0.5399094505415863, "rouge1_recall_stderr": 0.002919139611658139, "rouge2_fmeasure": 0.2881030819423194, "rouge2_fmeasure_stderr": 0.002237533822044873, "rouge2_precision": 0.32247210648488667, "rouge2_precision_stderr": 0.002607759271064687, "rouge2_recall": 0.2786473236193413, "rouge2_recall_stderr": 0.0024315057816577438, "rougeL_fmeasure": 0.39893180563698516, "rougeL_fmeasure_stderr": 0.0022031921523099768, "rougeL_precision": 0.4457156218502052, "rougeL_precision_stderr": 0.002744253406926975, "rougeL_recall": 0.38477129842024516, "rougeL_recall_stderr": 0.0025376583332808664, "rougeLsum_fmeasure": 0.45992470785237377, "rougeLsum_fmeasure_stderr": 0.0024350491859283473, "rougeLsum_precision": 0.5128290677816093, "rougeLsum_precision_stderr": 0.002980025939462215, "rougeLsum_recall": 0.44377486379833864, "rougeLsum_recall_stderr": 0.002830112289804681}}, "4": {"generate_text_restaurant": {"bleu": 18.31918955508527, "bleu_stderr": 0.15754119471961942, "rouge1_fmeasure": 0.5656753031403037, "rouge1_fmeasure_stderr": 0.002278760944536904, "rouge1_precision": 0.6279517916032283, "rouge1_precision_stderr": 0.002944286438643164, "rouge1_recall": 0.5454770476528574, "rouge1_recall_stderr": 0.0028822547405913477, "rouge2_fmeasure": 0.29650945041353777, "rouge2_fmeasure_stderr": 0.002292708123648972, "rouge2_precision": 0.33045056182719734, "rouge2_precision_stderr": 0.0026614717534088747, "rouge2_recall": 0.28640873500577874, "rouge2_recall_stderr": 0.002476449555218096, "rougeL_fmeasure": 0.40442992029435154, "rougeL_fmeasure_stderr": 0.0022796677874249837, "rougeL_precision": 0.4497509296905579, "rougeL_precision_stderr": 0.0028164532935422443, "rougeL_recall": 0.3897848021335173, "rougeL_recall_stderr": 0.0025849718937575503, "rougeLsum_fmeasure": 0.4697683839494934, "rougeLsum_fmeasure_stderr": 0.0024834590153620813, "rougeLsum_precision": 0.5210649861273837, "rougeLsum_precision_stderr": 0.0030047651018879832, "rougeLsum_recall": 0.45323149287414854, "rougeLsum_recall_stderr": 0.0028802549273979528}}, "5": {"generate_text_restaurant": {"bleu": 17.960439045923064, "bleu_stderr": 0.24402641114299475, "rouge1_fmeasure": 0.5643903572481209, "rouge1_fmeasure_stderr": 0.002255280251343447, "rouge1_precision": 0.6315054405112993, "rouge1_precision_stderr": 0.002990673384548303, "rouge1_recall": 0.5400860046713761, "rouge1_recall_stderr": 0.0028106427092864637, "rouge2_fmeasure": 0.2962156347894723, "rouge2_fmeasure_stderr": 0.0022815792283599062, "rouge2_precision": 0.33281317061318083, "rouge2_precision_stderr": 0.002681864527165246, "rouge2_recall": 0.2838353975698396, "rouge2_recall_stderr": 0.0024339910599304676, "rougeL_fmeasure": 0.40473089019193104, "rougeL_fmeasure_stderr": 0.0022512705449731557, "rougeL_precision": 0.4530709399456242, "rougeL_precision_stderr": 0.002802105043256588, "rougeL_recall": 0.38739689995192905, "rougeL_recall_stderr": 0.002537083443039135, "rougeLsum_fmeasure": 0.46775324541885227, "rougeLsum_fmeasure_stderr": 0.002457212564884093, "rougeLsum_precision": 0.523061452351981, "rougeLsum_precision_stderr": 0.003042253489245001, "rougeLsum_recall": 0.44760138445342396, "rougeLsum_recall_stderr": 0.002786956104827745}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.898029014522741, "bleu_stderr": 0.0621045472883636, "rouge1_fmeasure": 0.2118395018429487, "rouge1_fmeasure_stderr": 0.0024636083114867977, "rouge1_precision": 0.15624368997481555, "rouge1_precision_stderr": 0.0020141995424361537, "rouge1_recall": 0.3583620701595724, "rouge1_recall_stderr": 0.0043618195269311604, "rouge2_fmeasure": 0.048108834032744206, "rouge2_fmeasure_stderr": 0.001491272188361829, "rouge2_precision": 0.034740667971456626, "rouge2_precision_stderr": 0.0010961297567670593, "rouge2_recall": 0.08492957945530763, "rouge2_recall_stderr": 0.0027292382773262203, "rougeL_fmeasure": 0.15648896116335023, "rougeL_fmeasure_stderr": 0.0018706059111361075, "rougeL_precision": 0.11541824844233431, "rougeL_precision_stderr": 0.001521805833672726, "rougeL_recall": 0.26583610504341093, "rougeL_recall_stderr": 0.0034547321451017464, "rougeLsum_fmeasure": 0.16831509506372735, "rougeLsum_fmeasure_stderr": 0.002080929001242235, "rougeLsum_precision": 0.1238053683612589, "rougeLsum_precision_stderr": 0.0016487142268722796, "rougeLsum_recall": 0.2866339063222549, "rougeLsum_recall_stderr": 0.0038338609987898604}}, "1": {"article_DOC_summary": {"bleu": 1.8244631279382701, "bleu_stderr": 0.12981973857276125, "rouge1_fmeasure": 0.2060082513501437, "rouge1_fmeasure_stderr": 0.002839389140182063, "rouge1_precision": 0.16767243137480217, "rouge1_precision_stderr": 0.002970087487920079, "rouge1_recall": 0.32093443528578147, "rouge1_recall_stderr": 0.004718174675688207, "rouge2_fmeasure": 0.044832702231943945, "rouge2_fmeasure_stderr": 0.0016608187602593493, "rouge2_precision": 0.03640318258985222, "rouge2_precision_stderr": 0.0015703236627444166, "rouge2_recall": 0.07214820343126738, "rouge2_recall_stderr": 0.0027048080703660857, "rougeL_fmeasure": 0.15171417699984252, "rougeL_fmeasure_stderr": 0.002153371397499807, "rougeL_precision": 0.12329685686001111, "rougeL_precision_stderr": 0.002263280214078096, "rougeL_recall": 0.23800740345423912, "rougeL_recall_stderr": 0.003682563146850884, "rougeLsum_fmeasure": 0.16101328880633298, "rougeLsum_fmeasure_stderr": 0.002319476234207615, "rougeLsum_precision": 0.1299915444490808, "rougeLsum_precision_stderr": 0.002316211636653606, "rougeLsum_recall": 0.25371872809692253, "rougeLsum_recall_stderr": 0.0040410570883259184}}, "2": {"article_DOC_summary": {"bleu": 2.1539575871422523, "bleu_stderr": 0.11580184343815451, "rouge1_fmeasure": 0.22475499686393385, "rouge1_fmeasure_stderr": 0.003147300726797165, "rouge1_precision": 0.20457563040395477, "rouge1_precision_stderr": 0.0037898985705208423, "rouge1_recall": 0.3021886296048394, "rouge1_recall_stderr": 0.0043334562764797665, "rouge2_fmeasure": 0.052437650370502395, "rouge2_fmeasure_stderr": 0.002029778109588179, "rouge2_precision": 0.04864954567970401, "rouge2_precision_stderr": 0.002162387697981618, "rouge2_recall": 0.07052036196534157, "rouge2_recall_stderr": 0.002631047710908095, "rougeL_fmeasure": 0.16907875182652785, "rougeL_fmeasure_stderr": 0.002537135293083765, "rougeL_precision": 0.15460406543450989, "rougeL_precision_stderr": 0.0031056157190172346, "rougeL_recall": 0.22714605831719337, "rougeL_recall_stderr": 0.0033750271831163086, "rougeLsum_fmeasure": 0.17633797791876762, "rougeLsum_fmeasure_stderr": 0.0026213379569355395, "rougeLsum_precision": 0.15994168668588624, "rougeLsum_precision_stderr": 0.0031066476351750685, "rougeLsum_recall": 0.23934654450491796, "rougeLsum_recall_stderr": 0.0037291639028117166}}, "3": {"article_DOC_summary": {"bleu": 2.470682014953078, "bleu_stderr": 0.13934016050087605, "rouge1_fmeasure": 0.22622378732366108, "rouge1_fmeasure_stderr": 0.0036155630216220336, "rouge1_precision": 0.22005905487382696, "rouge1_precision_stderr": 0.004296179845687528, "rouge1_recall": 0.276026401721913, "rouge1_recall_stderr": 0.00446755883668739, "rouge2_fmeasure": 0.05437609448479325, "rouge2_fmeasure_stderr": 0.0021959636247266957, "rouge2_precision": 0.053934310357718694, "rouge2_precision_stderr": 0.002427626877543396, "rouge2_recall": 0.06543019726976265, "rouge2_recall_stderr": 0.0025244452430399056, "rougeL_fmeasure": 0.1689632997247983, "rougeL_fmeasure_stderr": 0.0028464516598954892, "rougeL_precision": 0.16451176292128145, "rougeL_precision_stderr": 0.0033894141566561645, "rougeL_recall": 0.20659195912684952, "rougeL_recall_stderr": 0.0034854825418956115, "rougeLsum_fmeasure": 0.17338436552105183, "rougeLsum_fmeasure_stderr": 0.002896363525372315, "rougeLsum_precision": 0.16802358418874647, "rougeLsum_precision_stderr": 0.003405185479063602, "rougeLsum_recall": 0.21363367155842128, "rougeLsum_recall_stderr": 0.003711328238748103}}, "4": {"article_DOC_summary": {"bleu": 0.3436306654105826, "bleu_stderr": 0.0936524979713778, "rouge1_fmeasure": 0.05925928880419014, "rouge1_fmeasure_stderr": 0.0033702498704057536, "rouge1_precision": 0.06451009096482054, "rouge1_precision_stderr": 0.0039009593899421843, "rouge1_recall": 0.06836149108839051, "rouge1_recall_stderr": 0.004081552795683849, "rouge2_fmeasure": 0.01342732886723235, "rouge2_fmeasure_stderr": 0.0012658112590712524, "rouge2_precision": 0.015599489229205931, "rouge2_precision_stderr": 0.0017749720731952423, "rouge2_recall": 0.015587185299433712, "rouge2_recall_stderr": 0.0014828723762299963, "rougeL_fmeasure": 0.044804508673895696, "rougeL_fmeasure_stderr": 0.0025721087234640496, "rougeL_precision": 0.049783720599684944, "rougeL_precision_stderr": 0.0031440759650352753, "rougeL_recall": 0.05119332858191562, "rougeL_recall_stderr": 0.0030489073864090393, "rougeLsum_fmeasure": 0.046329199178483695, "rougeLsum_fmeasure_stderr": 0.0026578279644949776, "rougeLsum_precision": 0.0510047143661472, "rougeLsum_precision_stderr": 0.003188754503688182, "rougeLsum_recall": 0.05362228101565458, "rougeLsum_recall_stderr": 0.0032588056838317205}}, "5": {"article_DOC_summary": {"bleu": 2.929364234168567e-40, "bleu_stderr": 2.0861438219353014e-34, "rouge1_fmeasure": 0.002650336183188564, "rouge1_fmeasure_stderr": 0.0007090932375627381, "rouge1_precision": 0.0030133650011417404, "rouge1_precision_stderr": 0.0008188944540516045, "rouge1_recall": 0.0024623689285241295, "rouge1_recall_stderr": 0.0006581423998232249, "rouge2_fmeasure": 0.0002394225271583762, "rouge2_fmeasure_stderr": 0.00010800450922924677, "rouge2_precision": 0.00027200417045034135, "rouge2_precision_stderr": 0.0001215679497899001, "rouge2_recall": 0.00021798170383076042, "rouge2_recall_stderr": 9.996534184234093e-05, "rougeL_fmeasure": 0.0018757756274033561, "rougeL_fmeasure_stderr": 0.0004899327883188719, "rougeL_precision": 0.0021010494866298356, "rougeL_precision_stderr": 0.0005494572154834354, "rougeL_recall": 0.0017827055551384349, "rougeL_recall_stderr": 0.00048012261945421994, "rougeLsum_fmeasure": 0.0021075043872760177, "rougeLsum_fmeasure_stderr": 0.0005522309805428464, "rougeLsum_precision": 0.002363524402267903, "rougeLsum_precision_stderr": 0.0006208866028094235, "rougeLsum_recall": 0.0019919981207774844, "rougeLsum_recall_stderr": 0.0005326512222663613}}}}
4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.334,0.014922019523732965,0
3
+ anli_r2,acc,0.336,0.014944140233795027,0
4
+ anli_r3,acc,0.32666666666666666,0.013544340907003663,0
5
+ arc_challenge,acc,0.25426621160409557,0.012724999945157744,0
6
+ arc_challenge,acc_norm,0.2815699658703072,0.013143376735009019,0
7
+ arc_easy,acc,0.5833333333333334,0.010116282977781239,0
8
+ arc_easy,acc_norm,0.5109427609427609,0.010257326131172867,0
9
+ boolq,acc,0.5694189602446483,0.008660360145988744,1
10
+ cb,acc,0.3392857142857143,0.06384226561930825,1
11
+ cb,f1,0.30009608665269044,,1
12
+ copa,acc,0.73,0.0446196043338474,0
13
+ hellaswag,acc,0.43975303724357695,0.004953426186069828,0
14
+ hellaswag,acc_norm,0.5689105755825533,0.004942164585991472,0
15
+ piqa,acc,0.7290533188248096,0.010369718937426843,0
16
+ piqa,acc_norm,0.7383025027203483,0.010255630772708229,0
17
+ rte,acc,0.5740072202166066,0.029764956741777645,0
18
+ sciq,acc,0.839,0.011628164696727193,0
19
+ sciq,acc_norm,0.757,0.013569640199177451,0
20
+ storycloze_2016,acc,0.6990913949759487,0.010606289538707335,0
21
+ winogrande,acc,0.5651144435674822,0.013932814110418013,0
4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.323,0.014794927843348637,0
3
+ anli_r2,acc,0.315,0.01469663196079251,0
4
+ anli_r3,acc,0.3541666666666667,0.013811933499570958,0
5
+ arc_challenge,acc,0.2525597269624573,0.012696728980207706,0
6
+ arc_challenge,acc_norm,0.2883959044368601,0.013238394422428173,0
7
+ arc_easy,acc,0.5955387205387206,0.010070746648278773,0
8
+ arc_easy,acc_norm,0.5879629629629629,0.010099765857562759,0
9
+ boolq,acc,0.5165137614678899,0.00874028404648664,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.29523809523809524,,1
12
+ copa,acc,0.7,0.046056618647183814,0
13
+ hellaswag,acc,0.43815972913762197,0.004951470301995878,0
14
+ hellaswag,acc_norm,0.5702051384186417,0.004940349676769334,0
15
+ piqa,acc,0.7241566920565833,0.010427805502729117,0
16
+ piqa,acc_norm,0.7317736670293797,0.010336761992404485,0
17
+ rte,acc,0.516245487364621,0.030080573208738064,0
18
+ sciq,acc,0.912,0.008963053962592064,0
19
+ sciq,acc_norm,0.889,0.009938701010583726,0
20
+ storycloze_2016,acc,0.689470871191876,0.010700112173178448,0
21
+ winogrande,acc,0.5666929755327546,0.013926915052757347,0
4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.299,0.014484778521220478,0
3
+ anli_r2,acc,0.342,0.015008706182121733,0
4
+ anli_r3,acc,0.32416666666666666,0.013517438120881615,0
5
+ arc_challenge,acc,0.2687713310580205,0.01295506596371069,0
6
+ arc_challenge,acc_norm,0.29692832764505117,0.013352025976725223,0
7
+ arc_easy,acc,0.6102693602693603,0.010007169391797051,0
8
+ arc_easy,acc_norm,0.6098484848484849,0.010009118166667415,0
9
+ boolq,acc,0.5226299694189602,0.008736093428015823,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.2650662665666417,,1
12
+ copa,acc,0.7,0.046056618647183814,0
13
+ hellaswag,acc,0.4360685122485561,0.0049488245013554815,0
14
+ hellaswag,acc_norm,0.5723959370643298,0.0049371997599476905,0
15
+ piqa,acc,0.7295973884657236,0.01036316703162079,0
16
+ piqa,acc_norm,0.7410228509249184,0.010220966031405616,0
17
+ rte,acc,0.5379061371841155,0.030009848912529113,0
18
+ sciq,acc,0.913,0.008916866630745925,0
19
+ sciq,acc_norm,0.908,0.009144376393151108,0
20
+ storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0
21
+ winogrande,acc,0.5777426992896606,0.013881582030658542,0
4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.299,0.014484778521220458,0
3
+ anli_r2,acc,0.341,0.014998131348402699,0
4
+ anli_r3,acc,0.3283333333333333,0.013562032919529017,0
5
+ arc_challenge,acc,0.26109215017064846,0.012835523909473847,0
6
+ arc_challenge,acc_norm,0.29180887372013653,0.013284525292403508,0
7
+ arc_easy,acc,0.6102693602693603,0.010007169391797051,0
8
+ arc_easy,acc_norm,0.6140572390572391,0.009989277329503953,0
9
+ boolq,acc,0.5070336391437309,0.008744189661475105,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.3737373737373737,,1
12
+ copa,acc,0.73,0.0446196043338474,0
13
+ hellaswag,acc,0.4398526190001992,0.004953546708512332,0
14
+ hellaswag,acc_norm,0.5766779525990838,0.004930757390897343,0
15
+ piqa,acc,0.735582154515778,0.010289787244767163,0
16
+ piqa,acc_norm,0.735038084874864,0.010296557993316031,0
17
+ rte,acc,0.5703971119133574,0.02979666882912467,0
18
+ sciq,acc,0.922,0.008484573530118585,0
19
+ sciq,acc_norm,0.92,0.008583336977753653,0
20
+ storycloze_2016,acc,0.6916087653661144,0.0106797344454878,0
21
+ winogrande,acc,0.5785319652722968,0.013878072377497606,0
4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.322,0.014782913600996662,0
3
+ anli_r2,acc,0.37,0.01527525231651936,0
4
+ anli_r3,acc,0.3491666666666667,0.013767075395077245,0
5
+ arc_challenge,acc,0.2721843003412969,0.013006600406423704,0
6
+ arc_challenge,acc_norm,0.302901023890785,0.013428241573185349,0
7
+ arc_easy,acc,0.6127946127946128,0.00999531206589035,0
8
+ arc_easy,acc_norm,0.6195286195286195,0.00996230599205857,0
9
+ boolq,acc,0.4957186544342508,0.008744734378208071,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.32238805970149254,,1
12
+ copa,acc,0.76,0.04292346959909284,0
13
+ hellaswag,acc,0.43845847440748853,0.0049518409782196935,0
14
+ hellaswag,acc_norm,0.5737900816570405,0.004935143791573816,0
15
+ piqa,acc,0.7312295973884657,0.010343392940090011,0
16
+ piqa,acc_norm,0.7404787812840044,0.010227939888173923,0
17
+ rte,acc,0.516245487364621,0.030080573208738064,0
18
+ sciq,acc,0.92,0.008583336977753655,0
19
+ sciq,acc_norm,0.918,0.008680515615523729,0
20
+ storycloze_2016,acc,0.6964190272581507,0.01063290135851837,0
21
+ winogrande,acc,0.5611681136543015,0.013946933444507032,0
4b284b84b40c4py/evaluation/rankeval/4b284b84b40c4py_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.338,0.014965960710224492,0
3
+ anli_r2,acc,0.324,0.014806864733738859,0
4
+ anli_r3,acc,0.3258333333333333,0.013535422043417455,0
5
+ arc_challenge,acc,0.2721843003412969,0.013006600406423706,0
6
+ arc_challenge,acc_norm,0.3046075085324232,0.013449522109932492,0
7
+ arc_easy,acc,0.6144781144781145,0.009987250004629022,0
8
+ arc_easy,acc_norm,0.6144781144781145,0.009987250004629009,0
9
+ boolq,acc,0.5027522935779817,0.008744922485713838,1
10
+ cb,acc,0.5714285714285714,0.06672848092813058,1
11
+ cb,f1,0.3786646957378665,,1
12
+ copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.43945429197371044,0.004953063404791446,0
14
+ hellaswag,acc_norm,0.5733917546305517,0.004935735300348865,0
15
+ piqa,acc,0.7334058759521219,0.010316749863541367,0
16
+ piqa,acc_norm,0.7366702937976061,0.010276185322196764,0
17
+ rte,acc,0.628158844765343,0.02909101849221743,0
18
+ sciq,acc,0.917,0.00872852720607479,0
19
+ sciq,acc_norm,0.915,0.008823426366942326,0
20
+ storycloze_2016,acc,0.7033671833244255,0.010562819181563219,0
21
+ winogrande,acc,0.5777426992896606,0.013881582030658556,0
4b284b84b60c4py/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.025640802615608842
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.025640802615608842
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.24335202781995585
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.24335202781995585
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2741924846784574
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2741924846784574
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2853194237803746
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2853194237803746
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.29042848350528283
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.29042848350528283
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.29064064925473093
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.29064064925473093
14
+ e2e_nlg_cleaned,5,average,multiple,0.23492897860906842
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.044732369431626645
16
+ gem_xsum,0,median,rouge2_fmeasure,0.044732369431626645
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04672709348462675
18
+ gem_xsum,1,median,rouge2_fmeasure,0.04672709348462675
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.057990701442539934
20
+ gem_xsum,2,median,rouge2_fmeasure,0.057990701442539934
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05897771350337035
22
+ gem_xsum,3,median,rouge2_fmeasure,0.05897771350337035
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.012680674818083815
24
+ gem_xsum,4,median,rouge2_fmeasure,0.012680674818083815
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0
27
+ gem_xsum,5,average,multiple,0.036851425446707914
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05397278892900303
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05397278892900303
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08887163803405253
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.08887163803405253
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.10672158624702527
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.10672158624702527
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.11249106553317505
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.11249106553317505
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.11667508023114785
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.11667508023114785
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.12289814464055694
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.12289814464055694
40
+ web_nlg_en,5,average,multiple,0.10027171726916011
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03882558823895396
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03882558823895396
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06397045733420877
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.06397045733420877
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07425667915290382
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.07425667915290382
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.0634569752805522
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.0634569752805522
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.020504615316899424
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.020504615316899424
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0036037511602081455
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0036037511602081455
53
+ wiki_lingua_en,5,average,multiple,0.044103011080621056
4b284b84b60c4py/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.36839650460443496, "bleu_stderr": 0.03693811466125887, "rouge1_fmeasure": 0.12179537949949117, "rouge1_fmeasure_stderr": 0.0017948647375230998, "rouge1_precision": 0.07922516293219223, "rouge1_precision_stderr": 0.0015737269203973088, "rouge1_recall": 0.3785113049770472, "rouge1_recall_stderr": 0.005292653431578969, "rouge2_fmeasure": 0.05397278892900303, "rouge2_fmeasure_stderr": 0.0011186386697521263, "rouge2_precision": 0.035157334401903065, "rouge2_precision_stderr": 0.001038292812328464, "rouge2_recall": 0.1740071715881754, "rouge2_recall_stderr": 0.0034373290428352135, "rougeL_fmeasure": 0.11411692569300623, "rougeL_fmeasure_stderr": 0.0016294969357995113, "rougeL_precision": 0.07429317599389638, "rougeL_precision_stderr": 0.0014825392272162208, "rougeL_recall": 0.35524737297189446, "rougeL_recall_stderr": 0.004866231441317311, "rougeLsum_fmeasure": 0.11398456193236862, "rougeLsum_fmeasure_stderr": 0.001665039306254528, "rougeLsum_precision": 0.07435656832928782, "rougeLsum_precision_stderr": 0.001510876836592405, "rougeLsum_recall": 0.3535235844422809, "rougeLsum_recall_stderr": 0.004855460300144671}}, "1": {"PALM_prompt": {"bleu": 0.708772476352869, "bleu_stderr": 0.04393031796058734, "rouge1_fmeasure": 0.17207517281643395, "rouge1_fmeasure_stderr": 0.003822251897680684, "rouge1_precision": 0.1458720629687531, "rouge1_precision_stderr": 0.00451504556268303, "rouge1_recall": 0.3521921831349989, "rouge1_recall_stderr": 0.005288865737427393, "rouge2_fmeasure": 0.08887163803405253, "rouge2_fmeasure_stderr": 0.0026144610518286987, "rouge2_precision": 0.07732803505568758, "rouge2_precision_stderr": 0.0031347730381023715, "rouge2_recall": 0.18280142068018543, "rouge2_recall_stderr": 0.0037694151864380836, "rougeL_fmeasure": 0.15481112377317344, "rougeL_fmeasure_stderr": 0.0032423689025774024, "rougeL_precision": 0.1300518109880955, "rougeL_precision_stderr": 0.0039374610751503275, "rougeL_recall": 0.3259584316759415, "rougeL_recall_stderr": 0.004757385276169493, "rougeLsum_fmeasure": 0.1584200505299405, "rougeLsum_fmeasure_stderr": 0.0033550764978907298, "rougeLsum_precision": 0.13344096336404151, "rougeLsum_precision_stderr": 0.004047605710023097, "rougeLsum_recall": 0.3307855376911941, "rougeLsum_recall_stderr": 0.0048395525583811464}}, "2": {"PALM_prompt": {"bleu": 0.7770723623195508, "bleu_stderr": 0.04176315583056231, "rouge1_fmeasure": 0.19711044409546197, "rouge1_fmeasure_stderr": 0.004294252460986334, "rouge1_precision": 0.17692396957662454, "rouge1_precision_stderr": 0.005223443663869332, "rouge1_recall": 0.3640751335837825, "rouge1_recall_stderr": 0.005009559230284251, "rouge2_fmeasure": 0.10672158624702527, "rouge2_fmeasure_stderr": 0.0030481247835459787, "rouge2_precision": 0.09868482696897406, "rouge2_precision_stderr": 0.003653042106982874, "rouge2_recall": 0.1971606563639153, "rouge2_recall_stderr": 0.0038459561576765873, "rougeL_fmeasure": 0.17696992361161118, "rougeL_fmeasure_stderr": 0.003654034239494166, "rougeL_precision": 0.1570598921962305, "rougeL_precision_stderr": 0.004531572534533353, "rougeL_recall": 0.3382539397790294, "rougeL_recall_stderr": 0.004571024122696908, "rougeLsum_fmeasure": 0.181654316729077, "rougeLsum_fmeasure_stderr": 0.0037953787082525593, "rougeLsum_precision": 0.16225382600990382, "rougeLsum_precision_stderr": 0.004723289424110052, "rougeLsum_recall": 0.34342990259729905, "rougeLsum_recall_stderr": 0.004641512448934588}}, "3": {"PALM_prompt": {"bleu": 0.9367128093036233, "bleu_stderr": 0.03579111165340011, "rouge1_fmeasure": 0.2048222329217373, "rouge1_fmeasure_stderr": 0.004533660282815979, "rouge1_precision": 0.18496669640320706, "rouge1_precision_stderr": 0.005448599910623124, "rouge1_recall": 0.37001173008491406, "rouge1_recall_stderr": 0.004982108375668671, "rouge2_fmeasure": 0.11249106553317505, "rouge2_fmeasure_stderr": 0.0032786441154541565, "rouge2_precision": 0.10426351077075448, "rouge2_precision_stderr": 0.0038464983057059105, "rouge2_recall": 0.20168533903248367, "rouge2_recall_stderr": 0.003935196680838187, "rougeL_fmeasure": 0.18227904731218098, "rougeL_fmeasure_stderr": 0.0037984019972325516, "rougeL_precision": 0.16237966039828178, "rougeL_precision_stderr": 0.004673370716029179, "rougeL_recall": 0.34236506268827255, "rougeL_recall_stderr": 0.00448598034706228, "rougeLsum_fmeasure": 0.18798817638675805, "rougeLsum_fmeasure_stderr": 0.003999262450258021, "rougeLsum_precision": 0.16918545233017043, "rougeLsum_precision_stderr": 0.004946077065758279, "rougeLsum_recall": 0.34783232870163594, "rougeLsum_recall_stderr": 0.004555165878898018}}, "4": {"PALM_prompt": {"bleu": 1.0092809699245466, "bleu_stderr": 0.06928673219809903, "rouge1_fmeasure": 0.2114324222905099, "rouge1_fmeasure_stderr": 0.004499354867610648, "rouge1_precision": 0.1937643219652831, "rouge1_precision_stderr": 0.005520380527696231, "rouge1_recall": 0.37732666900342693, "rouge1_recall_stderr": 0.004886546479505209, "rouge2_fmeasure": 0.11667508023114785, "rouge2_fmeasure_stderr": 0.003299041888108482, "rouge2_precision": 0.11000589837541341, "rouge2_precision_stderr": 0.003937288433393638, "rouge2_recall": 0.2080696615655487, "rouge2_recall_stderr": 0.003963224104038742, "rougeL_fmeasure": 0.18760366218337216, "rougeL_fmeasure_stderr": 0.0037747832335749444, "rougeL_precision": 0.16920030318742038, "rougeL_precision_stderr": 0.004680541970034783, "rougeL_recall": 0.3486193967767282, "rougeL_recall_stderr": 0.004424327241332651, "rougeLsum_fmeasure": 0.19492988802490385, "rougeLsum_fmeasure_stderr": 0.004014420843918751, "rougeLsum_precision": 0.17756680284440748, "rougeLsum_precision_stderr": 0.005000407035533921, "rougeLsum_recall": 0.3561503750877715, "rougeLsum_recall_stderr": 0.004515958205661666}}, "5": {"PALM_prompt": {"bleu": 1.1046318099210415, "bleu_stderr": 0.07157222979617221, "rouge1_fmeasure": 0.2219602030229684, "rouge1_fmeasure_stderr": 0.004622263542257135, "rouge1_precision": 0.20561675789235484, "rouge1_precision_stderr": 0.00572302618253094, "rouge1_recall": 0.38820033672553494, "rouge1_recall_stderr": 0.004901890534729385, "rouge2_fmeasure": 0.12289814464055694, "rouge2_fmeasure_stderr": 0.0033081688119242073, "rouge2_precision": 0.1171550679188743, "rouge2_precision_stderr": 0.003991472585370963, "rouge2_recall": 0.2140777082494364, "rouge2_recall_stderr": 0.003923458177659614, "rougeL_fmeasure": 0.1956381512562096, "rougeL_fmeasure_stderr": 0.0038400384984683153, "rougeL_precision": 0.1787127470917118, "rougeL_precision_stderr": 0.004849781714681424, "rougeL_recall": 0.3561757037878013, "rougeL_recall_stderr": 0.004412086857381878, "rougeLsum_fmeasure": 0.20338527358608494, "rougeLsum_fmeasure_stderr": 0.004088841622210635, "rougeLsum_precision": 0.1878332782023582, "rougeLsum_precision_stderr": 0.005187840313252927, "rougeLsum_recall": 0.36409738385890505, "rougeLsum_recall_stderr": 0.004526199710902854}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.9324378195827014, "bleu_stderr": 0.0922003845635032, "rouge1_fmeasure": 0.17988493654272839, "rouge1_fmeasure_stderr": 0.0020272336291818345, "rouge1_precision": 0.1587407821250192, "rouge1_precision_stderr": 0.002110133515609183, "rouge1_recall": 0.25133500451234014, "rouge1_recall_stderr": 0.002887496987715359, "rouge2_fmeasure": 0.03882558823895396, "rouge2_fmeasure_stderr": 0.00093704538880766, "rouge2_precision": 0.034019220383001, "rouge2_precision_stderr": 0.0008605712206971714, "rouge2_recall": 0.056015167620536196, "rouge2_recall_stderr": 0.0014904122848923534, "rougeL_fmeasure": 0.1405612415400349, "rougeL_fmeasure_stderr": 0.0014783931161189185, "rougeL_precision": 0.12274574507766053, "rougeL_precision_stderr": 0.0015200925341907296, "rougeL_recall": 0.20049226685984714, "rougeL_recall_stderr": 0.0022998855770912335, "rougeLsum_fmeasure": 0.16661717731404257, "rougeLsum_fmeasure_stderr": 0.0018778999537756222, "rougeLsum_precision": 0.14702400397695498, "rougeLsum_precision_stderr": 0.0019628489431421613, "rougeLsum_recall": 0.23325836780260698, "rougeLsum_recall_stderr": 0.0026879972958097345}}, "1": {"tldr_en": {"bleu": 3.41081833251504, "bleu_stderr": 0.06343743457569372, "rouge1_fmeasure": 0.2235532954064031, "rouge1_fmeasure_stderr": 0.002308173349020659, "rouge1_precision": 0.35639294302831104, "rouge1_precision_stderr": 0.004534803700821476, "rouge1_recall": 0.2228183578951572, "rouge1_recall_stderr": 0.0028955151858489525, "rouge2_fmeasure": 0.06397045733420877, "rouge2_fmeasure_stderr": 0.0014650196740003158, "rouge2_precision": 0.11328898491742163, "rouge2_precision_stderr": 0.0030875507010303047, "rouge2_recall": 0.062316781584024046, "rouge2_recall_stderr": 0.0015629418397744195, "rougeL_fmeasure": 0.17121348798068653, "rougeL_fmeasure_stderr": 0.0018230069098997413, "rougeL_precision": 0.28240631708879516, "rougeL_precision_stderr": 0.003968505621842457, "rougeL_recall": 0.16919013590257986, "rougeL_recall_stderr": 0.002237667599118631, "rougeLsum_fmeasure": 0.21003399487968705, "rougeLsum_fmeasure_stderr": 0.0021741417096366156, "rougeLsum_precision": 0.3364307156940188, "rougeLsum_precision_stderr": 0.004351581982419238, "rougeLsum_recall": 0.20914463341958509, "rougeLsum_recall_stderr": 0.0027193111199466493}}, "2": {"tldr_en": {"bleu": 3.6563610665636794, "bleu_stderr": 0.10787975783270783, "rouge1_fmeasure": 0.24112618033023753, "rouge1_fmeasure_stderr": 0.0022853285551182115, "rouge1_precision": 0.40174147363225615, "rouge1_precision_stderr": 0.0045403193404830846, "rouge1_recall": 0.2269767255767654, "rouge1_recall_stderr": 0.002767346678189479, "rouge2_fmeasure": 0.07425667915290382, "rouge2_fmeasure_stderr": 0.0015526421366264772, "rouge2_precision": 0.1373453104959268, "rouge2_precision_stderr": 0.003340863409866677, "rouge2_recall": 0.06737578253523521, "rouge2_recall_stderr": 0.0015292379603900957, "rougeL_fmeasure": 0.1878357273959955, "rougeL_fmeasure_stderr": 0.0018659871477781963, "rougeL_precision": 0.3215819498553555, "rougeL_precision_stderr": 0.004011574279712789, "rougeL_recall": 0.17516150008556522, "rougeL_recall_stderr": 0.0021922709009645704, "rougeLsum_fmeasure": 0.22773988060730666, "rougeLsum_fmeasure_stderr": 0.002176030108081828, "rougeLsum_precision": 0.3814738064825408, "rougeLsum_precision_stderr": 0.0044112779985563035, "rougeLsum_recall": 0.21392644170190447, "rougeLsum_recall_stderr": 0.002612141320903455}}, "3": {"tldr_en": {"bleu": 2.4978677805017724, "bleu_stderr": 0.11433415620279655, "rouge1_fmeasure": 0.2007865981136526, "rouge1_fmeasure_stderr": 0.0027085272543998233, "rouge1_precision": 0.34091967260653555, "rouge1_precision_stderr": 0.004974952628314982, "rouge1_recall": 0.18691495200266725, "rouge1_recall_stderr": 0.003023876240323042, "rouge2_fmeasure": 0.0634569752805522, "rouge2_fmeasure_stderr": 0.0015946482081264856, "rouge2_precision": 0.11836093862121347, "rouge2_precision_stderr": 0.003292320078133778, "rouge2_recall": 0.057668952389103974, "rouge2_recall_stderr": 0.0016015296485184332, "rougeL_fmeasure": 0.15883673382682365, "rougeL_fmeasure_stderr": 0.002214432406640885, "rougeL_precision": 0.2777306848059067, "rougeL_precision_stderr": 0.004354568917856371, "rougeL_recall": 0.14660151301366933, "rougeL_recall_stderr": 0.00243538969404984, "rougeLsum_fmeasure": 0.18979050790851731, "rougeLsum_fmeasure_stderr": 0.002575555003612789, "rougeLsum_precision": 0.32459839325522427, "rougeLsum_precision_stderr": 0.0048207258054161215, "rougeLsum_recall": 0.176110169169721, "rougeLsum_recall_stderr": 0.0028508082420462925}}, "4": {"tldr_en": {"bleu": 0.027296056063556173, "bleu_stderr": 0.005452504537000318, "rouge1_fmeasure": 0.06618326907121005, "rouge1_fmeasure_stderr": 0.002357199629069961, "rouge1_precision": 0.11240024847483139, "rouge1_precision_stderr": 0.004132576106531944, "rouge1_recall": 0.06216956239183844, "rouge1_recall_stderr": 0.002418982032497166, "rouge2_fmeasure": 0.020504615316899424, "rouge2_fmeasure_stderr": 0.0010938880601892506, "rouge2_precision": 0.03918166323751805, "rouge2_precision_stderr": 0.002232701270896771, "rouge2_recall": 0.01896169901075388, "rouge2_recall_stderr": 0.0011238187004286074, "rougeL_fmeasure": 0.053015069141547445, "rougeL_fmeasure_stderr": 0.0019054031762975026, "rougeL_precision": 0.09232725830549246, "rougeL_precision_stderr": 0.0035009608082627874, "rougeL_recall": 0.04944577132177331, "rougeL_recall_stderr": 0.0019370474215297743, "rougeLsum_fmeasure": 0.061925651353736, "rougeLsum_fmeasure_stderr": 0.0022134341623464993, "rougeLsum_precision": 0.10639512799831675, "rougeLsum_precision_stderr": 0.003959010336722984, "rougeLsum_recall": 0.0579688671559081, "rougeLsum_recall_stderr": 0.0022585926905087242}}, "5": {"tldr_en": {"bleu": 1.3395696093100012e-17, "bleu_stderr": 1.6188031570042197e-15, "rouge1_fmeasure": 0.010578856116046968, "rouge1_fmeasure_stderr": 0.0010587962974140785, "rouge1_precision": 0.018794965559043822, "rouge1_precision_stderr": 0.0019103856487032388, "rouge1_recall": 0.009739253987019405, "rouge1_recall_stderr": 0.001028022736543038, "rouge2_fmeasure": 0.0036037511602081455, "rouge2_fmeasure_stderr": 0.0004967762048401135, "rouge2_precision": 0.007035041192131134, "rouge2_precision_stderr": 0.0010544020631658667, "rouge2_recall": 0.003115860649260451, "rouge2_recall_stderr": 0.00042988568749742647, "rougeL_fmeasure": 0.008694132539875133, "rougeL_fmeasure_stderr": 0.0008868664008104218, "rougeL_precision": 0.01597995382276719, "rougeL_precision_stderr": 0.0016907973645642418, "rougeL_recall": 0.007932505786965948, "rougeL_recall_stderr": 0.0008492134765906592, "rougeLsum_fmeasure": 0.009885706837554721, "rougeLsum_fmeasure_stderr": 0.0009930372784227673, "rougeLsum_precision": 0.01766856835979721, "rougeLsum_precision_stderr": 0.001804010001840347, "rougeLsum_recall": 0.00911773298032118, "rougeLsum_recall_stderr": 0.0009710350459859406}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.42812449271200076, "bleu_stderr": 0.03593952863513392, "rouge1_fmeasure": 0.16111833165340633, "rouge1_fmeasure_stderr": 0.0013228073404982464, "rouge1_precision": 0.1704981126291811, "rouge1_precision_stderr": 0.0013338397083228846, "rouge1_recall": 0.16250827808613028, "rouge1_recall_stderr": 0.001695638847503166, "rouge2_fmeasure": 0.025640802615608842, "rouge2_fmeasure_stderr": 0.0007913648955649943, "rouge2_precision": 0.025202596042333106, "rouge2_precision_stderr": 0.0007271421139071685, "rouge2_recall": 0.028534248474045745, "rouge2_recall_stderr": 0.0009943145039016717, "rougeL_fmeasure": 0.12055722052982439, "rougeL_fmeasure_stderr": 0.0010184730270181702, "rougeL_precision": 0.12730412210960618, "rougeL_precision_stderr": 0.0010054473303058402, "rougeL_recall": 0.12210325694317206, "rougeL_recall_stderr": 0.0013280429419495875, "rougeLsum_fmeasure": 0.14772535858728192, "rougeLsum_fmeasure_stderr": 0.0012134722386092569, "rougeLsum_precision": 0.15639796181494445, "rougeLsum_precision_stderr": 0.001232965501375508, "rougeLsum_recall": 0.14898698501762475, "rougeLsum_recall_stderr": 0.00155443556811081}}, "1": {"generate_text_restaurant": {"bleu": 13.247221385803988, "bleu_stderr": 0.14220552756821703, "rouge1_fmeasure": 0.4973916877386909, "rouge1_fmeasure_stderr": 0.0023727417832446964, "rouge1_precision": 0.6086045777197723, "rouge1_precision_stderr": 0.003236210007098886, "rouge1_recall": 0.46003438384829903, "rouge1_recall_stderr": 0.003061540592911205, "rouge2_fmeasure": 0.24335202781995585, "rouge2_fmeasure_stderr": 0.0021397393667393345, "rouge2_precision": 0.30299801357226896, "rouge2_precision_stderr": 0.0028574994951330785, "rouge2_recall": 0.2244547532852298, "rouge2_recall_stderr": 0.002266111204059807, "rougeL_fmeasure": 0.3610792363091501, "rougeL_fmeasure_stderr": 0.0021708910521971844, "rougeL_precision": 0.4457052256579531, "rougeL_precision_stderr": 0.003102587687098008, "rougeL_recall": 0.33263854068854276, "rougeL_recall_stderr": 0.002514293143182877, "rougeLsum_fmeasure": 0.4070097927724991, "rougeLsum_fmeasure_stderr": 0.0024071167864800603, "rougeLsum_precision": 0.49973264184554195, "rougeLsum_precision_stderr": 0.0032800184973764096, "rougeLsum_recall": 0.3756533771172665, "rougeLsum_recall_stderr": 0.0028125276621560214}}, "2": {"generate_text_restaurant": {"bleu": 15.795325428190091, "bleu_stderr": 0.19354816394840407, "rouge1_fmeasure": 0.5348334029766927, "rouge1_fmeasure_stderr": 0.002302993203227098, "rouge1_precision": 0.6262977832855142, "rouge1_precision_stderr": 0.0030378843936176092, "rouge1_recall": 0.5012471507744696, "rouge1_recall_stderr": 0.002971147405537894, "rouge2_fmeasure": 0.2741924846784574, "rouge2_fmeasure_stderr": 0.00227334101090194, "rouge2_precision": 0.32480905479986955, "rouge2_precision_stderr": 0.002856490337910367, "rouge2_recall": 0.25668318234712745, "rouge2_recall_stderr": 0.0024157694030146693, "rougeL_fmeasure": 0.3885968924573731, "rougeL_fmeasure_stderr": 0.002278632944564752, "rougeL_precision": 0.4569375770685092, "rougeL_precision_stderr": 0.0029960044893023088, "rougeL_recall": 0.36349781442639345, "rougeL_recall_stderr": 0.002594278395414943, "rougeLsum_fmeasure": 0.4414536559352786, "rougeLsum_fmeasure_stderr": 0.0024590884505842894, "rougeLsum_precision": 0.5177649533471562, "rougeLsum_precision_stderr": 0.0031698207789464476, "rougeLsum_recall": 0.41324492047913813, "rougeLsum_recall_stderr": 0.0028470360934478253}}, "3": {"generate_text_restaurant": {"bleu": 16.843742302400223, "bleu_stderr": 0.17227935435033276, "rouge1_fmeasure": 0.5464856858081838, "rouge1_fmeasure_stderr": 0.0023122490369582955, "rouge1_precision": 0.6282949929704077, "rouge1_precision_stderr": 0.003059057789987167, "rouge1_recall": 0.51680840079262, "rouge1_recall_stderr": 0.002942470701126268, "rouge2_fmeasure": 0.2853194237803746, "rouge2_fmeasure_stderr": 0.0022905380381352523, "rouge2_precision": 0.33058645220476474, "rouge2_precision_stderr": 0.002790981489366873, "rouge2_recall": 0.2697773045898863, "rouge2_recall_stderr": 0.0024449939958337075, "rougeL_fmeasure": 0.39545828835235125, "rougeL_fmeasure_stderr": 0.002297979755546528, "rougeL_precision": 0.4560736776191547, "rougeL_precision_stderr": 0.002953788243801524, "rougeL_recall": 0.37322618247394634, "rougeL_recall_stderr": 0.0025866935045218294, "rougeLsum_fmeasure": 0.45203864297161794, "rougeLsum_fmeasure_stderr": 0.002475859302189518, "rougeLsum_precision": 0.5195157725542344, "rougeLsum_precision_stderr": 0.0031060960719298224, "rougeLsum_recall": 0.42733270921434857, "rougeLsum_recall_stderr": 0.0028523523692084675}}, "4": {"generate_text_restaurant": {"bleu": 17.399269756968444, "bleu_stderr": 0.16188339080546563, "rouge1_fmeasure": 0.5532251605346499, "rouge1_fmeasure_stderr": 0.0022847219763500745, "rouge1_precision": 0.634594502025717, "rouge1_precision_stderr": 0.0030656173711672854, "rouge1_recall": 0.5222402912263271, "rouge1_recall_stderr": 0.002886085844977843, "rouge2_fmeasure": 0.29042848350528283, "rouge2_fmeasure_stderr": 0.002349569754314753, "rouge2_precision": 0.3359798617820434, "rouge2_precision_stderr": 0.0028620896080978273, "rouge2_recall": 0.27410547644527544, "rouge2_recall_stderr": 0.002497771616842685, "rougeL_fmeasure": 0.4004341563068622, "rougeL_fmeasure_stderr": 0.002323893750882234, "rougeL_precision": 0.45985203916820844, "rougeL_precision_stderr": 0.0029511298780480707, "rougeL_recall": 0.37799957829619235, "rougeL_recall_stderr": 0.0026279876955639324, "rougeLsum_fmeasure": 0.4598816082988491, "rougeLsum_fmeasure_stderr": 0.002487013453825871, "rougeLsum_precision": 0.5267970840604689, "rougeLsum_precision_stderr": 0.0031119369072898856, "rougeLsum_recall": 0.43444392259381265, "rougeLsum_recall_stderr": 0.0028648632023860416}}, "5": {"generate_text_restaurant": {"bleu": 17.19780535137731, "bleu_stderr": 0.2075967898815662, "rouge1_fmeasure": 0.5541112966804755, "rouge1_fmeasure_stderr": 0.002255001054091324, "rouge1_precision": 0.6338905877705581, "rouge1_precision_stderr": 0.0030547047418891904, "rouge1_recall": 0.522789967000048, "rouge1_recall_stderr": 0.0028168748936535045, "rouge2_fmeasure": 0.29064064925473093, "rouge2_fmeasure_stderr": 0.0022903584986981493, "rouge2_precision": 0.33510143626605604, "rouge2_precision_stderr": 0.0028081812085845744, "rouge2_recall": 0.27421724337581194, "rouge2_recall_stderr": 0.0024212577256025832, "rougeL_fmeasure": 0.4023384160512089, "rougeL_fmeasure_stderr": 0.002289742273920384, "rougeL_precision": 0.461012242271556, "rougeL_precision_stderr": 0.0029540047890377285, "rougeL_recall": 0.37952909816891267, "rougeL_recall_stderr": 0.002563629564031177, "rougeLsum_fmeasure": 0.4610941761776966, "rougeLsum_fmeasure_stderr": 0.0024690225692075383, "rougeLsum_precision": 0.5274205387508322, "rougeLsum_precision_stderr": 0.0031342343594973837, "rougeLsum_recall": 0.43485769994303725, "rougeLsum_recall_stderr": 0.002791067097572193}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.82774643364302, "bleu_stderr": 0.09185502717273288, "rouge1_fmeasure": 0.21092842594846697, "rouge1_fmeasure_stderr": 0.0024422237278152676, "rouge1_precision": 0.16185818612685765, "rouge1_precision_stderr": 0.0021298548938891364, "rouge1_recall": 0.3417885033136733, "rouge1_recall_stderr": 0.0043386227237333114, "rouge2_fmeasure": 0.044732369431626645, "rouge2_fmeasure_stderr": 0.0014622575988439016, "rouge2_precision": 0.03344425091536045, "rouge2_precision_stderr": 0.001138155435797099, "rouge2_recall": 0.07619065688587215, "rouge2_recall_stderr": 0.0026227249124846155, "rougeL_fmeasure": 0.1537478229884277, "rougeL_fmeasure_stderr": 0.0018117237209179383, "rougeL_precision": 0.11803353022964283, "rougeL_precision_stderr": 0.0016063075124484745, "rougeL_recall": 0.25073105375655186, "rougeL_recall_stderr": 0.0033843541491142843, "rougeLsum_fmeasure": 0.16696764847671122, "rougeLsum_fmeasure_stderr": 0.0020396578647428917, "rougeLsum_precision": 0.12753567648031114, "rougeLsum_precision_stderr": 0.0017136718255235162, "rougeLsum_recall": 0.2731872954114564, "rougeLsum_recall_stderr": 0.0037952177465157954}}, "1": {"article_DOC_summary": {"bleu": 1.7770709996701801, "bleu_stderr": 0.11553547694333567, "rouge1_fmeasure": 0.2094533876793256, "rouge1_fmeasure_stderr": 0.003209359027680064, "rouge1_precision": 0.19351676396852305, "rouge1_precision_stderr": 0.0040092070739415385, "rouge1_recall": 0.2837904793617724, "rouge1_recall_stderr": 0.003977222043243443, "rouge2_fmeasure": 0.04672709348462675, "rouge2_fmeasure_stderr": 0.0019868362415111454, "rouge2_precision": 0.04499950744353185, "rouge2_precision_stderr": 0.00217424463498997, "rouge2_recall": 0.06095764821333711, "rouge2_recall_stderr": 0.0024452528240745852, "rougeL_fmeasure": 0.15884186810923107, "rougeL_fmeasure_stderr": 0.0025392450826167046, "rougeL_precision": 0.14655234786832474, "rougeL_precision_stderr": 0.0031482823845961926, "rougeL_recall": 0.21676483815893888, "rougeL_recall_stderr": 0.003210209647773703, "rougeLsum_fmeasure": 0.1637213451126967, "rougeLsum_fmeasure_stderr": 0.0025821353633863208, "rougeLsum_precision": 0.15016619700140588, "rougeLsum_precision_stderr": 0.0031473927196044387, "rougeLsum_recall": 0.22501230268304348, "rougeLsum_recall_stderr": 0.0034280059726790927}}, "2": {"article_DOC_summary": {"bleu": 2.7302243001064608, "bleu_stderr": 0.13092322861350983, "rouge1_fmeasure": 0.2425796498051972, "rouge1_fmeasure_stderr": 0.003644299173817006, "rouge1_precision": 0.2582865268725016, "rouge1_precision_stderr": 0.004545430952687285, "rouge1_recall": 0.2560290399304562, "rouge1_recall_stderr": 0.0037484892423799345, "rouge2_fmeasure": 0.057990701442539934, "rouge2_fmeasure_stderr": 0.002332398084011424, "rouge2_precision": 0.06385365074215316, "rouge2_precision_stderr": 0.002696782990365961, "rouge2_recall": 0.058943932082595595, "rouge2_recall_stderr": 0.00235735901100326, "rougeL_fmeasure": 0.18282395563529474, "rougeL_fmeasure_stderr": 0.002869149609053893, "rougeL_precision": 0.19469946748637232, "rougeL_precision_stderr": 0.0035803062809549023, "rougeL_recall": 0.19366376666848154, "rougeL_recall_stderr": 0.0029687037915645093, "rougeLsum_fmeasure": 0.18531683703581192, "rougeLsum_fmeasure_stderr": 0.0028851030877910795, "rougeLsum_precision": 0.1969779372180501, "rougeLsum_precision_stderr": 0.0035834644994916995, "rougeLsum_recall": 0.19685369864744215, "rougeLsum_recall_stderr": 0.0030365791580345657}}, "3": {"article_DOC_summary": {"bleu": 3.3527230846402456, "bleu_stderr": 0.1568059055742541, "rouge1_fmeasure": 0.23874337127435116, "rouge1_fmeasure_stderr": 0.003968049625410433, "rouge1_precision": 0.2620037574708844, "rouge1_precision_stderr": 0.00475771849789178, "rouge1_recall": 0.23956862708215287, "rouge1_recall_stderr": 0.004086077247863763, "rouge2_fmeasure": 0.05897771350337035, "rouge2_fmeasure_stderr": 0.0025160964826499567, "rouge2_precision": 0.06538030895524288, "rouge2_precision_stderr": 0.0028546535348718855, "rouge2_recall": 0.058252191699842505, "rouge2_recall_stderr": 0.0025252795163929805, "rougeL_fmeasure": 0.18155346847966253, "rougeL_fmeasure_stderr": 0.003198458165973925, "rougeL_precision": 0.19986216708164317, "rougeL_precision_stderr": 0.0038691972628647217, "rougeL_recall": 0.18210791064611975, "rougeL_recall_stderr": 0.003276204955871331, "rougeLsum_fmeasure": 0.18337069196001973, "rougeLsum_fmeasure_stderr": 0.003201131063755417, "rougeLsum_precision": 0.20159040068033138, "rougeLsum_precision_stderr": 0.003862917301607155, "rougeLsum_recall": 0.1843274963260961, "rougeLsum_recall_stderr": 0.00330910927545364}}, "4": {"article_DOC_summary": {"bleu": 0.05822825874081429, "bleu_stderr": 0.01644695163797456, "rouge1_fmeasure": 0.05583381136942088, "rouge1_fmeasure_stderr": 0.0033778520603231508, "rouge1_precision": 0.07204809256967028, "rouge1_precision_stderr": 0.004504905988140821, "rouge1_recall": 0.05223149780623666, "rouge1_recall_stderr": 0.0032037100337574767, "rouge2_fmeasure": 0.012680674818083815, "rouge2_fmeasure_stderr": 0.0012466206862505908, "rouge2_precision": 0.016683925104854617, "rouge2_precision_stderr": 0.0018492907460814511, "rouge2_recall": 0.011406975102705814, "rouge2_recall_stderr": 0.0011263344664469651, "rougeL_fmeasure": 0.04222746459224138, "rougeL_fmeasure_stderr": 0.0025625895022892376, "rougeL_precision": 0.05595959490072549, "rougeL_precision_stderr": 0.003627186961146684, "rougeL_recall": 0.0393015653450511, "rougeL_recall_stderr": 0.0024120043012467343, "rougeLsum_fmeasure": 0.042274058417037116, "rougeLsum_fmeasure_stderr": 0.002566919396984918, "rougeLsum_precision": 0.05598352171101143, "rougeLsum_precision_stderr": 0.003628615758216825, "rougeLsum_recall": 0.039335659315169115, "rougeLsum_recall_stderr": 0.0024152603185751574}}, "5": {"article_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}}}
4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.014944140233795023,0
3
+ anli_r2,acc,0.335,0.014933117490932573,0
4
+ anli_r3,acc,0.335,0.013630871843821467,0
5
+ arc_challenge,acc,0.26621160409556316,0.012915774781523209,0
6
+ arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0
7
+ arc_easy,acc,0.6022727272727273,0.010042861602178058,0
8
+ arc_easy,acc_norm,0.5353535353535354,0.010234104543411428,0
9
+ boolq,acc,0.5718654434250765,0.00865425341578108,1
10
+ cb,acc,0.35714285714285715,0.06460957383809221,1
11
+ cb,f1,0.24776800260671225,,1
12
+ copa,acc,0.77,0.04229525846816506,0
13
+ hellaswag,acc,0.46046604262099183,0.004974159561342699,0
14
+ hellaswag,acc_norm,0.6019717187811193,0.004884909544477094,0
15
+ piqa,acc,0.7459194776931447,0.010157271999135043,0
16
+ piqa,acc_norm,0.7616974972796517,0.009940334245876219,0
17
+ rte,acc,0.5595667870036101,0.029882123363118705,0
18
+ sciq,acc,0.857,0.01107581480856704,0
19
+ sciq,acc_norm,0.78,0.013106173040661766,0
20
+ storycloze_2016,acc,0.7097808658471406,0.010495529690730063,0
21
+ winogrande,acc,0.584846093133386,0.013848684086658588,0
4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.308,0.01460648312734276,0
3
+ anli_r2,acc,0.309,0.014619600977206498,0
4
+ anli_r3,acc,0.3441666666666667,0.013720551062295755,0
5
+ arc_challenge,acc,0.28498293515358364,0.013191348179838793,0
6
+ arc_challenge,acc_norm,0.30887372013651876,0.013501770929344003,0
7
+ arc_easy,acc,0.6182659932659933,0.009968648851839668,0
8
+ arc_easy,acc_norm,0.6026936026936027,0.010041053078884287,0
9
+ boolq,acc,0.5752293577981651,0.008645503833361103,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.3140096618357488,,1
12
+ copa,acc,0.73,0.0446196043338474,0
13
+ hellaswag,acc,0.457876916948815,0.004972042602001379,0
14
+ hellaswag,acc_norm,0.599681338378809,0.004889615413144207,0
15
+ piqa,acc,0.7410228509249184,0.010220966031405616,0
16
+ piqa,acc_norm,0.750272034820457,0.01009923296986747,0
17
+ rte,acc,0.592057761732852,0.02958195251960619,0
18
+ sciq,acc,0.911,0.009008893392651532,0
19
+ sciq,acc_norm,0.903,0.009363689373248113,0
20
+ storycloze_2016,acc,0.7012292891501871,0.010584692134739972,0
21
+ winogrande,acc,0.5895816890292028,0.01382510712003587,0
4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.301,0.014512395033543143,0
3
+ anli_r2,acc,0.355,0.015139491543780532,0
4
+ anli_r3,acc,0.33916666666666667,0.013672343491681815,0
5
+ arc_challenge,acc,0.2883959044368601,0.013238394422428182,0
6
+ arc_challenge,acc_norm,0.30716723549488056,0.013481034054980943,0
7
+ arc_easy,acc,0.6300505050505051,0.009906656266021153,0
8
+ arc_easy,acc_norm,0.6216329966329966,0.009951575683331952,0
9
+ boolq,acc,0.6015290519877676,0.00856286653334057,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.24787825577992387,,1
12
+ copa,acc,0.72,0.04512608598542128,0
13
+ hellaswag,acc,0.4596693885680143,0.004973522582431219,0
14
+ hellaswag,acc_norm,0.6048595897231627,0.004878816961012039,0
15
+ piqa,acc,0.7415669205658324,0.010213971636773315,0
16
+ piqa,acc_norm,0.7573449401523396,0.01000200256970869,0
17
+ rte,acc,0.5054151624548736,0.030094698123239966,0
18
+ sciq,acc,0.915,0.008823426366942323,0
19
+ sciq,acc_norm,0.914,0.008870325962594766,0
20
+ storycloze_2016,acc,0.7071084981293426,0.01052387329324631,0
21
+ winogrande,acc,0.5840568271507498,0.013852485356798254,0
4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.327,0.014842213153411242,0
3
+ anli_r2,acc,0.327,0.014842213153411244,0
4
+ anli_r3,acc,0.33166666666666667,0.013596836729485163,0
5
+ arc_challenge,acc,0.29692832764505117,0.013352025976725223,0
6
+ arc_challenge,acc_norm,0.32081911262798635,0.01364094309194653,0
7
+ arc_easy,acc,0.6380471380471381,0.009860991466688479,0
8
+ arc_easy,acc_norm,0.6321548821548821,0.009894923464455186,0
9
+ boolq,acc,0.5868501529051988,0.008612117547803595,1
10
+ cb,acc,0.5535714285714286,0.06703189227942397,1
11
+ cb,f1,0.48686323164501527,,1
12
+ copa,acc,0.8,0.04020151261036846,0
13
+ hellaswag,acc,0.45907189802828124,0.004973036453863712,0
14
+ hellaswag,acc_norm,0.6064528978291177,0.004875379352079826,0
15
+ piqa,acc,0.7459194776931447,0.010157271999135041,0
16
+ piqa,acc_norm,0.7568008705114254,0.01000961195385892,0
17
+ rte,acc,0.5631768953068592,0.02985524739031494,0
18
+ sciq,acc,0.918,0.008680515615523719,0
19
+ sciq,acc_norm,0.926,0.008282064512704164,0
20
+ storycloze_2016,acc,0.7145911277391769,0.01044339588406212,0
21
+ winogrande,acc,0.5911602209944752,0.013816954295135696,0
4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.324,0.01480686473373886,0
3
+ anli_r2,acc,0.349,0.015080663991563095,0
4
+ anli_r3,acc,0.35,0.013774667009018552,0
5
+ arc_challenge,acc,0.3097269624573379,0.01351205841523836,0
6
+ arc_challenge,acc_norm,0.3191126279863481,0.013621696119173307,0
7
+ arc_easy,acc,0.6422558922558923,0.00983577275734336,0
8
+ arc_easy,acc_norm,0.6308922558922558,0.00990198741024272,0
9
+ boolq,acc,0.5773700305810398,0.008639722698719017,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.3162501893078904,,1
12
+ copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.4609639514041028,0.004974551179483944,0
14
+ hellaswag,acc_norm,0.608743278231428,0.004870342592915045,0
15
+ piqa,acc,0.7470076169749728,0.010142888698862458,0
16
+ piqa,acc_norm,0.7573449401523396,0.010002002569708688,0
17
+ rte,acc,0.5018050541516246,0.030096267148976626,0
18
+ sciq,acc,0.92,0.008583336977753653,0
19
+ sciq,acc_norm,0.921,0.008534156773333431,0
20
+ storycloze_2016,acc,0.7161945483698557,0.010425696279730919,0
21
+ winogrande,acc,0.595895816890292,0.013791610664670863,0
4b284b84b60c4py/evaluation/rankeval/4b284b84b60c4py_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811476,0
3
+ anli_r2,acc,0.322,0.014782913600996655,0
4
+ anli_r3,acc,0.3466666666666667,0.013744022550571956,0
5
+ arc_challenge,acc,0.310580204778157,0.01352229209805305,0
6
+ arc_challenge,acc_norm,0.3302047781569966,0.013743085603760427,0
7
+ arc_easy,acc,0.6393097643097643,0.009853512108416737,0
8
+ arc_easy,acc_norm,0.6443602693602694,0.009822854395535483,0
9
+ boolq,acc,0.5706422018348624,0.008657333755353679,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.4210510374072391,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.45976897032463654,0.004973602904247799,0
14
+ hellaswag,acc_norm,0.610336586337383,0.004866772373029937,0
15
+ piqa,acc,0.7459194776931447,0.01015727199913504,0
16
+ piqa,acc_norm,0.7627856365614799,0.00992469493358637,0
17
+ rte,acc,0.5992779783393501,0.02949722923716315,0
18
+ sciq,acc,0.92,0.008583336977753656,0
19
+ sciq,acc_norm,0.919,0.00863212103213998,0
20
+ storycloze_2016,acc,0.7145911277391769,0.010443395884062118,0
21
+ winogrande,acc,0.5872138910812944,0.013837060648682089,0
4b284b84b70c4py/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.008098169031211164
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.008098169031211164
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.24123039025136075
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.24123039025136075
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2734427432548749
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2734427432548749
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.28763411022506213
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.28763411022506213
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2923497201090039
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2923497201090039
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2901019845755836
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2901019845755836
14
+ e2e_nlg_cleaned,5,average,multiple,0.23214285290784942
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04310473913800793
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04310473913800793
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03804941171577492
18
+ gem_xsum,1,median,rouge2_fmeasure,0.03804941171577492
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04593830462341063
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04593830462341063
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05057991242538114
22
+ gem_xsum,3,median,rouge2_fmeasure,0.05057991242538114
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.012864663909485747
24
+ gem_xsum,4,median,rouge2_fmeasure,0.012864663909485747
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00030536930193877015
26
+ gem_xsum,5,median,rouge2_fmeasure,0.00030536930193877015
27
+ gem_xsum,5,average,multiple,0.031807066852333185
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.052832415905400075
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.052832415905400075
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0882203961704786
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.0882203961704786
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.10607455474495636
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.10607455474495636
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.11011224841518844
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.11011224841518844
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.11134812760048587
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.11134812760048587
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.12320959484950564
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.12320959484950564
40
+ web_nlg_en,5,average,multiple,0.09863288961433583
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.035598696037042944
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.035598696037042944
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05627709313767569
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05627709313767569
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.0689053644873375
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.0689053644873375
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.059175735072984945
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.059175735072984945
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.020118442009674838
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.020118442009674838
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0030989992660247776
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0030989992660247776
53
+ wiki_lingua_en,5,average,multiple,0.04052905500179012
4b284b84b70c4py/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.45419048482755325, "bleu_stderr": 0.03343264900668279, "rouge1_fmeasure": 0.12050264484641364, "rouge1_fmeasure_stderr": 0.0019557997342759008, "rouge1_precision": 0.07656848458551854, "rouge1_precision_stderr": 0.0014013614212390725, "rouge1_recall": 0.4036790488281202, "rouge1_recall_stderr": 0.005315114093965629, "rouge2_fmeasure": 0.052832415905400075, "rouge2_fmeasure_stderr": 0.0011839640636500091, "rouge2_precision": 0.033428842308488534, "rouge2_precision_stderr": 0.0008128053819647495, "rouge2_recall": 0.18340526600302984, "rouge2_recall_stderr": 0.003727031912672351, "rougeL_fmeasure": 0.11109523222583549, "rougeL_fmeasure_stderr": 0.0017383601500938328, "rougeL_precision": 0.07054587229335195, "rougeL_precision_stderr": 0.0012459171875782095, "rougeL_recall": 0.37462626179690983, "rougeL_recall_stderr": 0.004966030287557139, "rougeLsum_fmeasure": 0.11161406434102886, "rougeLsum_fmeasure_stderr": 0.0018124941447083965, "rougeLsum_precision": 0.07110053256257555, "rougeLsum_precision_stderr": 0.0013096550789070658, "rougeLsum_recall": 0.3724362458459577, "rougeLsum_recall_stderr": 0.004863022252389103}}, "1": {"PALM_prompt": {"bleu": 0.5850021236636699, "bleu_stderr": 0.03819693214469707, "rouge1_fmeasure": 0.1677409926355529, "rouge1_fmeasure_stderr": 0.004030921408273657, "rouge1_precision": 0.15109588314127873, "rouge1_precision_stderr": 0.004953399230808383, "rouge1_recall": 0.325670897152676, "rouge1_recall_stderr": 0.005147150507406254, "rouge2_fmeasure": 0.0882203961704786, "rouge2_fmeasure_stderr": 0.0027304069330761974, "rouge2_precision": 0.08184320047143445, "rouge2_precision_stderr": 0.003449022357977633, "rouge2_recall": 0.17132523279025044, "rouge2_recall_stderr": 0.00367610399739497, "rougeL_fmeasure": 0.15245069014322057, "rougeL_fmeasure_stderr": 0.00345251550628098, "rougeL_precision": 0.13599215274261797, "rougeL_precision_stderr": 0.004360821088518568, "rougeL_recall": 0.3051575499297103, "rougeL_recall_stderr": 0.004723568587467038, "rougeLsum_fmeasure": 0.15494868008248852, "rougeLsum_fmeasure_stderr": 0.003528355370205954, "rougeLsum_precision": 0.13850001433454043, "rougeLsum_precision_stderr": 0.004436787945937485, "rougeLsum_recall": 0.30775803342597885, "rougeLsum_recall_stderr": 0.004752694198738615}}, "2": {"PALM_prompt": {"bleu": 0.7246766267092646, "bleu_stderr": 0.05216754042228656, "rouge1_fmeasure": 0.19302680914857126, "rouge1_fmeasure_stderr": 0.004550241366319154, "rouge1_precision": 0.16997361970056543, "rouge1_precision_stderr": 0.005336466087297649, "rouge1_recall": 0.3624866589291664, "rouge1_recall_stderr": 0.005051573422247833, "rouge2_fmeasure": 0.10607455474495636, "rouge2_fmeasure_stderr": 0.0031850991496843, "rouge2_precision": 0.09695911898895748, "rouge2_precision_stderr": 0.003772749560662997, "rouge2_recall": 0.19796654029903527, "rouge2_recall_stderr": 0.003829948250927247, "rougeL_fmeasure": 0.17343940636186816, "rougeL_fmeasure_stderr": 0.003866556548338404, "rougeL_precision": 0.1505856722307468, "rougeL_precision_stderr": 0.004601030271832378, "rougeL_recall": 0.33798352518669805, "rougeL_recall_stderr": 0.004605226083895095, "rougeLsum_fmeasure": 0.17821185760691358, "rougeLsum_fmeasure_stderr": 0.004029760056556632, "rougeLsum_precision": 0.15526534529621946, "rougeLsum_precision_stderr": 0.004777197445287186, "rougeLsum_recall": 0.34354280660789954, "rougeLsum_recall_stderr": 0.004693627364580387}}, "3": {"PALM_prompt": {"bleu": 0.912401774785461, "bleu_stderr": 0.05499172600355808, "rouge1_fmeasure": 0.2027515169133416, "rouge1_fmeasure_stderr": 0.004616866259808885, "rouge1_precision": 0.177739891947758, "rouge1_precision_stderr": 0.005377477441053506, "rouge1_recall": 0.37571301827708603, "rouge1_recall_stderr": 0.005155063555837091, "rouge2_fmeasure": 0.11011224841518844, "rouge2_fmeasure_stderr": 0.0032287748904178603, "rouge2_precision": 0.09925086869826542, "rouge2_precision_stderr": 0.0037360879884190974, "rouge2_recall": 0.2039607362114278, "rouge2_recall_stderr": 0.004009008079403255, "rougeL_fmeasure": 0.17817494628022673, "rougeL_fmeasure_stderr": 0.003810921024330032, "rougeL_precision": 0.15305639487114978, "rougeL_precision_stderr": 0.004468138904157244, "rougeL_recall": 0.34454827318886916, "rougeL_recall_stderr": 0.004597031286799378, "rougeLsum_fmeasure": 0.185206448541148, "rougeLsum_fmeasure_stderr": 0.004057582364193407, "rougeLsum_precision": 0.16073321051848882, "rougeLsum_precision_stderr": 0.004766423801226516, "rougeLsum_recall": 0.35191122444688017, "rougeLsum_recall_stderr": 0.004722893850454437}}, "4": {"PALM_prompt": {"bleu": 0.9362020995122441, "bleu_stderr": 0.07456263807984138, "rouge1_fmeasure": 0.20398642955440918, "rouge1_fmeasure_stderr": 0.0044990621443271265, "rouge1_precision": 0.1801767792651148, "rouge1_precision_stderr": 0.0053176999629491305, "rouge1_recall": 0.38030055324748463, "rouge1_recall_stderr": 0.005130695248034955, "rouge2_fmeasure": 0.11134812760048587, "rouge2_fmeasure_stderr": 0.0031257086692601643, "rouge2_precision": 0.10092720165635745, "rouge2_precision_stderr": 0.0036595986543100584, "rouge2_recall": 0.2093414664405995, "rouge2_recall_stderr": 0.004066301941452167, "rougeL_fmeasure": 0.18020750722912687, "rougeL_fmeasure_stderr": 0.0037228657207143273, "rougeL_precision": 0.1562295618700323, "rougeL_precision_stderr": 0.004447831421456536, "rougeL_recall": 0.3500026913388304, "rougeL_recall_stderr": 0.00457250105781178, "rougeLsum_fmeasure": 0.18803470835441047, "rougeLsum_fmeasure_stderr": 0.003990573990048456, "rougeLsum_precision": 0.1646531954640878, "rougeLsum_precision_stderr": 0.0047752145526076765, "rougeLsum_recall": 0.35859704442011836, "rougeLsum_recall_stderr": 0.004708359064341005}}, "5": {"PALM_prompt": {"bleu": 1.1473284737488645, "bleu_stderr": 0.07257359326088067, "rouge1_fmeasure": 0.22344029518114394, "rouge1_fmeasure_stderr": 0.0047579906981359355, "rouge1_precision": 0.19695121461258075, "rouge1_precision_stderr": 0.005543936314076539, "rouge1_recall": 0.39862816336821605, "rouge1_recall_stderr": 0.005230455011065989, "rouge2_fmeasure": 0.12320959484950564, "rouge2_fmeasure_stderr": 0.0033292695883422856, "rouge2_precision": 0.1118421615652518, "rouge2_precision_stderr": 0.003869244868009604, "rouge2_recall": 0.22068136349930514, "rouge2_recall_stderr": 0.004119383941992751, "rougeL_fmeasure": 0.19693845048375522, "rougeL_fmeasure_stderr": 0.003977166542530307, "rougeL_precision": 0.1712293574036339, "rougeL_precision_stderr": 0.004702901005328839, "rougeL_recall": 0.36465485161639194, "rougeL_recall_stderr": 0.004657456646320378, "rougeLsum_fmeasure": 0.20542437703481312, "rougeLsum_fmeasure_stderr": 0.004254659038175917, "rougeLsum_precision": 0.1800833361396532, "rougeLsum_precision_stderr": 0.00501549579335833, "rougeLsum_recall": 0.3739480609864003, "rougeLsum_recall_stderr": 0.004813185015660923}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.5247944854894806, "bleu_stderr": 0.0879841385659621, "rouge1_fmeasure": 0.14904907950315077, "rouge1_fmeasure_stderr": 0.0024558774712346303, "rouge1_precision": 0.1384834766411849, "rouge1_precision_stderr": 0.002631651930646105, "rouge1_recall": 0.1986797302552911, "rouge1_recall_stderr": 0.003301745364935552, "rouge2_fmeasure": 0.035598696037042944, "rouge2_fmeasure_stderr": 0.0010333687243820114, "rouge2_precision": 0.03225938464648384, "rouge2_precision_stderr": 0.000980163943008181, "rouge2_recall": 0.04862171566457941, "rouge2_recall_stderr": 0.0015116459850371242, "rougeL_fmeasure": 0.11791797807562439, "rougeL_fmeasure_stderr": 0.0018560003669397872, "rougeL_precision": 0.10896306382058518, "rougeL_precision_stderr": 0.002036973587521335, "rougeL_recall": 0.1604599066510129, "rougeL_recall_stderr": 0.0026801418820689106, "rougeLsum_fmeasure": 0.13814732526217105, "rougeLsum_fmeasure_stderr": 0.0022950179887594627, "rougeLsum_precision": 0.1285709971312458, "rougeLsum_precision_stderr": 0.0024834562723841675, "rougeLsum_recall": 0.1844269890660411, "rougeLsum_recall_stderr": 0.003094769735687356}}, "1": {"tldr_en": {"bleu": 3.0785238389209355, "bleu_stderr": 0.12100023309172617, "rouge1_fmeasure": 0.19932748144830872, "rouge1_fmeasure_stderr": 0.0023746278349365216, "rouge1_precision": 0.3077283488063351, "rouge1_precision_stderr": 0.004482468614194303, "rouge1_recall": 0.20156170597255982, "rouge1_recall_stderr": 0.0028613140412138494, "rouge2_fmeasure": 0.05627709313767569, "rouge2_fmeasure_stderr": 0.001427328834770283, "rouge2_precision": 0.09669759370794292, "rouge2_precision_stderr": 0.002865381089616566, "rouge2_recall": 0.0555415827995515, "rouge2_recall_stderr": 0.00153691704948828, "rougeL_fmeasure": 0.15700774611111312, "rougeL_fmeasure_stderr": 0.0019119812430907841, "rougeL_precision": 0.24842926652738595, "rougeL_precision_stderr": 0.0038947924223351827, "rougeL_recall": 0.15799128474542956, "rougeL_recall_stderr": 0.002265547666522197, "rougeLsum_fmeasure": 0.1874016150707306, "rougeLsum_fmeasure_stderr": 0.0022396190663859935, "rougeLsum_precision": 0.29106256830073524, "rougeLsum_precision_stderr": 0.004316037155064917, "rougeLsum_recall": 0.18921966019379916, "rougeLsum_recall_stderr": 0.0026890228631320456}}, "2": {"tldr_en": {"bleu": 3.652111781813624, "bleu_stderr": 0.10486391893167614, "rouge1_fmeasure": 0.2283435942520453, "rouge1_fmeasure_stderr": 0.002353258676899788, "rouge1_precision": 0.36924759348379244, "rouge1_precision_stderr": 0.004371000269537104, "rouge1_recall": 0.2161281279844684, "rouge1_recall_stderr": 0.0027903795763512965, "rouge2_fmeasure": 0.0689053644873375, "rouge2_fmeasure_stderr": 0.0015082619422947736, "rouge2_precision": 0.12155473826758335, "rouge2_precision_stderr": 0.0030290361967967947, "rouge2_recall": 0.06398344341696371, "rouge2_recall_stderr": 0.0015954786097649597, "rougeL_fmeasure": 0.1806325741208752, "rougeL_fmeasure_stderr": 0.0018958600514033847, "rougeL_precision": 0.29809803146126795, "rougeL_precision_stderr": 0.0037990348444261793, "rougeL_recall": 0.17039689340892664, "rougeL_recall_stderr": 0.0022336612525983126, "rougeLsum_fmeasure": 0.21643866687270513, "rougeLsum_fmeasure_stderr": 0.0022530258028476487, "rougeLsum_precision": 0.35169516629927816, "rougeLsum_precision_stderr": 0.004252566315780221, "rougeLsum_recall": 0.2045309406583435, "rougeLsum_recall_stderr": 0.0026458064196168626}}, "3": {"tldr_en": {"bleu": 2.30710280652967, "bleu_stderr": 0.09094590687124952, "rouge1_fmeasure": 0.1945336408406022, "rouge1_fmeasure_stderr": 0.0026446921596933503, "rouge1_precision": 0.3236204752072193, "rouge1_precision_stderr": 0.0047276363208619545, "rouge1_recall": 0.1778973939940185, "rouge1_recall_stderr": 0.002915566666838674, "rouge2_fmeasure": 0.059175735072984945, "rouge2_fmeasure_stderr": 0.0015008631332196428, "rouge2_precision": 0.10514218669712877, "rouge2_precision_stderr": 0.0029331277208536808, "rouge2_recall": 0.05366081871257144, "rouge2_recall_stderr": 0.0015514417031996504, "rougeL_fmeasure": 0.1547742780157689, "rougeL_fmeasure_stderr": 0.0021406675865622577, "rougeL_precision": 0.2617516967205688, "rougeL_precision_stderr": 0.004024319520519343, "rougeL_recall": 0.14136929228154757, "rougeL_recall_stderr": 0.0023577454595674853, "rougeLsum_fmeasure": 0.18473010709149373, "rougeLsum_fmeasure_stderr": 0.0025203297464613237, "rougeLsum_precision": 0.30869344193505416, "rougeLsum_precision_stderr": 0.004557374401231459, "rougeLsum_recall": 0.16856250527976735, "rougeLsum_recall_stderr": 0.0027602819187965205}}, "4": {"tldr_en": {"bleu": 0.01997228189371226, "bleu_stderr": 0.004164437604071004, "rouge1_fmeasure": 0.0642661368715624, "rouge1_fmeasure_stderr": 0.002311467107094036, "rouge1_precision": 0.10579363907416886, "rouge1_precision_stderr": 0.0038996170266521225, "rouge1_recall": 0.05989991930806105, "rouge1_recall_stderr": 0.002383270863537302, "rouge2_fmeasure": 0.020118442009674838, "rouge2_fmeasure_stderr": 0.001072025516558785, "rouge2_precision": 0.03603840886780009, "rouge2_precision_stderr": 0.0020500166307483576, "rouge2_recall": 0.01870103420296575, "rouge2_recall_stderr": 0.0011326153028213988, "rougeL_fmeasure": 0.052479963022099216, "rougeL_fmeasure_stderr": 0.0019019460257588422, "rougeL_precision": 0.08768306117325718, "rougeL_precision_stderr": 0.003312003426848069, "rougeL_recall": 0.04882499641240493, "rougeL_recall_stderr": 0.0019528616148385455, "rougeLsum_fmeasure": 0.06058158326550758, "rougeLsum_fmeasure_stderr": 0.0021794146863436045, "rougeLsum_precision": 0.10049299845393068, "rougeLsum_precision_stderr": 0.0037366898545692227, "rougeLsum_recall": 0.056172794670012766, "rougeLsum_recall_stderr": 0.0022208852083168873}}, "5": {"tldr_en": {"bleu": 6.720014938526705e-25, "bleu_stderr": 1.368232575017575e-23, "rouge1_fmeasure": 0.008551690929725, "rouge1_fmeasure_stderr": 0.0009563473253764333, "rouge1_precision": 0.015189081718840411, "rouge1_precision_stderr": 0.0016914095953933566, "rouge1_recall": 0.007820035481039486, "rouge1_recall_stderr": 0.0009666086539579351, "rouge2_fmeasure": 0.0030989992660247776, "rouge2_fmeasure_stderr": 0.00045069838543670873, "rouge2_precision": 0.005792228464995174, "rouge2_precision_stderr": 0.0008621781228084065, "rouge2_recall": 0.002930153771964047, "rouge2_recall_stderr": 0.0005187699239656753, "rougeL_fmeasure": 0.007168225022246425, "rougeL_fmeasure_stderr": 0.0008110437096733947, "rougeL_precision": 0.01284373251270546, "rougeL_precision_stderr": 0.0014549043740755003, "rougeL_recall": 0.0065817077301885156, "rougeL_recall_stderr": 0.0008324133884409117, "rougeLsum_fmeasure": 0.008101674490294078, "rougeLsum_fmeasure_stderr": 0.0009108848536289975, "rougeLsum_precision": 0.014339494923437973, "rougeLsum_precision_stderr": 0.0016066549399758809, "rougeLsum_recall": 0.0074702858797809375, "rougeLsum_recall_stderr": 0.0009377181101147522}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.2174856161906872, "bleu_stderr": 0.015923894878958702, "rouge1_fmeasure": 0.09849126682765423, "rouge1_fmeasure_stderr": 0.0014923228309475303, "rouge1_precision": 0.10975769964880429, "rouge1_precision_stderr": 0.0015870675246378444, "rouge1_recall": 0.09928819244999808, "rouge1_recall_stderr": 0.0017021351064544997, "rouge2_fmeasure": 0.008098169031211164, "rouge2_fmeasure_stderr": 0.0005382144957334649, "rouge2_precision": 0.007623464868246138, "rouge2_precision_stderr": 0.0005009913285230473, "rouge2_recall": 0.009301745941549075, "rouge2_recall_stderr": 0.0006509937552160252, "rougeL_fmeasure": 0.0736904154736711, "rougeL_fmeasure_stderr": 0.0010858349093197835, "rougeL_precision": 0.08356464635738844, "rougeL_precision_stderr": 0.0012285431730394058, "rougeL_recall": 0.07404637900035545, "rougeL_recall_stderr": 0.0012403578700447176, "rougeLsum_fmeasure": 0.09038375273205482, "rougeLsum_fmeasure_stderr": 0.0014028476096075255, "rougeLsum_precision": 0.1006889250238476, "rougeLsum_precision_stderr": 0.0014871743224185528, "rougeLsum_recall": 0.09132141334499083, "rougeLsum_recall_stderr": 0.0016087849316541365}}, "1": {"generate_text_restaurant": {"bleu": 13.603186674422444, "bleu_stderr": 0.16720918384264621, "rouge1_fmeasure": 0.49485627314549524, "rouge1_fmeasure_stderr": 0.0024544062699189814, "rouge1_precision": 0.5979241216990805, "rouge1_precision_stderr": 0.003249159766185611, "rouge1_recall": 0.4639078870638987, "rouge1_recall_stderr": 0.003148275112212192, "rouge2_fmeasure": 0.24123039025136075, "rouge2_fmeasure_stderr": 0.002180045932319959, "rouge2_precision": 0.2956478031764748, "rouge2_precision_stderr": 0.00278907091106282, "rouge2_recall": 0.22605648093866812, "rouge2_recall_stderr": 0.0023685889739160394, "rougeL_fmeasure": 0.35989371492803346, "rougeL_fmeasure_stderr": 0.0022080667678571124, "rougeL_precision": 0.43850922077428195, "rougeL_precision_stderr": 0.0030731074026139234, "rougeL_recall": 0.33651308226541454, "rougeL_recall_stderr": 0.0025991623791134935, "rougeLsum_fmeasure": 0.4091180712168849, "rougeLsum_fmeasure_stderr": 0.0024997075802474684, "rougeLsum_precision": 0.494681105830301, "rougeLsum_precision_stderr": 0.0032392131317364663, "rougeLsum_recall": 0.3838741776920783, "rougeLsum_recall_stderr": 0.002994592667557759}}, "2": {"generate_text_restaurant": {"bleu": 16.576086891653166, "bleu_stderr": 0.1984305930797547, "rouge1_fmeasure": 0.5406367365743524, "rouge1_fmeasure_stderr": 0.0022538009424502856, "rouge1_precision": 0.6107310269990832, "rouge1_precision_stderr": 0.003015577689861615, "rouge1_recall": 0.5221390359885365, "rouge1_recall_stderr": 0.0029512899488828535, "rouge2_fmeasure": 0.2734427432548749, "rouge2_fmeasure_stderr": 0.0022099035880405217, "rouge2_precision": 0.31025817915664383, "rouge2_precision_stderr": 0.0026272739666680675, "rouge2_recall": 0.2649860705420364, "rouge2_recall_stderr": 0.0024508343474972704, "rougeL_fmeasure": 0.39198077065688064, "rougeL_fmeasure_stderr": 0.002182514239974999, "rougeL_precision": 0.4439430176815307, "rougeL_precision_stderr": 0.0028358702473415052, "rougeL_recall": 0.3785721561496246, "rougeL_recall_stderr": 0.002595192473566268, "rougeLsum_fmeasure": 0.45178684887136245, "rougeLsum_fmeasure_stderr": 0.0024132990759076346, "rougeLsum_precision": 0.5096418612798311, "rougeLsum_precision_stderr": 0.0030192956186345364, "rougeLsum_recall": 0.43688135096476943, "rougeLsum_recall_stderr": 0.002910558794257935}}, "3": {"generate_text_restaurant": {"bleu": 18.0845433681664, "bleu_stderr": 0.1777813122477081, "rouge1_fmeasure": 0.5569933886386429, "rouge1_fmeasure_stderr": 0.002199268289167996, "rouge1_precision": 0.6137536153047778, "rouge1_precision_stderr": 0.002904157765234221, "rouge1_recall": 0.5442156551577437, "rouge1_recall_stderr": 0.002885723399368904, "rouge2_fmeasure": 0.28763411022506213, "rouge2_fmeasure_stderr": 0.0022477495833815196, "rouge2_precision": 0.31771337110021336, "rouge2_precision_stderr": 0.002612043327366063, "rouge2_recall": 0.2820443575315225, "rouge2_recall_stderr": 0.002486793380575716, "rougeL_fmeasure": 0.4035567493026598, "rougeL_fmeasure_stderr": 0.0021951888628519655, "rougeL_precision": 0.44521250737924556, "rougeL_precision_stderr": 0.002748868729929973, "rougeL_recall": 0.3945188323134259, "rougeL_recall_stderr": 0.0025907833781845116, "rougeLsum_fmeasure": 0.46891207559259884, "rougeLsum_fmeasure_stderr": 0.0024057218135253083, "rougeLsum_precision": 0.5161460637873506, "rougeLsum_precision_stderr": 0.0029604827844778962, "rougeLsum_recall": 0.45840539581253936, "rougeLsum_recall_stderr": 0.002872125220850784}}, "4": {"generate_text_restaurant": {"bleu": 18.51503235974728, "bleu_stderr": 0.137264758627985, "rouge1_fmeasure": 0.5613974208808915, "rouge1_fmeasure_stderr": 0.002229445477122744, "rouge1_precision": 0.6120787062374268, "rouge1_precision_stderr": 0.0029210193021996945, "rouge1_recall": 0.551131861972078, "rouge1_recall_stderr": 0.002878591933639494, "rouge2_fmeasure": 0.2923497201090039, "rouge2_fmeasure_stderr": 0.002320583768019888, "rouge2_precision": 0.31927245421777867, "rouge2_precision_stderr": 0.0026652064038851435, "rouge2_recall": 0.2880520521380086, "rouge2_recall_stderr": 0.002555894661960643, "rougeL_fmeasure": 0.4041359208213478, "rougeL_fmeasure_stderr": 0.002238652049241865, "rougeL_precision": 0.4410030855945479, "rougeL_precision_stderr": 0.0027444185755603635, "rougeL_recall": 0.3968570780449371, "rougeL_recall_stderr": 0.002615164584024417, "rougeLsum_fmeasure": 0.47289433229350847, "rougeLsum_fmeasure_stderr": 0.002447850484296981, "rougeLsum_precision": 0.5150329022926635, "rougeLsum_precision_stderr": 0.002968021858154739, "rougeLsum_recall": 0.46450424841976723, "rougeLsum_recall_stderr": 0.002904435277766539}}, "5": {"generate_text_restaurant": {"bleu": 18.31902300887679, "bleu_stderr": 0.21660929188675368, "rouge1_fmeasure": 0.560533216370765, "rouge1_fmeasure_stderr": 0.002197938637338113, "rouge1_precision": 0.6075691463019105, "rouge1_precision_stderr": 0.002877547740801876, "rouge1_recall": 0.5519116595173106, "rouge1_recall_stderr": 0.002825353291269693, "rouge2_fmeasure": 0.2901019845755836, "rouge2_fmeasure_stderr": 0.0022752845282833444, "rouge2_precision": 0.3145544773242325, "rouge2_precision_stderr": 0.002564671139370085, "rouge2_recall": 0.2871130624175668, "rouge2_recall_stderr": 0.002534307523052322, "rougeL_fmeasure": 0.4050393820619599, "rougeL_fmeasure_stderr": 0.0022368299932115226, "rougeL_precision": 0.43853503664166193, "rougeL_precision_stderr": 0.0026803764592891635, "rougeL_recall": 0.39978015472393064, "rougeL_recall_stderr": 0.0026469457358851683, "rougeLsum_fmeasure": 0.4728370986753031, "rougeLsum_fmeasure_stderr": 0.0024112597225851803, "rougeLsum_precision": 0.5118762996951014, "rougeLsum_precision_stderr": 0.0029189311646059903, "rougeLsum_recall": 0.46611352612043305, "rougeLsum_recall_stderr": 0.0028714705658076977}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.824437063172752, "bleu_stderr": 0.1295959938779033, "rouge1_fmeasure": 0.19064385039008325, "rouge1_fmeasure_stderr": 0.0026847919999498783, "rouge1_precision": 0.13889553377394204, "rouge1_precision_stderr": 0.0020470938512401043, "rouge1_recall": 0.3224174898964051, "rouge1_recall_stderr": 0.0047323832955649394, "rouge2_fmeasure": 0.04310473913800793, "rouge2_fmeasure_stderr": 0.0015909694285954996, "rouge2_precision": 0.030770546785531084, "rouge2_precision_stderr": 0.0011485507935108212, "rouge2_recall": 0.07627788908402264, "rouge2_recall_stderr": 0.0029161006551518855, "rougeL_fmeasure": 0.14803927722404647, "rougeL_fmeasure_stderr": 0.002024044592590105, "rougeL_precision": 0.1076726505455253, "rougeL_precision_stderr": 0.0015263290004438454, "rougeL_recall": 0.2516553160702365, "rougeL_recall_stderr": 0.0037251276937883744, "rougeLsum_fmeasure": 0.1506543367359999, "rougeLsum_fmeasure_stderr": 0.0022302335824208926, "rougeLsum_precision": 0.10954802882008673, "rougeLsum_precision_stderr": 0.001670554704114846, "rougeLsum_recall": 0.2563589341875558, "rougeLsum_recall_stderr": 0.004093935928498495}}, "1": {"article_DOC_summary": {"bleu": 1.7609819893980598, "bleu_stderr": 0.09673219339225483, "rouge1_fmeasure": 0.18745377329593618, "rouge1_fmeasure_stderr": 0.0028616444790333095, "rouge1_precision": 0.1556677591501949, "rouge1_precision_stderr": 0.0028313144238476637, "rouge1_recall": 0.28181566724348217, "rouge1_recall_stderr": 0.0046620008628106705, "rouge2_fmeasure": 0.03804941171577492, "rouge2_fmeasure_stderr": 0.0016796104857723759, "rouge2_precision": 0.03124209990455104, "rouge2_precision_stderr": 0.0015863039800586146, "rouge2_recall": 0.05959322682370301, "rouge2_recall_stderr": 0.0026033301572952875, "rougeL_fmeasure": 0.14217542198253097, "rougeL_fmeasure_stderr": 0.0021914393891234374, "rougeL_precision": 0.11865350137721012, "rougeL_precision_stderr": 0.002261907056323959, "rougeL_recall": 0.21376373660006684, "rougeL_recall_stderr": 0.00356630705358284, "rougeLsum_fmeasure": 0.15022526225239208, "rougeLsum_fmeasure_stderr": 0.0023683857139214465, "rougeLsum_precision": 0.12449107117970809, "rougeLsum_precision_stderr": 0.002329800484842638, "rougeLsum_recall": 0.227483012391418, "rougeLsum_recall_stderr": 0.003955993366379117}}, "2": {"article_DOC_summary": {"bleu": 2.0844403527225617, "bleu_stderr": 0.18103428706460614, "rouge1_fmeasure": 0.21367633738963315, "rouge1_fmeasure_stderr": 0.0033554954700050752, "rouge1_precision": 0.19935476907025304, "rouge1_precision_stderr": 0.0038598266568137785, "rouge1_recall": 0.2706926378266182, "rouge1_recall_stderr": 0.004156883055461473, "rouge2_fmeasure": 0.04593830462341063, "rouge2_fmeasure_stderr": 0.0020168943686124465, "rouge2_precision": 0.044027605832549326, "rouge2_precision_stderr": 0.002125094239368793, "rouge2_recall": 0.05675910399033515, "rouge2_recall_stderr": 0.0024265127143739836, "rougeL_fmeasure": 0.16184982863572556, "rougeL_fmeasure_stderr": 0.0026008492724763654, "rougeL_precision": 0.15107166602542463, "rougeL_precision_stderr": 0.0030007609375925313, "rougeL_recall": 0.20556159886471823, "rougeL_recall_stderr": 0.0032143700207118648, "rougeLsum_fmeasure": 0.1658443102086597, "rougeLsum_fmeasure_stderr": 0.002655700581774396, "rougeLsum_precision": 0.15408614394681422, "rougeLsum_precision_stderr": 0.003005649518177271, "rougeLsum_recall": 0.2120361542004191, "rougeLsum_recall_stderr": 0.0034289113513717964}}, "3": {"article_DOC_summary": {"bleu": 2.6157705705080483, "bleu_stderr": 0.1931328754738087, "rouge1_fmeasure": 0.21733976037468944, "rouge1_fmeasure_stderr": 0.0036238273509993928, "rouge1_precision": 0.21476053304129508, "rouge1_precision_stderr": 0.004210457868862111, "rouge1_recall": 0.2548785666224053, "rouge1_recall_stderr": 0.004327640907101733, "rouge2_fmeasure": 0.05057991242538114, "rouge2_fmeasure_stderr": 0.0021902733567216452, "rouge2_precision": 0.05108878519401023, "rouge2_precision_stderr": 0.00238600623033403, "rouge2_recall": 0.058040451447882946, "rouge2_recall_stderr": 0.002461257596090234, "rougeL_fmeasure": 0.1669642989094155, "rougeL_fmeasure_stderr": 0.0029609461708291133, "rougeL_precision": 0.16532395838276415, "rougeL_precision_stderr": 0.0034399821384294557, "rougeL_recall": 0.19583029758598305, "rougeL_recall_stderr": 0.0034969216930147515, "rougeLsum_fmeasure": 0.16989940378849605, "rougeLsum_fmeasure_stderr": 0.002996828651281411, "rougeLsum_precision": 0.16782197382314246, "rougeLsum_precision_stderr": 0.0034561917258272806, "rougeLsum_recall": 0.20028204694266308, "rougeLsum_recall_stderr": 0.0036507354923465485}}, "4": {"article_DOC_summary": {"bleu": 0.24290172552446196, "bleu_stderr": 0.056581491175426914, "rouge1_fmeasure": 0.05851076846542915, "rouge1_fmeasure_stderr": 0.0033565789806109434, "rouge1_precision": 0.06774270444285756, "rouge1_precision_stderr": 0.004232420643142785, "rouge1_recall": 0.06344049540701503, "rouge1_recall_stderr": 0.0037875385187947238, "rouge2_fmeasure": 0.012864663909485747, "rouge2_fmeasure_stderr": 0.0012115981535214609, "rouge2_precision": 0.015101385764716672, "rouge2_precision_stderr": 0.0015922733167297797, "rouge2_recall": 0.014184749823453857, "rouge2_recall_stderr": 0.0014191161123945839, "rougeL_fmeasure": 0.044310535444670965, "rougeL_fmeasure_stderr": 0.002566821173611681, "rougeL_precision": 0.05231134411266022, "rougeL_precision_stderr": 0.0034387489181517653, "rougeL_recall": 0.04797296934231772, "rougeL_recall_stderr": 0.0028959645784639894, "rougeLsum_fmeasure": 0.04489857322141054, "rougeLsum_fmeasure_stderr": 0.0025974168309322184, "rougeLsum_precision": 0.05280348194292997, "rougeLsum_precision_stderr": 0.003452891291164583, "rougeLsum_recall": 0.048834354415796526, "rougeLsum_recall_stderr": 0.00297075040562579}}, "5": {"article_DOC_summary": {"bleu": 6.130780440334243e-47, "bleu_stderr": 9.503426174895799e-44, "rouge1_fmeasure": 0.002244737752786088, "rouge1_fmeasure_stderr": 0.0006095570789287521, "rouge1_precision": 0.003117611880602497, "rouge1_precision_stderr": 0.0009842144959043254, "rouge1_recall": 0.0019488182977195217, "rouge1_recall_stderr": 0.0005249211324998764, "rouge2_fmeasure": 0.00030536930193877015, "rouge2_fmeasure_stderr": 0.0001813875271676511, "rouge2_precision": 0.0008079370976590439, "rouge2_precision_stderr": 0.0006502336039399843, "rouge2_recall": 0.0002226782651310953, "rouge2_recall_stderr": 0.00011959198904763001, "rougeL_fmeasure": 0.0016390641129216145, "rougeL_fmeasure_stderr": 0.00045172367777572114, "rougeL_precision": 0.00237811186000067, "rougeL_precision_stderr": 0.000838173352270679, "rougeL_recall": 0.001419537348620057, "rougeL_recall_stderr": 0.00038663520349704417, "rougeLsum_fmeasure": 0.0016975367824715818, "rougeLsum_fmeasure_stderr": 0.0004551946310914248, "rougeLsum_precision": 0.0024324846668707023, "rougeLsum_precision_stderr": 0.0008373236753933433, "rougeLsum_recall": 0.001479673753096084, "rougeLsum_recall_stderr": 0.00039335345523923025}}}}
4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.316,0.014709193056057123,0
3
+ anli_r2,acc,0.334,0.01492201952373296,0
4
+ anli_r3,acc,0.3375,0.01365589718546366,0
5
+ arc_challenge,acc,0.23976109215017063,0.012476304127453944,0
6
+ arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0
7
+ arc_easy,acc,0.4238215488215488,0.01014000609521361,0
8
+ arc_easy,acc_norm,0.39646464646464646,0.010037412763064524,0
9
+ boolq,acc,0.5798165137614679,0.008632912118872552,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.27341794465009694,,1
12
+ copa,acc,0.8,0.04020151261036844,0
13
+ hellaswag,acc,0.42342162915753834,0.004930911515084788,0
14
+ hellaswag,acc_norm,0.5375423222465644,0.004975696076240854,0
15
+ piqa,acc,0.6692056583242655,0.010977520584714429,0
16
+ piqa,acc_norm,0.6686615886833515,0.01098207745895734,0
17
+ rte,acc,0.5451263537906137,0.029973636495415252,0
18
+ sciq,acc,0.769,0.013334797216936436,0
19
+ sciq,acc_norm,0.68,0.014758652303574876,0
20
+ storycloze_2016,acc,0.686798503474078,0.010725209422929404,0
21
+ winogrande,acc,0.5611681136543015,0.013946933444507034,0
4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.315,0.014696631960792505,0
3
+ anli_r2,acc,0.323,0.014794927843348633,0
4
+ anli_r3,acc,0.32416666666666666,0.013517438120881634,0
5
+ arc_challenge,acc,0.2773037542662116,0.013082095839059374,0
6
+ arc_challenge,acc_norm,0.2909556313993174,0.013273077865907588,0
7
+ arc_easy,acc,0.5366161616161617,0.010232235063933027,0
8
+ arc_easy,acc_norm,0.5227272727272727,0.01024917909060598,0
9
+ boolq,acc,0.5697247706422018,0.008659608602932495,1
10
+ cb,acc,0.44642857142857145,0.06703189227942397,1
11
+ cb,f1,0.4618941916389197,,1
12
+ copa,acc,0.78,0.04163331998932263,0
13
+ hellaswag,acc,0.4213304122684724,0.0049276318064775575,0
14
+ hellaswag,acc_norm,0.5533758215494922,0.004961268387512967,0
15
+ piqa,acc,0.7219804134929271,0.01045311735833281,0
16
+ piqa,acc_norm,0.7241566920565833,0.010427805502729119,0
17
+ rte,acc,0.5595667870036101,0.029882123363118716,0
18
+ sciq,acc,0.902,0.009406619184621252,0
19
+ sciq,acc_norm,0.887,0.01001655286669685,0
20
+ storycloze_2016,acc,0.689470871191876,0.010700112173178448,0
21
+ winogrande,acc,0.5753749013417522,0.013891893150264224,0
4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.32,0.014758652303574859,0
3
+ anli_r2,acc,0.346,0.01505026612756444,0
4
+ anli_r3,acc,0.3283333333333333,0.013562032919529015,0
5
+ arc_challenge,acc,0.2713310580204778,0.0129938077275458,0
6
+ arc_challenge,acc_norm,0.3122866894197952,0.013542598541688065,0
7
+ arc_easy,acc,0.5778619528619529,0.010134620524592271,0
8
+ arc_easy,acc_norm,0.5702861952861953,0.010157908005763676,0
9
+ boolq,acc,0.5712538226299694,0.008655800332760227,1
10
+ cb,acc,0.375,0.06527912098338669,1
11
+ cb,f1,0.26794380587484035,,1
12
+ copa,acc,0.78,0.04163331998932262,0
13
+ hellaswag,acc,0.4343756223859789,0.004946617138983515,0
14
+ hellaswag,acc_norm,0.5676160127464649,0.004943945069611461,0
15
+ piqa,acc,0.719260065288357,0.010484325438311827,0
16
+ piqa,acc_norm,0.721436343852013,0.010459397235965154,0
17
+ rte,acc,0.49458483754512633,0.030094698123239966,0
18
+ sciq,acc,0.903,0.00936368937324811,0
19
+ sciq,acc_norm,0.907,0.009188875634996705,0
20
+ storycloze_2016,acc,0.6862640299305185,0.010730179119317628,0
21
+ winogrande,acc,0.5643251775848461,0.01393570973961571,0
4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.327,0.01484221315341124,0
3
+ anli_r2,acc,0.346,0.015050266127564446,0
4
+ anli_r3,acc,0.345,0.013728421539454876,0
5
+ arc_challenge,acc,0.2841296928327645,0.013179442447653887,0
6
+ arc_challenge,acc_norm,0.3165529010238908,0.013592431519068079,0
7
+ arc_easy,acc,0.5921717171717171,0.01008395024004121,0
8
+ arc_easy,acc_norm,0.5707070707070707,0.010156678075911096,0
9
+ boolq,acc,0.572782874617737,0.00865190772248611,1
10
+ cb,acc,0.5178571428571429,0.06737697508644648,1
11
+ cb,f1,0.49780616853787585,,1
12
+ copa,acc,0.81,0.03942772444036622,0
13
+ hellaswag,acc,0.4426409081856204,0.004956839256162737,0
14
+ hellaswag,acc_norm,0.5813582951603267,0.004923281841828511,0
15
+ piqa,acc,0.721436343852013,0.010459397235965161,0
16
+ piqa,acc_norm,0.7404787812840044,0.010227939888173927,0
17
+ rte,acc,0.5667870036101083,0.029826764082138267,0
18
+ sciq,acc,0.92,0.008583336977753653,0
19
+ sciq,acc_norm,0.916,0.00877616208949112,0
20
+ storycloze_2016,acc,0.6958845537145911,0.010638172655194794,0
21
+ winogrande,acc,0.5651144435674822,0.013932814110418029,0
4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.32,0.014758652303574878,0
3
+ anli_r2,acc,0.357,0.015158521721486767,0
4
+ anli_r3,acc,0.3433333333333333,0.01371263383046586,0
5
+ arc_challenge,acc,0.28242320819112626,0.013155456884097222,0
6
+ arc_challenge,acc_norm,0.3225255972696246,0.01365998089427737,0
7
+ arc_easy,acc,0.6031144781144782,0.010039236800583206,0
8
+ arc_easy,acc_norm,0.5921717171717171,0.010083950240041214,0
9
+ boolq,acc,0.5657492354740061,0.00866911618424304,1
10
+ cb,acc,0.5,0.06741998624632421,1
11
+ cb,f1,0.3808102345415778,,1
12
+ copa,acc,0.78,0.04163331998932261,0
13
+ hellaswag,acc,0.44433379804819756,0.004958761056959784,0
14
+ hellaswag,acc_norm,0.5844453296156145,0.004918102168717935,0
15
+ piqa,acc,0.7225244831338411,0.010446818281039954,0
16
+ piqa,acc_norm,0.735582154515778,0.010289787244767168,0
17
+ rte,acc,0.5018050541516246,0.030096267148976633,0
18
+ sciq,acc,0.928,0.008178195576218681,0
19
+ sciq,acc_norm,0.925,0.008333333333333358,0
20
+ storycloze_2016,acc,0.6990913949759487,0.010606289538707337,0
21
+ winogrande,acc,0.5872138910812944,0.0138370606486821,0
4b284b84b70c4py/evaluation/rankeval/4b284b84b70c4py_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.327,0.014842213153411249,0
3
+ anli_r2,acc,0.335,0.014933117490932572,0
4
+ anli_r3,acc,0.34833333333333333,0.013759437498874066,0
5
+ arc_challenge,acc,0.29266211604095566,0.013295916103619411,0
6
+ arc_challenge,acc_norm,0.31399317406143346,0.013562691224726291,0
7
+ arc_easy,acc,0.5904882154882155,0.010090368160990062,0
8
+ arc_easy,acc_norm,0.5921717171717171,0.010083950240041224,0
9
+ boolq,acc,0.5678899082568807,0.008664067354619378,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.37575757575757573,,1
12
+ copa,acc,0.74,0.0440844002276808,0
13
+ hellaswag,acc,0.44981079466241786,0.004964579685712441,0
14
+ hellaswag,acc_norm,0.5979884485162318,0.004893022130229104,0
15
+ piqa,acc,0.7301414581066377,0.010356595421852195,0
16
+ piqa,acc_norm,0.7312295973884657,0.01034339294009,0
17
+ rte,acc,0.5703971119133574,0.02979666882912467,0
18
+ sciq,acc,0.926,0.008282064512704159,0
19
+ sciq,acc_norm,0.925,0.008333333333333347,0
20
+ storycloze_2016,acc,0.7022982362373063,0.010573790208173063,0
21
+ winogrande,acc,0.5895816890292028,0.013825107120035865,0
4b284b84b80c4py/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.013034000920115679
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.013034000920115679
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.13358035874530175
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.13358035874530175
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.22277617819890463
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.22277617819890463
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.22831433189515238
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.22831433189515238
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2331491796706113
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2331491796706113
14
+ e2e_nlg_cleaned,5,average,multiple,0.13847567490501428
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04598629367072178
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04598629367072178
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04280890210939285
18
+ gem_xsum,1,median,rouge2_fmeasure,0.04280890210939285
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.054061483001217665
20
+ gem_xsum,2,median,rouge2_fmeasure,0.054061483001217665
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.054853778845451784
22
+ gem_xsum,3,median,rouge2_fmeasure,0.054853778845451784
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.014767437977446025
24
+ gem_xsum,4,median,rouge2_fmeasure,0.014767437977446025
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0002632456641890604
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0002632456641890604
27
+ gem_xsum,5,average,multiple,0.03545685687806986
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.048217817458998315
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.048217817458998315
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.06959492243303098
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.06959492243303098
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.08682731837424236
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.08682731837424236
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.09029622365274437
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.09029622365274437
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10170657902284919
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.10170657902284919
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.10281580541714498
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.10281580541714498
40
+ web_nlg_en,5,average,multiple,0.08324311105983503
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.021777445931524943
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.021777445931524943
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05486345429358379
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05486345429358379
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06669343202973603
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.06669343202973603
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05642140770577982
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.05642140770577982
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.018135359558163425
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.018135359558163425
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0029772370336828623
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0029772370336828623
53
+ wiki_lingua_en,5,average,multiple,0.03681138942541181
4b284b84b80c4py/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2654494229344325, "bleu_stderr": 0.02042403331906659, "rouge1_fmeasure": 0.10270865851589617, "rouge1_fmeasure_stderr": 0.0018633348660556126, "rouge1_precision": 0.06884441562975974, "rouge1_precision_stderr": 0.001625074690502154, "rouge1_recall": 0.2864019956058112, "rouge1_recall_stderr": 0.004456037431923359, "rouge2_fmeasure": 0.048217817458998315, "rouge2_fmeasure_stderr": 0.0011793965314152368, "rouge2_precision": 0.03283019510867619, "rouge2_precision_stderr": 0.0011605897915615384, "rouge2_recall": 0.14006146078837917, "rouge2_recall_stderr": 0.003098754558238878, "rougeL_fmeasure": 0.09954469907709385, "rougeL_fmeasure_stderr": 0.0017476648824675284, "rougeL_precision": 0.06647135478840976, "rougeL_precision_stderr": 0.0014974291698116727, "rougeL_recall": 0.27961016599496186, "rougeL_recall_stderr": 0.004359691176586921, "rougeLsum_fmeasure": 0.09828931038760727, "rougeLsum_fmeasure_stderr": 0.0017568749980800627, "rougeLsum_precision": 0.06572757765232382, "rougeLsum_precision_stderr": 0.001504407454736008, "rougeLsum_recall": 0.2747092630663634, "rougeLsum_recall_stderr": 0.004204339446833014}}, "1": {"PALM_prompt": {"bleu": 0.4341998648443088, "bleu_stderr": 0.02931909495215452, "rouge1_fmeasure": 0.13856233448813582, "rouge1_fmeasure_stderr": 0.0033473280860128814, "rouge1_precision": 0.11688602814035473, "rouge1_precision_stderr": 0.003935770022981782, "rouge1_recall": 0.28720810418688403, "rouge1_recall_stderr": 0.0048849831746035485, "rouge2_fmeasure": 0.06959492243303098, "rouge2_fmeasure_stderr": 0.0022198617599453164, "rouge2_precision": 0.05973753564316913, "rouge2_precision_stderr": 0.002683788007664101, "rouge2_recall": 0.14802561352698737, "rouge2_recall_stderr": 0.0034313367488162144, "rougeL_fmeasure": 0.1275832647601733, "rougeL_fmeasure_stderr": 0.0029115175322049283, "rougeL_precision": 0.10645218467239395, "rougeL_precision_stderr": 0.003474676536995635, "rougeL_recall": 0.2709461862797011, "rougeL_recall_stderr": 0.004518921334055727, "rougeLsum_fmeasure": 0.1293122258071173, "rougeLsum_fmeasure_stderr": 0.00297986072627297, "rougeLsum_precision": 0.10865705434291732, "rougeLsum_precision_stderr": 0.0036059561504076425, "rougeLsum_recall": 0.2725754911180342, "rougeLsum_recall_stderr": 0.004526923371814666}}, "2": {"PALM_prompt": {"bleu": 0.540195807592588, "bleu_stderr": 0.016062705325417568, "rouge1_fmeasure": 0.16698242868460828, "rouge1_fmeasure_stderr": 0.003879108485606686, "rouge1_precision": 0.14540952314253452, "rouge1_precision_stderr": 0.004745902001566398, "rouge1_recall": 0.328727947556376, "rouge1_recall_stderr": 0.0048043462555501775, "rouge2_fmeasure": 0.08682731837424236, "rouge2_fmeasure_stderr": 0.0026421517017045306, "rouge2_precision": 0.07922006900478823, "rouge2_precision_stderr": 0.0033729424838290973, "rouge2_recall": 0.1729314235845569, "rouge2_recall_stderr": 0.0035720515220793716, "rougeL_fmeasure": 0.15100781291127027, "rougeL_fmeasure_stderr": 0.0033012121653411587, "rougeL_precision": 0.12981426632377385, "rougeL_precision_stderr": 0.004115558064535076, "rougeL_recall": 0.30716678048607904, "rougeL_recall_stderr": 0.004371914170063972, "rougeLsum_fmeasure": 0.1535308401357221, "rougeLsum_fmeasure_stderr": 0.003382020862859542, "rougeLsum_precision": 0.1327382198133406, "rougeLsum_precision_stderr": 0.004242541006964004, "rougeLsum_recall": 0.31045535547391245, "rougeLsum_recall_stderr": 0.004419769979750796}}, "3": {"PALM_prompt": {"bleu": 0.579303420349284, "bleu_stderr": 0.03070691818729103, "rouge1_fmeasure": 0.17192584224366533, "rouge1_fmeasure_stderr": 0.003981598663484289, "rouge1_precision": 0.14564236883492274, "rouge1_precision_stderr": 0.004570006061233058, "rouge1_recall": 0.34273155063758165, "rouge1_recall_stderr": 0.0049092564398085705, "rouge2_fmeasure": 0.09029622365274437, "rouge2_fmeasure_stderr": 0.0027516311728602635, "rouge2_precision": 0.07846599635038211, "rouge2_precision_stderr": 0.0031642057157483514, "rouge2_recall": 0.18308333983979522, "rouge2_recall_stderr": 0.0037611482801926245, "rougeL_fmeasure": 0.15460241770061486, "rougeL_fmeasure_stderr": 0.003337321935085864, "rougeL_precision": 0.12898664514808025, "rougeL_precision_stderr": 0.003874560467823768, "rougeL_recall": 0.3196500639450009, "rougeL_recall_stderr": 0.004463839362742239, "rougeLsum_fmeasure": 0.15832636884567902, "rougeLsum_fmeasure_stderr": 0.0034749866252351985, "rougeLsum_precision": 0.13294008530154622, "rougeLsum_precision_stderr": 0.004048051249427011, "rougeLsum_recall": 0.32359488765237576, "rougeLsum_recall_stderr": 0.004525961014634517}}, "4": {"PALM_prompt": {"bleu": 0.7508323900092823, "bleu_stderr": 0.051769496559376536, "rouge1_fmeasure": 0.1869024104643218, "rouge1_fmeasure_stderr": 0.004253968037546893, "rouge1_precision": 0.1576972285678001, "rouge1_precision_stderr": 0.00484503984635203, "rouge1_recall": 0.36394490355781656, "rouge1_recall_stderr": 0.0050308991182132345, "rouge2_fmeasure": 0.10170657902284919, "rouge2_fmeasure_stderr": 0.0030179631834473157, "rouge2_precision": 0.08770356291849396, "rouge2_precision_stderr": 0.0033611025435646164, "rouge2_recall": 0.2000290057900588, "rouge2_recall_stderr": 0.003953331921254398, "rougeL_fmeasure": 0.16743346298967118, "rougeL_fmeasure_stderr": 0.0036131612967653178, "rougeL_precision": 0.13896012083483686, "rougeL_precision_stderr": 0.004109905773203414, "rougeL_recall": 0.33722779081946314, "rougeL_recall_stderr": 0.00456715932509634, "rougeLsum_fmeasure": 0.1724905583779245, "rougeLsum_fmeasure_stderr": 0.0037847178446506636, "rougeLsum_precision": 0.14417324851937793, "rougeLsum_precision_stderr": 0.0043285030940103985, "rougeLsum_recall": 0.3433703383324822, "rougeLsum_recall_stderr": 0.004665980875994347}}, "5": {"PALM_prompt": {"bleu": 0.7585718867765139, "bleu_stderr": 0.03875579809012565, "rouge1_fmeasure": 0.1905467260711841, "rouge1_fmeasure_stderr": 0.004414987424299249, "rouge1_precision": 0.16283848668163364, "rouge1_precision_stderr": 0.005007785319382099, "rouge1_recall": 0.36291213320627547, "rouge1_recall_stderr": 0.005109556323070843, "rouge2_fmeasure": 0.10281580541714498, "rouge2_fmeasure_stderr": 0.0030780626856611935, "rouge2_precision": 0.08989826838593842, "rouge2_precision_stderr": 0.0034707168247376085, "rouge2_recall": 0.197506702227425, "rouge2_recall_stderr": 0.0039256445593364945, "rougeL_fmeasure": 0.1687224264092521, "rougeL_fmeasure_stderr": 0.003684562366384233, "rougeL_precision": 0.14144102299147554, "rougeL_precision_stderr": 0.004178467880093537, "rougeL_recall": 0.3343944651084762, "rougeL_recall_stderr": 0.004570275936684646, "rougeLsum_fmeasure": 0.17449394333145707, "rougeLsum_fmeasure_stderr": 0.003890562051611505, "rougeLsum_precision": 0.1475923003210134, "rougeLsum_precision_stderr": 0.004442064395172012, "rougeLsum_recall": 0.341002809904085, "rougeLsum_recall_stderr": 0.004689078879269339}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.342536584706368, "bleu_stderr": 0.08849297777910851, "rouge1_fmeasure": 0.11243999874212027, "rouge1_fmeasure_stderr": 0.002164767590792443, "rouge1_precision": 0.10084100701877136, "rouge1_precision_stderr": 0.0021822232536689488, "rouge1_recall": 0.1537148087959417, "rouge1_recall_stderr": 0.0029354730434610326, "rouge2_fmeasure": 0.021777445931524943, "rouge2_fmeasure_stderr": 0.000779211655168332, "rouge2_precision": 0.019569744916852705, "rouge2_precision_stderr": 0.0007931205203939353, "rouge2_recall": 0.030280499795756165, "rouge2_recall_stderr": 0.001141214709974854, "rougeL_fmeasure": 0.09116793015021148, "rougeL_fmeasure_stderr": 0.0016722585691851509, "rougeL_precision": 0.08143261981565801, "rougeL_precision_stderr": 0.001716843760236591, "rougeL_recall": 0.12657442687470177, "rougeL_recall_stderr": 0.002368616522700004, "rougeLsum_fmeasure": 0.10395880699124531, "rougeLsum_fmeasure_stderr": 0.0019942544791669574, "rougeLsum_precision": 0.09317871386007986, "rougeLsum_precision_stderr": 0.00202679291331779, "rougeLsum_recall": 0.14271836981299196, "rougeLsum_recall_stderr": 0.0027293604057795425}}, "1": {"tldr_en": {"bleu": 3.2004783899209834, "bleu_stderr": 0.1051547990688132, "rouge1_fmeasure": 0.19811974170046415, "rouge1_fmeasure_stderr": 0.002370481534408051, "rouge1_precision": 0.29220498640246884, "rouge1_precision_stderr": 0.004510483375329714, "rouge1_recall": 0.21056207914512326, "rouge1_recall_stderr": 0.002862993912076448, "rouge2_fmeasure": 0.05486345429358379, "rouge2_fmeasure_stderr": 0.0013983543393740758, "rouge2_precision": 0.09320868443715985, "rouge2_precision_stderr": 0.002951243018922277, "rouge2_recall": 0.0555993214685687, "rouge2_recall_stderr": 0.0014949682024923239, "rougeL_fmeasure": 0.15498144751222015, "rougeL_fmeasure_stderr": 0.0018789733170041269, "rougeL_precision": 0.23423603839828175, "rougeL_precision_stderr": 0.003904349603424068, "rougeL_recall": 0.16451539334697188, "rougeL_recall_stderr": 0.0022614983310227524, "rougeLsum_fmeasure": 0.18669768619880286, "rougeLsum_fmeasure_stderr": 0.0022412402690552744, "rougeLsum_precision": 0.27679621320346764, "rougeLsum_precision_stderr": 0.004345174609021784, "rougeLsum_recall": 0.19832595208410367, "rougeLsum_recall_stderr": 0.002698605664187125}}, "2": {"tldr_en": {"bleu": 3.6795406957109114, "bleu_stderr": 0.097399871906329, "rouge1_fmeasure": 0.22424297313492328, "rouge1_fmeasure_stderr": 0.002319036746211076, "rouge1_precision": 0.3525340746012154, "rouge1_precision_stderr": 0.004467804844257969, "rouge1_recall": 0.22264562345375213, "rouge1_recall_stderr": 0.002858154973416283, "rouge2_fmeasure": 0.06669343202973603, "rouge2_fmeasure_stderr": 0.0014962853551376627, "rouge2_precision": 0.11498341764248544, "rouge2_precision_stderr": 0.0030158658545186897, "rouge2_recall": 0.06423937910034487, "rouge2_recall_stderr": 0.0015779202639016519, "rougeL_fmeasure": 0.17754585692592612, "rougeL_fmeasure_stderr": 0.0018979417178323916, "rougeL_precision": 0.28463470891650217, "rougeL_precision_stderr": 0.0038915645253803526, "rougeL_recall": 0.17549568614773742, "rougeL_recall_stderr": 0.0022895404035897927, "rougeLsum_fmeasure": 0.21236591933537968, "rougeLsum_fmeasure_stderr": 0.0022121647472795004, "rougeLsum_precision": 0.335317909671499, "rougeLsum_precision_stderr": 0.004335252266828728, "rougeLsum_recall": 0.21052955540974397, "rougeLsum_recall_stderr": 0.002705259907732843}}, "3": {"tldr_en": {"bleu": 2.6529228625851395, "bleu_stderr": 0.06695145117325985, "rouge1_fmeasure": 0.18822521680672596, "rouge1_fmeasure_stderr": 0.002617296361824549, "rouge1_precision": 0.3049261199791451, "rouge1_precision_stderr": 0.0048083571132485625, "rouge1_recall": 0.1830127373796882, "rouge1_recall_stderr": 0.0029960157712551165, "rouge2_fmeasure": 0.05642140770577982, "rouge2_fmeasure_stderr": 0.0014287166524203647, "rouge2_precision": 0.10081001355275752, "rouge2_precision_stderr": 0.0029637521105942426, "rouge2_recall": 0.05354945333572987, "rouge2_recall_stderr": 0.0014886114498347263, "rougeL_fmeasure": 0.1492329166091523, "rougeL_fmeasure_stderr": 0.00209140087652073, "rougeL_precision": 0.24721479077809116, "rougeL_precision_stderr": 0.004125615926891615, "rougeL_recall": 0.14463835968636768, "rougeL_recall_stderr": 0.002389453896153913, "rougeLsum_fmeasure": 0.17816202805073278, "rougeLsum_fmeasure_stderr": 0.0024879553747040152, "rougeLsum_precision": 0.2904445190012587, "rougeLsum_precision_stderr": 0.004649224071030751, "rougeLsum_recall": 0.17268720279442448, "rougeLsum_recall_stderr": 0.002824015936150878}}, "4": {"tldr_en": {"bleu": 0.056804681949967316, "bleu_stderr": 0.010690345275534413, "rouge1_fmeasure": 0.06096360459839256, "rouge1_fmeasure_stderr": 0.0022037357882037844, "rouge1_precision": 0.10036011757294341, "rouge1_precision_stderr": 0.003828065967839756, "rouge1_recall": 0.05976859185068726, "rouge1_recall_stderr": 0.002374591625498158, "rouge2_fmeasure": 0.018135359558163425, "rouge2_fmeasure_stderr": 0.0009762156938567968, "rouge2_precision": 0.034172314432026826, "rouge2_precision_stderr": 0.0020947231263731035, "rouge2_recall": 0.017746804116333104, "rouge2_recall_stderr": 0.0010915110023364762, "rougeL_fmeasure": 0.048955315158774056, "rougeL_fmeasure_stderr": 0.0017762945473565504, "rougeL_precision": 0.08290724012011313, "rougeL_precision_stderr": 0.003283318294210223, "rougeL_recall": 0.04774180933859826, "rougeL_recall_stderr": 0.0019108742866020224, "rougeLsum_fmeasure": 0.05749923576482752, "rougeLsum_fmeasure_stderr": 0.0020730526611142247, "rougeLsum_precision": 0.09578765321068981, "rougeLsum_precision_stderr": 0.0036910942164747476, "rougeLsum_recall": 0.05616768754158701, "rougeLsum_recall_stderr": 0.002221027072581802}}, "5": {"tldr_en": {"bleu": 1.8602852496691957e-15, "bleu_stderr": 7.326278991313062e-14, "rouge1_fmeasure": 0.009040594267997637, "rouge1_fmeasure_stderr": 0.000910583028080826, "rouge1_precision": 0.016269144811300436, "rouge1_precision_stderr": 0.0017330627263105036, "rouge1_recall": 0.008992868782942299, "rouge1_recall_stderr": 0.0010023183417149886, "rouge2_fmeasure": 0.0029772370336828623, "rouge2_fmeasure_stderr": 0.00042374476014479795, "rouge2_precision": 0.00603444728982033, "rouge2_precision_stderr": 0.0009879493281734829, "rouge2_recall": 0.002907664570790127, "rouge2_recall_stderr": 0.00044404256219533566, "rougeL_fmeasure": 0.007363262152628281, "rougeL_fmeasure_stderr": 0.0007389345085151382, "rougeL_precision": 0.013636530561312142, "rougeL_precision_stderr": 0.0015048481430310666, "rougeL_recall": 0.007310481662486371, "rougeL_recall_stderr": 0.0008157930983246532, "rougeLsum_fmeasure": 0.008632369068789038, "rougeLsum_fmeasure_stderr": 0.0008708707742786176, "rougeLsum_precision": 0.0155806787512584, "rougeLsum_precision_stderr": 0.001671774406055537, "rougeLsum_recall": 0.00861961707329055, "rougeLsum_recall_stderr": 0.0009704867392879004}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}, "1": {"generate_text_restaurant": {"bleu": 0.004326822275913898, "bleu_stderr": 0.0014860868392970075, "rouge1_fmeasure": 0.03482971934623997, "rouge1_fmeasure_stderr": 0.0018674733169836441, "rouge1_precision": 0.058968309479278815, "rouge1_precision_stderr": 0.0030937332420023565, "rouge1_recall": 0.030823840874859645, "rouge1_recall_stderr": 0.001733437151299972, "rouge2_fmeasure": 0.013034000920115679, "rouge2_fmeasure_stderr": 0.0008234820993304426, "rouge2_precision": 0.026742477635741294, "rouge2_precision_stderr": 0.0018505815419979316, "rouge2_recall": 0.011324255646727236, "rouge2_recall_stderr": 0.0007596732722241375, "rougeL_fmeasure": 0.02743146688384779, "rougeL_fmeasure_stderr": 0.0014088315511498015, "rougeL_precision": 0.05010925528697452, "rougeL_precision_stderr": 0.0027377945669586406, "rougeL_recall": 0.024067231142752887, "rougeL_recall_stderr": 0.00130119291837391, "rougeLsum_fmeasure": 0.0317885909357075, "rougeLsum_fmeasure_stderr": 0.001711559137587224, "rougeLsum_precision": 0.05538025417425293, "rougeLsum_precision_stderr": 0.002960086478367767, "rougeLsum_recall": 0.027958304714186387, "rougeLsum_recall_stderr": 0.0015853148239341247}}, "2": {"generate_text_restaurant": {"bleu": 6.200047848676018, "bleu_stderr": 0.24054659400513925, "rouge1_fmeasure": 0.28844741889796105, "rouge1_fmeasure_stderr": 0.0045808299305253675, "rouge1_precision": 0.3214003289518496, "rouge1_precision_stderr": 0.005238100537720716, "rouge1_recall": 0.284478028112611, "rouge1_recall_stderr": 0.004743529526509979, "rouge2_fmeasure": 0.13358035874530175, "rouge2_fmeasure_stderr": 0.00249241678679737, "rouge2_precision": 0.15008595165910327, "rouge2_precision_stderr": 0.0028768202777924123, "rouge2_recall": 0.1322459611454104, "rouge2_recall_stderr": 0.0025975328144746245, "rougeL_fmeasure": 0.2033919638776235, "rougeL_fmeasure_stderr": 0.0033552917330628277, "rougeL_precision": 0.22779449309825947, "rougeL_precision_stderr": 0.0038970883792558597, "rougeL_recall": 0.2005511248209543, "rougeL_recall_stderr": 0.003483267890361711, "rougeLsum_fmeasure": 0.23275157582350312, "rougeLsum_fmeasure_stderr": 0.0038339428398296597, "rougeLsum_precision": 0.26024582037717414, "rougeLsum_precision_stderr": 0.004423604232728428, "rougeLsum_recall": 0.22918551883988122, "rougeLsum_recall_stderr": 0.003951066733561367}}, "3": {"generate_text_restaurant": {"bleu": 12.224128259289117, "bleu_stderr": 0.1131943064882355, "rouge1_fmeasure": 0.4766438856618967, "rouge1_fmeasure_stderr": 0.0025363961643216663, "rouge1_precision": 0.5543651713209715, "rouge1_precision_stderr": 0.0032935915949457687, "rouge1_recall": 0.4547427659659613, "rouge1_recall_stderr": 0.0031408322276990914, "rouge2_fmeasure": 0.22277617819890463, "rouge2_fmeasure_stderr": 0.002103801771067249, "rouge2_precision": 0.2607440662815023, "rouge2_precision_stderr": 0.002584890578187239, "rouge2_recall": 0.21365467393160184, "rouge2_recall_stderr": 0.0022995158726377913, "rougeL_fmeasure": 0.33469538109407226, "rougeL_fmeasure_stderr": 0.0021606704264442774, "rougeL_precision": 0.3918373597130441, "rougeL_precision_stderr": 0.002885461551537328, "rougeL_recall": 0.3186339099823638, "rougeL_recall_stderr": 0.0024998158606782906, "rougeLsum_fmeasure": 0.3800774546847204, "rougeLsum_fmeasure_stderr": 0.002431509578697271, "rougeLsum_precision": 0.44371023791903563, "rougeLsum_precision_stderr": 0.0031647756469437537, "rougeLsum_recall": 0.3621092150398747, "rougeLsum_recall_stderr": 0.0028249633749365005}}, "4": {"generate_text_restaurant": {"bleu": 12.817550214423665, "bleu_stderr": 0.15782925798606207, "rouge1_fmeasure": 0.48443702967140084, "rouge1_fmeasure_stderr": 0.0023494429444964656, "rouge1_precision": 0.5635073604223376, "rouge1_precision_stderr": 0.0031635297044165928, "rouge1_recall": 0.461114970168718, "rouge1_recall_stderr": 0.002943421071835856, "rouge2_fmeasure": 0.22831433189515238, "rouge2_fmeasure_stderr": 0.0021240715136561683, "rouge2_precision": 0.2672205748981713, "rouge2_precision_stderr": 0.0025895351630755756, "rouge2_recall": 0.2179187838829706, "rouge2_recall_stderr": 0.002285220106810577, "rougeL_fmeasure": 0.3425710261254338, "rougeL_fmeasure_stderr": 0.002101742060293759, "rougeL_precision": 0.3999713213111463, "rougeL_precision_stderr": 0.0028082247237324717, "rougeL_recall": 0.3259700616997284, "rougeL_recall_stderr": 0.0024434736091599415, "rougeLsum_fmeasure": 0.3894593959263041, "rougeLsum_fmeasure_stderr": 0.0023589628899129238, "rougeLsum_precision": 0.45324588701712326, "rougeLsum_precision_stderr": 0.003046010589730128, "rougeLsum_recall": 0.37080736877305864, "rougeLsum_recall_stderr": 0.002744994499618735}}, "5": {"generate_text_restaurant": {"bleu": 13.051174071516838, "bleu_stderr": 0.2110312895856706, "rouge1_fmeasure": 0.48855532878466623, "rouge1_fmeasure_stderr": 0.002375323146214924, "rouge1_precision": 0.5640539414895285, "rouge1_precision_stderr": 0.0031671577055334484, "rouge1_recall": 0.46564743766107397, "rouge1_recall_stderr": 0.0029379877152414316, "rouge2_fmeasure": 0.2331491796706113, "rouge2_fmeasure_stderr": 0.0021482124252456517, "rouge2_precision": 0.27153647342992987, "rouge2_precision_stderr": 0.002637174705922207, "rouge2_recall": 0.2222841755994956, "rouge2_recall_stderr": 0.002284857566592831, "rougeL_fmeasure": 0.34828358400400466, "rougeL_fmeasure_stderr": 0.0021652804436678532, "rougeL_precision": 0.40306890487172303, "rougeL_precision_stderr": 0.002821970160355803, "rougeL_recall": 0.33187532216109156, "rougeL_recall_stderr": 0.0024788769666433154, "rougeLsum_fmeasure": 0.39469458292739074, "rougeLsum_fmeasure_stderr": 0.002394817932044403, "rougeLsum_precision": 0.45577046674651234, "rougeLsum_precision_stderr": 0.0030568219622970076, "rougeLsum_recall": 0.37626439393943184, "rougeLsum_recall_stderr": 0.0027564177116925857}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8735006105340295, "bleu_stderr": 0.10320645407084378, "rouge1_fmeasure": 0.20135280851124157, "rouge1_fmeasure_stderr": 0.0024676671763789815, "rouge1_precision": 0.1466719087192455, "rouge1_precision_stderr": 0.0019539872335237757, "rouge1_recall": 0.3432282239187905, "rouge1_recall_stderr": 0.00430687923447409, "rouge2_fmeasure": 0.04598629367072178, "rouge2_fmeasure_stderr": 0.00153042075925482, "rouge2_precision": 0.03310445725784801, "rouge2_precision_stderr": 0.0011249607082800114, "rouge2_recall": 0.08089868525611035, "rouge2_recall_stderr": 0.0027717888057174154, "rougeL_fmeasure": 0.15345822797707173, "rougeL_fmeasure_stderr": 0.0018838649769431558, "rougeL_precision": 0.11159186184598226, "rougeL_precision_stderr": 0.001477636952497233, "rougeL_recall": 0.26327102447827944, "rougeL_recall_stderr": 0.003439337825926613, "rougeLsum_fmeasure": 0.15780611683822826, "rougeLsum_fmeasure_stderr": 0.0021423620363575256, "rougeLsum_precision": 0.11467397020173747, "rougeLsum_precision_stderr": 0.001645799192213771, "rougeLsum_recall": 0.2709104419548266, "rougeLsum_recall_stderr": 0.0038902222241056794}}, "1": {"article_DOC_summary": {"bleu": 1.6899997370466049, "bleu_stderr": 0.110619031864698, "rouge1_fmeasure": 0.19433316520820396, "rouge1_fmeasure_stderr": 0.0029534463597401307, "rouge1_precision": 0.16110321152835377, "rouge1_precision_stderr": 0.0033575248684591646, "rouge1_recall": 0.296650867747574, "rouge1_recall_stderr": 0.00414859635676739, "rouge2_fmeasure": 0.04280890210939285, "rouge2_fmeasure_stderr": 0.0018056604175163613, "rouge2_precision": 0.03699079787080637, "rouge2_precision_stderr": 0.0018578033358832057, "rouge2_recall": 0.06382029852295158, "rouge2_recall_stderr": 0.0025115117427719977, "rougeL_fmeasure": 0.1505154716699778, "rougeL_fmeasure_stderr": 0.0022990397739744138, "rougeL_precision": 0.12423313201559882, "rougeL_precision_stderr": 0.0026117553435255204, "rougeL_recall": 0.23181948242508482, "rougeL_recall_stderr": 0.0033284544605107205, "rougeLsum_fmeasure": 0.15157976809422116, "rougeLsum_fmeasure_stderr": 0.00239260501559873, "rougeLsum_precision": 0.12508664597678232, "rougeLsum_precision_stderr": 0.0026586305781588454, "rougeLsum_recall": 0.23346001764991783, "rougeLsum_recall_stderr": 0.003513030655878587}}, "2": {"article_DOC_summary": {"bleu": 2.3625362314641043, "bleu_stderr": 0.14750649952781789, "rouge1_fmeasure": 0.2296736514150804, "rouge1_fmeasure_stderr": 0.0034883498232922967, "rouge1_precision": 0.22300296135759085, "rouge1_precision_stderr": 0.0042012945059409414, "rouge1_recall": 0.277429168784763, "rouge1_recall_stderr": 0.004004826659234149, "rouge2_fmeasure": 0.054061483001217665, "rouge2_fmeasure_stderr": 0.002184485710060082, "rouge2_precision": 0.05375831447104746, "rouge2_precision_stderr": 0.002374418058021283, "rouge2_recall": 0.06405534076522049, "rouge2_recall_stderr": 0.00252850221748638, "rougeL_fmeasure": 0.1734125770645973, "rougeL_fmeasure_stderr": 0.0027461874651780096, "rougeL_precision": 0.16753114748509526, "rougeL_precision_stderr": 0.0032663751827037685, "rougeL_recall": 0.21151461032347418, "rougeL_recall_stderr": 0.0032308513968815305, "rougeLsum_fmeasure": 0.17467691666653457, "rougeLsum_fmeasure_stderr": 0.002795616182080349, "rougeLsum_precision": 0.168585227049474, "rougeLsum_precision_stderr": 0.0032851609020734607, "rougeLsum_recall": 0.21365023683762577, "rougeLsum_recall_stderr": 0.0034060433108231195}}, "3": {"article_DOC_summary": {"bleu": 2.777167100666753, "bleu_stderr": 0.25896124541561855, "rouge1_fmeasure": 0.2268153823830712, "rouge1_fmeasure_stderr": 0.0037756595891816386, "rouge1_precision": 0.23309113589842997, "rouge1_precision_stderr": 0.004495733518566236, "rouge1_recall": 0.25265960179525243, "rouge1_recall_stderr": 0.004243339934810083, "rouge2_fmeasure": 0.054853778845451784, "rouge2_fmeasure_stderr": 0.0021822842294678037, "rouge2_precision": 0.05791724618540958, "rouge2_precision_stderr": 0.0024966364806032018, "rouge2_recall": 0.059784534471793396, "rouge2_recall_stderr": 0.002427389911412999, "rougeL_fmeasure": 0.1726357010250971, "rougeL_fmeasure_stderr": 0.0030100972682100195, "rougeL_precision": 0.17773852708631135, "rougeL_precision_stderr": 0.003642815015026007, "rougeL_recall": 0.1933692923669615, "rougeL_recall_stderr": 0.0034166230979183706, "rougeLsum_fmeasure": 0.1738328006059657, "rougeLsum_fmeasure_stderr": 0.0030606580523114277, "rougeLsum_precision": 0.17903652416888485, "rougeLsum_precision_stderr": 0.003671959165016209, "rougeLsum_recall": 0.19455475712524214, "rougeLsum_recall_stderr": 0.0035367172360608948}}, "4": {"article_DOC_summary": {"bleu": 0.23114258991747624, "bleu_stderr": 0.0797853003835446, "rouge1_fmeasure": 0.05897317243356514, "rouge1_fmeasure_stderr": 0.0034748536443240967, "rouge1_precision": 0.06911456773901885, "rouge1_precision_stderr": 0.004187417057420757, "rouge1_recall": 0.06041108350866476, "rouge1_recall_stderr": 0.0036702344926798033, "rouge2_fmeasure": 0.014767437977446025, "rouge2_fmeasure_stderr": 0.0014371675702058476, "rouge2_precision": 0.017927410740768483, "rouge2_precision_stderr": 0.0019384500431498925, "rouge2_recall": 0.014851993476515502, "rouge2_recall_stderr": 0.0014351274374135084, "rougeL_fmeasure": 0.045230662051896874, "rougeL_fmeasure_stderr": 0.002702332048116945, "rougeL_precision": 0.05392252555894993, "rougeL_precision_stderr": 0.0033712857221675978, "rougeL_recall": 0.04602412020062712, "rougeL_recall_stderr": 0.002794124862823235, "rougeLsum_fmeasure": 0.045993855364812146, "rougeLsum_fmeasure_stderr": 0.002742078219418954, "rougeLsum_precision": 0.05456671018880701, "rougeLsum_precision_stderr": 0.00339292620603511, "rougeLsum_recall": 0.04713892804250872, "rougeLsum_recall_stderr": 0.0028855668767142633}}, "5": {"article_DOC_summary": {"bleu": 3.232878331759999e-53, "bleu_stderr": 9.090439382002912e-41, "rouge1_fmeasure": 0.001960934060128614, "rouge1_fmeasure_stderr": 0.0006100129143407545, "rouge1_precision": 0.0024976405336611164, "rouge1_precision_stderr": 0.0008018925400411893, "rouge1_recall": 0.00167361342000685, "rouge1_recall_stderr": 0.0005136897833313838, "rouge2_fmeasure": 0.0002632456641890604, "rouge2_fmeasure_stderr": 0.00014362991826181266, "rouge2_precision": 0.0003547044889997165, "rouge2_precision_stderr": 0.00020187650272638787, "rouge2_recall": 0.00021287674589561383, "rouge2_recall_stderr": 0.0001139201714225417, "rougeL_fmeasure": 0.0016906578735054498, "rougeL_fmeasure_stderr": 0.0005270751675662782, "rougeL_precision": 0.002131553171433103, "rougeL_precision_stderr": 0.0006859491324476021, "rougeL_recall": 0.0014558675347859545, "rougeL_recall_stderr": 0.0004490812299504436, "rougeLsum_fmeasure": 0.0017411068695704283, "rougeLsum_fmeasure_stderr": 0.0005435904933760577, "rougeLsum_precision": 0.0021851552297521425, "rougeLsum_precision_stderr": 0.0007003264434375038, "rougeLsum_recall": 0.001503513808847323, "rougeLsum_recall_stderr": 0.00046632363286219195}}}}