diff --git "a/4b284b28bc4/eval/merged.json" "b/4b284b28bc4/eval/merged.json" new file mode 100644--- /dev/null +++ "b/4b284b28bc4/eval/merged.json" @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2871925225988394, "bleu_stderr": 0.02880794237734816, "rouge1_fmeasure": 0.10707093959955763, "rouge1_fmeasure_stderr": 0.0019099253416430774, "rouge1_precision": 0.07031750338322859, "rouge1_precision_stderr": 0.0015362201736309874, "rouge1_recall": 0.3012155634284117, "rouge1_recall_stderr": 0.004546846231718025, "rouge2_fmeasure": 0.049917192299013896, "rouge2_fmeasure_stderr": 0.0012014538250113653, "rouge2_precision": 0.03252619427180376, "rouge2_precision_stderr": 0.0009041180535715348, "rouge2_recall": 0.14439430798437106, "rouge2_recall_stderr": 0.0030595449553106713, "rougeL_fmeasure": 0.10327080072990603, "rougeL_fmeasure_stderr": 0.0017814345648663893, "rougeL_precision": 0.06757287768017098, "rougeL_precision_stderr": 0.001414062975537199, "rougeL_recall": 0.2931917692240097, "rougeL_recall_stderr": 0.0044525581749204875, "rougeLsum_fmeasure": 0.10238598559666155, "rougeLsum_fmeasure_stderr": 0.001793711398293261, "rougeLsum_precision": 0.06720725044881726, "rougeLsum_precision_stderr": 0.0014407633141713585, "rougeLsum_recall": 0.2882066628071488, "rougeLsum_recall_stderr": 0.0042625595600572705}, "explicit-graph-description2": {"bleu": 0.16810516169972559, "bleu_stderr": 0.03835304479839481, "rouge1_fmeasure": 0.08821581167136193, "rouge1_fmeasure_stderr": 0.0020280559575042486, "rouge1_precision": 0.09435580335073716, "rouge1_precision_stderr": 0.002596237406338737, "rouge1_recall": 0.1516877183797435, "rouge1_recall_stderr": 0.002163617861294294, "rouge2_fmeasure": 0.00537375341399136, "rouge2_fmeasure_stderr": 0.0004491934033010578, "rouge2_precision": 0.0053159784831301035, "rouge2_precision_stderr": 0.00043010712261271663, "rouge2_recall": 0.009522281054005119, "rouge2_recall_stderr": 0.0010761109699630326, "rougeL_fmeasure": 0.07551459506478578, "rougeL_fmeasure_stderr": 0.0016129890419721987, "rougeL_precision": 0.07882470114338755, "rougeL_precision_stderr": 0.0020788842929298805, "rougeL_recall": 0.13940682930714066, "rougeL_recall_stderr": 0.0019495456097635088, "rougeLsum_fmeasure": 0.06986508795300185, "rougeLsum_fmeasure_stderr": 0.0017500362429631329, "rougeLsum_precision": 0.0777715925714806, "rougeLsum_precision_stderr": 0.002284643122449491, "rougeLsum_recall": 0.11130068960104114, "rougeLsum_recall_stderr": 0.0018656577889789815}, "implicit-graph-description": {"bleu": 0.13642739401224172, "bleu_stderr": 0.019132472003375194, "rouge1_fmeasure": 0.042903100190703786, "rouge1_fmeasure_stderr": 0.0007010978773798984, "rouge1_precision": 0.02512636113552703, "rouge1_precision_stderr": 0.0007036632388253778, "rouge1_recall": 0.21863243644000946, "rouge1_recall_stderr": 0.002521017541213538, "rouge2_fmeasure": 0.0034599979122394713, "rouge2_fmeasure_stderr": 0.00029151015532393696, "rouge2_precision": 0.002350066060168301, "rouge2_precision_stderr": 0.0004125290354570353, "rouge2_recall": 0.01850848216392922, "rouge2_recall_stderr": 0.001239122346922424, "rougeL_fmeasure": 0.04253972308092736, "rougeL_fmeasure_stderr": 0.0006683881645207197, "rougeL_precision": 0.024907100271538515, "rougeL_precision_stderr": 0.000690766936930914, "rougeL_recall": 0.21742396152099233, "rougeL_recall_stderr": 0.0024595518988296725, "rougeLsum_fmeasure": 0.028854953753925324, "rougeLsum_fmeasure_stderr": 0.0005590058998891695, "rougeLsum_precision": 0.017035317660162044, "rougeLsum_precision_stderr": 0.0006541825206071444, "rougeLsum_recall": 0.15198121220346103, "rougeLsum_recall_stderr": 0.002036913011848439}, "non-explicit-description": {"bleu": 0.009519886388116046, "bleu_stderr": 0.0030236756925875077, "rouge1_fmeasure": 0.023310980636958793, "rouge1_fmeasure_stderr": 0.000452486408355606, "rouge1_precision": 0.013350416749489698, "rouge1_precision_stderr": 0.00028057458287476263, "rouge1_recall": 0.11566374488953345, "rouge1_recall_stderr": 0.001749855882209205, "rouge2_fmeasure": 0.0007679014767691473, "rouge2_fmeasure_stderr": 0.00012819459126050034, "rouge2_precision": 0.0004485308245799727, "rouge2_precision_stderr": 7.901239884774927e-05, "rouge2_recall": 0.0036672786059082207, "rouge2_recall_stderr": 0.0005157087714575659, "rougeL_fmeasure": 0.02320710445968874, "rougeL_fmeasure_stderr": 0.00043026318572661666, "rougeL_precision": 0.013285581389076958, "rougeL_precision_stderr": 0.0002656851189258093, "rougeL_recall": 0.11534380952368356, "rougeL_recall_stderr": 0.0017118106609243738, "rougeLsum_fmeasure": 0.020705297270136803, "rougeLsum_fmeasure_stderr": 0.0003655496821761756, "rougeLsum_precision": 0.011828106724862774, "rougeLsum_precision_stderr": 0.00022725277636703424, "rougeLsum_recall": 0.10529252247490435, "rougeLsum_recall_stderr": 0.001497843448633412}, "very-explicit-description": {"bleu": 0.010910686930411773, "bleu_stderr": 0.00030602545126491815, "rouge1_fmeasure": 0.06975157019283658, "rouge1_fmeasure_stderr": 0.0009137011093224832, "rouge1_precision": 0.10655760753679919, "rouge1_precision_stderr": 0.0019077191437516404, "rouge1_recall": 0.12089604584843483, "rouge1_recall_stderr": 0.002329629845339565, "rouge2_fmeasure": 0.0005675109134810492, "rouge2_fmeasure_stderr": 5.471478866388383e-05, "rouge2_precision": 0.0003312213456760952, "rouge2_precision_stderr": 4.9493114517125344e-05, "rouge2_recall": 0.005980525954602008, "rouge2_recall_stderr": 0.0005793862846553182, "rougeL_fmeasure": 0.06626241359825397, "rougeL_fmeasure_stderr": 0.0008570407826010054, "rougeL_precision": 0.1004586755525386, "rougeL_precision_stderr": 0.0017955289781684024, "rougeL_recall": 0.1184027498174915, "rougeL_recall_stderr": 0.0023585973835660435, "rougeLsum_fmeasure": 0.06538396642694783, "rougeLsum_fmeasure_stderr": 0.0009096365360827909, "rougeLsum_precision": 0.1014087241228341, "rougeLsum_precision_stderr": 0.001861610563145598, "rougeLsum_recall": 0.10839827017927679, "rougeLsum_recall_stderr": 0.002044855763678236}}, "1": {"PALM_prompt": {"bleu": 0.4674006237665374, "bleu_stderr": 0.03675015156688127, "rouge1_fmeasure": 0.11843411039548267, "rouge1_fmeasure_stderr": 0.001895876971489225, "rouge1_precision": 0.07708396436923028, "rouge1_precision_stderr": 0.001529013971135644, "rouge1_recall": 0.36235242098066, "rouge1_recall_stderr": 0.00504305962540817, "rouge2_fmeasure": 0.05553061893758205, "rouge2_fmeasure_stderr": 0.0012200850248834274, "rouge2_precision": 0.035725346847754684, "rouge2_precision_stderr": 0.0008827277116185362, "rouge2_recall": 0.17866943863024684, "rouge2_recall_stderr": 0.003599106083841207, "rougeL_fmeasure": 0.11228525846628372, "rougeL_fmeasure_stderr": 0.0017420331023114827, "rougeL_precision": 0.07294152863590639, "rougeL_precision_stderr": 0.0014064426998360536, "rougeL_recall": 0.3437168036630356, "rougeL_recall_stderr": 0.004711682138202734, "rougeLsum_fmeasure": 0.11267135213673385, "rougeLsum_fmeasure_stderr": 0.0017806522868172628, "rougeLsum_precision": 0.07337404630465882, "rougeLsum_precision_stderr": 0.0014496612765965426, "rougeLsum_recall": 0.3436003452114997, "rougeLsum_recall_stderr": 0.004616609203187745}, "explicit-graph-description2": {"bleu": 2.6466191277757654, "bleu_stderr": 0.15100935076720456, "rouge1_fmeasure": 0.31536194630864184, "rouge1_fmeasure_stderr": 0.0048986014551623686, "rouge1_precision": 0.32500435790610316, "rouge1_precision_stderr": 0.006219134826883584, "rouge1_recall": 0.46243909963396884, "rouge1_recall_stderr": 0.005727997203394979, "rouge2_fmeasure": 0.1493719195270224, "rouge2_fmeasure_stderr": 0.0034781292314063805, "rouge2_precision": 0.15832835758132308, "rouge2_precision_stderr": 0.004379479319821069, "rouge2_recall": 0.21989028843694963, "rouge2_recall_stderr": 0.00443907246545659, "rougeL_fmeasure": 0.24943460339721188, "rougeL_fmeasure_stderr": 0.003903652479227547, "rougeL_precision": 0.25838083394802175, "rougeL_precision_stderr": 0.005195219904829289, "rougeL_recall": 0.3797932387763246, "rougeL_recall_stderr": 0.004882479498294952, "rougeLsum_fmeasure": 0.2742602935438785, "rougeLsum_fmeasure_stderr": 0.004363579640120447, "rougeLsum_precision": 0.28371843954863074, "rougeLsum_precision_stderr": 0.00558326351046381, "rougeLsum_recall": 0.4042180845615538, "rougeLsum_recall_stderr": 0.005247552299585261}, "implicit-graph-description": {"bleu": 1.2344773114465395, "bleu_stderr": 0.06144744886971806, "rouge1_fmeasure": 0.14240938766090952, "rouge1_fmeasure_stderr": 0.0026994539955292815, "rouge1_precision": 0.09359962530750512, "rouge1_precision_stderr": 0.0025105917511609984, "rouge1_recall": 0.522665803339837, "rouge1_recall_stderr": 0.004850735788452025, "rouge2_fmeasure": 0.06073259012334097, "rouge2_fmeasure_stderr": 0.0016141608477161177, "rouge2_precision": 0.04032025283529164, "rouge2_precision_stderr": 0.0014131816715617135, "rouge2_recall": 0.23323991686673376, "rouge2_recall_stderr": 0.004275513488096086, "rougeL_fmeasure": 0.11888572362932227, "rougeL_fmeasure_stderr": 0.001965744751513682, "rougeL_precision": 0.0769023862248869, "rougeL_precision_stderr": 0.0018572847386138004, "rougeL_recall": 0.4625095347149072, "rougeL_recall_stderr": 0.004342267829958794, "rougeLsum_fmeasure": 0.1245843999084538, "rougeLsum_fmeasure_stderr": 0.0024393680091218312, "rougeLsum_precision": 0.08184718121577335, "rougeLsum_precision_stderr": 0.002207850839836764, "rougeLsum_recall": 0.4608387006930489, "rougeLsum_recall_stderr": 0.004681197366055284}, "non-explicit-description": {"bleu": 2.191415616124935, "bleu_stderr": 0.09680100323362864, "rouge1_fmeasure": 0.25888123233754007, "rouge1_fmeasure_stderr": 0.0031768498729056785, "rouge1_precision": 0.18596600590889065, "rouge1_precision_stderr": 0.003645906867093087, "rouge1_recall": 0.7000841391242195, "rouge1_recall_stderr": 0.00407563035243576, "rouge2_fmeasure": 0.12187611759524282, "rouge2_fmeasure_stderr": 0.002198007938376146, "rouge2_precision": 0.08857846914965355, "rouge2_precision_stderr": 0.002396373719643474, "rouge2_recall": 0.34773296216867, "rouge2_recall_stderr": 0.004163769887333789, "rougeL_fmeasure": 0.20438070771678543, "rougeL_fmeasure_stderr": 0.002413318992124119, "rougeL_precision": 0.14607148785062837, "rougeL_precision_stderr": 0.0029509545373158728, "rougeL_recall": 0.582387391289673, "rougeL_recall_stderr": 0.004300891719123408, "rougeLsum_fmeasure": 0.22097954689386484, "rougeLsum_fmeasure_stderr": 0.0028005845900640268, "rougeLsum_precision": 0.1590760662106541, "rougeLsum_precision_stderr": 0.0032411687263806696, "rougeLsum_recall": 0.6045971668841404, "rougeLsum_recall_stderr": 0.003947144030126864}, "very-explicit-description": {"bleu": 1.5758893404824672, "bleu_stderr": 0.05552280312712737, "rouge1_fmeasure": 0.13287078506554892, "rouge1_fmeasure_stderr": 0.0017687380102785963, "rouge1_precision": 0.08062063267183606, "rouge1_precision_stderr": 0.0016464001390685112, "rouge1_recall": 0.5804077018598791, "rouge1_recall_stderr": 0.004705014575654514, "rouge2_fmeasure": 0.058019313464160664, "rouge2_fmeasure_stderr": 0.0010621558481173597, "rouge2_precision": 0.03519804726011007, "rouge2_precision_stderr": 0.0009691687220744844, "rouge2_recall": 0.284901492168855, "rouge2_recall_stderr": 0.004424090476216274, "rougeL_fmeasure": 0.12184770112424802, "rougeL_fmeasure_stderr": 0.0014608368603050287, "rougeL_precision": 0.07336501049936618, "rougeL_precision_stderr": 0.001340183479865301, "rougeL_recall": 0.545991165694801, "rougeL_recall_stderr": 0.004636447373365752, "rougeLsum_fmeasure": 0.11269533007372474, "rougeLsum_fmeasure_stderr": 0.0015437562768583104, "rougeLsum_precision": 0.06830124095667763, "rougeLsum_precision_stderr": 0.001429554164098288, "rougeLsum_recall": 0.5022686537167637, "rougeLsum_recall_stderr": 0.00445439816772978}}, "2": {"PALM_prompt": {"bleu": 0.5250778439407279, "bleu_stderr": 0.025625496064299234, "rouge1_fmeasure": 0.1222441134481138, "rouge1_fmeasure_stderr": 0.0019020155784340502, "rouge1_precision": 0.0786831611862841, "rouge1_precision_stderr": 0.0014969718758498687, "rouge1_recall": 0.3956998500897082, "rouge1_recall_stderr": 0.005131459404981971, "rouge2_fmeasure": 0.057331612844470456, "rouge2_fmeasure_stderr": 0.001226447389142751, "rouge2_precision": 0.03682232020364324, "rouge2_precision_stderr": 0.0009265440871287532, "rouge2_recall": 0.19779728011829195, "rouge2_recall_stderr": 0.0038163400420330356, "rougeL_fmeasure": 0.11353522248821303, "rougeL_fmeasure_stderr": 0.001703018837429438, "rougeL_precision": 0.07302274174131225, "rougeL_precision_stderr": 0.001338647584797816, "rougeL_recall": 0.36618568072469376, "rougeL_recall_stderr": 0.004622159858156629, "rougeLsum_fmeasure": 0.11628410210009521, "rougeLsum_fmeasure_stderr": 0.0017922853862293046, "rougeLsum_precision": 0.07488527729009754, "rougeLsum_precision_stderr": 0.001411146414679001, "rougeLsum_recall": 0.3754608054763451, "rougeLsum_recall_stderr": 0.004798151749531035}, "explicit-graph-description2": {"bleu": 7.5083947202338726, "bleu_stderr": 0.24848690178409746, "rouge1_fmeasure": 0.5130705309379091, "rouge1_fmeasure_stderr": 0.004347345042742489, "rouge1_precision": 0.616964955469801, "rouge1_precision_stderr": 0.00592579769672606, "rouge1_recall": 0.5254416101366304, "rouge1_recall_stderr": 0.004868870996231356, "rouge2_fmeasure": 0.29741471013327875, "rouge2_fmeasure_stderr": 0.004142260867688268, "rouge2_precision": 0.3649050910641516, "rouge2_precision_stderr": 0.005264819050488393, "rouge2_recall": 0.3044911557323109, "rouge2_recall_stderr": 0.00442897115331052, "rougeL_fmeasure": 0.41944130358080134, "rougeL_fmeasure_stderr": 0.004138185147643418, "rougeL_precision": 0.5063613364405887, "rougeL_precision_stderr": 0.005565487436877947, "rougeL_recall": 0.4323292288126833, "rougeL_recall_stderr": 0.004568270840466541, "rougeLsum_fmeasure": 0.4498297629937119, "rougeLsum_fmeasure_stderr": 0.004109002891379284, "rougeLsum_precision": 0.5436203304123873, "rougeLsum_precision_stderr": 0.005624136142894095, "rougeLsum_recall": 0.4628187968326305, "rougeLsum_recall_stderr": 0.004668999589191003}, "implicit-graph-description": {"bleu": 1.6773713320753056, "bleu_stderr": 0.055884247961949976, "rouge1_fmeasure": 0.2057962810551555, "rouge1_fmeasure_stderr": 0.003760812343326168, "rouge1_precision": 0.17374754160822523, "rouge1_precision_stderr": 0.0050939685408092975, "rouge1_recall": 0.5730195532268542, "rouge1_recall_stderr": 0.004521823239131311, "rouge2_fmeasure": 0.10509886424101751, "rouge2_fmeasure_stderr": 0.002496931131477509, "rouge2_precision": 0.09140223665380956, "rouge2_precision_stderr": 0.0032890628357242144, "rouge2_recall": 0.3074558179703969, "rouge2_recall_stderr": 0.004375074576693778, "rougeL_fmeasure": 0.16838171411089273, "rougeL_fmeasure_stderr": 0.0029883563268557796, "rougeL_precision": 0.14133526089953277, "rougeL_precision_stderr": 0.004177683006061348, "rougeL_recall": 0.49304734486333374, "rougeL_recall_stderr": 0.004509829618388591, "rougeLsum_fmeasure": 0.1825157877706738, "rougeLsum_fmeasure_stderr": 0.0033370782552026696, "rougeLsum_precision": 0.15428774706595433, "rougeLsum_precision_stderr": 0.004553930547293362, "rougeLsum_recall": 0.5140723424370337, "rougeLsum_recall_stderr": 0.004433969406651666}, "non-explicit-description": {"bleu": 2.8569076256551087, "bleu_stderr": 0.12052575313613716, "rouge1_fmeasure": 0.2805422037797494, "rouge1_fmeasure_stderr": 0.0033268027640946463, "rouge1_precision": 0.1980304639632034, "rouge1_precision_stderr": 0.003492199137777792, "rouge1_recall": 0.7112810316711771, "rouge1_recall_stderr": 0.00376166113477323, "rouge2_fmeasure": 0.13915045372935977, "rouge2_fmeasure_stderr": 0.0023234212020214113, "rouge2_precision": 0.09886442937653378, "rouge2_precision_stderr": 0.002345021679529973, "rouge2_recall": 0.37290220171207966, "rouge2_recall_stderr": 0.004228119092633622, "rougeL_fmeasure": 0.2169472494379018, "rougeL_fmeasure_stderr": 0.0025420675625343024, "rougeL_precision": 0.15240021506621623, "rougeL_precision_stderr": 0.002802864425131344, "rougeL_recall": 0.5764693524969581, "rougeL_recall_stderr": 0.004032933171743271, "rougeLsum_fmeasure": 0.24203406785259418, "rougeLsum_fmeasure_stderr": 0.0029372092841247473, "rougeLsum_precision": 0.17080807902531406, "rougeLsum_precision_stderr": 0.0031067593345175896, "rougeLsum_recall": 0.6220206309306955, "rougeLsum_recall_stderr": 0.0037970602252338824}, "very-explicit-description": {"bleu": 3.229929819791474, "bleu_stderr": 0.10809788252140991, "rouge1_fmeasure": 0.3340303326789718, "rouge1_fmeasure_stderr": 0.005770276552964151, "rouge1_precision": 0.3420015606083595, "rouge1_precision_stderr": 0.00759844434394234, "rouge1_recall": 0.6045685729269148, "rouge1_recall_stderr": 0.004463808851257848, "rouge2_fmeasure": 0.18536983565498788, "rouge2_fmeasure_stderr": 0.004299838463484588, "rouge2_precision": 0.19480755638535052, "rouge2_precision_stderr": 0.005308301829142384, "rouge2_recall": 0.33266720411338324, "rouge2_recall_stderr": 0.0045078698777430235, "rougeL_fmeasure": 0.27872869979256815, "rougeL_fmeasure_stderr": 0.004804872599934385, "rougeL_precision": 0.2821382753377313, "rougeL_precision_stderr": 0.0063280271310375115, "rougeL_recall": 0.532955297379096, "rougeL_recall_stderr": 0.004605656886036717, "rougeLsum_fmeasure": 0.2902240542226799, "rougeLsum_fmeasure_stderr": 0.005189448695060585, "rougeLsum_precision": 0.298538665654306, "rougeLsum_precision_stderr": 0.0067989076976235135, "rougeLsum_recall": 0.5279377913839256, "rougeLsum_recall_stderr": 0.004350056352869954}}, "3": {"PALM_prompt": {"bleu": 0.6240971401779115, "bleu_stderr": 0.03840020245332954, "rouge1_fmeasure": 0.12413425365513392, "rouge1_fmeasure_stderr": 0.0018801730074252724, "rouge1_precision": 0.07946921950052364, "rouge1_precision_stderr": 0.0015734983149428565, "rouge1_recall": 0.41443709705557313, "rouge1_recall_stderr": 0.0052063482293464285, "rouge2_fmeasure": 0.05836966723015618, "rouge2_fmeasure_stderr": 0.0012404189838209753, "rouge2_precision": 0.03727374629297188, "rouge2_precision_stderr": 0.0010068745541143666, "rouge2_recall": 0.20933728572588564, "rouge2_recall_stderr": 0.0039729036741121115, "rougeL_fmeasure": 0.11463438747216832, "rougeL_fmeasure_stderr": 0.0016925693943895671, "rougeL_precision": 0.07340984582992345, "rougeL_precision_stderr": 0.0014110667213403662, "rougeL_recall": 0.3803567082991153, "rougeL_recall_stderr": 0.004624202286888448, "rougeLsum_fmeasure": 0.11802649327609507, "rougeLsum_fmeasure_stderr": 0.0017907209912884955, "rougeLsum_precision": 0.07570628299965847, "rougeLsum_precision_stderr": 0.0015124490142500056, "rougeLsum_recall": 0.39213432148715394, "rougeLsum_recall_stderr": 0.004828378800565317}, "explicit-graph-description2": {"bleu": 9.28684502126115, "bleu_stderr": 0.5074769619297956, "rouge1_fmeasure": 0.5412014771452126, "rouge1_fmeasure_stderr": 0.004008094970635858, "rouge1_precision": 0.6407710071562776, "rouge1_precision_stderr": 0.005336259744860889, "rouge1_recall": 0.5360462579768369, "rouge1_recall_stderr": 0.004707261284923473, "rouge2_fmeasure": 0.31908027874923556, "rouge2_fmeasure_stderr": 0.004128300468278769, "rouge2_precision": 0.38414086267020836, "rouge2_precision_stderr": 0.00510054295724189, "rouge2_recall": 0.316436535679906, "rouge2_recall_stderr": 0.004429375693886751, "rougeL_fmeasure": 0.4439501498676375, "rougeL_fmeasure_stderr": 0.0040343196115624315, "rougeL_precision": 0.5276989567128523, "rougeL_precision_stderr": 0.005209222935383238, "rougeL_recall": 0.44075459787644217, "rougeL_recall_stderr": 0.0045004795324398335, "rougeLsum_fmeasure": 0.47592175675183224, "rougeLsum_fmeasure_stderr": 0.003941128663060527, "rougeLsum_precision": 0.5658065072497239, "rougeLsum_precision_stderr": 0.005198931528391741, "rougeLsum_recall": 0.47242630953998244, "rougeLsum_recall_stderr": 0.004540483814405851}, "implicit-graph-description": {"bleu": 1.688973732407593, "bleu_stderr": 0.0437940722424649, "rouge1_fmeasure": 0.23997327415870215, "rouge1_fmeasure_stderr": 0.00426770453497841, "rouge1_precision": 0.2263783160965279, "rouge1_precision_stderr": 0.006171991779324731, "rouge1_recall": 0.5624565194412028, "rouge1_recall_stderr": 0.0046331481122687, "rouge2_fmeasure": 0.12855310203356, "rouge2_fmeasure_stderr": 0.002943917792986252, "rouge2_precision": 0.12650521502940273, "rouge2_precision_stderr": 0.004187905052421771, "rouge2_recall": 0.3116380965308516, "rouge2_recall_stderr": 0.004367945574727286, "rougeL_fmeasure": 0.19630420632254614, "rougeL_fmeasure_stderr": 0.003553339268118638, "rougeL_precision": 0.18506548497150052, "rougeL_precision_stderr": 0.005182255852735757, "rougeL_recall": 0.47864760075928686, "rougeL_recall_stderr": 0.004626469457756582, "rougeLsum_fmeasure": 0.21346988106806253, "rougeLsum_fmeasure_stderr": 0.003833256489070223, "rougeLsum_precision": 0.20150484374278746, "rougeLsum_precision_stderr": 0.005543691944915573, "rougeLsum_recall": 0.5064505915998677, "rougeLsum_recall_stderr": 0.004511689198493829}, "non-explicit-description": {"bleu": 3.009544719305422, "bleu_stderr": 0.12091914294594716, "rouge1_fmeasure": 0.27975069293703747, "rouge1_fmeasure_stderr": 0.0031284036937637358, "rouge1_precision": 0.1933328502928586, "rouge1_precision_stderr": 0.003099530624368226, "rouge1_recall": 0.7011350938451281, "rouge1_recall_stderr": 0.0037491005183801886, "rouge2_fmeasure": 0.13862295274121508, "rouge2_fmeasure_stderr": 0.002135737901854583, "rouge2_precision": 0.09519751402031547, "rouge2_precision_stderr": 0.0020025816969127396, "rouge2_recall": 0.3723729566418955, "rouge2_recall_stderr": 0.0042697919686992426, "rougeL_fmeasure": 0.21407754529660591, "rougeL_fmeasure_stderr": 0.0023998887716300023, "rougeL_precision": 0.1470822171154783, "rougeL_precision_stderr": 0.0024663239297952433, "rougeL_recall": 0.5599189483902042, "rougeL_recall_stderr": 0.004020046406729631, "rougeLsum_fmeasure": 0.24241624159869068, "rougeLsum_fmeasure_stderr": 0.0027709885961488493, "rougeLsum_precision": 0.1673033495587541, "rougeLsum_precision_stderr": 0.002759668112872226, "rougeLsum_recall": 0.6165957546031788, "rougeLsum_recall_stderr": 0.0037550091734433974}, "very-explicit-description": {"bleu": 5.459046430007518, "bleu_stderr": 0.21326177813295816, "rouge1_fmeasure": 0.4624643695844432, "rouge1_fmeasure_stderr": 0.005423092759830048, "rouge1_precision": 0.5037779287889935, "rouge1_precision_stderr": 0.007188822322314131, "rouge1_recall": 0.6013235088491181, "rouge1_recall_stderr": 0.004702106644962896, "rouge2_fmeasure": 0.26767500932744387, "rouge2_fmeasure_stderr": 0.004580954162492676, "rouge2_precision": 0.2967411625801182, "rouge2_precision_stderr": 0.005575768515677308, "rouge2_recall": 0.3427937666222623, "rouge2_recall_stderr": 0.0045155157665415, "rougeL_fmeasure": 0.3727832655109453, "rougeL_fmeasure_stderr": 0.004858552003307264, "rougeL_precision": 0.4061070236594833, "rougeL_precision_stderr": 0.006289328700511993, "rougeL_recall": 0.49351648432022127, "rougeL_recall_stderr": 0.00462047959595513, "rougeLsum_fmeasure": 0.4039703218874318, "rougeLsum_fmeasure_stderr": 0.00498186857977818, "rougeLsum_precision": 0.4402820106855099, "rougeLsum_precision_stderr": 0.006558923231150342, "rougeLsum_recall": 0.5309933184677589, "rougeLsum_recall_stderr": 0.004638055543942346}}, "4": {"PALM_prompt": {"bleu": 0.6460958847523566, "bleu_stderr": 0.03922360003785139, "rouge1_fmeasure": 0.12269659017119199, "rouge1_fmeasure_stderr": 0.0017597424238451932, "rouge1_precision": 0.07760008602513783, "rouge1_precision_stderr": 0.001320690166501572, "rouge1_recall": 0.4207626130570328, "rouge1_recall_stderr": 0.0051343858175420766, "rouge2_fmeasure": 0.0577700863367864, "rouge2_fmeasure_stderr": 0.001158809400479912, "rouge2_precision": 0.03635274690781486, "rouge2_precision_stderr": 0.0008330185169416092, "rouge2_recall": 0.2129557453468375, "rouge2_recall_stderr": 0.003851289229933982, "rougeL_fmeasure": 0.11231459598044416, "rougeL_fmeasure_stderr": 0.0015694568258752962, "rougeL_precision": 0.07107029230772488, "rougeL_precision_stderr": 0.001171387599016451, "rougeL_recall": 0.38200909854824977, "rougeL_recall_stderr": 0.004464667845112728, "rougeLsum_fmeasure": 0.11625392090888889, "rougeLsum_fmeasure_stderr": 0.0016649015391818642, "rougeLsum_precision": 0.07356474812940257, "rougeLsum_precision_stderr": 0.0012438027129333135, "rougeLsum_recall": 0.39729543398826606, "rougeLsum_recall_stderr": 0.004755911479219925}, "explicit-graph-description2": {"bleu": 9.795110512629257, "bleu_stderr": 0.43643537944667626, "rouge1_fmeasure": 0.5464856118324216, "rouge1_fmeasure_stderr": 0.003910057721192816, "rouge1_precision": 0.6472264998572703, "rouge1_precision_stderr": 0.005232105513902178, "rouge1_recall": 0.5366498372396236, "rouge1_recall_stderr": 0.0046402228185357716, "rouge2_fmeasure": 0.32559945531385337, "rouge2_fmeasure_stderr": 0.004008792699742887, "rouge2_precision": 0.39097181878893417, "rouge2_precision_stderr": 0.005010695619680332, "rouge2_recall": 0.3220332631389833, "rouge2_recall_stderr": 0.004380478546023843, "rougeL_fmeasure": 0.4482338075737081, "rougeL_fmeasure_stderr": 0.0038829126617500277, "rougeL_precision": 0.5313681099482047, "rougeL_precision_stderr": 0.0050483302509796105, "rougeL_recall": 0.44300639511893825, "rougeL_recall_stderr": 0.004479689083403708, "rougeLsum_fmeasure": 0.47944694202870075, "rougeLsum_fmeasure_stderr": 0.0038182134462323098, "rougeLsum_precision": 0.569330259899693, "rougeLsum_precision_stderr": 0.005059793660411658, "rougeLsum_recall": 0.4721692944626995, "rougeLsum_recall_stderr": 0.0044818770782843265}, "implicit-graph-description": {"bleu": 1.670081659519132, "bleu_stderr": 0.04318517964945629, "rouge1_fmeasure": 0.24712078753708633, "rouge1_fmeasure_stderr": 0.004314381507879295, "rouge1_precision": 0.2397708671243343, "rouge1_precision_stderr": 0.0064228338007496924, "rouge1_recall": 0.5485806406380264, "rouge1_recall_stderr": 0.004720768862432214, "rouge2_fmeasure": 0.13327902883600057, "rouge2_fmeasure_stderr": 0.0029460826534397667, "rouge2_precision": 0.132848179955477, "rouge2_precision_stderr": 0.00420337986829722, "rouge2_recall": 0.30940781093399633, "rouge2_recall_stderr": 0.004367428964629067, "rougeL_fmeasure": 0.20127645493389038, "rougeL_fmeasure_stderr": 0.0035809225423920244, "rougeL_precision": 0.19425619955945797, "rougeL_precision_stderr": 0.005325133026757208, "rougeL_recall": 0.46533329832959947, "rougeL_recall_stderr": 0.004667920119273198, "rougeLsum_fmeasure": 0.21840163342192948, "rougeLsum_fmeasure_stderr": 0.0038191209490347515, "rougeLsum_precision": 0.21110404564884136, "rougeLsum_precision_stderr": 0.00569641155080923, "rougeLsum_recall": 0.4921812900394637, "rougeLsum_recall_stderr": 0.004548259054500709}, "non-explicit-description": {"bleu": 2.9471554297303775, "bleu_stderr": 0.09562350894285683, "rouge1_fmeasure": 0.2677925532107245, "rouge1_fmeasure_stderr": 0.002981957765622307, "rouge1_precision": 0.18185566077745052, "rouge1_precision_stderr": 0.0027072441708704876, "rouge1_recall": 0.6878760207615766, "rouge1_recall_stderr": 0.0039399069406699195, "rouge2_fmeasure": 0.13292917476889163, "rouge2_fmeasure_stderr": 0.002014006244700826, "rouge2_precision": 0.08953701779489043, "rouge2_precision_stderr": 0.001710625628350317, "rouge2_recall": 0.36720908036084704, "rouge2_recall_stderr": 0.004308436966018264, "rougeL_fmeasure": 0.20379974606633744, "rougeL_fmeasure_stderr": 0.0022399193895611676, "rougeL_precision": 0.13720570266102783, "rougeL_precision_stderr": 0.0020576199930684665, "rougeL_recall": 0.5490721849829181, "rougeL_recall_stderr": 0.0042054270913741984, "rougeLsum_fmeasure": 0.23315369915847917, "rougeLsum_fmeasure_stderr": 0.002653830698926144, "rougeLsum_precision": 0.15812569287913272, "rougeLsum_precision_stderr": 0.0024138703089301825, "rougeLsum_recall": 0.606968571968092, "rougeLsum_recall_stderr": 0.003889795075167132}, "very-explicit-description": {"bleu": 4.768286137370153, "bleu_stderr": 0.17352437696704526, "rouge1_fmeasure": 0.4340669909812108, "rouge1_fmeasure_stderr": 0.005456429814747237, "rouge1_precision": 0.45806774309488824, "rouge1_precision_stderr": 0.007318355183210045, "rouge1_recall": 0.6241468092977022, "rouge1_recall_stderr": 0.004668663969159961, "rouge2_fmeasure": 0.24713173271908043, "rouge2_fmeasure_stderr": 0.004395624838836395, "rouge2_precision": 0.26701579357766814, "rouge2_precision_stderr": 0.005503172843923408, "rouge2_recall": 0.35329352839698064, "rouge2_recall_stderr": 0.0044599483677181355, "rougeL_fmeasure": 0.3463395333726421, "rougeL_fmeasure_stderr": 0.004817987549566419, "rougeL_precision": 0.36755677377705864, "rougeL_precision_stderr": 0.006350405645219005, "rougeL_recall": 0.5032053457704092, "rougeL_recall_stderr": 0.0045314258456629845, "rougeLsum_fmeasure": 0.3800941783693374, "rougeLsum_fmeasure_stderr": 0.0049021744640361225, "rougeLsum_precision": 0.4016479351386044, "rougeLsum_precision_stderr": 0.006624569651668937, "rougeLsum_recall": 0.5561992816740847, "rougeLsum_recall_stderr": 0.004637830171648178}}, "5": {"PALM_prompt": {"bleu": 0.7283147266727299, "bleu_stderr": 0.03506630990313516, "rouge1_fmeasure": 0.12670702220068714, "rouge1_fmeasure_stderr": 0.0018353031821599214, "rouge1_precision": 0.0800529618846568, "rouge1_precision_stderr": 0.0014030415087213262, "rouge1_recall": 0.4360056147654019, "rouge1_recall_stderr": 0.005237161872126708, "rouge2_fmeasure": 0.05951196634046783, "rouge2_fmeasure_stderr": 0.0011799130523628795, "rouge2_precision": 0.037349148159645094, "rouge2_precision_stderr": 0.000853308549188945, "rouge2_recall": 0.2217598521275178, "rouge2_recall_stderr": 0.003994885901439153, "rougeL_fmeasure": 0.11485528722019342, "rougeL_fmeasure_stderr": 0.001589365738462336, "rougeL_precision": 0.07255538123365188, "rougeL_precision_stderr": 0.0012004887370386942, "rougeL_recall": 0.39357574348180674, "rougeL_recall_stderr": 0.004547413362244599, "rougeLsum_fmeasure": 0.11989099231027077, "rougeLsum_fmeasure_stderr": 0.0017190774588667075, "rougeLsum_precision": 0.07578798351325475, "rougeLsum_precision_stderr": 0.0012955589973183616, "rougeLsum_recall": 0.4104972172441367, "rougeLsum_recall_stderr": 0.0048123434799543785}, "explicit-graph-description2": {"bleu": 11.239061696658661, "bleu_stderr": 0.3211949581138453, "rouge1_fmeasure": 0.5525906757014365, "rouge1_fmeasure_stderr": 0.003963242953623838, "rouge1_precision": 0.6580257286903278, "rouge1_precision_stderr": 0.005154899649610462, "rouge1_recall": 0.5377144440002729, "rouge1_recall_stderr": 0.004751931821359596, "rouge2_fmeasure": 0.3330531711847648, "rouge2_fmeasure_stderr": 0.004179019933874763, "rouge2_precision": 0.4024984504821446, "rouge2_precision_stderr": 0.005105136616228292, "rouge2_recall": 0.3256015347638036, "rouge2_recall_stderr": 0.004546014562570848, "rougeL_fmeasure": 0.4568474950562419, "rougeL_fmeasure_stderr": 0.004002759515298098, "rougeL_precision": 0.5454660436185705, "rougeL_precision_stderr": 0.00507487374817911, "rougeL_recall": 0.4456656435992453, "rougeL_recall_stderr": 0.004577146376256788, "rougeLsum_fmeasure": 0.4862086181416132, "rougeLsum_fmeasure_stderr": 0.00389366720553536, "rougeLsum_precision": 0.5815729850402755, "rougeLsum_precision_stderr": 0.005083872222185427, "rougeLsum_recall": 0.4737226545590055, "rougeLsum_recall_stderr": 0.004553659067486879}, "implicit-graph-description": {"bleu": 1.7150662066576599, "bleu_stderr": 0.04867244279674884, "rouge1_fmeasure": 0.24413807121404715, "rouge1_fmeasure_stderr": 0.004196903213714051, "rouge1_precision": 0.23129723699957813, "rouge1_precision_stderr": 0.0062156402527426885, "rouge1_recall": 0.5532761590144792, "rouge1_recall_stderr": 0.004718580529051603, "rouge2_fmeasure": 0.13440771646832117, "rouge2_fmeasure_stderr": 0.0030479854784096305, "rouge2_precision": 0.13131503132685515, "rouge2_precision_stderr": 0.00424768261815168, "rouge2_recall": 0.31689123692271287, "rouge2_recall_stderr": 0.004479228620477408, "rougeL_fmeasure": 0.20077123718465184, "rougeL_fmeasure_stderr": 0.003610791227071776, "rougeL_precision": 0.19041781904289584, "rougeL_precision_stderr": 0.005317512197816564, "rougeL_recall": 0.468350868728647, "rougeL_recall_stderr": 0.0046258788947290365, "rougeLsum_fmeasure": 0.21698522412439886, "rougeLsum_fmeasure_stderr": 0.003796119059919846, "rougeLsum_precision": 0.20527402650433593, "rougeLsum_precision_stderr": 0.005591111023710644, "rougeLsum_recall": 0.49732487082024746, "rougeLsum_recall_stderr": 0.004602620634284861}, "non-explicit-description": {"bleu": 2.8828869742378753, "bleu_stderr": 0.09032299188645451, "rouge1_fmeasure": 0.26156722580164604, "rouge1_fmeasure_stderr": 0.0029559487433299535, "rouge1_precision": 0.17429298605765095, "rouge1_precision_stderr": 0.002453866849905799, "rouge1_recall": 0.6874234755183823, "rouge1_recall_stderr": 0.003958252758008644, "rouge2_fmeasure": 0.1292376475089326, "rouge2_fmeasure_stderr": 0.0019765919490193793, "rouge2_precision": 0.08498484032096375, "rouge2_precision_stderr": 0.0014987715737771314, "rouge2_recall": 0.36659589385018765, "rouge2_recall_stderr": 0.004439215725950152, "rougeL_fmeasure": 0.1982820935914217, "rougeL_fmeasure_stderr": 0.002156722479988033, "rougeL_precision": 0.130533611669738, "rougeL_precision_stderr": 0.001731325232954926, "rougeL_recall": 0.5475992486934416, "rougeL_recall_stderr": 0.004197239234834233, "rougeLsum_fmeasure": 0.22805092628919627, "rougeLsum_fmeasure_stderr": 0.002601257261272881, "rougeLsum_precision": 0.15152370925757377, "rougeLsum_precision_stderr": 0.0021398507103325367, "rougeLsum_recall": 0.6088523048467857, "rougeLsum_recall_stderr": 0.003930409593530016}, "very-explicit-description": {"bleu": 4.0649225050991795, "bleu_stderr": 0.14182226731160913, "rouge1_fmeasure": 0.4004976431750891, "rouge1_fmeasure_stderr": 0.005533519853929168, "rouge1_precision": 0.4012912849159947, "rouge1_precision_stderr": 0.007332717541607647, "rouge1_recall": 0.64320056686215, "rouge1_recall_stderr": 0.004496400900386177, "rouge2_fmeasure": 0.2256483945335245, "rouge2_fmeasure_stderr": 0.004326347089488998, "rouge2_precision": 0.23173467180213086, "rouge2_precision_stderr": 0.005322008089541369, "rouge2_recall": 0.36045746541298906, "rouge2_recall_stderr": 0.004418126556266405, "rougeL_fmeasure": 0.31629484401516644, "rougeL_fmeasure_stderr": 0.004837305063133512, "rougeL_precision": 0.3181899530902145, "rougeL_precision_stderr": 0.006233483121725614, "rougeL_recall": 0.5124106544670798, "rougeL_recall_stderr": 0.0044121171030032285, "rougeLsum_fmeasure": 0.3503718103768193, "rougeLsum_fmeasure_stderr": 0.00488155342773186, "rougeLsum_precision": 0.34997948174033583, "rougeLsum_precision_stderr": 0.006495798513744922, "rougeLsum_recall": 0.5748853093658587, "rougeLsum_recall_stderr": 0.004482155650671063}}}, "GEM/wiki_lingua_en": {"0": {"article_summary_en": {"bleu": 2.0534810991559866, "bleu_stderr": 0.0666110922750702, "rouge1_fmeasure": 0.2117111727875298, "rouge1_fmeasure_stderr": 0.0018438088393860322, "rouge1_precision": 0.17814038490300868, "rouge1_precision_stderr": 0.0019339544316495465, "rouge1_recall": 0.3141329562889899, "rouge1_recall_stderr": 0.0026819443880949302, "rouge2_fmeasure": 0.04599914076874335, "rouge2_fmeasure_stderr": 0.0009026810349493257, "rouge2_precision": 0.03845870479092204, "rouge2_precision_stderr": 0.0007912707782195452, "rouge2_recall": 0.07088130315681604, "rouge2_recall_stderr": 0.0015849066154849387, "rougeL_fmeasure": 0.14701777739136027, "rougeL_fmeasure_stderr": 0.001160350487915751, "rougeL_precision": 0.12177527865393714, "rougeL_precision_stderr": 0.0011744038036189456, "rougeL_recall": 0.22582257646250492, "rougeL_recall_stderr": 0.0020828750946991857, "rougeLsum_fmeasure": 0.1960912017363633, "rougeLsum_fmeasure_stderr": 0.001706624400289841, "rougeLsum_precision": 0.16477812405030756, "rougeLsum_precision_stderr": 0.0017822475337332542, "rougeLsum_recall": 0.29175188388878937, "rougeLsum_recall_stderr": 0.0025122798019773414}, "rephrase_en": {"bleu": 0.7056427126817756, "bleu_stderr": 0.036208763595939567, "rouge1_fmeasure": 0.10170268353078239, "rouge1_fmeasure_stderr": 0.001552854434242085, "rouge1_precision": 0.08751592392277548, "rouge1_precision_stderr": 0.0014885876589069494, "rouge1_recall": 0.14641105372315416, "rouge1_recall_stderr": 0.002277503755216182, "rouge2_fmeasure": 0.014876069868397498, "rouge2_fmeasure_stderr": 0.0005216997847218128, "rouge2_precision": 0.012652600762562841, "rouge2_precision_stderr": 0.0004575918826167268, "rouge2_recall": 0.022474938777637253, "rouge2_recall_stderr": 0.0009218950292896009, "rougeL_fmeasure": 0.08720494916711495, "rougeL_fmeasure_stderr": 0.0012395877618178168, "rougeL_precision": 0.0743006100872787, "rougeL_precision_stderr": 0.0011597328400413602, "rougeL_recall": 0.1279023695193692, "rougeL_recall_stderr": 0.001964736141764416, "rougeLsum_fmeasure": 0.09436348999877432, "rougeLsum_fmeasure_stderr": 0.0014294898556339143, "rougeLsum_precision": 0.0809683487363323, "rougeLsum_precision_stderr": 0.0013609972489967807, "rougeLsum_recall": 0.13640036278318976, "rougeLsum_recall_stderr": 0.0021233241725256875}, "summarize_above_en": {"bleu": 0.5934908400066984, "bleu_stderr": 0.04577158251574039, "rouge1_fmeasure": 0.13902820692376117, "rouge1_fmeasure_stderr": 0.0015977409115201353, "rouge1_precision": 0.12685534513451716, "rouge1_precision_stderr": 0.0018887365773073797, "rouge1_recall": 0.19257637263679828, "rouge1_recall_stderr": 0.0021883455691780597, "rouge2_fmeasure": 0.01758898085911187, "rouge2_fmeasure_stderr": 0.0005845142555911145, "rouge2_precision": 0.016462988916567202, "rouge2_precision_stderr": 0.0006707808949734338, "rouge2_recall": 0.02514451000634953, "rouge2_recall_stderr": 0.0009497451781410877, "rougeL_fmeasure": 0.11668382606343859, "rougeL_fmeasure_stderr": 0.0012164143612498303, "rougeL_precision": 0.10541733933259849, "rougeL_precision_stderr": 0.001488446061768874, "rougeL_recall": 0.16496936421708008, "rougeL_recall_stderr": 0.0018405198920376908, "rougeLsum_fmeasure": 0.1283175804360348, "rougeLsum_fmeasure_stderr": 0.0014569156963222744, "rougeLsum_precision": 0.11691349355480424, "rougeLsum_precision_stderr": 0.0017447856277329906, "rougeLsum_recall": 0.1787124652688377, "rougeLsum_recall_stderr": 0.002037393870462275}, "tldr_en": {"bleu": 1.6192080325529026, "bleu_stderr": 0.04050520870343064, "rouge1_fmeasure": 0.17809655126532312, "rouge1_fmeasure_stderr": 0.0018236388559653763, "rouge1_precision": 0.15222794945796614, "rouge1_precision_stderr": 0.001854391491208858, "rouge1_recall": 0.259279440153063, "rouge1_recall_stderr": 0.0027052349535794085, "rouge2_fmeasure": 0.03601951697280678, "rouge2_fmeasure_stderr": 0.0008300101222283796, "rouge2_precision": 0.030497998250497895, "rouge2_precision_stderr": 0.0007413435249853986, "rouge2_recall": 0.05425261740808977, "rouge2_recall_stderr": 0.001399269212332445, "rougeL_fmeasure": 0.1384741786574652, "rougeL_fmeasure_stderr": 0.0012894950527783321, "rougeL_precision": 0.11697977840134756, "rougeL_precision_stderr": 0.0012794261124303204, "rougeL_recall": 0.2062333696623697, "rougeL_recall_stderr": 0.0021597198817625285, "rougeLsum_fmeasure": 0.1628530750095549, "rougeLsum_fmeasure_stderr": 0.0016530415994811475, "rougeLsum_precision": 0.1389877568936686, "rougeLsum_precision_stderr": 0.0016763555392295747, "rougeLsum_recall": 0.23808538118300834, "rougeLsum_recall_stderr": 0.0025032807703488534}, "write_abstract_en": {"bleu": 0.7950296589720617, "bleu_stderr": 0.02515184874378254, "rouge1_fmeasure": 0.11796833391406458, "rouge1_fmeasure_stderr": 0.001679540196270818, "rouge1_precision": 0.10260040509975965, "rouge1_precision_stderr": 0.0016816885739169474, "rouge1_recall": 0.17029243506464356, "rouge1_recall_stderr": 0.0024565476844548746, "rouge2_fmeasure": 0.015984403876231276, "rouge2_fmeasure_stderr": 0.0006030736806193037, "rouge2_precision": 0.013771295330145835, "rouge2_precision_stderr": 0.0005405889426764763, "rouge2_recall": 0.024597820085107556, "rouge2_recall_stderr": 0.0010575366347068554, "rougeL_fmeasure": 0.09851081780240795, "rougeL_fmeasure_stderr": 0.0012485527286575612, "rougeL_precision": 0.08487885507917314, "rougeL_precision_stderr": 0.0012383575679524605, "rougeL_recall": 0.14493899736320978, "rougeL_recall_stderr": 0.0020037770730611786, "rougeLsum_fmeasure": 0.10980074384140354, "rougeLsum_fmeasure_stderr": 0.0015599379035532135, "rougeLsum_precision": 0.09525971665769668, "rougeLsum_precision_stderr": 0.0015577333350387276, "rougeLsum_recall": 0.15944167499687736, "rougeLsum_recall_stderr": 0.002320656358626716}}, "1": {"article_summary_en": {"bleu": 1.8865327529226577, "bleu_stderr": 0.04249976221266161, "rouge1_fmeasure": 0.19313302430200963, "rouge1_fmeasure_stderr": 0.0019316051118957926, "rouge1_precision": 0.164850593284305, "rouge1_precision_stderr": 0.002029018039576436, "rouge1_recall": 0.28405561359995446, "rouge1_recall_stderr": 0.002857993695751241, "rouge2_fmeasure": 0.040864899057913275, "rouge2_fmeasure_stderr": 0.0008899257763011835, "rouge2_precision": 0.03503377865570394, "rouge2_precision_stderr": 0.0008530621853281363, "rouge2_recall": 0.0623540484617142, "rouge2_recall_stderr": 0.001548287542059019, "rougeL_fmeasure": 0.14097329346444698, "rougeL_fmeasure_stderr": 0.0013041417812161638, "rougeL_precision": 0.11907730193169616, "rougeL_precision_stderr": 0.0013816606669132387, "rougeL_recall": 0.2131003193203166, "rougeL_recall_stderr": 0.002234666822976161, "rougeLsum_fmeasure": 0.1789179111545324, "rougeLsum_fmeasure_stderr": 0.001788238357802055, "rougeLsum_precision": 0.15259882161084798, "rougeLsum_precision_stderr": 0.001885756918977423, "rougeLsum_recall": 0.26389227182331143, "rougeLsum_recall_stderr": 0.0026742133450759734}, "rephrase_en": {"bleu": 1.1951746948073685, "bleu_stderr": 0.05498403783354158, "rouge1_fmeasure": 0.13464216389707748, "rouge1_fmeasure_stderr": 0.0018008035510529402, "rouge1_precision": 0.11704708597499369, "rouge1_precision_stderr": 0.0017208124581957302, "rouge1_recall": 0.192663652857006, "rouge1_recall_stderr": 0.0027090117742870658, "rouge2_fmeasure": 0.02017226312757468, "rouge2_fmeasure_stderr": 0.000714000343756126, "rouge2_precision": 0.01695970372606848, "rouge2_precision_stderr": 0.0006027560326870975, "rouge2_recall": 0.030780517480147162, "rouge2_recall_stderr": 0.0012354005199119644, "rougeL_fmeasure": 0.1019122729108625, "rougeL_fmeasure_stderr": 0.0012525770397120181, "rougeL_precision": 0.08787724537079063, "rougeL_precision_stderr": 0.0011884855038448482, "rougeL_recall": 0.14920400372412476, "rougeL_recall_stderr": 0.0020664312372579213, "rougeLsum_fmeasure": 0.12602710319462657, "rougeLsum_fmeasure_stderr": 0.0016626827697359793, "rougeLsum_precision": 0.10950085735058879, "rougeLsum_precision_stderr": 0.0015902682243807002, "rougeLsum_recall": 0.1807018882877703, "rougeLsum_recall_stderr": 0.002516295905690825}, "summarize_above_en": {"bleu": 1.3183556866717046, "bleu_stderr": 0.047701497933856064, "rouge1_fmeasure": 0.1575281230169748, "rouge1_fmeasure_stderr": 0.0017363328564521583, "rouge1_precision": 0.13945230313441068, "rouge1_precision_stderr": 0.0018821753367623763, "rouge1_recall": 0.22481790415194117, "rouge1_recall_stderr": 0.002565910341383482, "rouge2_fmeasure": 0.025193685294665726, "rouge2_fmeasure_stderr": 0.0007407727831149609, "rouge2_precision": 0.023062949689263944, "rouge2_precision_stderr": 0.0008856830610119579, "rouge2_recall": 0.03747209579391718, "rouge2_recall_stderr": 0.0012382175456694833, "rougeL_fmeasure": 0.11740619203596399, "rougeL_fmeasure_stderr": 0.001212497657479112, "rougeL_precision": 0.10342195238697749, "rougeL_precision_stderr": 0.0013835117875348828, "rougeL_recall": 0.17152534934129882, "rougeL_recall_stderr": 0.0019955254451948295, "rougeLsum_fmeasure": 0.14750316226109086, "rougeLsum_fmeasure_stderr": 0.0016133291787871627, "rougeLsum_precision": 0.1306018801081659, "rougeLsum_precision_stderr": 0.0017653906193587606, "rougeLsum_recall": 0.21110368454548847, "rougeLsum_recall_stderr": 0.00241423606851821}, "tldr_en": {"bleu": 2.545643626821724, "bleu_stderr": 0.048869510836124584, "rouge1_fmeasure": 0.20484942755932814, "rouge1_fmeasure_stderr": 0.001960405666708866, "rouge1_precision": 0.17909391212556103, "rouge1_precision_stderr": 0.0021684904700656395, "rouge1_recall": 0.2986428547454232, "rouge1_recall_stderr": 0.002901154619840759, "rouge2_fmeasure": 0.04757609861819433, "rouge2_fmeasure_stderr": 0.0009961877982725383, "rouge2_precision": 0.04098672752953724, "rouge2_precision_stderr": 0.0009402949935154375, "rouge2_recall": 0.07250823843480829, "rouge2_recall_stderr": 0.00171407343966866, "rougeL_fmeasure": 0.14525233898326748, "rougeL_fmeasure_stderr": 0.001303267897245817, "rougeL_precision": 0.12586154047693615, "rougeL_precision_stderr": 0.001447455908294373, "rougeL_recall": 0.21783174447787615, "rougeL_recall_stderr": 0.002284390945146576, "rougeLsum_fmeasure": 0.19265194446101522, "rougeLsum_fmeasure_stderr": 0.0018360173432108782, "rougeLsum_precision": 0.16826083836269604, "rougeLsum_precision_stderr": 0.0020324268022024173, "rougeLsum_recall": 0.28157870760475356, "rougeLsum_recall_stderr": 0.002752878166728396}, "write_abstract_en": {"bleu": 1.1020316820663745, "bleu_stderr": 0.05702217937857135, "rouge1_fmeasure": 0.1501712303062816, "rouge1_fmeasure_stderr": 0.0017196466104618196, "rouge1_precision": 0.13088003322560143, "rouge1_precision_stderr": 0.0017327225800732847, "rouge1_recall": 0.21418661344518894, "rouge1_recall_stderr": 0.002499792501862042, "rouge2_fmeasure": 0.020818331143036432, "rouge2_fmeasure_stderr": 0.0007112541235966938, "rouge2_precision": 0.017915022495063486, "rouge2_precision_stderr": 0.0006375485225038393, "rouge2_recall": 0.03152784532661522, "rouge2_recall_stderr": 0.001233712804278278, "rougeL_fmeasure": 0.10744769510485963, "rougeL_fmeasure_stderr": 0.0011180856260891987, "rougeL_precision": 0.0927062754049998, "rougeL_precision_stderr": 0.0011190335418292052, "rougeL_recall": 0.15760113826039301, "rougeL_recall_stderr": 0.0018824698988046046, "rougeLsum_fmeasure": 0.14140181753666214, "rougeLsum_fmeasure_stderr": 0.0015942864602239222, "rougeLsum_precision": 0.1231860191540673, "rougeLsum_precision_stderr": 0.0016113685375994105, "rougeLsum_recall": 0.20193059778802494, "rougeLsum_recall_stderr": 0.0023296824464227}}, "2": {"article_summary_en": {"bleu": 2.250874940153623, "bleu_stderr": 0.08256417332234947, "rouge1_fmeasure": 0.20240023881581198, "rouge1_fmeasure_stderr": 0.0019530480366369383, "rouge1_precision": 0.17401161171600613, "rouge1_precision_stderr": 0.0020881821450366707, "rouge1_recall": 0.29609796476117983, "rouge1_recall_stderr": 0.0029221116925695636, "rouge2_fmeasure": 0.04643912783550571, "rouge2_fmeasure_stderr": 0.0009627363006805538, "rouge2_precision": 0.039624014311144054, "rouge2_precision_stderr": 0.0008900870750819028, "rouge2_recall": 0.07078627112644985, "rouge2_recall_stderr": 0.0016722377383979908, "rougeL_fmeasure": 0.14993944604674414, "rougeL_fmeasure_stderr": 0.0013384024265687326, "rougeL_precision": 0.12735865745665292, "rougeL_precision_stderr": 0.0014154169873442247, "rougeL_recall": 0.225450514063261, "rougeL_recall_stderr": 0.0023420645348967563, "rougeLsum_fmeasure": 0.18742123863337812, "rougeLsum_fmeasure_stderr": 0.0018142650005546468, "rougeLsum_precision": 0.16080285795577998, "rougeLsum_precision_stderr": 0.0019296398265542425, "rougeLsum_recall": 0.2753859473796806, "rougeLsum_recall_stderr": 0.0027696876252129017}, "rephrase_en": {"bleu": 2.3341861618808797, "bleu_stderr": 0.07849200046628084, "rouge1_fmeasure": 0.17364781357215728, "rouge1_fmeasure_stderr": 0.0020669795912559206, "rouge1_precision": 0.15158367096384875, "rouge1_precision_stderr": 0.002160293878134524, "rouge1_recall": 0.24779508708098152, "rouge1_recall_stderr": 0.0029503505282398473, "rouge2_fmeasure": 0.040268213401871214, "rouge2_fmeasure_stderr": 0.0009739966672053971, "rouge2_precision": 0.03517592630819259, "rouge2_precision_stderr": 0.0009362444811665309, "rouge2_recall": 0.05870812861597759, "rouge2_recall_stderr": 0.0015348108636689321, "rougeL_fmeasure": 0.13599583814052624, "rougeL_fmeasure_stderr": 0.001496120401269599, "rougeL_precision": 0.11724613742268182, "rougeL_precision_stderr": 0.0015394113402297696, "rougeL_recall": 0.1987576493460671, "rougeL_recall_stderr": 0.002371525557855466, "rougeLsum_fmeasure": 0.16058302443864383, "rougeLsum_fmeasure_stderr": 0.0019230855651408392, "rougeLsum_precision": 0.14002594651258354, "rougeLsum_precision_stderr": 0.002012494612958899, "rougeLsum_recall": 0.22991508782822548, "rougeLsum_recall_stderr": 0.0027786907548584838}, "summarize_above_en": {"bleu": 2.251008452787017, "bleu_stderr": 0.07741675426485511, "rouge1_fmeasure": 0.1865965522646182, "rouge1_fmeasure_stderr": 0.0019292479914873961, "rouge1_precision": 0.17257284656874033, "rouge1_precision_stderr": 0.0023886955187102186, "rouge1_recall": 0.2624694392412262, "rouge1_recall_stderr": 0.0027556283158437244, "rouge2_fmeasure": 0.040241799290576724, "rouge2_fmeasure_stderr": 0.0009520200071847311, "rouge2_precision": 0.0386635351793387, "rouge2_precision_stderr": 0.0011380738975515533, "rouge2_recall": 0.058135448256113796, "rouge2_recall_stderr": 0.0015225329835211754, "rougeL_fmeasure": 0.14758577085564545, "rougeL_fmeasure_stderr": 0.0013996716496550154, "rougeL_precision": 0.13532659869410305, "rougeL_precision_stderr": 0.0017966336725582315, "rougeL_recall": 0.2128994118965257, "rougeL_recall_stderr": 0.00228475982913276, "rougeLsum_fmeasure": 0.17307145674730884, "rougeLsum_fmeasure_stderr": 0.0017845742440570694, "rougeLsum_precision": 0.1602504598601161, "rougeLsum_precision_stderr": 0.002234879306428014, "rougeLsum_recall": 0.24407970791374364, "rougeLsum_recall_stderr": 0.0025830465673148976}, "tldr_en": {"bleu": 3.123719853905042, "bleu_stderr": 0.09435758546693512, "rouge1_fmeasure": 0.21836409386191818, "rouge1_fmeasure_stderr": 0.001989518194370473, "rouge1_precision": 0.2332474242904849, "rouge1_precision_stderr": 0.0031768172038711467, "rouge1_recall": 0.29410268890498986, "rouge1_recall_stderr": 0.002958828414438297, "rouge2_fmeasure": 0.05650249608530642, "rouge2_fmeasure_stderr": 0.0011487340298214372, "rouge2_precision": 0.06566711930000235, "rouge2_precision_stderr": 0.0019413943466877388, "rouge2_recall": 0.07670871046547496, "rouge2_recall_stderr": 0.0016787739182338268, "rougeL_fmeasure": 0.1603606067900483, "rougeL_fmeasure_stderr": 0.0014157142739287894, "rougeL_precision": 0.1752085713341631, "rougeL_precision_stderr": 0.0026584779502129303, "rougeL_recall": 0.21819240455646285, "rougeL_recall_stderr": 0.002276168106614223, "rougeLsum_fmeasure": 0.20617794726883687, "rougeLsum_fmeasure_stderr": 0.0018867289872942854, "rougeLsum_precision": 0.22112816157170134, "rougeLsum_precision_stderr": 0.0030769042746772593, "rougeLsum_recall": 0.2776979400072693, "rougeLsum_recall_stderr": 0.002813335835445409}, "write_abstract_en": {"bleu": 1.2530485760326848, "bleu_stderr": 0.034281822221775095, "rouge1_fmeasure": 0.1444547659918946, "rouge1_fmeasure_stderr": 0.0018306868593415573, "rouge1_precision": 0.12906028226799743, "rouge1_precision_stderr": 0.0019154820247282277, "rouge1_recall": 0.20575231428257332, "rouge1_recall_stderr": 0.0026866298708369675, "rouge2_fmeasure": 0.0219486272286028, "rouge2_fmeasure_stderr": 0.000730523081944293, "rouge2_precision": 0.019627327385611428, "rouge2_precision_stderr": 0.0007723877693570129, "rouge2_recall": 0.0319483746867405, "rouge2_recall_stderr": 0.0011670173723061863, "rougeL_fmeasure": 0.10633076283099206, "rougeL_fmeasure_stderr": 0.0012465424058332557, "rougeL_precision": 0.09457002128612854, "rougeL_precision_stderr": 0.0013340023606244708, "rougeL_recall": 0.1551264273734134, "rougeL_recall_stderr": 0.0020478473866823725, "rougeLsum_fmeasure": 0.13526431674777373, "rougeLsum_fmeasure_stderr": 0.0016980499334396653, "rougeLsum_precision": 0.12068760136915536, "rougeLsum_precision_stderr": 0.0017801778755423962, "rougeLsum_recall": 0.19339826374990432, "rougeLsum_recall_stderr": 0.0025250499570330727}}, "3": {"article_summary_en": {"bleu": 2.419872373280389, "bleu_stderr": 0.0632053559231718, "rouge1_fmeasure": 0.1732009883115761, "rouge1_fmeasure_stderr": 0.0022644018750802943, "rouge1_precision": 0.15417391268004804, "rouge1_precision_stderr": 0.0023622060928554494, "rouge1_recall": 0.25177591335426297, "rouge1_recall_stderr": 0.003390550226163827, "rouge2_fmeasure": 0.0414745893981831, "rouge2_fmeasure_stderr": 0.0009648670616110777, "rouge2_precision": 0.03643560093458038, "rouge2_precision_stderr": 0.0009248695850334793, "rouge2_recall": 0.06270148510981541, "rouge2_recall_stderr": 0.0016134073336328298, "rougeL_fmeasure": 0.13035603600788886, "rougeL_fmeasure_stderr": 0.0016318784961578442, "rougeL_precision": 0.11525851431820534, "rougeL_precision_stderr": 0.0017153378060412703, "rougeL_recall": 0.1945255102820067, "rougeL_recall_stderr": 0.0027243009700905294, "rougeLsum_fmeasure": 0.16116163645425682, "rougeLsum_fmeasure_stderr": 0.0021129773225797956, "rougeLsum_precision": 0.1433709185752066, "rougeLsum_precision_stderr": 0.0021997928215724285, "rougeLsum_recall": 0.2347702913915573, "rougeLsum_recall_stderr": 0.0031942105153544165}, "rephrase_en": {"bleu": 2.5938240082605772, "bleu_stderr": 0.1039770016299349, "rouge1_fmeasure": 0.15258466894138525, "rouge1_fmeasure_stderr": 0.002231238932667558, "rouge1_precision": 0.14222703095938236, "rouge1_precision_stderr": 0.002476470176258003, "rouge1_recall": 0.21389332067876604, "rouge1_recall_stderr": 0.003224457836713378, "rouge2_fmeasure": 0.03686577574547315, "rouge2_fmeasure_stderr": 0.000986211178727786, "rouge2_precision": 0.03453726675386056, "rouge2_precision_stderr": 0.001111852850451432, "rouge2_recall": 0.053000124965507024, "rouge2_recall_stderr": 0.0015287742053960468, "rougeL_fmeasure": 0.12093827771832451, "rougeL_fmeasure_stderr": 0.0016818007172860084, "rougeL_precision": 0.11196872227962629, "rougeL_precision_stderr": 0.0018972303986128753, "rougeL_recall": 0.17347582553687865, "rougeL_recall_stderr": 0.0026374790749814484, "rougeLsum_fmeasure": 0.14180007256699717, "rougeLsum_fmeasure_stderr": 0.0020827290155971973, "rougeLsum_precision": 0.13231796413662955, "rougeLsum_precision_stderr": 0.002328686040271986, "rougeLsum_recall": 0.1992898897205879, "rougeLsum_recall_stderr": 0.003033843371726955}, "summarize_above_en": {"bleu": 2.699909605867917, "bleu_stderr": 0.10552509596678104, "rouge1_fmeasure": 0.16004715198678016, "rouge1_fmeasure_stderr": 0.0022371069572285113, "rouge1_precision": 0.15910068424231602, "rouge1_precision_stderr": 0.0028942512572503855, "rouge1_recall": 0.22204167759688737, "rouge1_recall_stderr": 0.0031930213084605565, "rouge2_fmeasure": 0.03722078217429424, "rouge2_fmeasure_stderr": 0.0009797813487457106, "rouge2_precision": 0.03901258316156372, "rouge2_precision_stderr": 0.001398659027827721, "rouge2_recall": 0.05258690577442277, "rouge2_recall_stderr": 0.0015262833449626498, "rougeL_fmeasure": 0.1282770493927655, "rougeL_fmeasure_stderr": 0.0017127973815793287, "rougeL_precision": 0.12723416561359413, "rougeL_precision_stderr": 0.0023377769258987246, "rougeL_recall": 0.18222910972085204, "rougeL_recall_stderr": 0.0026727913824271736, "rougeLsum_fmeasure": 0.14829642927866662, "rougeLsum_fmeasure_stderr": 0.0020736961426557118, "rougeLsum_precision": 0.14797898460424116, "rougeLsum_precision_stderr": 0.0027432966222511084, "rougeLsum_recall": 0.20640960313414, "rougeLsum_recall_stderr": 0.0030075792341701397}, "tldr_en": {"bleu": 3.5006949715383118, "bleu_stderr": 0.08071997944617754, "rouge1_fmeasure": 0.1819863863181494, "rouge1_fmeasure_stderr": 0.0024282963824572346, "rouge1_precision": 0.22344987630747953, "rouge1_precision_stderr": 0.0038946320752308397, "rouge1_recall": 0.228687908011972, "rouge1_recall_stderr": 0.0033634865916312945, "rouge2_fmeasure": 0.05005886014366939, "rouge2_fmeasure_stderr": 0.0012507484027525577, "rouge2_precision": 0.06821362116644611, "rouge2_precision_stderr": 0.0023091245896002433, "rouge2_recall": 0.06243742554002245, "rouge2_recall_stderr": 0.001665626169617889, "rougeL_fmeasure": 0.13680266609806765, "rougeL_fmeasure_stderr": 0.001821525871477116, "rougeL_precision": 0.17380086539073145, "rougeL_precision_stderr": 0.003294779021368952, "rougeL_recall": 0.1728818390227638, "rougeL_recall_stderr": 0.0026287282834976822, "rougeLsum_fmeasure": 0.17195845477450572, "rougeLsum_fmeasure_stderr": 0.0023005495305984053, "rougeLsum_precision": 0.21200019749157653, "rougeLsum_precision_stderr": 0.003735810714751958, "rougeLsum_recall": 0.21587781354640423, "rougeLsum_recall_stderr": 0.003186682013692883}, "write_abstract_en": {"bleu": 1.2880401565259663, "bleu_stderr": 0.02892700209020493, "rouge1_fmeasure": 0.11206218209748141, "rouge1_fmeasure_stderr": 0.0019274598908821678, "rouge1_precision": 0.10422127953279672, "rouge1_precision_stderr": 0.0020439479764111446, "rouge1_recall": 0.15837206489083266, "rouge1_recall_stderr": 0.0028160022730325317, "rouge2_fmeasure": 0.017248853072901214, "rouge2_fmeasure_stderr": 0.0006648333827724484, "rouge2_precision": 0.01585990327947335, "rouge2_precision_stderr": 0.00074827880117735, "rouge2_recall": 0.025799942301719232, "rouge2_recall_stderr": 0.0011077245741293905, "rougeL_fmeasure": 0.08351024617373054, "rougeL_fmeasure_stderr": 0.0013594205425130031, "rougeL_precision": 0.07759975361518856, "rougeL_precision_stderr": 0.001488357218316451, "rougeL_recall": 0.12105717330360508, "rougeL_recall_stderr": 0.002169577022250882, "rougeLsum_fmeasure": 0.10540334302928316, "rougeLsum_fmeasure_stderr": 0.0017989673027563617, "rougeLsum_precision": 0.09810666829139612, "rougeLsum_precision_stderr": 0.0019260722064131435, "rougeLsum_recall": 0.14946639296314937, "rougeLsum_recall_stderr": 0.002649765078910915}}, "4": {"article_summary_en": {"bleu": 0.5407149916214953, "bleu_stderr": 0.04531884877289389, "rouge1_fmeasure": 0.05560797193518763, "rouge1_fmeasure_stderr": 0.0019328666401600453, "rouge1_precision": 0.05190785653580835, "rouge1_precision_stderr": 0.0020577014263843353, "rouge1_recall": 0.08336762757152712, "rouge1_recall_stderr": 0.0029759640480461146, "rouge2_fmeasure": 0.013892451555974773, "rouge2_fmeasure_stderr": 0.0006715642301954771, "rouge2_precision": 0.01264978957372063, "rouge2_precision_stderr": 0.0006811352498881881, "rouge2_recall": 0.022501646103038838, "rouge2_recall_stderr": 0.0012662878267261353, "rougeL_fmeasure": 0.04291751591005431, "rougeL_fmeasure_stderr": 0.0014550040577974023, "rougeL_precision": 0.039660746703855165, "rougeL_precision_stderr": 0.0015372422364523625, "rougeL_recall": 0.06621442775842074, "rougeL_recall_stderr": 0.0023962464441846846, "rougeLsum_fmeasure": 0.05153615578362743, "rougeLsum_fmeasure_stderr": 0.001791767114164613, "rougeLsum_precision": 0.04808436429768246, "rougeLsum_precision_stderr": 0.0019131925346893503, "rougeLsum_recall": 0.07749318488885995, "rougeLsum_recall_stderr": 0.002775519992790014}, "rephrase_en": {"bleu": 0.4670924044746616, "bleu_stderr": 0.04946249100806658, "rouge1_fmeasure": 0.04871798574935136, "rouge1_fmeasure_stderr": 0.0017935599450487872, "rouge1_precision": 0.04866014150454847, "rouge1_precision_stderr": 0.0020483076366846933, "rouge1_recall": 0.07060422651814766, "rouge1_recall_stderr": 0.002660644139013408, "rouge2_fmeasure": 0.012358132206090394, "rouge2_fmeasure_stderr": 0.0006693685268812792, "rouge2_precision": 0.012614536951186476, "rouge2_precision_stderr": 0.0009004920229567338, "rouge2_recall": 0.01860558990495083, "rouge2_recall_stderr": 0.0010812577940661158, "rougeL_fmeasure": 0.039216854579977777, "rougeL_fmeasure_stderr": 0.001419722439788474, "rougeL_precision": 0.039145972316534536, "rougeL_precision_stderr": 0.001664987060904554, "rougeL_recall": 0.058137528796694114, "rougeL_recall_stderr": 0.002223198690894717, "rougeLsum_fmeasure": 0.045169216003218556, "rougeLsum_fmeasure_stderr": 0.0016700917030067347, "rougeLsum_precision": 0.04530693688620845, "rougeLsum_precision_stderr": 0.001936598783228226, "rougeLsum_recall": 0.06555591615366842, "rougeLsum_recall_stderr": 0.0024912463291508627}, "summarize_above_en": {"bleu": 0.3436540247746996, "bleu_stderr": 0.02229209860808525, "rouge1_fmeasure": 0.048912234414672136, "rouge1_fmeasure_stderr": 0.0018303249700878453, "rouge1_precision": 0.04968775822887688, "rouge1_precision_stderr": 0.0021399482740943475, "rouge1_recall": 0.07040614521918066, "rouge1_recall_stderr": 0.0026795450412985373, "rouge2_fmeasure": 0.011579301131776956, "rouge2_fmeasure_stderr": 0.0006430202421005574, "rouge2_precision": 0.012311807543146269, "rouge2_precision_stderr": 0.0008824124738308823, "rouge2_recall": 0.017322028166765847, "rouge2_recall_stderr": 0.0010491107468874295, "rougeL_fmeasure": 0.039162591548067456, "rougeL_fmeasure_stderr": 0.0014375174353609152, "rougeL_precision": 0.039942899566434244, "rougeL_precision_stderr": 0.001742840540946017, "rougeL_recall": 0.057791070453037226, "rougeL_recall_stderr": 0.0022301074454849456, "rougeLsum_fmeasure": 0.045047671152385296, "rougeLsum_fmeasure_stderr": 0.0016882686974440538, "rougeLsum_precision": 0.04604352091174109, "rougeLsum_precision_stderr": 0.0020115234889020277, "rougeLsum_recall": 0.06491403804508811, "rougeLsum_recall_stderr": 0.0024844589212858962}, "tldr_en": {"bleu": 0.4203638205606742, "bleu_stderr": 0.04062156232814086, "rouge1_fmeasure": 0.05800966504497334, "rouge1_fmeasure_stderr": 0.002055695120200674, "rouge1_precision": 0.06907034391802865, "rouge1_precision_stderr": 0.002842925157819051, "rouge1_recall": 0.07641790711530233, "rouge1_recall_stderr": 0.0028071608967584296, "rouge2_fmeasure": 0.015594437236270214, "rouge2_fmeasure_stderr": 0.0008033551932234715, "rouge2_precision": 0.020323118606363738, "rouge2_precision_stderr": 0.0013765538659170954, "rouge2_recall": 0.020689214217856734, "rouge2_recall_stderr": 0.0011098261782535187, "rougeL_fmeasure": 0.04363217232305618, "rougeL_fmeasure_stderr": 0.0015475509311173715, "rougeL_precision": 0.05326819078634553, "rougeL_precision_stderr": 0.002300594360643888, "rougeL_recall": 0.05823666899257084, "rougeL_recall_stderr": 0.0021806250615139192, "rougeLsum_fmeasure": 0.05441077080277394, "rougeLsum_fmeasure_stderr": 0.0019346325978603714, "rougeLsum_precision": 0.06512227133445396, "rougeLsum_precision_stderr": 0.002711627122094951, "rougeLsum_recall": 0.07184593801381658, "rougeLsum_recall_stderr": 0.0026558284841563095}, "write_abstract_en": {"bleu": 0.05493255544599618, "bleu_stderr": 0.004843136897563013, "rouge1_fmeasure": 0.02664836221163967, "rouge1_fmeasure_stderr": 0.0012443957611582865, "rouge1_precision": 0.025833527508072014, "rouge1_precision_stderr": 0.0013553627152093723, "rouge1_recall": 0.03917838397118692, "rouge1_recall_stderr": 0.0018502036256180985, "rouge2_fmeasure": 0.0040223276062826534, "rouge2_fmeasure_stderr": 0.0003523486305081411, "rouge2_precision": 0.003887727371925059, "rouge2_precision_stderr": 0.0004542038297200875, "rouge2_recall": 0.006181327739501699, "rouge2_recall_stderr": 0.0005609017845600248, "rougeL_fmeasure": 0.020358325637102978, "rougeL_fmeasure_stderr": 0.0009200768723482526, "rougeL_precision": 0.01966558062594936, "rougeL_precision_stderr": 0.001016340019434063, "rougeL_recall": 0.030934646851062786, "rougeL_recall_stderr": 0.0014674198673725897, "rougeLsum_fmeasure": 0.024854830906791463, "rougeLsum_fmeasure_stderr": 0.0011586779799544313, "rougeLsum_precision": 0.024201111892969286, "rougeLsum_precision_stderr": 0.0012739866254506131, "rougeLsum_recall": 0.03658490742322415, "rougeLsum_recall_stderr": 0.0017229024892846027}}, "5": {"article_summary_en": {"bleu": 3.875581456441756e-07, "bleu_stderr": 1.138882234754457e-06, "rouge1_fmeasure": 0.00858766610820728, "rouge1_fmeasure_stderr": 0.0008521488341725035, "rouge1_precision": 0.008747426074934984, "rouge1_precision_stderr": 0.0010334072389606014, "rouge1_recall": 0.013166466939228978, "rouge1_recall_stderr": 0.0013165673788761627, "rouge2_fmeasure": 0.002181790528509318, "rouge2_fmeasure_stderr": 0.00029155550977350524, "rouge2_precision": 0.001993543076039664, "rouge2_precision_stderr": 0.00032468030555408186, "rouge2_recall": 0.0037795884860438948, "rouge2_recall_stderr": 0.0005979090260362486, "rougeL_fmeasure": 0.006564407691410186, "rougeL_fmeasure_stderr": 0.0006360402903853286, "rougeL_precision": 0.006817828377343832, "rougeL_precision_stderr": 0.0008550758600300307, "rougeL_recall": 0.010456992886461796, "rougeL_recall_stderr": 0.0010697198590483594, "rougeLsum_fmeasure": 0.00811170366067559, "rougeLsum_fmeasure_stderr": 0.000800158042567113, "rougeLsum_precision": 0.008318710239963927, "rougeLsum_precision_stderr": 0.0009932928437688979, "rougeLsum_recall": 0.0125244294943219, "rougeLsum_recall_stderr": 0.0012566507407968427}, "rephrase_en": {"bleu": 1.7431796693295205e-08, "bleu_stderr": 5.3491931466841714e-08, "rouge1_fmeasure": 0.007375281725767917, "rouge1_fmeasure_stderr": 0.0007598214513616661, "rouge1_precision": 0.008041599675695846, "rouge1_precision_stderr": 0.0009393646375217224, "rouge1_recall": 0.010590450930500163, "rouge1_recall_stderr": 0.0011298193688755148, "rouge2_fmeasure": 0.0017789109729195612, "rouge2_fmeasure_stderr": 0.00023905673552756363, "rouge2_precision": 0.002035037570270077, "rouge2_precision_stderr": 0.0003375852776151478, "rouge2_recall": 0.0025466017472514603, "rouge2_recall_stderr": 0.0003590177860459669, "rougeL_fmeasure": 0.005916436260589178, "rougeL_fmeasure_stderr": 0.0005951477346238279, "rougeL_precision": 0.006463314313217977, "rougeL_precision_stderr": 0.0007367508919664071, "rougeL_recall": 0.008641735031241883, "rougeL_recall_stderr": 0.0009172071264531992, "rougeLsum_fmeasure": 0.006855742708837078, "rougeLsum_fmeasure_stderr": 0.0007014519003240475, "rougeLsum_precision": 0.0075302854587578905, "rougeLsum_precision_stderr": 0.0008806851003942225, "rougeLsum_recall": 0.009830760205551979, "rougeLsum_recall_stderr": 0.0010421634809987866}, "summarize_above_en": {"bleu": 7.509001958231661e-10, "bleu_stderr": 1.9581849644344306e-09, "rouge1_fmeasure": 0.006704558536864876, "rouge1_fmeasure_stderr": 0.0007268399140477246, "rouge1_precision": 0.006943053713544341, "rouge1_precision_stderr": 0.0008438126140001231, "rouge1_recall": 0.008894699789752037, "rouge1_recall_stderr": 0.0009656196544936773, "rouge2_fmeasure": 0.0015940878813423497, "rouge2_fmeasure_stderr": 0.0002489498244534957, "rouge2_precision": 0.0016201459969527312, "rouge2_precision_stderr": 0.0002822313186020891, "rouge2_recall": 0.002057657522078538, "rouge2_recall_stderr": 0.00031150670601567205, "rougeL_fmeasure": 0.005412786572883017, "rougeL_fmeasure_stderr": 0.0005765647323043862, "rougeL_precision": 0.005508592130950043, "rougeL_precision_stderr": 0.0006491909281406889, "rougeL_recall": 0.0073750235680540436, "rougeL_recall_stderr": 0.0008073064738323795, "rougeLsum_fmeasure": 0.006217775628514679, "rougeLsum_fmeasure_stderr": 0.000672203252135069, "rougeLsum_precision": 0.006472729964406455, "rougeLsum_precision_stderr": 0.0007966089744646425, "rougeLsum_recall": 0.008284399822459577, "rougeLsum_recall_stderr": 0.0009017405138743037}, "tldr_en": {"bleu": 1.610620675193976e-08, "bleu_stderr": 6.336079313722214e-08, "rouge1_fmeasure": 0.008814606254748622, "rouge1_fmeasure_stderr": 0.0008820141996097188, "rouge1_precision": 0.011141473368752484, "rouge1_precision_stderr": 0.0012984382630292146, "rouge1_recall": 0.011356368661606248, "rouge1_recall_stderr": 0.001166446625113568, "rouge2_fmeasure": 0.0024833328621297794, "rouge2_fmeasure_stderr": 0.0003381158590391299, "rouge2_precision": 0.0036979579695363266, "rouge2_precision_stderr": 0.00069734856115032, "rouge2_recall": 0.0030116660387216835, "rouge2_recall_stderr": 0.00039145176963269976, "rougeL_fmeasure": 0.0067361341505456805, "rougeL_fmeasure_stderr": 0.0006778741138953477, "rougeL_precision": 0.008749076310088717, "rougeL_precision_stderr": 0.0010822814361618575, "rougeL_recall": 0.008702418570223684, "rougeL_recall_stderr": 0.0008937526565969341, "rougeLsum_fmeasure": 0.008282398875850052, "rougeLsum_fmeasure_stderr": 0.0008304343325046589, "rougeLsum_precision": 0.010561687151292019, "rougeLsum_precision_stderr": 0.0012526496630397305, "rougeLsum_recall": 0.010693942253000091, "rougeLsum_recall_stderr": 0.001098581608402249}, "write_abstract_en": {"bleu": 6.481159719965521e-16, "bleu_stderr": 2.6449993743474514e-15, "rouge1_fmeasure": 0.0026710840623148636, "rouge1_fmeasure_stderr": 0.000399411300114256, "rouge1_precision": 0.002265233515641718, "rouge1_precision_stderr": 0.0003579734134523194, "rouge1_recall": 0.004150900437862034, "rouge1_recall_stderr": 0.000611002349583528, "rouge2_fmeasure": 0.0003338217989499952, "rouge2_fmeasure_stderr": 8.131145790626592e-05, "rouge2_precision": 0.00026974488596675497, "rouge2_precision_stderr": 6.71291598724476e-05, "rouge2_recall": 0.0005227590853525244, "rouge2_recall_stderr": 0.00013134134677067233, "rougeL_fmeasure": 0.002058167115063423, "rougeL_fmeasure_stderr": 0.00030026229552540865, "rougeL_precision": 0.001728264703351125, "rougeL_precision_stderr": 0.00026158511522949576, "rougeL_recall": 0.003287044984691778, "rougeL_recall_stderr": 0.00049276938287603, "rougeLsum_fmeasure": 0.0025284149541073293, "rougeLsum_fmeasure_stderr": 0.00037733824330625027, "rougeLsum_precision": 0.0021438711710157696, "rougeLsum_precision_stderr": 0.00033684195204710315, "rougeLsum_recall": 0.003922128301743887, "rougeLsum_recall_stderr": 0.0005753087418442966}}}, "anli_r1": {"0": {"GPT-3 style": {"acc": 0.329, "acc_norm": 0.32, "acc_norm_stderr": 0.014758652303574872, "acc_stderr": 0.01486539538592837, "subset": 1}, "MNLI crowdsource": {"acc": 0.334, "acc_norm": 0.349, "acc_norm_stderr": 0.0150806639915631, "acc_stderr": 0.014922019523732954, "subset": 1}, "can we infer": {"acc": 0.334, "acc_norm": 0.331, "acc_norm_stderr": 0.014888272588203941, "acc_stderr": 0.014922019523732954, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.332, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229859, "acc_stderr": 0.014899597242811494, "subset": 1}, "justified in saying": {"acc": 0.344, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229857, "acc_stderr": 0.015029633724408948, "subset": 1}}, "1": {"GPT-3 style": {"acc": 0.355, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732963, "acc_stderr": 0.015139491543780532, "subset": 1}, "MNLI crowdsource": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}, "can we infer": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.343, "acc_norm": 0.336, "acc_norm_stderr": 0.014944140233795025, "acc_stderr": 0.015019206922356953, "subset": 1}, "justified in saying": {"acc": 0.332, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014899597242811483, "subset": 1}}, "2": {"GPT-3 style": {"acc": 0.36, "acc_norm": 0.344, "acc_norm_stderr": 0.015029633724408947, "acc_stderr": 0.015186527932040117, "subset": 1}, "MNLI crowdsource": {"acc": 0.352, "acc_norm": 0.346, "acc_norm_stderr": 0.015050266127564436, "acc_stderr": 0.015110404505648658, "subset": 1}, "can we infer": {"acc": 0.355, "acc_norm": 0.332, "acc_norm_stderr": 0.014899597242811487, "acc_stderr": 0.015139491543780529, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.32, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732967, "acc_stderr": 0.014758652303574878, "subset": 1}, "justified in saying": {"acc": 0.351, "acc_norm": 0.331, "acc_norm_stderr": 0.014888272588203933, "acc_stderr": 0.015100563798316405, "subset": 1}}, "3": {"GPT-3 style": {"acc": 0.363, "acc_norm": 0.353, "acc_norm_stderr": 0.015120172605483696, "acc_stderr": 0.015213890444671287, "subset": 1}, "MNLI crowdsource": {"acc": 0.361, "acc_norm": 0.343, "acc_norm_stderr": 0.015019206922356953, "acc_stderr": 0.015195720118175122, "subset": 1}, "can we infer": {"acc": 0.36, "acc_norm": 0.345, "acc_norm_stderr": 0.015039986742055237, "acc_stderr": 0.015186527932040119, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.326, "acc_norm": 0.331, "acc_norm_stderr": 0.014888272588203938, "acc_stderr": 0.014830507204541042, "subset": 1}, "justified in saying": {"acc": 0.347, "acc_norm": 0.342, "acc_norm_stderr": 0.01500870618212173, "acc_stderr": 0.015060472031706618, "subset": 1}}, "4": {"GPT-3 style": {"acc": 0.349, "acc_norm": 0.353, "acc_norm_stderr": 0.015120172605483694, "acc_stderr": 0.0150806639915631, "subset": 1}, "MNLI crowdsource": {"acc": 0.35, "acc_norm": 0.344, "acc_norm_stderr": 0.015029633724408945, "acc_stderr": 0.015090650341444233, "subset": 1}, "can we infer": {"acc": 0.334, "acc_norm": 0.337, "acc_norm_stderr": 0.014955087918653598, "acc_stderr": 0.014922019523732956, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.332, "acc_norm": 0.325, "acc_norm_stderr": 0.014818724459095526, "acc_stderr": 0.014899597242811476, "subset": 1}, "justified in saying": {"acc": 0.331, "acc_norm": 0.337, "acc_norm_stderr": 0.014955087918653593, "acc_stderr": 0.014888272588203933, "subset": 1}}, "5": {"GPT-3 style": {"acc": 0.364, "acc_norm": 0.344, "acc_norm_stderr": 0.015029633724408947, "acc_stderr": 0.015222868840522024, "subset": 1}, "MNLI crowdsource": {"acc": 0.351, "acc_norm": 0.364, "acc_norm_stderr": 0.015222868840522022, "acc_stderr": 0.015100563798316403, "subset": 1}, "can we infer": {"acc": 0.338, "acc_norm": 0.328, "acc_norm_stderr": 0.014853842487270333, "acc_stderr": 0.014965960710224482, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.33, "acc_norm": 0.337, "acc_norm_stderr": 0.014955087918653609, "acc_stderr": 0.014876872027456732, "subset": 1}, "justified in saying": {"acc": 0.33, "acc_norm": 0.317, "acc_norm_stderr": 0.014721675438880217, "acc_stderr": 0.014876872027456738, "subset": 1}}}, "anli_r2": {"0": {"GPT-3 style": {"acc": 0.333, "acc_norm": 0.342, "acc_norm_stderr": 0.015008706182121734, "acc_stderr": 0.014910846164229871, "subset": 2}, "MNLI crowdsource": {"acc": 0.334, "acc_norm": 0.344, "acc_norm_stderr": 0.015029633724408947, "acc_stderr": 0.014922019523732958, "subset": 2}, "can we infer": {"acc": 0.329, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456727, "acc_stderr": 0.014865395385928369, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.333, "acc_norm": 0.334, "acc_norm_stderr": 0.01492201952373297, "acc_stderr": 0.014910846164229859, "subset": 2}, "justified in saying": {"acc": 0.331, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422987, "acc_stderr": 0.014888272588203934, "subset": 2}}, "1": {"GPT-3 style": {"acc": 0.315, "acc_norm": 0.314, "acc_norm_stderr": 0.014683991951087973, "acc_stderr": 0.014696631960792515, "subset": 2}, "MNLI crowdsource": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}, "can we infer": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.311, "acc_norm": 0.316, "acc_norm_stderr": 0.01470919305605713, "acc_stderr": 0.014645596385722692, "subset": 2}, "justified in saying": {"acc": 0.315, "acc_norm": 0.314, "acc_norm_stderr": 0.014683991951087973, "acc_stderr": 0.014696631960792506, "subset": 2}}, "2": {"GPT-3 style": {"acc": 0.334, "acc_norm": 0.317, "acc_norm_stderr": 0.014721675438880213, "acc_stderr": 0.014922019523732956, "subset": 2}, "MNLI crowdsource": {"acc": 0.316, "acc_norm": 0.323, "acc_norm_stderr": 0.014794927843348633, "acc_stderr": 0.014709193056057127, "subset": 2}, "can we infer": {"acc": 0.324, "acc_norm": 0.326, "acc_norm_stderr": 0.014830507204541026, "acc_stderr": 0.014806864733738868, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.326, "acc_norm": 0.32, "acc_norm_stderr": 0.014758652303574885, "acc_stderr": 0.014830507204541035, "subset": 2}, "justified in saying": {"acc": 0.32, "acc_norm": 0.322, "acc_norm_stderr": 0.014782913600996686, "acc_stderr": 0.014758652303574897, "subset": 2}}, "3": {"GPT-3 style": {"acc": 0.326, "acc_norm": 0.317, "acc_norm_stderr": 0.014721675438880215, "acc_stderr": 0.014830507204541035, "subset": 2}, "MNLI crowdsource": {"acc": 0.317, "acc_norm": 0.308, "acc_norm_stderr": 0.014606483127342761, "acc_stderr": 0.014721675438880219, "subset": 2}, "can we infer": {"acc": 0.324, "acc_norm": 0.328, "acc_norm_stderr": 0.014853842487270333, "acc_stderr": 0.014806864733738863, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.341, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422987, "acc_stderr": 0.014998131348402709, "subset": 2}, "justified in saying": {"acc": 0.324, "acc_norm": 0.324, "acc_norm_stderr": 0.014806864733738864, "acc_stderr": 0.01480686473373886, "subset": 2}}, "4": {"GPT-3 style": {"acc": 0.34, "acc_norm": 0.304, "acc_norm_stderr": 0.01455320568795044, "acc_stderr": 0.014987482264363933, "subset": 2}, "MNLI crowdsource": {"acc": 0.32, "acc_norm": 0.312, "acc_norm_stderr": 0.014658474370509005, "acc_stderr": 0.014758652303574883, "subset": 2}, "can we infer": {"acc": 0.314, "acc_norm": 0.32, "acc_norm_stderr": 0.014758652303574888, "acc_stderr": 0.014683991951087976, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.332, "acc_norm": 0.336, "acc_norm_stderr": 0.014944140233795018, "acc_stderr": 0.014899597242811482, "subset": 2}, "justified in saying": {"acc": 0.317, "acc_norm": 0.309, "acc_norm_stderr": 0.014619600977206488, "acc_stderr": 0.014721675438880219, "subset": 2}}, "5": {"GPT-3 style": {"acc": 0.317, "acc_norm": 0.311, "acc_norm_stderr": 0.014645596385722694, "acc_stderr": 0.01472167543888022, "subset": 2}, "MNLI crowdsource": {"acc": 0.312, "acc_norm": 0.313, "acc_norm_stderr": 0.01467127282297789, "acc_stderr": 0.014658474370509007, "subset": 2}, "can we infer": {"acc": 0.321, "acc_norm": 0.314, "acc_norm_stderr": 0.014683991951087964, "acc_stderr": 0.014770821817934645, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.339, "acc_norm": 0.338, "acc_norm_stderr": 0.014965960710224482, "acc_stderr": 0.014976758771620345, "subset": 2}, "justified in saying": {"acc": 0.331, "acc_norm": 0.312, "acc_norm_stderr": 0.014658474370509012, "acc_stderr": 0.014888272588203931, "subset": 2}}}, "anli_r3": {"0": {"GPT-3 style": {"acc": 0.3275, "acc_norm": 0.345, "acc_norm_stderr": 0.013728421539454872, "acc_stderr": 0.013553211167251947, "subset": 3}, "MNLI crowdsource": {"acc": 0.3375, "acc_norm": 0.3308333333333333, "acc_norm_stderr": 0.013588208070709002, "acc_stderr": 0.013655897185463664, "subset": 3}, "can we infer": {"acc": 0.32666666666666666, "acc_norm": 0.335, "acc_norm_stderr": 0.013630871843821469, "acc_stderr": 0.013544340907003663, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3075, "acc_norm": 0.3283333333333333, "acc_norm_stderr": 0.013562032919529017, "acc_stderr": 0.013326707242912041, "subset": 3}, "justified in saying": {"acc": 0.3475, "acc_norm": 0.3325, "acc_norm_stderr": 0.013605417345710526, "acc_stderr": 0.013751753243291854, "subset": 3}}, "1": {"GPT-3 style": {"acc": 0.335, "acc_norm": 0.3375, "acc_norm_stderr": 0.013655897185463653, "acc_stderr": 0.013630871843821472, "subset": 3}, "MNLI crowdsource": {"acc": 0.335, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.01364760294240639, "acc_stderr": 0.013630871843821474, "subset": 3}, "can we infer": {"acc": 0.33666666666666667, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.013647602942406393, "acc_stderr": 0.013647602942406393, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3375, "acc_norm": 0.335, "acc_norm_stderr": 0.013630871843821472, "acc_stderr": 0.013655897185463653, "subset": 3}, "justified in saying": {"acc": 0.3358333333333333, "acc_norm": 0.3358333333333333, "acc_norm_stderr": 0.013639261190932887, "acc_stderr": 0.013639261190932889, "subset": 3}}, "2": {"GPT-3 style": {"acc": 0.32166666666666666, "acc_norm": 0.3225, "acc_norm_stderr": 0.013499258621103244, "acc_stderr": 0.013490095282989521, "subset": 3}, "MNLI crowdsource": {"acc": 0.32916666666666666, "acc_norm": 0.32666666666666666, "acc_norm_stderr": 0.013544340907003663, "acc_stderr": 0.013570806258433625, "subset": 3}, "can we infer": {"acc": 0.31333333333333335, "acc_norm": 0.305, "acc_norm_stderr": 0.013296358936471105, "acc_stderr": 0.013395739415639082, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.32083333333333336, "acc_norm": 0.3258333333333333, "acc_norm_stderr": 0.013535422043417459, "acc_stderr": 0.013480882752851548, "subset": 3}, "justified in saying": {"acc": 0.32, "acc_norm": 0.30916666666666665, "acc_norm_stderr": 0.013346684134591945, "acc_stderr": 0.013471620929769145, "subset": 3}}, "3": {"GPT-3 style": {"acc": 0.33166666666666667, "acc_norm": 0.32166666666666666, "acc_norm_stderr": 0.013490095282989521, "acc_stderr": 0.013596836729485163, "subset": 3}, "MNLI crowdsource": {"acc": 0.3425, "acc_norm": 0.3416666666666667, "acc_norm_stderr": 0.013696658778002515, "acc_stderr": 0.013704669762934723, "subset": 3}, "can we infer": {"acc": 0.3433333333333333, "acc_norm": 0.32416666666666666, "acc_norm_stderr": 0.013517438120881633, "acc_stderr": 0.01371263383046586, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3275, "acc_norm": 0.33, "acc_norm_stderr": 0.013579531277800922, "acc_stderr": 0.013553211167251956, "subset": 3}, "justified in saying": {"acc": 0.3525, "acc_norm": 0.32916666666666666, "acc_norm_stderr": 0.013570806258433623, "acc_stderr": 0.013797164918918366, "subset": 3}}, "4": {"GPT-3 style": {"acc": 0.32166666666666666, "acc_norm": 0.32083333333333336, "acc_norm_stderr": 0.013480882752851557, "acc_stderr": 0.013490095282989521, "subset": 3}, "MNLI crowdsource": {"acc": 0.335, "acc_norm": 0.3275, "acc_norm_stderr": 0.013553211167251947, "acc_stderr": 0.013630871843821474, "subset": 3}, "can we infer": {"acc": 0.3225, "acc_norm": 0.33166666666666667, "acc_norm_stderr": 0.013596836729485164, "acc_stderr": 0.013499258621103245, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3408333333333333, "acc_norm": 0.3408333333333333, "acc_norm_stderr": 0.013688600793296934, "acc_stderr": 0.013688600793296936, "subset": 3}, "justified in saying": {"acc": 0.31916666666666665, "acc_norm": 0.335, "acc_norm_stderr": 0.013630871843821472, "acc_stderr": 0.01346230971200514, "subset": 3}}, "5": {"GPT-3 style": {"acc": 0.315, "acc_norm": 0.33416666666666667, "acc_norm_stderr": 0.01362243481313678, "acc_stderr": 0.013415009084004864, "subset": 3}, "MNLI crowdsource": {"acc": 0.31833333333333336, "acc_norm": 0.32083333333333336, "acc_norm_stderr": 0.013480882752851553, "acc_stderr": 0.01345294899699629, "subset": 3}, "can we infer": {"acc": 0.31166666666666665, "acc_norm": 0.33166666666666667, "acc_norm_stderr": 0.01359683672948516, "acc_stderr": 0.01337626879098211, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3375, "acc_norm": 0.32916666666666666, "acc_norm_stderr": 0.01357080625843363, "acc_stderr": 0.013655897185463658, "subset": 3}, "justified in saying": {"acc": 0.315, "acc_norm": 0.3358333333333333, "acc_norm_stderr": 0.013639261190932879, "acc_stderr": 0.013415009084004866, "subset": 3}}}, "arc_easy": {"0": {"heres_a_problem": {"acc": 0.255050505050505, "acc_norm": 0.255050505050505, "acc_norm_stderr": 0.008944265906130714, "acc_stderr": 0.008944265906130714}, "i_am_hesitating": {"acc": 0.35185185185185186, "acc_norm": 0.32407407407407407, "acc_norm_stderr": 0.00960372885009539, "acc_stderr": 0.009799078929868707}, "multiple_choice": {"acc": 0.2354948805460751, "acc_norm": 0.27047781569965873, "acc_norm_stderr": 0.012980954547659556, "acc_stderr": 0.012399451855004752}, "pick_the_most_correct_option": {"acc": 0.2563131313131313, "acc_norm": 0.2563131313131313, "acc_norm_stderr": 0.008958775997918365, "acc_stderr": 0.008958775997918365}, "qa_options": {"acc": 0.35395622895622897, "acc_norm": 0.3148148148148148, "acc_norm_stderr": 0.009530150430975593, "acc_stderr": 0.00981237064417443}}, "1": {"heres_a_problem": {"acc": 0.23208191126279865, "acc_norm": 0.23208191126279865, "acc_norm_stderr": 0.012336718284948856, "acc_stderr": 0.012336718284948856}, "i_am_hesitating": {"acc": 0.2713310580204778, "acc_norm": 0.29692832764505117, "acc_norm_stderr": 0.013352025976725222, "acc_stderr": 0.012993807727545784}, "multiple_choice": {"acc": 0.25, "acc_norm": 0.26109215017064846, "acc_norm_stderr": 0.012835523909473857, "acc_stderr": 0.012653835621466646}, "pick_the_most_correct_option": {"acc": 0.24284511784511784, "acc_norm": 0.24284511784511784, "acc_norm_stderr": 0.00879883644422203, "acc_stderr": 0.00879883644422203}, "qa_options": {"acc": 0.3291245791245791, "acc_norm": 0.30345117845117847, "acc_norm_stderr": 0.009433837434252272, "acc_stderr": 0.009642048058060978}}, "2": {"heres_a_problem": {"acc": 0.2558922558922559, "acc_norm": 0.2558922558922559, "acc_norm_stderr": 0.008953950243013993, "acc_stderr": 0.008953950243013993}, "i_am_hesitating": {"acc": 0.3333333333333333, "acc_norm": 0.30303030303030304, "acc_norm_stderr": 0.009430140669278948, "acc_stderr": 0.009673016668133394}, "multiple_choice": {"acc": 0.3282828282828283, "acc_norm": 0.3291245791245791, "acc_norm_stderr": 0.009642048058060978, "acc_stderr": 0.00963574950926216}, "pick_the_most_correct_option": {"acc": 0.2563131313131313, "acc_norm": 0.2563131313131313, "acc_norm_stderr": 0.008958775997918354, "acc_stderr": 0.008958775997918354}, "qa_options": {"acc": 0.32154882154882153, "acc_norm": 0.30765993265993263, "acc_norm_stderr": 0.009470292575831183, "acc_stderr": 0.009584091575640627}}, "3": {"heres_a_problem": {"acc": 0.22866894197952217, "acc_norm": 0.22866894197952217, "acc_norm_stderr": 0.012272853582540806, "acc_stderr": 0.012272853582540806}, "i_am_hesitating": {"acc": 0.335016835016835, "acc_norm": 0.2984006734006734, "acc_norm_stderr": 0.009388855914040433, "acc_stderr": 0.009685160765932356}, "multiple_choice": {"acc": 0.26023890784982934, "acc_norm": 0.2696245733788396, "acc_norm_stderr": 0.012968040686869154, "acc_stderr": 0.012821930225112556}, "pick_the_most_correct_option": {"acc": 0.24621212121212122, "acc_norm": 0.24621212121212122, "acc_norm_stderr": 0.008839902656771866, "acc_stderr": 0.008839902656771866}, "qa_options": {"acc": 0.2738907849829352, "acc_norm": 0.2832764505119454, "acc_norm_stderr": 0.013167478735134576, "acc_stderr": 0.013032004972989503}}, "4": {"heres_a_problem": {"acc": 0.24061433447098976, "acc_norm": 0.24061433447098976, "acc_norm_stderr": 0.01249146853239057, "acc_stderr": 0.01249146853239057}, "i_am_hesitating": {"acc": 0.32407407407407407, "acc_norm": 0.3026094276094276, "acc_norm_stderr": 0.009426434542371227, "acc_stderr": 0.009603728850095384}, "multiple_choice": {"acc": 0.26535836177474403, "acc_norm": 0.26706484641638223, "acc_norm_stderr": 0.012928933196496342, "acc_stderr": 0.012902554762313964}, "pick_the_most_correct_option": {"acc": 0.24705387205387205, "acc_norm": 0.24705387205387205, "acc_norm_stderr": 0.008850055161459234, "acc_stderr": 0.008850055161459234}, "qa_options": {"acc": 0.26023890784982934, "acc_norm": 0.27303754266211605, "acc_norm_stderr": 0.013019332762635725, "acc_stderr": 0.012821930225112547}}, "5": {"heres_a_problem": {"acc": 0.24663299663299662, "acc_norm": 0.24663299663299662, "acc_norm_stderr": 0.008844984581934895, "acc_stderr": 0.008844984581934895}, "i_am_hesitating": {"acc": 0.3202861952861953, "acc_norm": 0.29797979797979796, "acc_norm_stderr": 0.00938504606669487, "acc_stderr": 0.00957415266873942}, "multiple_choice": {"acc": 0.257679180887372, "acc_norm": 0.26535836177474403, "acc_norm_stderr": 0.012902554762313962, "acc_stderr": 0.012780770562768405}, "pick_the_most_correct_option": {"acc": 0.25252525252525254, "acc_norm": 0.25252525252525254, "acc_norm_stderr": 0.00891494899149571, "acc_stderr": 0.00891494899149571}, "qa_options": {"acc": 0.3164983164983165, "acc_norm": 0.2962962962962963, "acc_norm_stderr": 0.009369711585684292, "acc_stderr": 0.009543851857323891}}}, "boolq": {"0": {"GPT-3 Style": {"acc": 0.589, "acc_norm": 0.6273333333333333, "acc_norm_stderr": 0.00882919733890307, "acc_stderr": 0.008984425782182318}, "after_reading": {"acc": 0.6206666666666667, "acc_norm": 0.4083333333333333, "acc_norm_stderr": 0.008975481073137033, "acc_stderr": 0.008860362324722527}, "exercise": {"acc": 0.6226666666666667, "acc_norm": 0.46266666666666667, "acc_norm_stderr": 0.009104744524973354, "acc_stderr": 0.00885120015653439}, "valid_binary": {"acc": 0.49766666666666665, "acc_norm": 0.38133333333333336, "acc_norm_stderr": 0.008869364649389163, "acc_stderr": 0.009130131705156546}, "yes_no_question": {"acc": 0.38966666666666666, "acc_norm": 0.6243333333333333, "acc_norm_stderr": 0.008843442555522142, "acc_stderr": 0.008905164372580985}}, "1": {"GPT-3 Style": {"acc": 0.6156666666666667, "acc_norm": 0.63, "acc_norm_stderr": 0.008816229842524025, "acc_stderr": 0.008882569490543049}, "after_reading": {"acc": 0.5406666666666666, "acc_norm": 0.5406666666666666, "acc_norm_stderr": 0.009099982269204863, "acc_stderr": 0.009099982269204863}, "exercise": {"acc": 0.5423333333333333, "acc_norm": 0.5413333333333333, "acc_norm_stderr": 0.009098980657278165, "acc_stderr": 0.009097447488896774}, "valid_binary": {"acc": 0.5426666666666666, "acc_norm": 0.5423333333333333, "acc_norm_stderr": 0.009097447488896775, "acc_stderr": 0.009096928229880421}, "yes_no_question": {"acc": 0.5406666666666666, "acc_norm": 0.5406666666666666, "acc_norm_stderr": 0.009099982269204863, "acc_stderr": 0.009099982269204863}}, "2": {"GPT-3 Style": {"acc": 0.6273333333333333, "acc_norm": 0.6273333333333333, "acc_norm_stderr": 0.008829197338903068, "acc_stderr": 0.008829197338903068}, "after_reading": {"acc": 0.5963333333333334, "acc_norm": 0.5913333333333334, "acc_norm_stderr": 0.008976614094836194, "acc_stderr": 0.008959169522662576}, "exercise": {"acc": 0.5473333333333333, "acc_norm": 0.531, "acc_norm_stderr": 0.009112665923139413, "acc_stderr": 0.00908922749948324}, "valid_binary": {"acc": 0.5913333333333334, "acc_norm": 0.588, "acc_norm_stderr": 0.008987709736566396, "acc_stderr": 0.00897661409483619}, "yes_no_question": {"acc": 0.595, "acc_norm": 0.5943333333333334, "acc_norm_stderr": 0.008966262991425923, "acc_stderr": 0.00896391565823638}}, "3": {"GPT-3 Style": {"acc": 0.6313333333333333, "acc_norm": 0.6336666666666667, "acc_norm_stderr": 0.008797928274394058, "acc_stderr": 0.008809638003862736}, "after_reading": {"acc": 0.613, "acc_norm": 0.605, "acc_norm_stderr": 0.008926639623340282, "acc_stderr": 0.008894007408882734}, "exercise": {"acc": 0.546, "acc_norm": 0.523, "acc_norm_stderr": 0.0091205662238016, "acc_stderr": 0.009091509877386517}, "valid_binary": {"acc": 0.6136666666666667, "acc_norm": 0.6033333333333334, "acc_norm_stderr": 0.008933122315228996, "acc_stderr": 0.008891174310695494}, "yes_no_question": {"acc": 0.6096666666666667, "acc_norm": 0.6126666666666667, "acc_norm_stderr": 0.008895417372116205, "acc_stderr": 0.008907909838637944}}, "4": {"GPT-3 Style": {"acc": 0.6323333333333333, "acc_norm": 0.631, "acc_norm_stderr": 0.008811292732995706, "acc_stderr": 0.008804646702971675}, "after_reading": {"acc": 0.6173333333333333, "acc_norm": 0.604, "acc_norm_stderr": 0.008930542249025189, "acc_stderr": 0.008875277637761267}, "exercise": {"acc": 0.5476666666666666, "acc_norm": 0.5156666666666667, "acc_norm_stderr": 0.009125748094153249, "acc_stderr": 0.009088646624339614}, "valid_binary": {"acc": 0.6156666666666667, "acc_norm": 0.603, "acc_norm_stderr": 0.00893440584870012, "acc_stderr": 0.008882569490543054}, "yes_no_question": {"acc": 0.6206666666666667, "acc_norm": 0.6236666666666667, "acc_norm_stderr": 0.008846558976258924, "acc_stderr": 0.008860362324722528}}, "5": {"GPT-3 Style": {"acc": 0.6276666666666667, "acc_norm": 0.6323333333333333, "acc_norm_stderr": 0.00880464670297168, "acc_stderr": 0.008827592133099664}, "after_reading": {"acc": 0.62, "acc_norm": 0.6096666666666667, "acc_norm_stderr": 0.008907909838637955, "acc_stderr": 0.008863380835773165}, "exercise": {"acc": 0.5383333333333333, "acc_norm": 0.5193333333333333, "acc_norm_stderr": 0.009123403215694962, "acc_stderr": 0.009103358843448796}, "valid_binary": {"acc": 0.6183333333333333, "acc_norm": 0.6013333333333334, "acc_norm_stderr": 0.008940758594209433, "acc_stderr": 0.008870849530787626}, "yes_no_question": {"acc": 0.616, "acc_norm": 0.624, "acc_norm_stderr": 0.008845002997512752, "acc_stderr": 0.008881119942353993}}}, "cb": {"0": {"GPT-3 style": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.20571590265987547}, "MNLI crowdsource": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.19047619047619047}, "can we infer": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2407177363699103}, "guaranteed/possible/impossible": {"acc": 0.30357142857142855, "acc_stderr": 0.06199938655510753, "f1": 0.24545791620318877}, "justified in saying": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2558647026732133}}, "1": {"GPT-3 style": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "MNLI crowdsource": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "can we infer": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "guaranteed/possible/impossible": {"acc": 0.35714285714285715, "acc_stderr": 0.0646095738380922, "f1": 0.26425954997383566}, "justified in saying": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}}, "2": {"GPT-3 style": {"acc": 0.44642857142857145, "acc_stderr": 0.06703189227942398, "f1": 0.31122702434177846}, "MNLI crowdsource": {"acc": 0.44642857142857145, "acc_stderr": 0.06703189227942398, "f1": 0.3081967213114754}, "can we infer": {"acc": 0.44642857142857145, "acc_stderr": 0.06703189227942398, "f1": 0.32100667693888035}, "guaranteed/possible/impossible": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.32806324110671936}, "justified in saying": {"acc": 0.44642857142857145, "acc_stderr": 0.06703189227942398, "f1": 0.317639673571877}}, "3": {"GPT-3 style": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.3113026819923371}, "MNLI crowdsource": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.23488400048082703}, "can we infer": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.3040078201368524}, "guaranteed/possible/impossible": {"acc": 0.26785714285714285, "acc_stderr": 0.05971290310957636, "f1": 0.23582766439909297}, "justified in saying": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2716672677004448}}, "4": {"GPT-3 style": {"acc": 0.48214285714285715, "acc_stderr": 0.0673769750864465, "f1": 0.33963161021984556}, "MNLI crowdsource": {"acc": 0.35714285714285715, "acc_stderr": 0.0646095738380922, "f1": 0.250952380952381}, "can we infer": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.31340255400405775}, "guaranteed/possible/impossible": {"acc": 0.21428571428571427, "acc_stderr": 0.05532833351724884, "f1": 0.1915830546265329}, "justified in saying": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359541, "f1": 0.2880952380952381}}, "5": {"GPT-3 style": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.31340255400405775}, "MNLI crowdsource": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.26353276353276356}, "can we infer": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359542, "f1": 0.2886268730041759}, "guaranteed/possible/impossible": {"acc": 0.21428571428571427, "acc_stderr": 0.055328333517248834, "f1": 0.18070818070818073}, "justified in saying": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.31761006289308175}}}, "copa": {"0": {"best_option": {"acc": 0.6, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912, "acc_stderr": 0.04923659639173309}, "cause_effect": {"acc": 0.6, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912, "acc_stderr": 0.049236596391733084}, "choose": {"acc": 0.6, "acc_norm": 0.53, "acc_norm_stderr": 0.050161355804659205, "acc_stderr": 0.049236596391733084}, "i_am_hesitating": {"acc": 0.56, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.04988876515698589}, "plausible_alternatives": {"acc": 0.57, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.04975698519562428}}, "1": {"best_option": {"acc": 0.5, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.050251890762960605}, "cause_effect": {"acc": 0.46, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084, "acc_stderr": 0.05009082659620332}, "choose": {"acc": 0.48, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.050211673156867795}, "i_am_hesitating": {"acc": 0.47, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.05016135580465919}, "plausible_alternatives": {"acc": 0.46, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.05009082659620332}}, "2": {"best_option": {"acc": 0.48, "acc_norm": 0.45, "acc_norm_stderr": 0.049999999999999996, "acc_stderr": 0.050211673156867795}, "cause_effect": {"acc": 0.43, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562428, "acc_stderr": 0.049756985195624284}, "choose": {"acc": 0.47, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.05016135580465919}, "i_am_hesitating": {"acc": 0.42, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.049604496374885836}, "plausible_alternatives": {"acc": 0.44, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.04988876515698589}}, "3": {"best_option": {"acc": 0.52, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956911, "acc_stderr": 0.050211673156867795}, "cause_effect": {"acc": 0.45, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.04999999999999999}, "choose": {"acc": 0.44, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.0498887651569859}, "i_am_hesitating": {"acc": 0.48, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.05021167315686779}, "plausible_alternatives": {"acc": 0.43, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.049756985195624284}}, "4": {"best_option": {"acc": 0.53, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.050161355804659205}, "cause_effect": {"acc": 0.47, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999998, "acc_stderr": 0.05016135580465919}, "choose": {"acc": 0.43, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974, "acc_stderr": 0.049756985195624284}, "i_am_hesitating": {"acc": 0.45, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.04999999999999999}, "plausible_alternatives": {"acc": 0.46, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.05009082659620332}}, "5": {"best_option": {"acc": 0.5, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.050251890762960605}, "cause_effect": {"acc": 0.47, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956913, "acc_stderr": 0.05016135580465919}, "choose": {"acc": 0.5, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.050251890762960605}, "i_am_hesitating": {"acc": 0.49, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.05024183937956912}, "plausible_alternatives": {"acc": 0.46, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.05009082659620332}}}, "e2e_nlg_cleaned": {"0": {"coherent_text": {"bleu": 3.3348308138228138, "bleu_stderr": 0.0542442268004444, "rouge1_fmeasure": 0.2057045204344029, "rouge1_fmeasure_stderr": 0.0015278039224567346, "rouge1_precision": 0.15695463233811197, "rouge1_precision_stderr": 0.0013172199856712327, "rouge1_recall": 0.31753062729975684, "rouge1_recall_stderr": 0.0021570780173147865, "rouge2_fmeasure": 0.07227415925772734, "rouge2_fmeasure_stderr": 0.0010191143299311432, "rouge2_precision": 0.054730014529234865, "rouge2_precision_stderr": 0.0007967615111152051, "rouge2_recall": 0.11321119898453884, "rouge2_recall_stderr": 0.0016182734674237542, "rougeL_fmeasure": 0.17810439394384858, "rougeL_fmeasure_stderr": 0.0013638022161662619, "rougeL_precision": 0.1355356394919694, "rougeL_precision_stderr": 0.001147779810670209, "rougeL_recall": 0.2762000750744859, "rougeL_recall_stderr": 0.0020067480630458333, "rougeLsum_fmeasure": 0.17817139272595273, "rougeLsum_fmeasure_stderr": 0.0014357182624015643, "rougeLsum_precision": 0.13583123172424444, "rougeLsum_precision_stderr": 0.0012122638205754406, "rougeLsum_recall": 0.27559480017616955, "rougeLsum_recall_stderr": 0.002087623902803619}, "create_text_for_me": {"bleu": 0.5007977834063807, "bleu_stderr": 0.03795381558072733, "rouge1_fmeasure": 0.14034402421598652, "rouge1_fmeasure_stderr": 0.0010836222389849536, "rouge1_precision": 0.10661165424565706, "rouge1_precision_stderr": 0.0009177614571421352, "rouge1_recall": 0.21742813713741768, "rouge1_recall_stderr": 0.0015639823409019026, "rouge2_fmeasure": 0.01749967959480122, "rouge2_fmeasure_stderr": 0.00048519612999649395, "rouge2_precision": 0.013032347739898609, "rouge2_precision_stderr": 0.00035990837418407645, "rouge2_recall": 0.028194149116957987, "rouge2_recall_stderr": 0.0008205075711069978, "rougeL_fmeasure": 0.12481249710425146, "rougeL_fmeasure_stderr": 0.000964737385169164, "rougeL_precision": 0.09463196140112043, "rougeL_precision_stderr": 0.0008085663634196383, "rougeL_recall": 0.19429758597333047, "rougeL_recall_stderr": 0.001444769495062812, "rougeLsum_fmeasure": 0.11908206350989242, "rougeLsum_fmeasure_stderr": 0.0009080994588157214, "rougeLsum_precision": 0.09025046396276212, "rougeLsum_precision_stderr": 0.00076036607720926, "rougeLsum_recall": 0.18536439953811315, "rougeLsum_recall_stderr": 0.0013433343943009591}, "generate_gramatically_correct_text": {"bleu": 1.675468702405354e-98, "bleu_stderr": 2.5576140191327723e-91, "rouge1_fmeasure": 0.0070392305995167406, "rouge1_fmeasure_stderr": 0.0004846274711079081, "rouge1_precision": 0.06825, "rouge1_precision_stderr": 0.004585546668182722, "rouge1_recall": 0.003749985635338865, "rouge1_recall_stderr": 0.00026048740343422104, "rouge2_fmeasure": 5.466015466015465e-05, "rouge2_fmeasure_stderr": 3.1579886935067115e-05, "rouge2_precision": 0.00046666666666666666, "rouge2_precision_stderr": 0.00034636309927530367, "rouge2_recall": 3.063572910771552e-05, "rouge2_recall_stderr": 1.7771220236267286e-05, "rougeL_fmeasure": 0.006953552388838532, "rougeL_fmeasure_stderr": 0.00047845164190825134, "rougeL_precision": 0.06770833333333333, "rougeL_precision_stderr": 0.004559937242192579, "rougeL_recall": 0.003701609566962797, "rougeL_recall_stderr": 0.0002566134309674225, "rougeLsum_fmeasure": 0.006877766213052355, "rougeLsum_fmeasure_stderr": 0.0004727965098252039, "rougeLsum_precision": 0.06752777777777778, "rougeLsum_precision_stderr": 0.0045570416656537435, "rougeLsum_recall": 0.0036533251906324437, "rougeLsum_recall_stderr": 0.0002523074490538275}, "generate_text_restaurant": {"bleu": 0.8175837539659996, "bleu_stderr": 0.06903661804515053, "rouge1_fmeasure": 0.07913875454413057, "rouge1_fmeasure_stderr": 0.0015242869094641899, "rouge1_precision": 0.067410137720777, "rouge1_precision_stderr": 0.0015267778764287776, "rouge1_recall": 0.11905569111444682, "rouge1_recall_stderr": 0.002272837057404917, "rouge2_fmeasure": 0.010022915068112901, "rouge2_fmeasure_stderr": 0.0005999619167956302, "rouge2_precision": 0.00855571479752271, "rouge2_precision_stderr": 0.000610070681102783, "rouge2_recall": 0.01452701665726237, "rouge2_recall_stderr": 0.0008619364610158222, "rougeL_fmeasure": 0.07774011765647057, "rougeL_fmeasure_stderr": 0.0014739968390031794, "rougeL_precision": 0.06588530481318243, "rougeL_precision_stderr": 0.001456682483397516, "rougeL_recall": 0.11730596366118605, "rougeL_recall_stderr": 0.0022143035280371703, "rougeLsum_fmeasure": 0.06060524038626222, "rougeLsum_fmeasure_stderr": 0.0012344605903209444, "rougeLsum_precision": 0.053281568752607505, "rougeLsum_precision_stderr": 0.0013584259494467741, "rougeLsum_recall": 0.0896663423829268, "rougeLsum_recall_stderr": 0.0017758029746831937}, "text": {"bleu": 2.135318771277489, "bleu_stderr": 0.05149736456182392, "rouge1_fmeasure": 0.17219385386945554, "rouge1_fmeasure_stderr": 0.0016039551103560214, "rouge1_precision": 0.1300442556023875, "rouge1_precision_stderr": 0.0012997860884800998, "rouge1_recall": 0.26964198994481503, "rouge1_recall_stderr": 0.002413258698941157, "rouge2_fmeasure": 0.053555609070310047, "rouge2_fmeasure_stderr": 0.0009542998217676876, "rouge2_precision": 0.039727029516438334, "rouge2_precision_stderr": 0.0007171629939039988, "rouge2_recall": 0.08739502207944705, "rouge2_recall_stderr": 0.0015891465126372196, "rougeL_fmeasure": 0.15509057564768253, "rougeL_fmeasure_stderr": 0.0013250287969709912, "rougeL_precision": 0.11658569686014478, "rougeL_precision_stderr": 0.0010550984665999475, "rougeL_recall": 0.24489523558752985, "rougeL_recall_stderr": 0.002084234926217945, "rougeLsum_fmeasure": 0.14407710887166456, "rougeLsum_fmeasure_stderr": 0.0014318610459664793, "rougeLsum_precision": 0.1085790664749436, "rougeLsum_precision_stderr": 0.0011429046649663122, "rougeLsum_recall": 0.22674868093509162, "rougeLsum_recall_stderr": 0.002209787876074805}}, "1": {"coherent_text": {"bleu": 6.124007942744359, "bleu_stderr": 0.07045741934241276, "rouge1_fmeasure": 0.4315579334995969, "rouge1_fmeasure_stderr": 0.002196108406977491, "rouge1_precision": 0.3538932352903374, "rouge1_precision_stderr": 0.0022379532621590835, "rouge1_recall": 0.5881592755809824, "rouge1_recall_stderr": 0.002734526700250523, "rouge2_fmeasure": 0.1874037992560188, "rouge2_fmeasure_stderr": 0.0015276804768402203, "rouge2_precision": 0.15240494036724936, "rouge2_precision_stderr": 0.0013595392500719097, "rouge2_recall": 0.2605145852434897, "rouge2_recall_stderr": 0.00217480883356232, "rougeL_fmeasure": 0.30315812846780715, "rougeL_fmeasure_stderr": 0.0015402915151127962, "rougeL_precision": 0.2466966884536201, "rougeL_precision_stderr": 0.0014576935302306722, "rougeL_recall": 0.41914793158989233, "rougeL_recall_stderr": 0.0023677125845129993, "rougeLsum_fmeasure": 0.354643328344842, "rougeLsum_fmeasure_stderr": 0.0019921434871566575, "rougeLsum_precision": 0.2906120521359478, "rougeLsum_precision_stderr": 0.001962484953478978, "rougeLsum_recall": 0.484233118629712, "rougeLsum_recall_stderr": 0.002583500073418052}, "create_text_for_me": {"bleu": 5.912367109877884, "bleu_stderr": 0.061836973143818245, "rouge1_fmeasure": 0.394742251409417, "rouge1_fmeasure_stderr": 0.0017032945676370627, "rouge1_precision": 0.3128102978451566, "rouge1_precision_stderr": 0.001691715951507062, "rouge1_recall": 0.5681459084996038, "rouge1_recall_stderr": 0.0024899844904499157, "rouge2_fmeasure": 0.16624694535550544, "rouge2_fmeasure_stderr": 0.0013782062542515164, "rouge2_precision": 0.13058114965570497, "rouge2_precision_stderr": 0.0011618657961389826, "rouge2_recall": 0.24495367272360657, "rouge2_recall_stderr": 0.0021854464652663118, "rougeL_fmeasure": 0.2666452952973803, "rougeL_fmeasure_stderr": 0.0013235129810010833, "rougeL_precision": 0.2101887975592876, "rougeL_precision_stderr": 0.0012001358599358798, "rougeL_recall": 0.38833816310299923, "rougeL_recall_stderr": 0.002272167927909678, "rougeLsum_fmeasure": 0.3277807853351648, "rougeLsum_fmeasure_stderr": 0.0016501024982922983, "rougeLsum_precision": 0.2597848984036747, "rougeLsum_precision_stderr": 0.001569416089582477, "rougeLsum_recall": 0.4719852089818159, "rougeLsum_recall_stderr": 0.002427421024485974}, "generate_gramatically_correct_text": {"bleu": 0.09194884482583202, "bleu_stderr": 0.018000796645724612, "rouge1_fmeasure": 0.07620341314076311, "rouge1_fmeasure_stderr": 0.00314933874193351, "rouge1_precision": 0.10846695430848712, "rouge1_precision_stderr": 0.004476787852682004, "rouge1_recall": 0.07403964823256537, "rouge1_recall_stderr": 0.003217977904697661, "rouge2_fmeasure": 0.034204603336784774, "rouge2_fmeasure_stderr": 0.0015853396117442324, "rouge2_precision": 0.04014965767231649, "rouge2_precision_stderr": 0.001936657761303654, "rouge2_recall": 0.03382193035815861, "rouge2_recall_stderr": 0.0016247551035281998, "rougeL_fmeasure": 0.054364245776715184, "rougeL_fmeasure_stderr": 0.002273440248490721, "rougeL_precision": 0.08344446002840113, "rougeL_precision_stderr": 0.003725636670151101, "rougeL_recall": 0.05251848706263357, "rougeL_recall_stderr": 0.002326892413736413, "rougeLsum_fmeasure": 0.06338498822890376, "rougeLsum_fmeasure_stderr": 0.0026438955448359253, "rougeLsum_precision": 0.09421039833488701, "rougeLsum_precision_stderr": 0.004060934406491245, "rougeLsum_recall": 0.06112231322923796, "rougeLsum_recall_stderr": 0.002680843364295191}, "generate_text_restaurant": {"bleu": 11.360023014480657, "bleu_stderr": 0.14523312599938573, "rouge1_fmeasure": 0.4457059970163705, "rouge1_fmeasure_stderr": 0.0023475038584115206, "rouge1_precision": 0.5368002507342822, "rouge1_precision_stderr": 0.003197714150333534, "rouge1_recall": 0.4204347436029918, "rouge1_recall_stderr": 0.0029932330301144583, "rouge2_fmeasure": 0.20489136085595536, "rouge2_fmeasure_stderr": 0.0019458972766888125, "rouge2_precision": 0.25010730577128015, "rouge2_precision_stderr": 0.0025416736419787912, "rouge2_recall": 0.19320140085672308, "rouge2_recall_stderr": 0.0021059841419488645, "rougeL_fmeasure": 0.321161092281206, "rougeL_fmeasure_stderr": 0.001995161634007105, "rougeL_precision": 0.390181985500488, "rougeL_precision_stderr": 0.002842772643373759, "rougeL_recall": 0.30202346410754144, "rougeL_recall_stderr": 0.002382966736707436, "rougeLsum_fmeasure": 0.3629740875330838, "rougeLsum_fmeasure_stderr": 0.00225666081685001, "rougeLsum_precision": 0.4390382418075218, "rougeLsum_precision_stderr": 0.003082038398447118, "rougeLsum_recall": 0.34172069570320107, "rougeLsum_recall_stderr": 0.0026874775375555484}, "text": {"bleu": 6.514185829033243, "bleu_stderr": 0.08644240018036378, "rouge1_fmeasure": 0.44848731359904886, "rouge1_fmeasure_stderr": 0.0020321502995675426, "rouge1_precision": 0.37271218983917304, "rouge1_precision_stderr": 0.0021586668011346864, "rouge1_recall": 0.5972780988107822, "rouge1_recall_stderr": 0.002538198650027097, "rouge2_fmeasure": 0.20027753375836946, "rouge2_fmeasure_stderr": 0.0015441978236127502, "rouge2_precision": 0.16529227064321578, "rouge2_precision_stderr": 0.0013977761177152697, "rouge2_recall": 0.27152870296975423, "rouge2_recall_stderr": 0.002179427881235387, "rougeL_fmeasure": 0.3132180262669656, "rougeL_fmeasure_stderr": 0.0015462995225589086, "rougeL_precision": 0.2586035514720966, "rougeL_precision_stderr": 0.0014969615123517218, "rougeL_recall": 0.422455743313377, "rougeL_recall_stderr": 0.0023558444341864183, "rougeLsum_fmeasure": 0.37097935121646175, "rougeLsum_fmeasure_stderr": 0.0019395659335613548, "rougeLsum_precision": 0.3082635717959347, "rougeLsum_precision_stderr": 0.0019666465629011544, "rougeLsum_recall": 0.49456210205094986, "rougeLsum_recall_stderr": 0.002502870066737165}}, "2": {"coherent_text": {"bleu": 6.873416618205943, "bleu_stderr": 0.06451768534270613, "rouge1_fmeasure": 0.4285046970746608, "rouge1_fmeasure_stderr": 0.0020742383144155244, "rouge1_precision": 0.34827807118661636, "rouge1_precision_stderr": 0.0021235627642469048, "rouge1_recall": 0.5915788110263536, "rouge1_recall_stderr": 0.0026271815009548663, "rouge2_fmeasure": 0.18976901148520062, "rouge2_fmeasure_stderr": 0.001566892088457795, "rouge2_precision": 0.15293244786496132, "rouge2_precision_stderr": 0.0013754728634272706, "rouge2_recall": 0.26790058462792615, "rouge2_recall_stderr": 0.002298680167945361, "rougeL_fmeasure": 0.30509672166724194, "rougeL_fmeasure_stderr": 0.001517855741385593, "rougeL_precision": 0.24630747239149695, "rougeL_precision_stderr": 0.001440271745251014, "rougeL_recall": 0.4268276282189634, "rougeL_recall_stderr": 0.00236641291461196, "rougeLsum_fmeasure": 0.3533423486463186, "rougeLsum_fmeasure_stderr": 0.001973114674110923, "rougeLsum_precision": 0.287048775958497, "rougeLsum_precision_stderr": 0.0019274716695619075, "rougeLsum_recall": 0.48857994815747946, "rougeLsum_recall_stderr": 0.002609384753544022}, "create_text_for_me": {"bleu": 6.6948355547841905, "bleu_stderr": 0.0511979514463551, "rouge1_fmeasure": 0.4017351734479823, "rouge1_fmeasure_stderr": 0.0016902369451423883, "rouge1_precision": 0.3163535969719062, "rouge1_precision_stderr": 0.0016603427152068276, "rouge1_recall": 0.5834971336648034, "rouge1_recall_stderr": 0.002519250385447222, "rouge2_fmeasure": 0.1773205809463223, "rouge2_fmeasure_stderr": 0.0013921382173492855, "rouge2_precision": 0.1381896709382772, "rouge2_precision_stderr": 0.0011500719976283826, "rouge2_recall": 0.26453724451723776, "rouge2_recall_stderr": 0.0022668939643765884, "rougeL_fmeasure": 0.2761947809578966, "rougeL_fmeasure_stderr": 0.0013335952149149215, "rougeL_precision": 0.2163360334247349, "rougeL_precision_stderr": 0.0011827261695832954, "rougeL_recall": 0.40582280287415906, "rougeL_recall_stderr": 0.0023312901734070086, "rougeLsum_fmeasure": 0.33603536027203956, "rougeLsum_fmeasure_stderr": 0.001646750924419848, "rougeLsum_precision": 0.2646418877094466, "rougeLsum_precision_stderr": 0.0015454852649096642, "rougeLsum_recall": 0.48824736439219957, "rougeLsum_recall_stderr": 0.002461897429922055}, "generate_gramatically_correct_text": {"bleu": 1.59707844485191, "bleu_stderr": 0.1425795749931428, "rouge1_fmeasure": 0.1494166605053438, "rouge1_fmeasure_stderr": 0.004165824722062478, "rouge1_precision": 0.19136701846801105, "rouge1_precision_stderr": 0.005379543595527197, "rouge1_recall": 0.14581299068539502, "rouge1_recall_stderr": 0.004242345755430707, "rouge2_fmeasure": 0.07093954631020417, "rouge2_fmeasure_stderr": 0.0021919661337184986, "rouge2_precision": 0.08162398770498766, "rouge2_precision_stderr": 0.0026060437165329924, "rouge2_recall": 0.07003897501562158, "rouge2_recall_stderr": 0.0022428114359523145, "rougeL_fmeasure": 0.10636913992897945, "rougeL_fmeasure_stderr": 0.0030096187848934145, "rougeL_precision": 0.14327202198692984, "rougeL_precision_stderr": 0.0043408686018408405, "rougeL_recall": 0.10348587771415337, "rougeL_recall_stderr": 0.0030734426965391064, "rougeLsum_fmeasure": 0.12280164375185657, "rougeLsum_fmeasure_stderr": 0.003473100919506952, "rougeLsum_precision": 0.16186299066230891, "rougeLsum_precision_stderr": 0.004753003923288998, "rougeLsum_recall": 0.11938311850161805, "rougeLsum_recall_stderr": 0.0035231116397566264}, "generate_text_restaurant": {"bleu": 13.284050272128148, "bleu_stderr": 0.1770460818661209, "rouge1_fmeasure": 0.47571760194079554, "rouge1_fmeasure_stderr": 0.002265550724177173, "rouge1_precision": 0.5714641277328295, "rouge1_precision_stderr": 0.0032799480051059426, "rouge1_recall": 0.4468451656698964, "rouge1_recall_stderr": 0.0029010255062063995, "rouge2_fmeasure": 0.2325284196471626, "rouge2_fmeasure_stderr": 0.0020182406639818696, "rouge2_precision": 0.28407643182119585, "rouge2_precision_stderr": 0.002743346128512875, "rouge2_recall": 0.21834852344926514, "rouge2_recall_stderr": 0.0021773726271045162, "rougeL_fmeasure": 0.3475057918065784, "rougeL_fmeasure_stderr": 0.0020325138450125527, "rougeL_precision": 0.4201198181536731, "rougeL_precision_stderr": 0.002974299270315645, "rougeL_recall": 0.32583054256145155, "rougeL_recall_stderr": 0.002409815129643757, "rougeLsum_fmeasure": 0.391612619922272, "rougeLsum_fmeasure_stderr": 0.002268675240750965, "rougeLsum_precision": 0.4715445246123803, "rougeLsum_precision_stderr": 0.0031935178217488633, "rougeLsum_recall": 0.36737757013379746, "rougeLsum_recall_stderr": 0.002675859339354407}, "text": {"bleu": 6.3652204803444965, "bleu_stderr": 0.084081338904228, "rouge1_fmeasure": 0.42275422267896534, "rouge1_fmeasure_stderr": 0.002039352116930094, "rouge1_precision": 0.34970551268520844, "rouge1_precision_stderr": 0.0020964374351578907, "rouge1_recall": 0.5664575487593494, "rouge1_recall_stderr": 0.002630711098096289, "rouge2_fmeasure": 0.1872354549740021, "rouge2_fmeasure_stderr": 0.0015796108546654892, "rouge2_precision": 0.15364365198250277, "rouge2_precision_stderr": 0.0014033191424566054, "rouge2_recall": 0.2562044764289271, "rouge2_recall_stderr": 0.002268323282291491, "rougeL_fmeasure": 0.30097802629669945, "rougeL_fmeasure_stderr": 0.0015401844960097636, "rougeL_precision": 0.2474806927681275, "rougeL_precision_stderr": 0.0014727021825347633, "rougeL_recall": 0.4081431372540134, "rougeL_recall_stderr": 0.0023551278568623465, "rougeLsum_fmeasure": 0.3518120413336342, "rougeLsum_fmeasure_stderr": 0.0019523611899019283, "rougeLsum_precision": 0.2908623165977362, "rougeLsum_precision_stderr": 0.0019170217364269356, "rougeLsum_recall": 0.47214399804412605, "rougeLsum_recall_stderr": 0.00260021880713798}}, "3": {"coherent_text": {"bleu": 7.24802349891997, "bleu_stderr": 0.06635722949244867, "rouge1_fmeasure": 0.4237555248539553, "rouge1_fmeasure_stderr": 0.001992167963719159, "rouge1_precision": 0.34180817756753284, "rouge1_precision_stderr": 0.0020285133445260843, "rouge1_recall": 0.5920459286923339, "rouge1_recall_stderr": 0.0026236762956107166, "rouge2_fmeasure": 0.19128265567370034, "rouge2_fmeasure_stderr": 0.001581453769153973, "rouge2_precision": 0.15289604438013946, "rouge2_precision_stderr": 0.001375777527347721, "rouge2_recall": 0.2737065876148546, "rouge2_recall_stderr": 0.0023716671424749572, "rougeL_fmeasure": 0.30417906792765287, "rougeL_fmeasure_stderr": 0.0014988806302313118, "rougeL_precision": 0.24394200099420235, "rougeL_precision_stderr": 0.0014272228885483314, "rougeL_recall": 0.43011826387213586, "rougeL_recall_stderr": 0.002376768370329661, "rougeLsum_fmeasure": 0.3518251181572721, "rougeLsum_fmeasure_stderr": 0.0019444219873398726, "rougeLsum_precision": 0.28376532763365797, "rougeLsum_precision_stderr": 0.0018837986492293225, "rougeLsum_recall": 0.49209992765341304, "rougeLsum_recall_stderr": 0.0026597190806332866}, "create_text_for_me": {"bleu": 7.079344515403131, "bleu_stderr": 0.06095387919902322, "rouge1_fmeasure": 0.4003285706279974, "rouge1_fmeasure_stderr": 0.0017150724139835165, "rouge1_precision": 0.31434504165137933, "rouge1_precision_stderr": 0.0016756850149852798, "rouge1_recall": 0.5838649601706869, "rouge1_recall_stderr": 0.0025270103149731774, "rouge2_fmeasure": 0.1814061261607152, "rouge2_fmeasure_stderr": 0.001458658344245726, "rouge2_precision": 0.1411501512049683, "rouge2_precision_stderr": 0.0012071435589295404, "rouge2_recall": 0.2710239140164862, "rouge2_recall_stderr": 0.0023428408193831283, "rougeL_fmeasure": 0.2773948472348103, "rougeL_fmeasure_stderr": 0.0013888021320340568, "rougeL_precision": 0.2167987032269629, "rougeL_precision_stderr": 0.0012303868538522155, "rougeL_recall": 0.4087432941457496, "rougeL_recall_stderr": 0.0023596629155231627, "rougeLsum_fmeasure": 0.3378608633527859, "rougeLsum_fmeasure_stderr": 0.001695654483915732, "rougeLsum_precision": 0.26527285008292856, "rougeLsum_precision_stderr": 0.00157683567546418, "rougeLsum_recall": 0.49306302016696496, "rougeLsum_recall_stderr": 0.0025313515796034575}, "generate_gramatically_correct_text": {"bleu": 3.5532345468202933, "bleu_stderr": 0.1633344563680152, "rouge1_fmeasure": 0.19980679600968698, "rouge1_fmeasure_stderr": 0.004522960796291835, "rouge1_precision": 0.24121792609367423, "rouge1_precision_stderr": 0.005557178035436053, "rouge1_recall": 0.1980561640610698, "rouge1_recall_stderr": 0.004678758581243681, "rouge2_fmeasure": 0.09632029448061452, "rouge2_fmeasure_stderr": 0.0024366646851858957, "rouge2_precision": 0.10899469492781635, "rouge2_precision_stderr": 0.0028561009527101845, "rouge2_recall": 0.09627564067307154, "rouge2_recall_stderr": 0.002532227746801326, "rougeL_fmeasure": 0.141384148982077, "rougeL_fmeasure_stderr": 0.003282216691402862, "rougeL_precision": 0.17666987032282006, "rougeL_precision_stderr": 0.00438764652241939, "rougeL_recall": 0.13985008744550848, "rougeL_recall_stderr": 0.0034011563714505364, "rougeLsum_fmeasure": 0.16308829854180357, "rougeLsum_fmeasure_stderr": 0.0037724131070999297, "rougeLsum_precision": 0.20064541312984716, "rougeLsum_precision_stderr": 0.004834286841759822, "rougeLsum_recall": 0.1615193895639558, "rougeLsum_recall_stderr": 0.00390851563052825}, "generate_text_restaurant": {"bleu": 14.26351682307982, "bleu_stderr": 0.16518466746497154, "rouge1_fmeasure": 0.4869136869814224, "rouge1_fmeasure_stderr": 0.002252377471622215, "rouge1_precision": 0.5781766953393467, "rouge1_precision_stderr": 0.003232493897043414, "rouge1_recall": 0.45846733751843627, "rouge1_recall_stderr": 0.002862839752183954, "rouge2_fmeasure": 0.24388713793667496, "rouge2_fmeasure_stderr": 0.002081168246828599, "rouge2_precision": 0.2936437583754312, "rouge2_precision_stderr": 0.002766357156065721, "rouge2_recall": 0.2296605325605603, "rouge2_recall_stderr": 0.002239048819483412, "rougeL_fmeasure": 0.35476450327695513, "rougeL_fmeasure_stderr": 0.002131136688683226, "rougeL_precision": 0.42323041568862063, "rougeL_precision_stderr": 0.00299893377825116, "rougeL_recall": 0.3336509571396467, "rougeL_recall_stderr": 0.002463831973937884, "rougeLsum_fmeasure": 0.40259898572566205, "rougeLsum_fmeasure_stderr": 0.0023428037926685654, "rougeLsum_precision": 0.47879240286661645, "rougeLsum_precision_stderr": 0.0032211942725843233, "rougeLsum_recall": 0.3788603945042402, "rougeLsum_recall_stderr": 0.00272506620717064}, "text": {"bleu": 6.4968698400953535, "bleu_stderr": 0.059669479772880306, "rouge1_fmeasure": 0.4147491979933095, "rouge1_fmeasure_stderr": 0.0019885367884029, "rouge1_precision": 0.34117325753136746, "rouge1_precision_stderr": 0.0020233228481327443, "rouge1_recall": 0.5604137274546103, "rouge1_recall_stderr": 0.0026266097618435577, "rouge2_fmeasure": 0.18544033119790024, "rouge2_fmeasure_stderr": 0.0016196353447881983, "rouge2_precision": 0.1512868929321076, "rouge2_precision_stderr": 0.0014148299890455783, "rouge2_recall": 0.25600210025217546, "rouge2_recall_stderr": 0.0023581582821936968, "rougeL_fmeasure": 0.29753225476479545, "rougeL_fmeasure_stderr": 0.0015402025665262522, "rougeL_precision": 0.24337336103065083, "rougeL_precision_stderr": 0.0014497505785590113, "rougeL_recall": 0.40654419517533735, "rougeL_recall_stderr": 0.0023761049290174237, "rougeLsum_fmeasure": 0.34597699574768703, "rougeLsum_fmeasure_stderr": 0.0019510235256301094, "rougeLsum_precision": 0.2845042264341882, "rougeLsum_precision_stderr": 0.001885327286169968, "rougeLsum_recall": 0.4679542291978592, "rougeLsum_recall_stderr": 0.002638793268605149}}, "4": {"coherent_text": {"bleu": 7.259603611889512, "bleu_stderr": 0.09137927162900815, "rouge1_fmeasure": 0.41457403908903073, "rouge1_fmeasure_stderr": 0.0019274435714256614, "rouge1_precision": 0.33191481035446385, "rouge1_precision_stderr": 0.0019317351081010318, "rouge1_recall": 0.5860564434153247, "rouge1_recall_stderr": 0.002586714587351925, "rouge2_fmeasure": 0.18797801160921448, "rouge2_fmeasure_stderr": 0.0015801875991192818, "rouge2_precision": 0.1491204275015992, "rouge2_precision_stderr": 0.0013513647776642052, "rouge2_recall": 0.27220476893326745, "rouge2_recall_stderr": 0.0023984250517094926, "rougeL_fmeasure": 0.29732435133761553, "rougeL_fmeasure_stderr": 0.001482323108912372, "rougeL_precision": 0.23674148673246934, "rougeL_precision_stderr": 0.0013752390609366295, "rougeL_recall": 0.42520126985132733, "rougeL_recall_stderr": 0.002377551920288881, "rougeLsum_fmeasure": 0.3477933389836102, "rougeLsum_fmeasure_stderr": 0.0018713097091873625, "rougeLsum_precision": 0.2783455423171209, "rougeLsum_precision_stderr": 0.0017880786684828078, "rougeLsum_recall": 0.4922469101029398, "rougeLsum_recall_stderr": 0.002602328441781233}, "create_text_for_me": {"bleu": 7.3036749892250326, "bleu_stderr": 0.07515670477596327, "rouge1_fmeasure": 0.4023616713620362, "rouge1_fmeasure_stderr": 0.0017219623211045963, "rouge1_precision": 0.31515384991950207, "rouge1_precision_stderr": 0.0016788769894757376, "rouge1_recall": 0.5890522804534821, "rouge1_recall_stderr": 0.0025220913561448603, "rouge2_fmeasure": 0.1839557774098788, "rouge2_fmeasure_stderr": 0.0014913102819955194, "rouge2_precision": 0.14276401815964648, "rouge2_precision_stderr": 0.0012323791161579336, "rouge2_recall": 0.2759462068171959, "rouge2_recall_stderr": 0.002391724415993567, "rougeL_fmeasure": 0.278135454552631, "rougeL_fmeasure_stderr": 0.0014055461648461573, "rougeL_precision": 0.21671602881600943, "rougeL_precision_stderr": 0.0012338874458127408, "rougeL_recall": 0.41184759065700366, "rougeL_recall_stderr": 0.002409432664953705, "rougeLsum_fmeasure": 0.3402770443245213, "rougeLsum_fmeasure_stderr": 0.0017117460741257117, "rougeLsum_precision": 0.26652069267704986, "rougeLsum_precision_stderr": 0.001586801161274008, "rougeLsum_recall": 0.49825625350122094, "rougeLsum_recall_stderr": 0.00253173094162978}, "generate_gramatically_correct_text": {"bleu": 5.217629954237116, "bleu_stderr": 0.23284930274397692, "rouge1_fmeasure": 0.22663036657248486, "rouge1_fmeasure_stderr": 0.0046087693412430126, "rouge1_precision": 0.26662745576267594, "rouge1_precision_stderr": 0.005575953278388338, "rouge1_recall": 0.22983869250345296, "rouge1_recall_stderr": 0.004858036351505812, "rouge2_fmeasure": 0.11048253025142175, "rouge2_fmeasure_stderr": 0.002540966652479063, "rouge2_precision": 0.12167903294101802, "rouge2_precision_stderr": 0.0029190844237015766, "rouge2_recall": 0.11277983027021372, "rouge2_recall_stderr": 0.0026762229693605183, "rougeL_fmeasure": 0.15912782439760098, "rougeL_fmeasure_stderr": 0.0033432276357100712, "rougeL_precision": 0.19277110900350442, "rougeL_precision_stderr": 0.004362200883339936, "rougeL_recall": 0.1616071987301573, "rougeL_recall_stderr": 0.0035572917110778075, "rougeLsum_fmeasure": 0.18489747608203236, "rougeLsum_fmeasure_stderr": 0.003859601516136119, "rougeLsum_precision": 0.22055060611521593, "rougeLsum_precision_stderr": 0.004823378796586691, "rougeLsum_recall": 0.187891178897973, "rougeLsum_recall_stderr": 0.004094656427625295}, "generate_text_restaurant": {"bleu": 14.568456206535862, "bleu_stderr": 0.22714934644025087, "rouge1_fmeasure": 0.49248119559722, "rouge1_fmeasure_stderr": 0.002261462995557444, "rouge1_precision": 0.5824109780846385, "rouge1_precision_stderr": 0.0032128656177573896, "rouge1_recall": 0.46303711017335764, "rouge1_recall_stderr": 0.0028550395782162385, "rouge2_fmeasure": 0.24852828649672148, "rouge2_fmeasure_stderr": 0.0020894492308823178, "rouge2_precision": 0.2975497604665679, "rouge2_precision_stderr": 0.002721358209608908, "rouge2_recall": 0.2337393664335435, "rouge2_recall_stderr": 0.0022535476965624083, "rougeL_fmeasure": 0.35908240651080353, "rougeL_fmeasure_stderr": 0.0021535460141111546, "rougeL_precision": 0.4258973373500384, "rougeL_precision_stderr": 0.0029570811260566745, "rougeL_recall": 0.33738654578439453, "rougeL_recall_stderr": 0.002480938975354226, "rougeLsum_fmeasure": 0.4073362598730636, "rougeLsum_fmeasure_stderr": 0.002350552518906019, "rougeLsum_precision": 0.48186572666282207, "rougeLsum_precision_stderr": 0.0031731113183941597, "rougeLsum_recall": 0.3829876385027774, "rougeLsum_recall_stderr": 0.0027332507970680904}, "text": {"bleu": 6.6317178163787345, "bleu_stderr": 0.0876389482196388, "rouge1_fmeasure": 0.41464793929679983, "rouge1_fmeasure_stderr": 0.001938651237761389, "rouge1_precision": 0.3400198376127372, "rouge1_precision_stderr": 0.001969604534522521, "rouge1_recall": 0.5624978911685358, "rouge1_recall_stderr": 0.0026055056008207375, "rouge2_fmeasure": 0.18624803563290596, "rouge2_fmeasure_stderr": 0.0016110635518348864, "rouge2_precision": 0.1514447932523371, "rouge2_precision_stderr": 0.0014010545789579145, "rouge2_recall": 0.2581999705469884, "rouge2_recall_stderr": 0.0023705600248679587, "rougeL_fmeasure": 0.2965625670926111, "rougeL_fmeasure_stderr": 0.0015189535046102405, "rougeL_precision": 0.2418798336527905, "rougeL_precision_stderr": 0.0014203318517984557, "rougeL_recall": 0.40664147075304874, "rougeL_recall_stderr": 0.0023808852507882106, "rougeLsum_fmeasure": 0.34627588323332553, "rougeLsum_fmeasure_stderr": 0.0019163771810995604, "rougeLsum_precision": 0.2839226907614591, "rougeLsum_precision_stderr": 0.0018528730766754502, "rougeLsum_recall": 0.47007701746039365, "rougeLsum_recall_stderr": 0.002613092569755996}}, "5": {"coherent_text": {"bleu": 7.083478384911307, "bleu_stderr": 0.10406723270307067, "rouge1_fmeasure": 0.40449455948498675, "rouge1_fmeasure_stderr": 0.0018006323163830978, "rouge1_precision": 0.3214279899425999, "rouge1_precision_stderr": 0.0017937724757931723, "rouge1_recall": 0.5794690131338354, "rouge1_recall_stderr": 0.00257830350864617, "rouge2_fmeasure": 0.18412349501175196, "rouge2_fmeasure_stderr": 0.001510781529338931, "rouge2_precision": 0.14499572043902326, "rouge2_precision_stderr": 0.0012779901757949512, "rouge2_recall": 0.27026431913729904, "rouge2_recall_stderr": 0.0023571465046724562, "rougeL_fmeasure": 0.28734129485544363, "rougeL_fmeasure_stderr": 0.001403833180540554, "rougeL_precision": 0.2271462950868112, "rougeL_precision_stderr": 0.0012925654415635542, "rougeL_recall": 0.41645498805365133, "rougeL_recall_stderr": 0.002379296311255394, "rougeLsum_fmeasure": 0.34568503871928113, "rougeLsum_fmeasure_stderr": 0.0017485361441317436, "rougeLsum_precision": 0.2746492057015669, "rougeLsum_precision_stderr": 0.0016737991466862709, "rougeLsum_recall": 0.49571786150582986, "rougeLsum_recall_stderr": 0.0025524072970345467}, "create_text_for_me": {"bleu": 7.252202833742519, "bleu_stderr": 0.09258081742959207, "rouge1_fmeasure": 0.40056764017959046, "rouge1_fmeasure_stderr": 0.001685883519704545, "rouge1_precision": 0.31332420313697207, "rouge1_precision_stderr": 0.001643155770755078, "rouge1_recall": 0.5876744067602289, "rouge1_recall_stderr": 0.002515657293017262, "rouge2_fmeasure": 0.1840109955212137, "rouge2_fmeasure_stderr": 0.001458917708562315, "rouge2_precision": 0.14257489924729352, "rouge2_precision_stderr": 0.0012041170840367955, "rouge2_recall": 0.2769111512983329, "rouge2_recall_stderr": 0.0023686465359635234, "rougeL_fmeasure": 0.27755157420468274, "rougeL_fmeasure_stderr": 0.0013836819536529941, "rougeL_precision": 0.21603612545582793, "rougeL_precision_stderr": 0.0012167765328144225, "rougeL_recall": 0.4116394939841814, "rougeL_recall_stderr": 0.002384213224350567, "rougeLsum_fmeasure": 0.33937537443965576, "rougeLsum_fmeasure_stderr": 0.0017048672562046127, "rougeLsum_precision": 0.26549263778003923, "rougeLsum_precision_stderr": 0.0015751904507686763, "rougeLsum_recall": 0.49799879824695253, "rougeLsum_recall_stderr": 0.0025632163763559136}, "generate_gramatically_correct_text": {"bleu": 6.376693171193141, "bleu_stderr": 0.2748726388858482, "rouge1_fmeasure": 0.2375382569438019, "rouge1_fmeasure_stderr": 0.0045304403027307755, "rouge1_precision": 0.26448067560699023, "rouge1_precision_stderr": 0.0053005540267231106, "rouge1_recall": 0.25142750063623637, "rouge1_recall_stderr": 0.004979472392281411, "rouge2_fmeasure": 0.11427201259332177, "rouge2_fmeasure_stderr": 0.0024437856983114473, "rouge2_precision": 0.12190247907289396, "rouge2_precision_stderr": 0.0027751227368017595, "rouge2_recall": 0.12160621956065581, "rouge2_recall_stderr": 0.0026839402002743567, "rougeL_fmeasure": 0.1682205810105168, "rougeL_fmeasure_stderr": 0.0033058315417317363, "rougeL_precision": 0.19188124181418145, "rougeL_precision_stderr": 0.004147195755453536, "rougeL_recall": 0.1779950318253713, "rougeL_recall_stderr": 0.0036462875523432373, "rougeLsum_fmeasure": 0.19617441412979775, "rougeLsum_fmeasure_stderr": 0.0038379731820601636, "rougeLsum_precision": 0.22052854784819315, "rougeLsum_precision_stderr": 0.004603014131646932, "rougeLsum_recall": 0.2079492427367012, "rougeLsum_recall_stderr": 0.004237911852406221}, "generate_text_restaurant": {"bleu": 14.41705650902542, "bleu_stderr": 0.25051935957566557, "rouge1_fmeasure": 0.4912358527300658, "rouge1_fmeasure_stderr": 0.002175083911938509, "rouge1_precision": 0.5768002584328835, "rouge1_precision_stderr": 0.003210936196026848, "rouge1_recall": 0.4663927464698937, "rouge1_recall_stderr": 0.002827189059265761, "rouge2_fmeasure": 0.24634621400768708, "rouge2_fmeasure_stderr": 0.002025833005255322, "rouge2_precision": 0.2929623409468226, "rouge2_precision_stderr": 0.0026747147184981385, "rouge2_recall": 0.2340255710785021, "rouge2_recall_stderr": 0.002212699322566578, "rougeL_fmeasure": 0.35597711356903294, "rougeL_fmeasure_stderr": 0.002065825901549598, "rougeL_precision": 0.41839619423807317, "rougeL_precision_stderr": 0.0028660522019280242, "rougeL_recall": 0.3383776885015685, "rougeL_recall_stderr": 0.0024683861897523925, "rougeLsum_fmeasure": 0.4066143622258549, "rougeLsum_fmeasure_stderr": 0.002273545252564866, "rougeLsum_precision": 0.47696170422006223, "rougeLsum_precision_stderr": 0.003098280726576402, "rougeLsum_recall": 0.3864090320075692, "rougeLsum_recall_stderr": 0.0027224907209281047}, "text": {"bleu": 6.491534980084141, "bleu_stderr": 0.10379177513611697, "rouge1_fmeasure": 0.4122007802452186, "rouge1_fmeasure_stderr": 0.0019059990682215938, "rouge1_precision": 0.33728382733850976, "rouge1_precision_stderr": 0.0019389929582716602, "rouge1_recall": 0.5613995582020262, "rouge1_recall_stderr": 0.002595310912421587, "rouge2_fmeasure": 0.18362949663211528, "rouge2_fmeasure_stderr": 0.0015655061377427496, "rouge2_precision": 0.14882470581081966, "rouge2_precision_stderr": 0.0013513137828580892, "rouge2_recall": 0.2562146590122059, "rouge2_recall_stderr": 0.002346866630933058, "rougeL_fmeasure": 0.29378677375300194, "rougeL_fmeasure_stderr": 0.001476945763349084, "rougeL_precision": 0.23902186517429475, "rougeL_precision_stderr": 0.0013782846724426727, "rougeL_recall": 0.40466863634778255, "rougeL_recall_stderr": 0.00236488455920865, "rougeLsum_fmeasure": 0.34505218091652107, "rougeLsum_fmeasure_stderr": 0.0018918651643402265, "rougeLsum_precision": 0.2822952417456622, "rougeLsum_precision_stderr": 0.0018260149187799492, "rougeLsum_recall": 0.470397132239544, "rougeLsum_recall_stderr": 0.0026203121351988648}}}, "gem_xsum": {"0": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.764704482900491, "bleu_stderr": 0.07729220338781524, "rouge1_fmeasure": 0.14273720741484003, "rouge1_fmeasure_stderr": 0.00215614367639498, "rouge1_precision": 0.10101705898942855, "rouge1_precision_stderr": 0.0016024170779631586, "rouge1_recall": 0.25441720721083605, "rouge1_recall_stderr": 0.0037811710248573083, "rouge2_fmeasure": 0.021087066079578522, "rouge2_fmeasure_stderr": 0.0009789482177719121, "rouge2_precision": 0.014796714574159396, "rouge2_precision_stderr": 0.0006925399675854941, "rouge2_recall": 0.03874020571430558, "rouge2_recall_stderr": 0.00185089897813793, "rougeL_fmeasure": 0.11527495670864521, "rougeL_fmeasure_stderr": 0.001581234925824072, "rougeL_precision": 0.08142707182033677, "rougeL_precision_stderr": 0.0011693922318445696, "rougeL_recall": 0.20644516180144928, "rougeL_recall_stderr": 0.0028652502483865186, "rougeLsum_fmeasure": 0.11505369846922088, "rougeLsum_fmeasure_stderr": 0.0017783784436516575, "rougeLsum_precision": 0.08118614796808597, "rougeLsum_precision_stderr": 0.001299485058038869, "rougeLsum_recall": 0.2067297549645945, "rougeLsum_recall_stderr": 0.0032511919359005962}, "DOC_tldr": {"bleu": 2.3356469238799042, "bleu_stderr": 0.08798372854463395, "rouge1_fmeasure": 0.22203482324927531, "rouge1_fmeasure_stderr": 0.002846869346449123, "rouge1_precision": 0.18114983702463375, "rouge1_precision_stderr": 0.0032933592986375145, "rouge1_recall": 0.3436733855851609, "rouge1_recall_stderr": 0.004317974276915552, "rouge2_fmeasure": 0.05712816335799515, "rouge2_fmeasure_stderr": 0.0018963073568218371, "rouge2_precision": 0.047446576026624596, "rouge2_precision_stderr": 0.0020034775467036724, "rouge2_recall": 0.08991408171222039, "rouge2_recall_stderr": 0.0028845441646191392, "rougeL_fmeasure": 0.17285149090538388, "rougeL_fmeasure_stderr": 0.002306787503569375, "rougeL_precision": 0.14097151038726158, "rougeL_precision_stderr": 0.0027300858693928703, "rougeL_recall": 0.2690913907957878, "rougeL_recall_stderr": 0.0035417684042767152, "rougeLsum_fmeasure": 0.17513809375886846, "rougeLsum_fmeasure_stderr": 0.0024578450883517545, "rougeLsum_precision": 0.14261278916768066, "rougeLsum_precision_stderr": 0.0027978086937486806, "rougeLsum_recall": 0.2733257622994336, "rougeLsum_recall_stderr": 0.003865650205750047}, "article_DOC_summary": {"bleu": 2.242352554457821, "bleu_stderr": 0.06303290180001715, "rouge1_fmeasure": 0.22142206258867178, "rouge1_fmeasure_stderr": 0.002657386115409225, "rouge1_precision": 0.17908873736938333, "rouge1_precision_stderr": 0.0025998582918028913, "rouge1_recall": 0.3401581574130041, "rouge1_recall_stderr": 0.00466124187806079, "rouge2_fmeasure": 0.0511787638415587, "rouge2_fmeasure_stderr": 0.0017012685833384458, "rouge2_precision": 0.04001978076164408, "rouge2_precision_stderr": 0.00143853172997629, "rouge2_recall": 0.08351989855551167, "rouge2_recall_stderr": 0.002908378017045409, "rougeL_fmeasure": 0.16457376114227693, "rougeL_fmeasure_stderr": 0.002030624196291875, "rougeL_precision": 0.13280826032727655, "rougeL_precision_stderr": 0.0019720134198053826, "rougeL_recall": 0.2547013277658435, "rougeL_recall_stderr": 0.0036980006484927316, "rougeLsum_fmeasure": 0.1721074292499855, "rougeLsum_fmeasure_stderr": 0.002183282479578269, "rougeLsum_precision": 0.13820058290248866, "rougeLsum_precision_stderr": 0.0020236107693674434, "rougeLsum_recall": 0.267820674383363, "rougeLsum_recall_stderr": 0.00406330233863675}, "summarize_DOC": {"bleu": 1.9361958605276772, "bleu_stderr": 0.1105276321242612, "rouge1_fmeasure": 0.20038284398674655, "rouge1_fmeasure_stderr": 0.0026502877333823838, "rouge1_precision": 0.14742845406065047, "rouge1_precision_stderr": 0.0022525058672854016, "rouge1_recall": 0.3423037012370881, "rouge1_recall_stderr": 0.004534073547427449, "rouge2_fmeasure": 0.04876836268401361, "rouge2_fmeasure_stderr": 0.0015994140460390957, "rouge2_precision": 0.035566259003931525, "rouge2_precision_stderr": 0.0012143665358549194, "rouge2_recall": 0.0856761804632457, "rouge2_recall_stderr": 0.0028762266574800066, "rougeL_fmeasure": 0.15236796425987564, "rougeL_fmeasure_stderr": 0.0019755777625985065, "rougeL_precision": 0.1119655682266218, "rougeL_precision_stderr": 0.0016851034973666573, "rougeL_recall": 0.26208954500477866, "rougeL_recall_stderr": 0.0035709505116761774, "rougeLsum_fmeasure": 0.15870323247009147, "rougeLsum_fmeasure_stderr": 0.0022161037551435207, "rougeLsum_precision": 0.11645877000552814, "rougeLsum_precision_stderr": 0.001831460190174745, "rougeLsum_recall": 0.2732569800295104, "rougeLsum_recall_stderr": 0.003986118877385603}, "summarize_this_DOC_summary": {"bleu": 2.4757752294894693, "bleu_stderr": 0.10759359573160089, "rouge1_fmeasure": 0.22431631903155347, "rouge1_fmeasure_stderr": 0.0026093322298271567, "rouge1_precision": 0.16879708539303456, "rouge1_precision_stderr": 0.002317898014536499, "rouge1_recall": 0.3723291290939063, "rouge1_recall_stderr": 0.004545165883948288, "rouge2_fmeasure": 0.058199865858872574, "rouge2_fmeasure_stderr": 0.0017490056690093988, "rouge2_precision": 0.042926431575461076, "rouge2_precision_stderr": 0.0013931045314890859, "rouge2_recall": 0.1010845101974381, "rouge2_recall_stderr": 0.0031251208246711737, "rougeL_fmeasure": 0.17056577152255134, "rougeL_fmeasure_stderr": 0.002052264214122958, "rougeL_precision": 0.12782464697639043, "rougeL_precision_stderr": 0.001786854774861224, "rougeL_recall": 0.28577217256570797, "rougeL_recall_stderr": 0.0037793340502424826, "rougeLsum_fmeasure": 0.17737257514409943, "rougeLsum_fmeasure_stderr": 0.0022595548538614863, "rougeLsum_precision": 0.13266264249564455, "rougeLsum_precision_stderr": 0.0018939748248399677, "rougeLsum_recall": 0.2973642224201739, "rougeLsum_recall_stderr": 0.0041420819681628265}}, "1": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.5933480280129028, "bleu_stderr": 0.05429786159159416, "rouge1_fmeasure": 0.1494286022448456, "rouge1_fmeasure_stderr": 0.0020905023117744753, "rouge1_precision": 0.10639113371900555, "rouge1_precision_stderr": 0.0015564566569882924, "rouge1_recall": 0.2612998397001183, "rouge1_recall_stderr": 0.0035416281966467018, "rouge2_fmeasure": 0.01583906203117334, "rouge2_fmeasure_stderr": 0.0008546628932044431, "rouge2_precision": 0.011201230051773512, "rouge2_precision_stderr": 0.0006022749049849383, "rouge2_recall": 0.0281936592385339, "rouge2_recall_stderr": 0.001591047010891972, "rougeL_fmeasure": 0.10535331505143768, "rougeL_fmeasure_stderr": 0.0013577546916502855, "rougeL_precision": 0.07481912872892685, "rougeL_precision_stderr": 0.0010039125628398355, "rougeL_recall": 0.18571610402684233, "rougeL_recall_stderr": 0.002406722992733317, "rougeLsum_fmeasure": 0.12024087462673269, "rougeLsum_fmeasure_stderr": 0.001679730935315816, "rougeLsum_precision": 0.08543958378845312, "rougeLsum_precision_stderr": 0.0012424221154866547, "rougeLsum_recall": 0.21159077593083697, "rougeLsum_recall_stderr": 0.002933760670380391}, "DOC_tldr": {"bleu": 2.1464792246976607, "bleu_stderr": 0.07488613082754325, "rouge1_fmeasure": 0.21086782423473222, "rouge1_fmeasure_stderr": 0.0025534571718774162, "rouge1_precision": 0.15000019598004125, "rouge1_precision_stderr": 0.0019021792570688687, "rouge1_recall": 0.36983583808020726, "rouge1_recall_stderr": 0.004482161415274735, "rouge2_fmeasure": 0.052391077282527426, "rouge2_fmeasure_stderr": 0.0016560324357599706, "rouge2_precision": 0.0368473261642897, "rouge2_precision_stderr": 0.001172295467736883, "rouge2_recall": 0.09474599631418981, "rouge2_recall_stderr": 0.0030950408198772202, "rougeL_fmeasure": 0.1607709135525665, "rougeL_fmeasure_stderr": 0.0019498240736311654, "rougeL_precision": 0.11414094299528126, "rougeL_precision_stderr": 0.0014353953204332421, "rougeL_recall": 0.2837425807321581, "rougeL_recall_stderr": 0.0035949604065181845, "rougeLsum_fmeasure": 0.16811226782770253, "rougeLsum_fmeasure_stderr": 0.002178990811580748, "rougeLsum_precision": 0.11932299339443851, "rougeLsum_precision_stderr": 0.0015966278483692021, "rougeLsum_recall": 0.2967588714120258, "rougeLsum_recall_stderr": 0.00398015313663571}, "article_DOC_summary": {"bleu": 1.8494036251126653, "bleu_stderr": 0.06618274112679547, "rouge1_fmeasure": 0.19996151191164502, "rouge1_fmeasure_stderr": 0.0024642136191649943, "rouge1_precision": 0.14242240386934715, "rouge1_precision_stderr": 0.0018474131654211898, "rouge1_recall": 0.34972417416826374, "rouge1_recall_stderr": 0.00425304183688617, "rouge2_fmeasure": 0.04515071736102295, "rouge2_fmeasure_stderr": 0.0014958437813087835, "rouge2_precision": 0.03180784324852304, "rouge2_precision_stderr": 0.0010605401760687887, "rouge2_recall": 0.08122973807701163, "rouge2_recall_stderr": 0.0027530425274732025, "rougeL_fmeasure": 0.15185456372717573, "rougeL_fmeasure_stderr": 0.0018177177985044526, "rougeL_precision": 0.10785508103994479, "rougeL_precision_stderr": 0.001342675196432051, "rougeL_recall": 0.26778299938242467, "rougeL_recall_stderr": 0.0033228224655886393, "rougeLsum_fmeasure": 0.16002746501050483, "rougeLsum_fmeasure_stderr": 0.002040753526176795, "rougeLsum_precision": 0.11366701902108305, "rougeLsum_precision_stderr": 0.0014987342312601044, "rougeLsum_recall": 0.28182954712893066, "rougeLsum_recall_stderr": 0.0036825745017303583}, "summarize_DOC": {"bleu": 1.8413841516657772, "bleu_stderr": 0.10867724653105522, "rouge1_fmeasure": 0.20137857386382021, "rouge1_fmeasure_stderr": 0.0024449071554284354, "rouge1_precision": 0.14311406793133874, "rouge1_precision_stderr": 0.0018301237072922785, "rouge1_recall": 0.3539643364314241, "rouge1_recall_stderr": 0.0041947517615325515, "rouge2_fmeasure": 0.044490758416455556, "rouge2_fmeasure_stderr": 0.0015403655337067337, "rouge2_precision": 0.03128359088712783, "rouge2_precision_stderr": 0.0010905195100737691, "rouge2_recall": 0.08031060486303311, "rouge2_recall_stderr": 0.002840064526422232, "rougeL_fmeasure": 0.1495663368544618, "rougeL_fmeasure_stderr": 0.0018563556646033327, "rougeL_precision": 0.10608525021287649, "rougeL_precision_stderr": 0.0013726454652157288, "rougeL_recall": 0.26468135425429246, "rougeL_recall_stderr": 0.0033595462893916675, "rougeLsum_fmeasure": 0.16108455797657517, "rougeLsum_fmeasure_stderr": 0.0020640892705846995, "rougeLsum_precision": 0.11421808592242337, "rougeLsum_precision_stderr": 0.001519079728610017, "rougeLsum_recall": 0.28497861112523026, "rougeLsum_recall_stderr": 0.0037025598565927275}, "summarize_this_DOC_summary": {"bleu": 1.7005679889378829, "bleu_stderr": 0.09964986448064858, "rouge1_fmeasure": 0.19107539113194497, "rouge1_fmeasure_stderr": 0.002518376366873991, "rouge1_precision": 0.1361829095273917, "rouge1_precision_stderr": 0.0018976552774942676, "rouge1_recall": 0.33393036619688554, "rouge1_recall_stderr": 0.004242354030815657, "rouge2_fmeasure": 0.04062921012242493, "rouge2_fmeasure_stderr": 0.0015228641462241134, "rouge2_precision": 0.028698540492772314, "rouge2_precision_stderr": 0.001082902961563565, "rouge2_recall": 0.07241022779028632, "rouge2_recall_stderr": 0.00276886558899143, "rougeL_fmeasure": 0.14491822198289625, "rougeL_fmeasure_stderr": 0.0018471762207363737, "rougeL_precision": 0.10299681457974053, "rougeL_precision_stderr": 0.001372759455933598, "rougeL_recall": 0.25531583728222634, "rougeL_recall_stderr": 0.0032867969214619296, "rougeLsum_fmeasure": 0.1529660415125503, "rougeLsum_fmeasure_stderr": 0.002085977228826524, "rougeLsum_precision": 0.10874915835982801, "rougeLsum_precision_stderr": 0.0015461930795802182, "rougeLsum_recall": 0.2691233096679109, "rougeLsum_recall_stderr": 0.0036642185726151262}}, "2": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.956802800637642, "bleu_stderr": 0.053844707729285034, "rouge1_fmeasure": 0.16179129667309153, "rouge1_fmeasure_stderr": 0.002404708780391458, "rouge1_precision": 0.11490099337353599, "rouge1_precision_stderr": 0.0017855066978262773, "rouge1_recall": 0.28416098353802643, "rouge1_recall_stderr": 0.00403393246491068, "rouge2_fmeasure": 0.025696677971347164, "rouge2_fmeasure_stderr": 0.0011719065575280197, "rouge2_precision": 0.01814304612682325, "rouge2_precision_stderr": 0.0008381266313809256, "rouge2_recall": 0.045832024333924644, "rouge2_recall_stderr": 0.0020855696544412475, "rougeL_fmeasure": 0.11916137004159028, "rougeL_fmeasure_stderr": 0.0016721548286313965, "rougeL_precision": 0.08448464202437457, "rougeL_precision_stderr": 0.0012405791092172244, "rougeL_recall": 0.21062724107258454, "rougeL_recall_stderr": 0.002888253274044044, "rougeLsum_fmeasure": 0.12976468768250315, "rougeLsum_fmeasure_stderr": 0.0019232886149564547, "rougeLsum_precision": 0.09202870499262696, "rougeLsum_precision_stderr": 0.0014247653846688566, "rougeLsum_recall": 0.22909088132492106, "rougeLsum_recall_stderr": 0.003296324605941834}, "DOC_tldr": {"bleu": 2.3300715197661055, "bleu_stderr": 0.076927398815874, "rouge1_fmeasure": 0.21985072281272192, "rouge1_fmeasure_stderr": 0.0024623644197026796, "rouge1_precision": 0.15639600620674704, "rouge1_precision_stderr": 0.0018335197504392494, "rouge1_recall": 0.384927926107643, "rouge1_recall_stderr": 0.004418750266865289, "rouge2_fmeasure": 0.05747020905233346, "rouge2_fmeasure_stderr": 0.0016740650674689041, "rouge2_precision": 0.040354659501515545, "rouge2_precision_stderr": 0.0011772215563549183, "rouge2_recall": 0.10426062090763166, "rouge2_recall_stderr": 0.0031736366813986676, "rougeL_fmeasure": 0.16596009416926394, "rougeL_fmeasure_stderr": 0.001923269555875606, "rougeL_precision": 0.11784071641372797, "rougeL_precision_stderr": 0.0014063568059494577, "rougeL_recall": 0.29208769322286265, "rougeL_recall_stderr": 0.0036018446554112815, "rougeLsum_fmeasure": 0.1734564395771864, "rougeLsum_fmeasure_stderr": 0.0021637914826953624, "rougeLsum_precision": 0.12311951167098538, "rougeLsum_precision_stderr": 0.0015736137729286763, "rougeLsum_recall": 0.3054920255583516, "rougeLsum_recall_stderr": 0.004042312320168184}, "article_DOC_summary": {"bleu": 1.9553979785993556, "bleu_stderr": 0.08114119801334493, "rouge1_fmeasure": 0.20066139374359412, "rouge1_fmeasure_stderr": 0.002583892537384957, "rouge1_precision": 0.14299555153351842, "rouge1_precision_stderr": 0.0019414837646538163, "rouge1_recall": 0.35054680023331225, "rouge1_recall_stderr": 0.004405959066068025, "rouge2_fmeasure": 0.047730927310845786, "rouge2_fmeasure_stderr": 0.0016012182287873963, "rouge2_precision": 0.03372540037348309, "rouge2_precision_stderr": 0.0011433968826317544, "rouge2_recall": 0.08536452458478411, "rouge2_recall_stderr": 0.002896468085770253, "rougeL_fmeasure": 0.15423466735433622, "rougeL_fmeasure_stderr": 0.0019141237080501433, "rougeL_precision": 0.1096711314096099, "rougeL_precision_stderr": 0.0014240979490601532, "rougeL_recall": 0.2713048964277041, "rougeL_recall_stderr": 0.003435168695225616, "rougeLsum_fmeasure": 0.16073844532804232, "rougeLsum_fmeasure_stderr": 0.0021770573542344426, "rougeLsum_precision": 0.11432354351222646, "rougeLsum_precision_stderr": 0.0016135722781519138, "rougeLsum_recall": 0.282399261929163, "rougeLsum_recall_stderr": 0.0038458320187164767}, "summarize_DOC": {"bleu": 1.7406142615504723, "bleu_stderr": 0.07696728998860818, "rouge1_fmeasure": 0.19920399454150123, "rouge1_fmeasure_stderr": 0.0025260772599993864, "rouge1_precision": 0.14180365518935828, "rouge1_precision_stderr": 0.0019007951978877876, "rouge1_recall": 0.3486026747587321, "rouge1_recall_stderr": 0.004261696866882266, "rouge2_fmeasure": 0.044164071082009024, "rouge2_fmeasure_stderr": 0.0014977618687604835, "rouge2_precision": 0.031146297678497057, "rouge2_precision_stderr": 0.0010625332283172775, "rouge2_recall": 0.07920383582526831, "rouge2_recall_stderr": 0.00274094299078919, "rougeL_fmeasure": 0.14926255759990167, "rougeL_fmeasure_stderr": 0.001862261876173521, "rougeL_precision": 0.10604824573625245, "rougeL_precision_stderr": 0.001386014229937149, "rougeL_recall": 0.26286006784713417, "rougeL_recall_stderr": 0.0033079692185273243, "rougeLsum_fmeasure": 0.15986703304551134, "rougeLsum_fmeasure_stderr": 0.0020835266037317474, "rougeLsum_precision": 0.11354760855679208, "rougeLsum_precision_stderr": 0.0015422259389720333, "rougeLsum_recall": 0.2814350135656724, "rougeLsum_recall_stderr": 0.003681070167391697}, "summarize_this_DOC_summary": {"bleu": 1.6591704296842291, "bleu_stderr": 0.058952940117381177, "rouge1_fmeasure": 0.18596744227922282, "rouge1_fmeasure_stderr": 0.002501821848962091, "rouge1_precision": 0.13249564318284043, "rouge1_precision_stderr": 0.0018709338649646882, "rouge1_recall": 0.3246622264280915, "rouge1_recall_stderr": 0.004249757242473774, "rouge2_fmeasure": 0.04062641993018198, "rouge2_fmeasure_stderr": 0.001489738006922335, "rouge2_precision": 0.028665504454117075, "rouge2_precision_stderr": 0.0010557844669082574, "rouge2_recall": 0.07274913333784847, "rouge2_recall_stderr": 0.0027319168569439082, "rougeL_fmeasure": 0.14454037675210504, "rougeL_fmeasure_stderr": 0.0018835020887859987, "rougeL_precision": 0.10277571882616827, "rougeL_precision_stderr": 0.0014001676277520247, "rougeL_recall": 0.25399401920963616, "rougeL_recall_stderr": 0.0033229190889309318, "rougeLsum_fmeasure": 0.1498182518022352, "rougeLsum_fmeasure_stderr": 0.002092342016369401, "rougeLsum_precision": 0.10648169866167753, "rougeLsum_precision_stderr": 0.0015420217510554847, "rougeLsum_recall": 0.26335591514014295, "rougeLsum_recall_stderr": 0.0036975443420346143}}, "3": {"DOC_boils_down_to_simple_idea_that": {"bleu": 1.2524115713328146, "bleu_stderr": 0.07566638507964296, "rouge1_fmeasure": 0.16516309621402628, "rouge1_fmeasure_stderr": 0.0026871657655052933, "rouge1_precision": 0.11944990921004701, "rouge1_precision_stderr": 0.002075349245199532, "rouge1_recall": 0.28605812303299377, "rouge1_recall_stderr": 0.004658807697952179, "rouge2_fmeasure": 0.03211728661220163, "rouge2_fmeasure_stderr": 0.0013308066074301423, "rouge2_precision": 0.02287253200603821, "rouge2_precision_stderr": 0.0009561691761087596, "rouge2_recall": 0.057624882236011116, "rouge2_recall_stderr": 0.0024533377849923813, "rougeL_fmeasure": 0.12491419978896767, "rougeL_fmeasure_stderr": 0.001992774375798432, "rougeL_precision": 0.09017935363635442, "rougeL_precision_stderr": 0.001527608100576742, "rougeL_recall": 0.2174725953720994, "rougeL_recall_stderr": 0.00355507141242734, "rougeLsum_fmeasure": 0.1325926800413563, "rougeLsum_fmeasure_stderr": 0.002179322108142391, "rougeLsum_precision": 0.09579245704657499, "rougeLsum_precision_stderr": 0.0016893680911935236, "rougeLsum_recall": 0.23081299236685363, "rougeLsum_recall_stderr": 0.003847882937168944}, "DOC_tldr": {"bleu": 2.219342171346122, "bleu_stderr": 0.10669483486054532, "rouge1_fmeasure": 0.21450007180041575, "rouge1_fmeasure_stderr": 0.0027403785317734034, "rouge1_precision": 0.1560692539475909, "rouge1_precision_stderr": 0.0021493143139258548, "rouge1_recall": 0.36907923753393884, "rouge1_recall_stderr": 0.004843054637600467, "rouge2_fmeasure": 0.05413486443607653, "rouge2_fmeasure_stderr": 0.0017007510337812338, "rouge2_precision": 0.03862402056888545, "rouge2_precision_stderr": 0.0012274248154787231, "rouge2_recall": 0.09630094014738012, "rouge2_recall_stderr": 0.00313494655980441, "rougeL_fmeasure": 0.16116371553163497, "rougeL_fmeasure_stderr": 0.0021510860622327255, "rougeL_precision": 0.11715734774896432, "rougeL_precision_stderr": 0.0016770553946149032, "rougeL_recall": 0.2788115939785124, "rougeL_recall_stderr": 0.003945546036537552, "rougeLsum_fmeasure": 0.17063343819296709, "rougeLsum_fmeasure_stderr": 0.002369199596245067, "rougeLsum_precision": 0.12407625422586686, "rougeLsum_precision_stderr": 0.0018426611135377202, "rougeLsum_recall": 0.2950692838761085, "rougeLsum_recall_stderr": 0.004308493244658286}, "article_DOC_summary": {"bleu": 1.9899134724216982, "bleu_stderr": 0.08189694158167818, "rouge1_fmeasure": 0.19424816162568462, "rouge1_fmeasure_stderr": 0.002758326610864678, "rouge1_precision": 0.14095312565957024, "rouge1_precision_stderr": 0.0021252189798371952, "rouge1_recall": 0.3355112558235932, "rouge1_recall_stderr": 0.00482038827448099, "rouge2_fmeasure": 0.04656621187743751, "rouge2_fmeasure_stderr": 0.001620760377451161, "rouge2_precision": 0.03309503575773881, "rouge2_precision_stderr": 0.0011583105646001772, "rouge2_recall": 0.08361606756802129, "rouge2_recall_stderr": 0.0030274265177690917, "rougeL_fmeasure": 0.14844786782549954, "rougeL_fmeasure_stderr": 0.002106590246901777, "rougeL_precision": 0.10750740692234483, "rougeL_precision_stderr": 0.001617470504570499, "rougeL_recall": 0.25822344513710505, "rougeL_recall_stderr": 0.0038342113677037708, "rougeLsum_fmeasure": 0.15564298400864224, "rougeLsum_fmeasure_stderr": 0.002325750760252049, "rougeLsum_precision": 0.11272654966890937, "rougeLsum_precision_stderr": 0.0017739663038892394, "rougeLsum_recall": 0.2704314687293562, "rougeLsum_recall_stderr": 0.0041899251242752955}, "summarize_DOC": {"bleu": 1.9355476748458424, "bleu_stderr": 0.08372337758759396, "rouge1_fmeasure": 0.1984246140855233, "rouge1_fmeasure_stderr": 0.0027300551089189254, "rouge1_precision": 0.14423414216147484, "rouge1_precision_stderr": 0.0021959372951004916, "rouge1_recall": 0.34261604167692117, "rouge1_recall_stderr": 0.004689675276520268, "rouge2_fmeasure": 0.045340440062370646, "rouge2_fmeasure_stderr": 0.001542686638928427, "rouge2_precision": 0.03256635443812794, "rouge2_precision_stderr": 0.0011365296591071454, "rouge2_recall": 0.0804693811178453, "rouge2_recall_stderr": 0.0027940237622737594, "rougeL_fmeasure": 0.14687329421715054, "rougeL_fmeasure_stderr": 0.0020304285556757055, "rougeL_precision": 0.10661260527903711, "rougeL_precision_stderr": 0.0016541928452697811, "rougeL_recall": 0.25522999799700063, "rougeL_recall_stderr": 0.00362478715746941, "rougeLsum_fmeasure": 0.15866027545322184, "rougeLsum_fmeasure_stderr": 0.002268964508976531, "rougeLsum_precision": 0.1152311222340281, "rougeLsum_precision_stderr": 0.0018256115213476074, "rougeLsum_recall": 0.27538991990706024, "rougeLsum_recall_stderr": 0.004032468272508086}, "summarize_this_DOC_summary": {"bleu": 1.7550272676426535, "bleu_stderr": 0.10693852053384469, "rouge1_fmeasure": 0.18225506964350696, "rouge1_fmeasure_stderr": 0.0028014365519188978, "rouge1_precision": 0.1321970313885188, "rouge1_precision_stderr": 0.0021585317256625847, "rouge1_recall": 0.31326781741517834, "rouge1_recall_stderr": 0.004809417225791706, "rouge2_fmeasure": 0.040076565671093634, "rouge2_fmeasure_stderr": 0.0015348459428170054, "rouge2_precision": 0.028598963986615815, "rouge2_precision_stderr": 0.0010992959325129637, "rouge2_recall": 0.07104911617756425, "rouge2_recall_stderr": 0.002803269832027392, "rougeL_fmeasure": 0.13893464976445405, "rougeL_fmeasure_stderr": 0.002136322118023521, "rougeL_precision": 0.10064574048197487, "rougeL_precision_stderr": 0.0016306171384224302, "rougeL_recall": 0.23989193498285913, "rougeL_recall_stderr": 0.0037628092205002047, "rougeLsum_fmeasure": 0.14439166349049962, "rougeLsum_fmeasure_stderr": 0.002291465287441587, "rougeLsum_precision": 0.1044393755169441, "rougeLsum_precision_stderr": 0.0017305664557089341, "rougeLsum_recall": 0.24983012145534195, "rougeLsum_recall_stderr": 0.004050169780460673}}, "4": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.6706678456407895, "bleu_stderr": 0.14282899844134608, "rouge1_fmeasure": 0.044505675987276345, "rouge1_fmeasure_stderr": 0.002633675660123755, "rouge1_precision": 0.036672752688489164, "rouge1_precision_stderr": 0.002280550360708834, "rouge1_recall": 0.07104357839010789, "rouge1_recall_stderr": 0.004348885899507683, "rouge2_fmeasure": 0.009527033234766386, "rouge2_fmeasure_stderr": 0.0008493752428441728, "rouge2_precision": 0.00723200667268081, "rouge2_precision_stderr": 0.0006697782682481475, "rouge2_recall": 0.01625946634947881, "rouge2_recall_stderr": 0.0014906194095147963, "rougeL_fmeasure": 0.03404085416130508, "rougeL_fmeasure_stderr": 0.002000857011564909, "rougeL_precision": 0.028537184733923405, "rougeL_precision_stderr": 0.0018341458712957833, "rougeL_recall": 0.05427164298658868, "rougeL_recall_stderr": 0.0032950923739180006, "rougeLsum_fmeasure": 0.03592767335941958, "rougeLsum_fmeasure_stderr": 0.002132797533713022, "rougeLsum_precision": 0.0300121458572667, "rougeLsum_precision_stderr": 0.0019272475921858912, "rougeLsum_recall": 0.057470387886084315, "rougeLsum_recall_stderr": 0.0035480273356727466}, "DOC_tldr": {"bleu": 1.0148782549474955, "bleu_stderr": 0.145410394761497, "rouge1_fmeasure": 0.05633682692560753, "rouge1_fmeasure_stderr": 0.0030812766489717136, "rouge1_precision": 0.0465851524353561, "rouge1_precision_stderr": 0.0027356501105498567, "rouge1_recall": 0.08936680319887794, "rouge1_recall_stderr": 0.005013916901074763, "rouge2_fmeasure": 0.013394347210809258, "rouge2_fmeasure_stderr": 0.0011084639218436085, "rouge2_precision": 0.010250954815284269, "rouge2_precision_stderr": 0.0008867140781829498, "rouge2_recall": 0.02226634854406678, "rouge2_recall_stderr": 0.001849713945167963, "rougeL_fmeasure": 0.04233274207639188, "rougeL_fmeasure_stderr": 0.002338602629818879, "rougeL_precision": 0.03558590757936969, "rougeL_precision_stderr": 0.0022148829116658956, "rougeL_recall": 0.0672125091911251, "rougeL_recall_stderr": 0.0038250623727023383, "rougeLsum_fmeasure": 0.04511518475148196, "rougeLsum_fmeasure_stderr": 0.002494953102030446, "rougeLsum_precision": 0.03778568077281841, "rougeLsum_precision_stderr": 0.0023309139307629827, "rougeLsum_recall": 0.07189209907306558, "rougeLsum_recall_stderr": 0.0041053780258082095}, "article_DOC_summary": {"bleu": 0.851414956048288, "bleu_stderr": 0.10142038717157466, "rouge1_fmeasure": 0.05061277872693584, "rouge1_fmeasure_stderr": 0.0028337392524855837, "rouge1_precision": 0.04182867272383347, "rouge1_precision_stderr": 0.002516912133041032, "rouge1_recall": 0.08068317129523372, "rouge1_recall_stderr": 0.004630110172525636, "rouge2_fmeasure": 0.010817994039374855, "rouge2_fmeasure_stderr": 0.0009898135477985153, "rouge2_precision": 0.00821242826897234, "rouge2_precision_stderr": 0.000780085704496626, "rouge2_recall": 0.018229654639556718, "rouge2_recall_stderr": 0.0016649615737807282, "rougeL_fmeasure": 0.038452182498697945, "rougeL_fmeasure_stderr": 0.002124389719631779, "rougeL_precision": 0.03237406484505732, "rougeL_precision_stderr": 0.0020316037823294927, "rougeL_recall": 0.06169464366711305, "rougeL_recall_stderr": 0.0035477374875163793, "rougeLsum_fmeasure": 0.040956310132902136, "rougeLsum_fmeasure_stderr": 0.0023102690323821627, "rougeLsum_precision": 0.03434532775935572, "rougeLsum_precision_stderr": 0.0021605467450954603, "rougeLsum_recall": 0.06559190633881043, "rougeLsum_recall_stderr": 0.0038270052788935026}, "summarize_DOC": {"bleu": 0.9480714926440517, "bleu_stderr": 0.11225116755335138, "rouge1_fmeasure": 0.053882370203528755, "rouge1_fmeasure_stderr": 0.0029733530298338136, "rouge1_precision": 0.044405906375544715, "rouge1_precision_stderr": 0.0025610959506775525, "rouge1_recall": 0.08540051490880955, "rouge1_recall_stderr": 0.004853700270964368, "rouge2_fmeasure": 0.012207544410113281, "rouge2_fmeasure_stderr": 0.00104718267565319, "rouge2_precision": 0.009550693224487655, "rouge2_precision_stderr": 0.0008726966214714558, "rouge2_recall": 0.02007700952508465, "rouge2_recall_stderr": 0.0017411005950916892, "rougeL_fmeasure": 0.040443470438169406, "rougeL_fmeasure_stderr": 0.0022120809378912516, "rougeL_precision": 0.03383880268801385, "rougeL_precision_stderr": 0.0020073366629025043, "rougeL_recall": 0.06436025797802344, "rougeL_recall_stderr": 0.0036713369177924643, "rougeLsum_fmeasure": 0.04367643240984686, "rougeLsum_fmeasure_stderr": 0.0024209022639727494, "rougeLsum_precision": 0.036232835041777094, "rougeLsum_precision_stderr": 0.002133037107479806, "rougeLsum_recall": 0.06982777383177885, "rougeLsum_recall_stderr": 0.004049740359330814}, "summarize_this_DOC_summary": {"bleu": 0.761689921419185, "bleu_stderr": 0.1328182623834428, "rouge1_fmeasure": 0.043595060973618746, "rouge1_fmeasure_stderr": 0.0026168296236483567, "rouge1_precision": 0.036052583256726584, "rouge1_precision_stderr": 0.00222465814162892, "rouge1_recall": 0.06806165902172936, "rouge1_recall_stderr": 0.004214887088569051, "rouge2_fmeasure": 0.00916357714654539, "rouge2_fmeasure_stderr": 0.0009047152239114784, "rouge2_precision": 0.006971532253971467, "rouge2_precision_stderr": 0.0007060259100817015, "rouge2_recall": 0.0151236735555749, "rouge2_recall_stderr": 0.0015108099841256314, "rougeL_fmeasure": 0.03393069801696403, "rougeL_fmeasure_stderr": 0.002029922326331422, "rougeL_precision": 0.02826611508296682, "rougeL_precision_stderr": 0.0017443681779659868, "rougeL_recall": 0.05303756340984109, "rougeL_recall_stderr": 0.0032948227775611407, "rougeLsum_fmeasure": 0.03536876258776992, "rougeLsum_fmeasure_stderr": 0.002130719555450794, "rougeLsum_precision": 0.02934032328626938, "rougeLsum_precision_stderr": 0.0018132080168363025, "rougeLsum_recall": 0.05552522032023831, "rougeLsum_recall_stderr": 0.0034822420283327185}}, "5": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.00031526656966758077, "rouge1_fmeasure_stderr": 0.00015897565882719525, "rouge1_precision": 0.003430531732418525, "rouge1_precision_stderr": 0.0017130559457731738, "rouge1_recall": 0.00016538869567667977, "rouge1_recall_stderr": 8.34686515261994e-05, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.00031526656966758077, "rougeL_fmeasure_stderr": 0.00015897565882719525, "rougeL_precision": 0.003430531732418525, "rougeL_precision_stderr": 0.0017130559457731738, "rougeL_recall": 0.00016538869567667977, "rougeL_recall_stderr": 8.34686515261994e-05, "rougeLsum_fmeasure": 0.00031526656966758077, "rougeLsum_fmeasure_stderr": 0.00015897565882719525, "rougeLsum_precision": 0.003430531732418525, "rougeLsum_precision_stderr": 0.0017130559457731738, "rougeLsum_recall": 0.00016538869567667977, "rougeLsum_recall_stderr": 8.34686515261994e-05}, "DOC_tldr": {"bleu": 2.122977737748436e-42, "bleu_stderr": 9.17065601470076e-37, "rouge1_fmeasure": 0.0027092066961271533, "rouge1_fmeasure_stderr": 0.0007219195635749328, "rouge1_precision": 0.0031814398143476946, "rouge1_precision_stderr": 0.0008461438789843281, "rouge1_recall": 0.0025059491300529427, "rouge1_recall_stderr": 0.0006912202190712276, "rouge2_fmeasure": 0.0002940707111925786, "rouge2_fmeasure_stderr": 0.00012089379086475229, "rouge2_precision": 0.0003440741648288817, "rouge2_precision_stderr": 0.00014037543336007386, "rouge2_recall": 0.0002614759454382096, "rouge2_recall_stderr": 0.00010902516863349196, "rougeL_fmeasure": 0.0019169580279472128, "rougeL_fmeasure_stderr": 0.0005161311601160227, "rougeL_precision": 0.0022868109507954127, "rougeL_precision_stderr": 0.0006237767036215238, "rougeL_recall": 0.0017473662230373889, "rougeL_recall_stderr": 0.0004792256951751278, "rougeLsum_fmeasure": 0.002035329984614452, "rougeLsum_fmeasure_stderr": 0.0005499679200350618, "rougeLsum_precision": 0.002390441596878889, "rougeLsum_precision_stderr": 0.0006439443967438869, "rougeLsum_recall": 0.0019002132238758686, "rougeLsum_recall_stderr": 0.0005456581356378058}, "article_DOC_summary": {"bleu": 2.3013943780107486e-40, "bleu_stderr": 6.513754776072693e-35, "rouge1_fmeasure": 0.0019051009413407058, "rouge1_fmeasure_stderr": 0.0005593343846105789, "rouge1_precision": 0.0021529907822185413, "rouge1_precision_stderr": 0.0006324831746531657, "rouge1_recall": 0.001764574432238271, "rouge1_recall_stderr": 0.0005172389485732619, "rouge2_fmeasure": 0.0001299594149643802, "rouge2_fmeasure_stderr": 7.503370260000825e-05, "rouge2_precision": 0.0001457415441877151, "rouge2_precision_stderr": 8.410282821934284e-05, "rouge2_recall": 0.00011802662746058974, "rouge2_recall_stderr": 6.837858900511585e-05, "rougeL_fmeasure": 0.0015504765957310557, "rougeL_fmeasure_stderr": 0.0004402385576724586, "rougeL_precision": 0.0017522974182971845, "rougeL_precision_stderr": 0.000503946684193751, "rougeL_recall": 0.0014399120468157657, "rougeL_recall_stderr": 0.0004056116387189239, "rougeLsum_fmeasure": 0.0015923123485654278, "rougeLsum_fmeasure_stderr": 0.00044995087651658854, "rougeLsum_precision": 0.0018058994766162238, "rougeLsum_precision_stderr": 0.0005178578773672206, "rougeLsum_recall": 0.001474217364139951, "rougeLsum_recall_stderr": 0.0004127036649374616}, "summarize_DOC": {"bleu": 6.583932428818449e-39, "bleu_stderr": 3.862037022971363e-33, "rouge1_fmeasure": 0.002683283542434088, "rouge1_fmeasure_stderr": 0.0007198699976945498, "rouge1_precision": 0.0029613486934300065, "rouge1_precision_stderr": 0.0008017954910988925, "rouge1_recall": 0.002604820968979451, "rouge1_recall_stderr": 0.0007205582057724681, "rouge2_fmeasure": 0.0005146148795680421, "rouge2_fmeasure_stderr": 0.0002458736058430443, "rouge2_precision": 0.0006160191869252509, "rouge2_precision_stderr": 0.00030225046555707244, "rouge2_recall": 0.0004629935054463356, "rouge2_recall_stderr": 0.0002245599913382063, "rougeL_fmeasure": 0.002013235956184291, "rougeL_fmeasure_stderr": 0.000545919825941679, "rougeL_precision": 0.002211450919027296, "rougeL_precision_stderr": 0.0006050217509925478, "rougeL_recall": 0.001984170387640971, "rougeL_recall_stderr": 0.0005699295324941954, "rougeLsum_fmeasure": 0.002196929788975592, "rougeLsum_fmeasure_stderr": 0.0006019250148189245, "rougeLsum_precision": 0.002437420380568345, "rougeLsum_precision_stderr": 0.0006809965618635652, "rougeLsum_recall": 0.002143562481916302, "rougeLsum_recall_stderr": 0.0006129726717806505}, "summarize_this_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0007072965816124758, "rouge1_fmeasure_stderr": 0.0002672293328922938, "rouge1_precision": 0.003430531732418525, "rouge1_precision_stderr": 0.0013223146686700555, "rouge1_recall": 0.0003997094985643063, "rouge1_recall_stderr": 0.00015232935691739325, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0006215332883020128, "rougeL_fmeasure_stderr": 0.00022231592837332874, "rougeL_precision": 0.003144654088050314, "rougeL_precision_stderr": 0.0012266788586519496, "rougeL_recall": 0.00034926050249932806, "rougeL_recall_stderr": 0.00012487917660613927, "rougeLsum_fmeasure": 0.0006215332883020128, "rougeLsum_fmeasure_stderr": 0.00022231592837332874, "rougeLsum_precision": 0.003144654088050314, "rougeLsum_precision_stderr": 0.0012266788586519496, "rougeLsum_recall": 0.00034926050249932806, "rougeLsum_recall_stderr": 0.00012487917660613927}}}, "piqa": {"0": {"Correct the solution": {"bleu": 9.584748891646283, "bleu_stderr": 0.4634311680433123, "rouge1_fmeasure": 0.31552138662440865, "rouge1_fmeasure_stderr": 0.006271351560477954, "rouge1_precision": 0.250581062033785, "rouge1_precision_stderr": 0.0058239075018178725, "rouge1_recall": 0.7062158738344604, "rouge1_recall_stderr": 0.006429129435489434, "rouge2_fmeasure": 0.24351260224849494, "rouge2_fmeasure_stderr": 0.0060038232867892245, "rouge2_precision": 0.19130230064405784, "rouge2_precision_stderr": 0.005388995993524528, "rouge2_recall": 0.5459773371219434, "rouge2_recall_stderr": 0.0080205859504265, "rougeL_fmeasure": 0.30520915879086846, "rougeL_fmeasure_stderr": 0.006214719219843871, "rougeL_precision": 0.24167318855021985, "rougeL_precision_stderr": 0.005722704187225168, "rougeL_recall": 0.6860409065042077, "rougeL_recall_stderr": 0.006645065801360722, "rougeLsum_fmeasure": 0.3082991401991823, "rougeLsum_fmeasure_stderr": 0.006258332068067462, "rougeLsum_precision": 0.24469422917506942, "rougeLsum_precision_stderr": 0.0057794266304149345, "rougeLsum_recall": 0.6893060160244424, "rougeLsum_recall_stderr": 0.00664328067734422}, "choose the most appropriate solution": {"acc": 0.49510337323177367, "acc_norm": 0.49510337323177367, "acc_norm_stderr": 0.01166526473007815, "acc_stderr": 0.01166526473007815}, "no prompt needed": {"bleu": 0.18526507984327809, "bleu_stderr": 0.014352687013672758, "rouge1_fmeasure": 0.036171445152019484, "rouge1_fmeasure_stderr": 0.0008434569441985617, "rouge1_precision": 0.0207974489505957, "rouge1_precision_stderr": 0.0005419055674694086, "rouge1_recall": 0.2318870955901451, "rouge1_recall_stderr": 0.004096738969172235, "rouge2_fmeasure": 0.005621669068984276, "rouge2_fmeasure_stderr": 0.0002680503297844867, "rouge2_precision": 0.0031982235045390026, "rouge2_precision_stderr": 0.00016202149012648599, "rouge2_recall": 0.041020991401829215, "rouge2_recall_stderr": 0.0021462417342201236, "rougeL_fmeasure": 0.0330631932561623, "rougeL_fmeasure_stderr": 0.000733927602740061, "rougeL_precision": 0.018948626201834524, "rougeL_precision_stderr": 0.0004633920176847882, "rougeL_recall": 0.21626027275928003, "rougeL_recall_stderr": 0.0038232690449784353, "rougeLsum_fmeasure": 0.029360548512863772, "rougeLsum_fmeasure_stderr": 0.0006872127389303035, "rougeLsum_precision": 0.016844577935812375, "rougeLsum_precision_stderr": 0.00044192663287105345, "rougeLsum_recall": 0.19686515382910386, "rougeLsum_recall_stderr": 0.00370184205433221}, "pick_correct_choice_index": {"acc": 0.49510337323177367, "acc_norm": 0.49510337323177367, "acc_norm_stderr": 0.01166526473007815, "acc_stderr": 0.01166526473007815}, "what_is_the_correct_ending": {"acc": 0.5669205658324266, "acc_norm": 0.5745375408052231, "acc_norm_stderr": 0.011535468840824526, "acc_stderr": 0.011560864423151377}}, "1": {"Correct the solution": {"bleu": 9.533178886935461, "bleu_stderr": 0.280143031253315, "rouge1_fmeasure": 0.36387361635607934, "rouge1_fmeasure_stderr": 0.0072313190667876864, "rouge1_precision": 0.36248265473655833, "rouge1_precision_stderr": 0.00786742415876683, "rouge1_recall": 0.6236177733252095, "rouge1_recall_stderr": 0.007294999942203647, "rouge2_fmeasure": 0.2728457121204912, "rouge2_fmeasure_stderr": 0.00696791537970479, "rouge2_precision": 0.262274764045787, "rouge2_precision_stderr": 0.007192741692019697, "rouge2_recall": 0.47184937877299354, "rouge2_recall_stderr": 0.008337489044044633, "rougeL_fmeasure": 0.34999255403663865, "rougeL_fmeasure_stderr": 0.007254981296861731, "rougeL_precision": 0.3439193525878385, "rougeL_precision_stderr": 0.007678026405379061, "rougeL_recall": 0.6036216040299907, "rougeL_recall_stderr": 0.007526769831516323, "rougeLsum_fmeasure": 0.3523209043106269, "rougeLsum_fmeasure_stderr": 0.007250830483715573, "rougeLsum_precision": 0.3478926854036015, "rougeLsum_precision_stderr": 0.007735989195619495, "rougeLsum_recall": 0.6064434019351521, "rougeLsum_recall_stderr": 0.007514234152920643}, "choose the most appropriate solution": {"acc": 0.5021762785636561, "acc_norm": 0.5021762785636561, "acc_norm_stderr": 0.011665713661738877, "acc_stderr": 0.011665713661738877}, "no prompt needed": {"bleu": 0.1702000893250766, "bleu_stderr": 0.022602277898476263, "rouge1_fmeasure": 0.033921577431504014, "rouge1_fmeasure_stderr": 0.000851851348681226, "rouge1_precision": 0.019738604164709916, "rouge1_precision_stderr": 0.0005943670794712055, "rouge1_recall": 0.21655770189995802, "rouge1_recall_stderr": 0.0040458352158600835, "rouge2_fmeasure": 0.005440119305280117, "rouge2_fmeasure_stderr": 0.00028436946832994327, "rouge2_precision": 0.003127993584163729, "rouge2_precision_stderr": 0.00017993685583142814, "rouge2_recall": 0.03773492345933376, "rouge2_recall_stderr": 0.0020242951172695168, "rougeL_fmeasure": 0.03161334313286401, "rougeL_fmeasure_stderr": 0.0007473887108116619, "rougeL_precision": 0.018344912777774225, "rougeL_precision_stderr": 0.0005160040189181007, "rougeL_recall": 0.20486712491657955, "rougeL_recall_stderr": 0.003806037253425624, "rougeLsum_fmeasure": 0.027751716986245508, "rougeLsum_fmeasure_stderr": 0.0006946521749936248, "rougeLsum_precision": 0.016140312647768006, "rougeLsum_precision_stderr": 0.0004983217012086031, "rougeLsum_recall": 0.1850395128939851, "rougeLsum_recall_stderr": 0.0036539476740178115}, "pick_correct_choice_index": {"acc": 0.500544069640914, "acc_norm": 0.500544069640914, "acc_norm_stderr": 0.011665817258899177, "acc_stderr": 0.011665817258899177}, "what_is_the_correct_ending": {"acc": 0.5495103373231773, "acc_norm": 0.5549510337323177, "acc_norm_stderr": 0.011595157509775765, "acc_stderr": 0.011608491028638191}}, "2": {"Correct the solution": {"bleu": 47.02235168348344, "bleu_stderr": 1.7717853003889126, "rouge1_fmeasure": 0.6186288635304461, "rouge1_fmeasure_stderr": 0.007000082166722139, "rouge1_precision": 0.6572202360495694, "rouge1_precision_stderr": 0.00684754645297467, "rouge1_recall": 0.6314346295705424, "rouge1_recall_stderr": 0.007231765242823908, "rouge2_fmeasure": 0.4813163134151628, "rouge2_fmeasure_stderr": 0.008040016805301602, "rouge2_precision": 0.5015222551527236, "rouge2_precision_stderr": 0.008055814149315598, "rouge2_recall": 0.4939358543250565, "rouge2_recall_stderr": 0.008237670269354857, "rougeL_fmeasure": 0.6025255165585022, "rougeL_fmeasure_stderr": 0.0072208910046901355, "rougeL_precision": 0.6352718835824829, "rougeL_precision_stderr": 0.007015561715569228, "rougeL_recall": 0.6161858038815646, "rougeL_recall_stderr": 0.007440102458339062, "rougeLsum_fmeasure": 0.6055217771045465, "rougeLsum_fmeasure_stderr": 0.007171276947403789, "rougeLsum_precision": 0.640484622899746, "rougeLsum_precision_stderr": 0.0069959465065261486, "rougeLsum_recall": 0.6188983018867796, "rougeLsum_recall_stderr": 0.0073956327995358855}, "choose the most appropriate solution": {"acc": 0.4929270946681175, "acc_norm": 0.4929270946681175, "acc_norm_stderr": 0.011664656918145945, "acc_stderr": 0.011664656918145945}, "no prompt needed": {"bleu": 0.1503842124457421, "bleu_stderr": 0.013366858191649686, "rouge1_fmeasure": 0.03135729952678734, "rouge1_fmeasure_stderr": 0.0007915594925290304, "rouge1_precision": 0.01879881841993818, "rouge1_precision_stderr": 0.0006724234666717016, "rouge1_recall": 0.20311375385070707, "rouge1_recall_stderr": 0.003999202804540118, "rouge2_fmeasure": 0.004831334688117916, "rouge2_fmeasure_stderr": 0.00025723946213493105, "rouge2_precision": 0.0028249484242329798, "rouge2_precision_stderr": 0.00018807827347922434, "rouge2_recall": 0.0352797748367778, "rouge2_recall_stderr": 0.0019648415800267776, "rougeL_fmeasure": 0.029545104979527895, "rougeL_fmeasure_stderr": 0.0007329506313779751, "rougeL_precision": 0.01766775823250998, "rougeL_precision_stderr": 0.0006294794523191221, "rougeL_recall": 0.19315703981184032, "rougeL_recall_stderr": 0.0037946160886462503, "rougeLsum_fmeasure": 0.025479267218651926, "rougeLsum_fmeasure_stderr": 0.0006476891534007975, "rougeLsum_precision": 0.015392857681802947, "rougeLsum_precision_stderr": 0.0006117781547052657, "rougeLsum_recall": 0.17259808203436278, "rougeLsum_recall_stderr": 0.0035891800128922888}, "pick_correct_choice_index": {"acc": 0.48748639825897716, "acc_norm": 0.48748639825897716, "acc_norm_stderr": 0.011662170084916892, "acc_stderr": 0.011662170084916892}, "what_is_the_correct_ending": {"acc": 0.529923830250272, "acc_norm": 0.528835690968444, "acc_norm_stderr": 0.011646407809944715, "acc_stderr": 0.011644913435420153}}, "3": {"Correct the solution": {"bleu": 52.9579720847619, "bleu_stderr": 1.3100172607657463, "rouge1_fmeasure": 0.6471778364388423, "rouge1_fmeasure_stderr": 0.006873790000568802, "rouge1_precision": 0.6766298604013554, "rouge1_precision_stderr": 0.0067056488771926465, "rouge1_recall": 0.6598734365934378, "rouge1_recall_stderr": 0.007039834715151128, "rouge2_fmeasure": 0.5130291514276495, "rouge2_fmeasure_stderr": 0.007995711764359356, "rouge2_precision": 0.5291084288377738, "rouge2_precision_stderr": 0.008003327478399361, "rouge2_recall": 0.5238663129877955, "rouge2_recall_stderr": 0.008115248816933896, "rougeL_fmeasure": 0.6318467907423595, "rougeL_fmeasure_stderr": 0.007102064438778346, "rougeL_precision": 0.6572059275393904, "rougeL_precision_stderr": 0.006902013951713947, "rougeL_recall": 0.6451122696509598, "rougeL_recall_stderr": 0.007258135995140546, "rougeLsum_fmeasure": 0.6342013853383822, "rougeLsum_fmeasure_stderr": 0.0070565440039982155, "rougeLsum_precision": 0.6610004390006043, "rougeLsum_precision_stderr": 0.006869316294428751, "rougeLsum_recall": 0.6472344224673601, "rougeLsum_recall_stderr": 0.007217696942084704}, "choose the most appropriate solution": {"acc": 0.5065288356909684, "acc_norm": 0.5065288356909684, "acc_norm_stderr": 0.01166482959521097, "acc_stderr": 0.01166482959521097}, "no prompt needed": {"bleu": 0.13272684019612407, "bleu_stderr": 0.010461864794627634, "rouge1_fmeasure": 0.03068502702468395, "rouge1_fmeasure_stderr": 0.0008064587848542262, "rouge1_precision": 0.01850944530673799, "rouge1_precision_stderr": 0.0006860099150056603, "rouge1_recall": 0.19748557905607972, "rouge1_recall_stderr": 0.0040233681628607295, "rouge2_fmeasure": 0.004497352970943405, "rouge2_fmeasure_stderr": 0.00024384382231740025, "rouge2_precision": 0.0026003584508943025, "rouge2_precision_stderr": 0.00017066530138983938, "rouge2_recall": 0.03498197751607171, "rouge2_recall_stderr": 0.0020301674433790403, "rougeL_fmeasure": 0.02864531316521238, "rougeL_fmeasure_stderr": 0.0007274072742712978, "rougeL_precision": 0.017220549482260407, "rougeL_precision_stderr": 0.000613096462513708, "rougeL_recall": 0.1861971663320917, "rougeL_recall_stderr": 0.0037489583224773423, "rougeLsum_fmeasure": 0.025128864550950536, "rougeLsum_fmeasure_stderr": 0.0006505763970428052, "rougeLsum_precision": 0.015199797557400578, "rougeLsum_precision_stderr": 0.0005931310311220563, "rougeLsum_recall": 0.16911250346695345, "rougeLsum_recall_stderr": 0.003583275698147175}, "pick_correct_choice_index": {"acc": 0.4776931447225245, "acc_norm": 0.4776931447225245, "acc_norm_stderr": 0.011654208652596473, "acc_stderr": 0.011654208652596473}, "what_is_the_correct_ending": {"acc": 0.529379760609358, "acc_norm": 0.5397170837867247, "acc_norm_stderr": 0.011628961491718635, "acc_stderr": 0.011645667565050859}}, "4": {"Correct the solution": {"bleu": 52.34178177222339, "bleu_stderr": 1.133611224110179, "rouge1_fmeasure": 0.6536521914658334, "rouge1_fmeasure_stderr": 0.006849076962719914, "rouge1_precision": 0.6802288907430989, "rouge1_precision_stderr": 0.006708946985682678, "rouge1_recall": 0.6684627731217727, "rouge1_recall_stderr": 0.006968332579326068, "rouge2_fmeasure": 0.5191957202312659, "rouge2_fmeasure_stderr": 0.00803537497169239, "rouge2_precision": 0.5329584588559811, "rouge2_precision_stderr": 0.008028711159823622, "rouge2_recall": 0.5312726398193803, "rouge2_recall_stderr": 0.008141583217459525, "rougeL_fmeasure": 0.6391386680743565, "rougeL_fmeasure_stderr": 0.007074861598280453, "rougeL_precision": 0.6616061710878812, "rougeL_precision_stderr": 0.006895248485381961, "rougeL_recall": 0.6544242595472203, "rougeL_recall_stderr": 0.007187512365355003, "rougeLsum_fmeasure": 0.6416329037978986, "rougeLsum_fmeasure_stderr": 0.007024709271452905, "rougeLsum_precision": 0.6659368424074638, "rougeLsum_precision_stderr": 0.00686752783122648, "rougeLsum_recall": 0.6566926468449152, "rougeLsum_recall_stderr": 0.00714104853293446}, "choose the most appropriate solution": {"acc": 0.5059847660500544, "acc_norm": 0.5059847660500544, "acc_norm_stderr": 0.011664988455853328, "acc_stderr": 0.011664988455853328}, "no prompt needed": {"bleu": 0.1116223813797144, "bleu_stderr": 0.009741200478014737, "rouge1_fmeasure": 0.028877688849418524, "rouge1_fmeasure_stderr": 0.000724243387986705, "rouge1_precision": 0.01686851468849681, "rouge1_precision_stderr": 0.000499194047802729, "rouge1_recall": 0.18775401200901778, "rouge1_recall_stderr": 0.0038484179354708087, "rouge2_fmeasure": 0.00396783046089762, "rouge2_fmeasure_stderr": 0.00021343834712482054, "rouge2_precision": 0.002223786720488836, "rouge2_precision_stderr": 0.00012230483001260983, "rouge2_recall": 0.030293584456631554, "rouge2_recall_stderr": 0.001817796896011939, "rougeL_fmeasure": 0.026778467249332305, "rougeL_fmeasure_stderr": 0.000647417434268784, "rougeL_precision": 0.015625504769488287, "rougeL_precision_stderr": 0.00044811071614759033, "rougeL_recall": 0.17632683978230812, "rougeL_recall_stderr": 0.003581217055366121, "rougeLsum_fmeasure": 0.023672570832917728, "rougeLsum_fmeasure_stderr": 0.0005880376849600522, "rougeLsum_precision": 0.013796234932842528, "rougeLsum_precision_stderr": 0.0004109358448293982, "rougeLsum_recall": 0.16103835770466796, "rougeLsum_recall_stderr": 0.003451182928178216}, "pick_correct_choice_index": {"acc": 0.5021762785636561, "acc_norm": 0.5021762785636561, "acc_norm_stderr": 0.011665713661738868, "acc_stderr": 0.011665713661738868}, "what_is_the_correct_ending": {"acc": 0.5277475516866159, "acc_norm": 0.5353645266594124, "acc_norm_stderr": 0.011636607860111557, "acc_stderr": 0.011647846656062251}}, "5": {"Correct the solution": {"bleu": 54.45931011034328, "bleu_stderr": 1.3566590232163973, "rouge1_fmeasure": 0.6680605340297567, "rouge1_fmeasure_stderr": 0.006698004882443747, "rouge1_precision": 0.6910659299797782, "rouge1_precision_stderr": 0.006587178887176958, "rouge1_recall": 0.6819893555935073, "rouge1_recall_stderr": 0.006829836530109502, "rouge2_fmeasure": 0.5349097989485111, "rouge2_fmeasure_stderr": 0.007953481702063668, "rouge2_precision": 0.5479142345751371, "rouge2_precision_stderr": 0.007979754345711795, "rouge2_recall": 0.5461321733007606, "rouge2_recall_stderr": 0.008056987802607279, "rougeL_fmeasure": 0.6547986538369945, "rougeL_fmeasure_stderr": 0.006916766018240786, "rougeL_precision": 0.6748350327218241, "rougeL_precision_stderr": 0.006785040185272315, "rougeL_recall": 0.668942921143986, "rougeL_recall_stderr": 0.00704009869784028, "rougeLsum_fmeasure": 0.6568291500377871, "rougeLsum_fmeasure_stderr": 0.006878708628824482, "rougeLsum_precision": 0.6778670501360622, "rougeLsum_precision_stderr": 0.006756650353609527, "rougeLsum_recall": 0.6709060733280635, "rougeLsum_recall_stderr": 0.00700381652397326}, "choose the most appropriate solution": {"acc": 0.5032644178454843, "acc_norm": 0.5032644178454843, "acc_norm_stderr": 0.01166557553076037, "acc_stderr": 0.01166557553076037}, "no prompt needed": {"bleu": 0.12258212222097367, "bleu_stderr": 0.010470680109155984, "rouge1_fmeasure": 0.030015026802667398, "rouge1_fmeasure_stderr": 0.0008029789068806333, "rouge1_precision": 0.018668170976776546, "rouge1_precision_stderr": 0.000825403324535186, "rouge1_recall": 0.19105972479056632, "rouge1_recall_stderr": 0.0039771031374790326, "rouge2_fmeasure": 0.0042412326403525056, "rouge2_fmeasure_stderr": 0.0002561460325558005, "rouge2_precision": 0.0027317811603090264, "rouge2_precision_stderr": 0.00028543390592784026, "rouge2_recall": 0.032512896092599414, "rouge2_recall_stderr": 0.0020795575853301057, "rougeL_fmeasure": 0.027601201325242745, "rougeL_fmeasure_stderr": 0.0006943269476354326, "rougeL_precision": 0.01696418124382772, "rougeL_precision_stderr": 0.0006649299596873597, "rougeL_recall": 0.17869693365245193, "rougeL_recall_stderr": 0.00369968586124689, "rougeLsum_fmeasure": 0.024736080266790614, "rougeLsum_fmeasure_stderr": 0.0006498006943115617, "rougeLsum_precision": 0.015314009901949352, "rougeLsum_precision_stderr": 0.000662958634304753, "rougeLsum_recall": 0.16508907780803328, "rougeLsum_recall_stderr": 0.003604116467893131}, "pick_correct_choice_index": {"acc": 0.4967355821545158, "acc_norm": 0.4967355821545158, "acc_norm_stderr": 0.011665575530760367, "acc_stderr": 0.011665575530760367}, "what_is_the_correct_ending": {"acc": 0.5348204570184983, "acc_norm": 0.5359085963003264, "acc_norm_stderr": 0.011635700809215629, "acc_stderr": 0.011637500993815848}}}, "sciq": {"0": {"Direct Question": {"acc": 0.866, "acc_norm": 0.791, "acc_norm_stderr": 0.012864077288499346, "acc_stderr": 0.010777762298369683}, "Direct Question (Closed Book)": {"acc": 0.617, "acc_norm": 0.549, "acc_norm_stderr": 0.015743152379585533, "acc_stderr": 0.01538010232565271}, "Multiple Choice": {"acc": 0.583, "acc_norm": 0.499, "acc_norm_stderr": 0.015819268290576814, "acc_stderr": 0.015599819048769618}, "Multiple Choice (Closed Book)": {"acc": 0.46, "acc_norm": 0.424, "acc_norm_stderr": 0.015635487471405186, "acc_stderr": 0.015768596914394382}, "Multiple Choice Question First": {"acc": 0.534, "acc_norm": 0.463, "acc_norm_stderr": 0.015775927227262423, "acc_stderr": 0.015782683329937625}}, "1": {"Direct Question": {"acc": 0.9, "acc_norm": 0.87, "acc_norm_stderr": 0.010640169792499356, "acc_stderr": 0.00949157995752504}, "Direct Question (Closed Book)": {"acc": 0.675, "acc_norm": 0.65, "acc_norm_stderr": 0.015090650341444235, "acc_stderr": 0.014818724459095527}, "Multiple Choice": {"acc": 0.507, "acc_norm": 0.487, "acc_norm_stderr": 0.015813952101896626, "acc_stderr": 0.015817749561843567}, "Multiple Choice (Closed Book)": {"acc": 0.457, "acc_norm": 0.455, "acc_norm_stderr": 0.015755101498347093, "acc_stderr": 0.015760691590136378}, "Multiple Choice Question First": {"acc": 0.387, "acc_norm": 0.375, "acc_norm_stderr": 0.015316971293620996, "acc_stderr": 0.015410011955493933}}, "2": {"Direct Question": {"acc": 0.901, "acc_norm": 0.881, "acc_norm_stderr": 0.010244215145336666, "acc_stderr": 0.00944924802766275}, "Direct Question (Closed Book)": {"acc": 0.689, "acc_norm": 0.684, "acc_norm_stderr": 0.014709193056057127, "acc_stderr": 0.014645596385722694}, "Multiple Choice": {"acc": 0.548, "acc_norm": 0.521, "acc_norm_stderr": 0.015805341148131296, "acc_stderr": 0.015746235865880677}, "Multiple Choice (Closed Book)": {"acc": 0.543, "acc_norm": 0.523, "acc_norm_stderr": 0.0158025542467261, "acc_stderr": 0.015760691590136388}, "Multiple Choice Question First": {"acc": 0.42, "acc_norm": 0.401, "acc_norm_stderr": 0.015506109745498329, "acc_stderr": 0.015615500115072957}}, "3": {"Direct Question": {"acc": 0.911, "acc_norm": 0.896, "acc_norm_stderr": 0.009658016218524294, "acc_stderr": 0.009008893392651533}, "Direct Question (Closed Book)": {"acc": 0.696, "acc_norm": 0.696, "acc_norm_stderr": 0.014553205687950438, "acc_stderr": 0.014553205687950436}, "Multiple Choice": {"acc": 0.575, "acc_norm": 0.564, "acc_norm_stderr": 0.015689173023144064, "acc_stderr": 0.015640320317040105}, "Multiple Choice (Closed Book)": {"acc": 0.57, "acc_norm": 0.553, "acc_norm_stderr": 0.015730176046009077, "acc_stderr": 0.015663503610155283}, "Multiple Choice Question First": {"acc": 0.42, "acc_norm": 0.42, "acc_norm_stderr": 0.015615500115072957, "acc_stderr": 0.015615500115072957}}, "4": {"Direct Question": {"acc": 0.904, "acc_norm": 0.896, "acc_norm_stderr": 0.009658016218524294, "acc_stderr": 0.009320454434783226}, "Direct Question (Closed Book)": {"acc": 0.709, "acc_norm": 0.7, "acc_norm_stderr": 0.014498627873361428, "acc_stderr": 0.014370995982377942}, "Multiple Choice": {"acc": 0.584, "acc_norm": 0.572, "acc_norm_stderr": 0.015654426245029288, "acc_stderr": 0.015594460144140598}, "Multiple Choice (Closed Book)": {"acc": 0.565, "acc_norm": 0.563, "acc_norm_stderr": 0.015693223928730377, "acc_stderr": 0.0156850572527172}, "Multiple Choice Question First": {"acc": 0.445, "acc_norm": 0.415, "acc_norm_stderr": 0.01558903518560463, "acc_stderr": 0.015723301886760938}}, "5": {"Direct Question": {"acc": 0.906, "acc_norm": 0.894, "acc_norm_stderr": 0.009739551265785127, "acc_stderr": 0.00923305200078773}, "Direct Question (Closed Book)": {"acc": 0.714, "acc_norm": 0.702, "acc_norm_stderr": 0.014470846741134713, "acc_stderr": 0.014297146862517908}, "Multiple Choice": {"acc": 0.581, "acc_norm": 0.583, "acc_norm_stderr": 0.015599819048769618, "acc_stderr": 0.015610338967577799}, "Multiple Choice (Closed Book)": {"acc": 0.579, "acc_norm": 0.56, "acc_norm_stderr": 0.01570498795436179, "acc_stderr": 0.01562059547530132}, "Multiple Choice Question First": {"acc": 0.462, "acc_norm": 0.442, "acc_norm_stderr": 0.01571250721186421, "acc_stderr": 0.015773547629015113}}}, "story_cloze_2016": {"0": {"Answer Given options": {"acc": 0.49706039551042225, "acc_norm": 0.5221806520577231, "acc_norm_stderr": 0.011551049647290309, "acc_stderr": 0.011562232421541944}, "Choose Story Ending": {"acc": 0.48957776590058794, "acc_norm": 0.5334045964724746, "acc_norm_stderr": 0.011536599118298177, "acc_stderr": 0.011559920087347776}, "Novel Correct Ending": {"acc": 0.4879743452699091, "acc_norm": 0.5125601282736505, "acc_norm_stderr": 0.011558783570737969, "acc_stderr": 0.011559087533800687}, "Story Continuation and Options": {"acc": 0.49438802779262425, "acc_norm": 0.5312667022982362, "acc_norm_stderr": 0.011539803085637724, "acc_stderr": 0.011561703928784332}}, "1": {"Answer Given options": {"acc": 0.4853019775521112, "acc_norm": 0.48957776590058794, "acc_norm_stderr": 0.011559920087347783, "acc_stderr": 0.011557435464292914}, "Choose Story Ending": {"acc": 0.4906467129877071, "acc_norm": 0.5125601282736505, "acc_norm_stderr": 0.011558783570737967, "acc_stderr": 0.011560409019420362}, "Novel Correct Ending": {"acc": 0.48102618920363444, "acc_norm": 0.49331908070550506, "acc_norm_stderr": 0.011561400034509398, "acc_stderr": 0.01155410417401969}, "Story Continuation and Options": {"acc": 0.4917156600748263, "acc_norm": 0.5056119722073757, "acc_norm_stderr": 0.01156170392878433, "acc_stderr": 0.011560845076525718}}, "2": {"Answer Given options": {"acc": 0.47888829502939606, "acc_norm": 0.4863709246392304, "acc_norm_stderr": 0.011558135970599896, "acc_stderr": 0.011552120807053819}, "Choose Story Ending": {"acc": 0.47728487439871725, "acc_norm": 0.49438802779262425, "acc_norm_stderr": 0.011561703928784335, "acc_stderr": 0.011550494192008945}, "Novel Correct Ending": {"acc": 0.4751469802244789, "acc_norm": 0.4927846071619455, "acc_norm_stderr": 0.01156122826464673, "acc_stderr": 0.01154813982307477}, "Story Continuation and Options": {"acc": 0.47995724211651525, "acc_norm": 0.48743987172634956, "acc_norm_stderr": 0.011558783570737972, "acc_stderr": 0.011553138977961008}}, "3": {"Answer Given options": {"acc": 0.4735435595938001, "acc_norm": 0.484233030464992, "acc_norm_stderr": 0.011556682042196382, "acc_stderr": 0.011546234813777406}, "Choose Story Ending": {"acc": 0.4820951362907536, "acc_norm": 0.4853019775521112, "acc_norm_stderr": 0.01155743546429292, "acc_stderr": 0.011555016408505476}, "Novel Correct Ending": {"acc": 0.4740780331373597, "acc_norm": 0.4879743452699091, "acc_norm_stderr": 0.011559087533800689, "acc_stderr": 0.011546883081384901}, "Story Continuation and Options": {"acc": 0.4901122394441475, "acc_norm": 0.4949225013361839, "acc_norm_stderr": 0.01156183605423878, "acc_stderr": 0.011560171163157397}}, "4": {"Answer Given options": {"acc": 0.46178514163548906, "acc_norm": 0.47835382148583644, "acc_norm_stderr": 0.011551591851683333, "acc_stderr": 0.011528611805439891}, "Choose Story Ending": {"acc": 0.4730090860502405, "acc_norm": 0.4906467129877071, "acc_norm_stderr": 0.011560409019420367, "acc_stderr": 0.011545573278697235}, "Novel Correct Ending": {"acc": 0.4681988241582042, "acc_norm": 0.4826296098343132, "acc_norm_stderr": 0.011555452669106634, "acc_stderr": 0.011539022035111228}, "Story Continuation and Options": {"acc": 0.4879743452699091, "acc_norm": 0.5018706574024586, "acc_norm_stderr": 0.011562351329083271, "acc_stderr": 0.011559087533800687}}, "5": {"Answer Given options": {"acc": 0.46178514163548906, "acc_norm": 0.47888829502939606, "acc_norm_stderr": 0.011552120807053819, "acc_stderr": 0.011528611805439893}, "Choose Story Ending": {"acc": 0.4826296098343132, "acc_norm": 0.484233030464992, "acc_norm_stderr": 0.011556682042196382, "acc_stderr": 0.011555452669106632}, "Novel Correct Ending": {"acc": 0.4719401389631213, "acc_norm": 0.47995724211651525, "acc_norm_stderr": 0.011553138977961008, "acc_stderr": 0.011544210396951669}, "Story Continuation and Options": {"acc": 0.49438802779262425, "acc_norm": 0.4938535542490647, "acc_norm_stderr": 0.01156155858904076, "acc_stderr": 0.01156170392878433}}}, "superglue_rte": {"0": {"GPT-3 style": {"acc": 0.5090252707581228, "acc_norm": 0.4620938628158845, "acc_norm_stderr": 0.030009848912529113, "acc_stderr": 0.030091559826331334}, "MNLI crowdsource": {"acc": 0.48014440433212996, "acc_norm": 0.516245487364621, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030072723167317194}, "does it follow that": {"acc": 0.44404332129963897, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.029907396333795997}, "guaranteed true": {"acc": 0.5126353790613718, "acc_norm": 0.5415162454873647, "acc_norm_stderr": 0.029992535385373314, "acc_stderr": 0.030086851767188564}, "should assume": {"acc": 0.5415162454873647, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.029992535385373314}}, "1": {"GPT-3 style": {"acc": 0.5090252707581228, "acc_norm": 0.4981949458483754, "acc_norm_stderr": 0.030096267148976633, "acc_stderr": 0.030091559826331334}, "MNLI crowdsource": {"acc": 0.49097472924187724, "acc_norm": 0.48736462093862815, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030091559826331334}, "does it follow that": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}, "guaranteed true": {"acc": 0.49097472924187724, "acc_norm": 0.48736462093862815, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030091559826331334}, "should assume": {"acc": 0.48375451263537905, "acc_norm": 0.48375451263537905, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030080573208738064}}, "2": {"GPT-3 style": {"acc": 0.516245487364621, "acc_norm": 0.5342960288808665, "acc_norm_stderr": 0.030025579819366422, "acc_stderr": 0.030080573208738064}, "MNLI crowdsource": {"acc": 0.5054151624548736, "acc_norm": 0.516245487364621, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030094698123239966}, "does it follow that": {"acc": 0.516245487364621, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.03003973059219781, "acc_stderr": 0.030080573208738064}, "guaranteed true": {"acc": 0.5018050541516246, "acc_norm": 0.5054151624548736, "acc_norm_stderr": 0.030094698123239966, "acc_stderr": 0.030096267148976633}, "should assume": {"acc": 0.516245487364621, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.030039730592197812, "acc_stderr": 0.030080573208738064}}, "3": {"GPT-3 style": {"acc": 0.5234657039711191, "acc_norm": 0.5451263537906137, "acc_norm_stderr": 0.029973636495415252, "acc_stderr": 0.03006330041190266}, "MNLI crowdsource": {"acc": 0.5270758122743683, "acc_norm": 0.5342960288808665, "acc_norm_stderr": 0.030025579819366426, "acc_stderr": 0.030052303463143706}, "does it follow that": {"acc": 0.5379061371841155, "acc_norm": 0.51985559566787, "acc_norm_stderr": 0.030072723167317177, "acc_stderr": 0.030009848912529117}, "guaranteed true": {"acc": 0.51985559566787, "acc_norm": 0.5379061371841155, "acc_norm_stderr": 0.030009848912529113, "acc_stderr": 0.030072723167317184}, "should assume": {"acc": 0.5306859205776173, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030039730592197812}}, "4": {"GPT-3 style": {"acc": 0.5126353790613718, "acc_norm": 0.5342960288808665, "acc_norm_stderr": 0.030025579819366422, "acc_stderr": 0.030086851767188564}, "MNLI crowdsource": {"acc": 0.5379061371841155, "acc_norm": 0.5415162454873647, "acc_norm_stderr": 0.029992535385373314, "acc_stderr": 0.030009848912529113}, "does it follow that": {"acc": 0.51985559566787, "acc_norm": 0.516245487364621, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030072723167317184}, "guaranteed true": {"acc": 0.5342960288808665, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030025579819366426}, "should assume": {"acc": 0.5234657039711191, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.03006330041190266}}, "5": {"GPT-3 style": {"acc": 0.5306859205776173, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.03003973059219781}, "MNLI crowdsource": {"acc": 0.5090252707581228, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.03003973059219781, "acc_stderr": 0.030091559826331334}, "does it follow that": {"acc": 0.5234657039711191, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.030039730592197812, "acc_stderr": 0.03006330041190266}, "guaranteed true": {"acc": 0.5270758122743683, "acc_norm": 0.5379061371841155, "acc_norm_stderr": 0.030009848912529113, "acc_stderr": 0.030052303463143706}, "should assume": {"acc": 0.5306859205776173, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030039730592197812}}}, "winogrande": {"0": {"Replace": {"acc": 0.49013417521704816, "acc_norm": 0.5098658247829518, "acc_norm_stderr": 0.014049749833367589, "acc_stderr": 0.014049749833367597}, "True or False": {"acc": 0.4956590370955012, "acc_norm": 0.4956590370955012, "acc_norm_stderr": 0.0140519560640769, "acc_stderr": 0.014051956064076896}, "does underscore refer to": {"acc": 0.4940805051302289, "acc_norm": 0.48855564325177586, "acc_norm_stderr": 0.014048804199859325, "acc_stderr": 0.014051500838485807}, "stand for": {"acc": 0.500394632991318, "acc_norm": 0.49329123914759276, "acc_norm_stderr": 0.014051220692330349, "acc_stderr": 0.014052481306049516}, "underscore refer to": {"acc": 0.4861878453038674, "acc_norm": 0.48697711128650356, "acc_norm_stderr": 0.014047718393997663, "acc_stderr": 0.014047122916440419}}, "1": {"Replace": {"acc": 0.5098658247829518, "acc_norm": 0.5090765588003157, "acc_norm_stderr": 0.014050170094497697, "acc_stderr": 0.014049749833367589}, "True or False": {"acc": 0.494869771112865, "acc_norm": 0.49013417521704816, "acc_norm_stderr": 0.014049749833367585, "acc_stderr": 0.014051745961790516}, "does underscore refer to": {"acc": 0.505130228887135, "acc_norm": 0.4972375690607735, "acc_norm_stderr": 0.014052271211616436, "acc_stderr": 0.014051745961790513}, "stand for": {"acc": 0.5209155485398579, "acc_norm": 0.5082872928176796, "acc_norm_stderr": 0.014050555322824194, "acc_stderr": 0.014040185494212945}, "underscore refer to": {"acc": 0.5122336227308603, "acc_norm": 0.510655090765588, "acc_norm_stderr": 0.014049294536290393, "acc_stderr": 0.014048278820405621}}, "2": {"Replace": {"acc": 0.5177584846093133, "acc_norm": 0.510655090765588, "acc_norm_stderr": 0.014049294536290396, "acc_stderr": 0.014043619596174959}, "True or False": {"acc": 0.4956590370955012, "acc_norm": 0.5082872928176796, "acc_norm_stderr": 0.014050555322824189, "acc_stderr": 0.014051956064076908}, "does underscore refer to": {"acc": 0.5303867403314917, "acc_norm": 0.5011838989739542, "acc_norm_stderr": 0.014052446290529012, "acc_stderr": 0.014026510839428743}, "stand for": {"acc": 0.5240726124704025, "acc_norm": 0.5027624309392266, "acc_norm_stderr": 0.014052271211616436, "acc_stderr": 0.014036189665395132}, "underscore refer to": {"acc": 0.5146014206787688, "acc_norm": 0.510655090765588, "acc_norm_stderr": 0.014049294536290396, "acc_stderr": 0.014046492383275842}}, "3": {"Replace": {"acc": 0.5240726124704025, "acc_norm": 0.5185477505919495, "acc_norm_stderr": 0.014042813708888378, "acc_stderr": 0.014036189665395129}, "True or False": {"acc": 0.4988161010260458, "acc_norm": 0.500394632991318, "acc_norm_stderr": 0.014052481306049516, "acc_stderr": 0.014052446290529022}, "does underscore refer to": {"acc": 0.5272296764009471, "acc_norm": 0.5130228887134964, "acc_norm_stderr": 0.014047718393997663, "acc_stderr": 0.014031631629827701}, "stand for": {"acc": 0.510655090765588, "acc_norm": 0.5027624309392266, "acc_norm_stderr": 0.014052271211616436, "acc_stderr": 0.014049294536290391}, "underscore refer to": {"acc": 0.5248618784530387, "acc_norm": 0.5114443567482242, "acc_norm_stderr": 0.01404880419985932, "acc_stderr": 0.01403510288362775}}, "4": {"Replace": {"acc": 0.5177584846093133, "acc_norm": 0.5090765588003157, "acc_norm_stderr": 0.014050170094497704, "acc_stderr": 0.014043619596174962}, "True or False": {"acc": 0.5027624309392266, "acc_norm": 0.5122336227308603, "acc_norm_stderr": 0.014048278820405621, "acc_stderr": 0.014052271211616441}, "does underscore refer to": {"acc": 0.5288082083662194, "acc_norm": 0.5153906866614049, "acc_norm_stderr": 0.014045826789783658, "acc_stderr": 0.014029141615909612}, "stand for": {"acc": 0.5067087608524072, "acc_norm": 0.49329123914759276, "acc_norm_stderr": 0.014051220692330349, "acc_stderr": 0.014051220692330349}, "underscore refer to": {"acc": 0.5232833464877664, "acc_norm": 0.505130228887135, "acc_norm_stderr": 0.01405174596179051, "acc_stderr": 0.01403724130957364}}, "5": {"Replace": {"acc": 0.5185477505919495, "acc_norm": 0.516179952644041, "acc_norm_stderr": 0.0140451261309786, "acc_stderr": 0.014042813708888378}, "True or False": {"acc": 0.4940805051302289, "acc_norm": 0.4980268350434096, "acc_norm_stderr": 0.014052376259225632, "acc_stderr": 0.014051500838485807}, "does underscore refer to": {"acc": 0.526440410418311, "acc_norm": 0.5138121546961326, "acc_norm_stderr": 0.014047122916440426, "acc_stderr": 0.014032823874407229}, "stand for": {"acc": 0.4972375690607735, "acc_norm": 0.4846093133385951, "acc_norm_stderr": 0.01404582678978367, "acc_stderr": 0.014052271211616441}, "underscore refer to": {"acc": 0.5224940805051302, "acc_norm": 0.5169692186266772, "acc_norm_stderr": 0.014044390401612976, "acc_stderr": 0.014038257824059886}}}} \ No newline at end of file