diff --git "a/4b284b17bc4/eval/merged.json" "b/4b284b17bc4/eval/merged.json" new file mode 100644--- /dev/null +++ "b/4b284b17bc4/eval/merged.json" @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4040857346605273, "bleu_stderr": 0.04358756352339084, "rouge1_fmeasure": 0.11153940555452811, "rouge1_fmeasure_stderr": 0.002178375350508395, "rouge1_precision": 0.0759904796250538, "rouge1_precision_stderr": 0.0019478615830651011, "rouge1_recall": 0.3009878218567671, "rouge1_recall_stderr": 0.0046586299284223885, "rouge2_fmeasure": 0.05308201459552208, "rouge2_fmeasure_stderr": 0.0013729761880117914, "rouge2_precision": 0.03649587266689502, "rouge2_precision_stderr": 0.001284173883599284, "rouge2_recall": 0.14683508450255534, "rouge2_recall_stderr": 0.0032437032345681857, "rougeL_fmeasure": 0.10594741371659425, "rougeL_fmeasure_stderr": 0.0019835178597520093, "rougeL_precision": 0.07180766426825755, "rougeL_precision_stderr": 0.001761388989113738, "rougeL_recall": 0.28987291523705844, "rougeL_recall_stderr": 0.0045326872175802165, "rougeLsum_fmeasure": 0.10637513260264994, "rougeLsum_fmeasure_stderr": 0.00204505600505615, "rougeLsum_precision": 0.07245769602542276, "rougeLsum_precision_stderr": 0.00184585307065297, "rougeLsum_recall": 0.2877191217238231, "rougeLsum_recall_stderr": 0.004439207690226351}, "explicit-graph-description2": {"bleu": 0.1350370966328972, "bleu_stderr": 0.027107668307824653, "rouge1_fmeasure": 0.0649820340050219, "rouge1_fmeasure_stderr": 0.0016774667522408912, "rouge1_precision": 0.06635037756207988, "rouge1_precision_stderr": 0.00222464644174656, "rouge1_recall": 0.14012624989204728, "rouge1_recall_stderr": 0.002036835675458236, "rouge2_fmeasure": 0.0031079038623638374, "rouge2_fmeasure_stderr": 0.00034130095697030574, "rouge2_precision": 0.003020102888519982, "rouge2_precision_stderr": 0.00032547132764282983, "rouge2_recall": 0.006611308173835243, "rouge2_recall_stderr": 0.0009951093670627143, "rougeL_fmeasure": 0.057959376823978205, "rougeL_fmeasure_stderr": 0.0013701772199315872, "rougeL_precision": 0.05754089923288528, "rougeL_precision_stderr": 0.0018114419198071445, "rougeL_recall": 0.1332053573384668, "rougeL_recall_stderr": 0.0018778409471417506, "rougeLsum_fmeasure": 0.04917830665991281, "rougeLsum_fmeasure_stderr": 0.0014301781368119162, "rougeLsum_precision": 0.05219309540753949, "rougeLsum_precision_stderr": 0.001901332766377738, "rougeLsum_recall": 0.09949694242629789, "rougeLsum_recall_stderr": 0.0017781434541041129}, "implicit-graph-description": {"bleu": 0.12648828841845464, "bleu_stderr": 0.02172247828129552, "rouge1_fmeasure": 0.05185506267199114, "rouge1_fmeasure_stderr": 0.0010693704857640345, "rouge1_precision": 0.04385963495568408, "rouge1_precision_stderr": 0.0016141799634558302, "rouge1_recall": 0.20910265806376718, "rouge1_recall_stderr": 0.00260407682272209, "rouge2_fmeasure": 0.004093823034187833, "rouge2_fmeasure_stderr": 0.0003237805033460212, "rouge2_precision": 0.003657728710184829, "rouge2_precision_stderr": 0.00040744794041285926, "rouge2_recall": 0.017381693068654103, "rouge2_recall_stderr": 0.0011150256177358017, "rougeL_fmeasure": 0.048929691951989984, "rougeL_fmeasure_stderr": 0.0008927940829166845, "rougeL_precision": 0.03973081953475384, "rougeL_precision_stderr": 0.0013217691489049088, "rougeL_recall": 0.20624804575610606, "rougeL_recall_stderr": 0.0026094299138881873, "rougeLsum_fmeasure": 0.03823939846444661, "rougeLsum_fmeasure_stderr": 0.0009439507711198298, "rougeLsum_precision": 0.034850412608671714, "rougeLsum_precision_stderr": 0.0014877807289763565, "rougeLsum_recall": 0.149019716230595, "rougeLsum_recall_stderr": 0.002039697594288268}, "non-explicit-description": {"bleu": 0.15649947819833196, "bleu_stderr": 0.0164476915271142, "rouge1_fmeasure": 0.046262195996473646, "rouge1_fmeasure_stderr": 0.0010671021290240828, "rouge1_precision": 0.02706027614634666, "rouge1_precision_stderr": 0.0007199959223496352, "rouge1_recall": 0.21871619349540652, "rouge1_recall_stderr": 0.003017122387430669, "rouge2_fmeasure": 0.0091292497595853, "rouge2_fmeasure_stderr": 0.0005586233596109211, "rouge2_precision": 0.005489404852576963, "rouge2_precision_stderr": 0.00037168133237378876, "rouge2_recall": 0.04039720237090506, "rouge2_recall_stderr": 0.0018373898311850336, "rougeL_fmeasure": 0.04506442218172945, "rougeL_fmeasure_stderr": 0.000951352027550179, "rougeL_precision": 0.02627865667490291, "rougeL_precision_stderr": 0.0006333002013249642, "rougeL_recall": 0.21539356134726143, "rougeL_recall_stderr": 0.0028718466577210507, "rougeLsum_fmeasure": 0.040980292597500946, "rougeLsum_fmeasure_stderr": 0.0009311721807975657, "rougeLsum_precision": 0.023947674613616042, "rougeLsum_precision_stderr": 0.0006311347984113742, "rougeLsum_recall": 0.19592189387116501, "rougeLsum_recall_stderr": 0.002541067542265314}, "very-explicit-description": {"bleu": 0.06125988659412747, "bleu_stderr": 0.006732229818369253, "rouge1_fmeasure": 0.03898895789773015, "rouge1_fmeasure_stderr": 0.0006319532341075637, "rouge1_precision": 0.022502146837926656, "rouge1_precision_stderr": 0.00039190309284914334, "rouge1_recall": 0.19344726390485775, "rouge1_recall_stderr": 0.00254983973899585, "rouge2_fmeasure": 0.004145988607839967, "rouge2_fmeasure_stderr": 0.0001880250546353047, "rouge2_precision": 0.0023435938414272757, "rouge2_precision_stderr": 0.00010979866573243933, "rouge2_recall": 0.02355849492139947, "rouge2_recall_stderr": 0.0010345098847200228, "rougeL_fmeasure": 0.037891534322081845, "rougeL_fmeasure_stderr": 0.0005877375532899554, "rougeL_precision": 0.021854072209224234, "rougeL_precision_stderr": 0.0003652930614699315, "rougeL_recall": 0.18931671006162032, "rougeL_recall_stderr": 0.0024402267670285537, "rougeLsum_fmeasure": 0.03294863446012075, "rougeLsum_fmeasure_stderr": 0.0005052256590344441, "rougeLsum_precision": 0.019010396571724965, "rougeLsum_precision_stderr": 0.00031345492195533365, "rougeLsum_recall": 0.16545014623836463, "rougeLsum_recall_stderr": 0.002138025338260489}}, "1": {"PALM_prompt": {"bleu": 0.5179012826475189, "bleu_stderr": 0.03546328546887922, "rouge1_fmeasure": 0.11715894355967386, "rouge1_fmeasure_stderr": 0.0019757967913343107, "rouge1_precision": 0.07590692259956473, "rouge1_precision_stderr": 0.0015260502670476222, "rouge1_recall": 0.3587176031003754, "rouge1_recall_stderr": 0.005304902318303979, "rouge2_fmeasure": 0.054620931903283015, "rouge2_fmeasure_stderr": 0.001228796960295478, "rouge2_precision": 0.03541306461486918, "rouge2_precision_stderr": 0.0009438407351314926, "rouge2_recall": 0.17490815925289047, "rouge2_recall_stderr": 0.003575953294927914, "rougeL_fmeasure": 0.10948455596662156, "rougeL_fmeasure_stderr": 0.0017556314356608658, "rougeL_precision": 0.07063353254188104, "rougeL_precision_stderr": 0.0013315866040551792, "rougeL_recall": 0.33926359580036936, "rougeL_recall_stderr": 0.004980489876121937, "rougeLsum_fmeasure": 0.11027827926137282, "rougeLsum_fmeasure_stderr": 0.0018206662711825689, "rougeLsum_precision": 0.07145722539365447, "rougeLsum_precision_stderr": 0.0014201666549177136, "rougeLsum_recall": 0.338101035904291, "rougeLsum_recall_stderr": 0.004861723234525834}, "explicit-graph-description2": {"bleu": 2.052485569424793, "bleu_stderr": 0.12102973129799857, "rouge1_fmeasure": 0.2770237126205559, "rouge1_fmeasure_stderr": 0.004038789167972144, "rouge1_precision": 0.24515040330368046, "rouge1_precision_stderr": 0.004054348224566517, "rouge1_recall": 0.42594170823778976, "rouge1_recall_stderr": 0.005787897761386721, "rouge2_fmeasure": 0.10582834218904164, "rouge2_fmeasure_stderr": 0.0026426543643199916, "rouge2_precision": 0.09175996157947801, "rouge2_precision_stderr": 0.002601492839334607, "rouge2_recall": 0.1684955936702656, "rouge2_recall_stderr": 0.004064838991261767, "rougeL_fmeasure": 0.20936546936617006, "rougeL_fmeasure_stderr": 0.00297046089780952, "rougeL_precision": 0.18555197093693734, "rougeL_precision_stderr": 0.003147342164119049, "rougeL_recall": 0.33280053513758845, "rougeL_recall_stderr": 0.0045573071529510685, "rougeLsum_fmeasure": 0.24390484665607676, "rougeLsum_fmeasure_stderr": 0.0036238123294501058, "rougeLsum_precision": 0.21610700955371467, "rougeLsum_precision_stderr": 0.0036569952801652067, "rougeLsum_recall": 0.3772047553428585, "rougeLsum_recall_stderr": 0.005272234747783686}, "implicit-graph-description": {"bleu": 0.9717259245306719, "bleu_stderr": 0.05184305667078613, "rouge1_fmeasure": 0.1251494842748045, "rouge1_fmeasure_stderr": 0.002228010113423068, "rouge1_precision": 0.07728525564656695, "rouge1_precision_stderr": 0.0016345680555964624, "rouge1_recall": 0.47912345654951694, "rouge1_recall_stderr": 0.004974585856018974, "rouge2_fmeasure": 0.048933388223398, "rouge2_fmeasure_stderr": 0.0012848999460203356, "rouge2_precision": 0.030217086264151687, "rouge2_precision_stderr": 0.0008968155132881358, "rouge2_recall": 0.19718798013757022, "rouge2_recall_stderr": 0.0041103934021723375, "rougeL_fmeasure": 0.10781056512383318, "rougeL_fmeasure_stderr": 0.0016643371248226597, "rougeL_precision": 0.06575830202193088, "rougeL_precision_stderr": 0.0012184254155840335, "rougeL_recall": 0.43631558492945605, "rougeL_recall_stderr": 0.004551703262770418, "rougeLsum_fmeasure": 0.10816019192402117, "rougeLsum_fmeasure_stderr": 0.0020352482624991708, "rougeLsum_precision": 0.06685866773660962, "rougeLsum_precision_stderr": 0.00149258308259344, "rougeLsum_recall": 0.41789578588003, "rougeLsum_recall_stderr": 0.004754874400535324}, "non-explicit-description": {"bleu": 1.6599294185467146, "bleu_stderr": 0.06211968556668088, "rouge1_fmeasure": 0.21539745536997254, "rouge1_fmeasure_stderr": 0.0023973358203774717, "rouge1_precision": 0.13566963010336247, "rouge1_precision_stderr": 0.001901900628554032, "rouge1_recall": 0.6825350408773228, "rouge1_recall_stderr": 0.003845047297379057, "rouge2_fmeasure": 0.09345632716516253, "rouge2_fmeasure_stderr": 0.001519666475238801, "rouge2_precision": 0.05837651126421021, "rouge2_precision_stderr": 0.0011430214532306153, "rouge2_recall": 0.32049306904101077, "rouge2_recall_stderr": 0.003907206521752479, "rougeL_fmeasure": 0.1711923400170551, "rougeL_fmeasure_stderr": 0.0017590691181499992, "rougeL_precision": 0.10694697019167072, "rougeL_precision_stderr": 0.0014098708843255005, "rougeL_recall": 0.5679246988503801, "rougeL_recall_stderr": 0.003975927481974845, "rougeLsum_fmeasure": 0.1836183182793589, "rougeLsum_fmeasure_stderr": 0.0021016203672280464, "rougeLsum_precision": 0.11553086397885941, "rougeLsum_precision_stderr": 0.001655229291094829, "rougeLsum_recall": 0.5900036748532257, "rougeLsum_recall_stderr": 0.0036908035987317585}, "very-explicit-description": {"bleu": 0.6395614972095213, "bleu_stderr": 0.03517585023209597, "rouge1_fmeasure": 0.09987693823507093, "rouge1_fmeasure_stderr": 0.0019433674289780272, "rouge1_precision": 0.05975795472897865, "rouge1_precision_stderr": 0.0013335748577968784, "rouge1_recall": 0.42130399574332783, "rouge1_recall_stderr": 0.006021456162891838, "rouge2_fmeasure": 0.03326346351515616, "rouge2_fmeasure_stderr": 0.0010503739288322067, "rouge2_precision": 0.01988607904073932, "rouge2_precision_stderr": 0.0007117135743174414, "rouge2_recall": 0.1562830460937769, "rouge2_recall_stderr": 0.004166853394878266, "rougeL_fmeasure": 0.08653005164164256, "rougeL_fmeasure_stderr": 0.0015090364266964335, "rougeL_precision": 0.05146250597168572, "rougeL_precision_stderr": 0.0010354352926742175, "rougeL_recall": 0.3777395630694112, "rougeL_recall_stderr": 0.005232167400098075, "rougeLsum_fmeasure": 0.08634306012268002, "rougeLsum_fmeasure_stderr": 0.0017368588952515506, "rougeLsum_precision": 0.05159147675400835, "rougeLsum_precision_stderr": 0.0011924678247232138, "rougeLsum_recall": 0.3715339371365095, "rougeLsum_recall_stderr": 0.005597099196018005}}, "2": {"PALM_prompt": {"bleu": 0.5353533242406296, "bleu_stderr": 0.03431413900192352, "rouge1_fmeasure": 0.11225032910377221, "rouge1_fmeasure_stderr": 0.001729127211467464, "rouge1_precision": 0.07083033588676813, "rouge1_precision_stderr": 0.0012393618247087826, "rouge1_recall": 0.3713979630102006, "rouge1_recall_stderr": 0.0052635629154557445, "rouge2_fmeasure": 0.04972618028817665, "rouge2_fmeasure_stderr": 0.00102253346010308, "rouge2_precision": 0.031078186918670876, "rouge2_precision_stderr": 0.0007042214847925669, "rouge2_recall": 0.17869319720275487, "rouge2_recall_stderr": 0.0037152494180892654, "rougeL_fmeasure": 0.10286097604129013, "rougeL_fmeasure_stderr": 0.001521136220014524, "rougeL_precision": 0.06478794864331923, "rougeL_precision_stderr": 0.0010868099908157105, "rougeL_recall": 0.342386434900962, "rougeL_recall_stderr": 0.004791381155468985, "rougeLsum_fmeasure": 0.10565101438548488, "rougeLsum_fmeasure_stderr": 0.0016072411783332626, "rougeLsum_precision": 0.0666711337015857, "rougeLsum_precision_stderr": 0.0011558366976764654, "rougeLsum_recall": 0.3495506191555596, "rougeLsum_recall_stderr": 0.004867024005105769}, "explicit-graph-description2": {"bleu": 3.5977911950601484, "bleu_stderr": 0.12923912317137534, "rouge1_fmeasure": 0.3738311083147904, "rouge1_fmeasure_stderr": 0.004303874024559728, "rouge1_precision": 0.3566584079552788, "rouge1_precision_stderr": 0.005545339920311794, "rouge1_recall": 0.5400091585387286, "rouge1_recall_stderr": 0.005358867844689096, "rouge2_fmeasure": 0.18715015079380715, "rouge2_fmeasure_stderr": 0.003209239165332672, "rouge2_precision": 0.1806299226100141, "rouge2_precision_stderr": 0.0038767177534562857, "rouge2_recall": 0.27810862336642045, "rouge2_recall_stderr": 0.004344946134160203, "rougeL_fmeasure": 0.291139847215283, "rougeL_fmeasure_stderr": 0.003491774771956361, "rougeL_precision": 0.2774912289584729, "rougeL_precision_stderr": 0.00456955894576886, "rougeL_recall": 0.43176983982503986, "rougeL_recall_stderr": 0.0046883692841029135, "rougeLsum_fmeasure": 0.3252458107512585, "rougeLsum_fmeasure_stderr": 0.0037896518417690843, "rougeLsum_precision": 0.3096504505654974, "rougeLsum_precision_stderr": 0.004863905721617155, "rougeLsum_recall": 0.4751007623048103, "rougeLsum_recall_stderr": 0.004978268403772034}, "implicit-graph-description": {"bleu": 1.5553302383124015, "bleu_stderr": 0.04732740194035888, "rouge1_fmeasure": 0.14300003134712513, "rouge1_fmeasure_stderr": 0.00221162536158628, "rouge1_precision": 0.08832852532934997, "rouge1_precision_stderr": 0.0017907725492439333, "rouge1_recall": 0.5699592863851318, "rouge1_recall_stderr": 0.004714536643229723, "rouge2_fmeasure": 0.0664535256463577, "rouge2_fmeasure_stderr": 0.001390545014519469, "rouge2_precision": 0.040843244255482615, "rouge2_precision_stderr": 0.0010916245640850226, "rouge2_recall": 0.29220518033840015, "rouge2_recall_stderr": 0.00429630894545052, "rougeL_fmeasure": 0.12321433839865718, "rougeL_fmeasure_stderr": 0.0016493743934460554, "rougeL_precision": 0.0752494824229621, "rougeL_precision_stderr": 0.0013331527745553307, "rougeL_recall": 0.5158723539376079, "rougeL_recall_stderr": 0.004556028773885754, "rougeLsum_fmeasure": 0.1245303464212252, "rougeLsum_fmeasure_stderr": 0.0020005666890666714, "rougeLsum_precision": 0.07693161532405232, "rougeLsum_precision_stderr": 0.001600850652295148, "rougeLsum_recall": 0.5035674346293266, "rougeLsum_recall_stderr": 0.004594034425007832}, "non-explicit-description": {"bleu": 2.016001208228411, "bleu_stderr": 0.06460792855397608, "rouge1_fmeasure": 0.2076581040011104, "rouge1_fmeasure_stderr": 0.0022583455874752344, "rouge1_precision": 0.1287133172816397, "rouge1_precision_stderr": 0.0016808581165882275, "rouge1_recall": 0.700127439476741, "rouge1_recall_stderr": 0.0037713353423981066, "rouge2_fmeasure": 0.09362913479276391, "rouge2_fmeasure_stderr": 0.0014080197389295243, "rouge2_precision": 0.057317013209503706, "rouge2_precision_stderr": 0.000967554778182446, "rouge2_recall": 0.3482915132800189, "rouge2_recall_stderr": 0.00417921721931668, "rougeL_fmeasure": 0.16305870413789514, "rougeL_fmeasure_stderr": 0.0016446312009551617, "rougeL_precision": 0.10016020267379452, "rougeL_precision_stderr": 0.0011971639013113444, "rougeL_recall": 0.5771110494449467, "rougeL_recall_stderr": 0.004193435707875633, "rougeLsum_fmeasure": 0.17891433463701623, "rougeLsum_fmeasure_stderr": 0.002014498326601367, "rougeLsum_precision": 0.11082374580358195, "rougeLsum_precision_stderr": 0.001491597958210396, "rougeLsum_recall": 0.6106710174543206, "rougeLsum_recall_stderr": 0.003704970101429629}, "very-explicit-description": {"bleu": 1.121353825084264, "bleu_stderr": 0.037725280779123294, "rouge1_fmeasure": 0.13518295850423986, "rouge1_fmeasure_stderr": 0.001797688945586576, "rouge1_precision": 0.0810153812005263, "rouge1_precision_stderr": 0.0012511509455719108, "rouge1_recall": 0.5539166378964278, "rouge1_recall_stderr": 0.0047292784942071205, "rouge2_fmeasure": 0.05216469490783778, "rouge2_fmeasure_stderr": 0.000960104338852003, "rouge2_precision": 0.030941146425207795, "rouge2_precision_stderr": 0.0006330818483411955, "rouge2_recall": 0.24102157803692914, "rouge2_recall_stderr": 0.0040746001964615045, "rougeL_fmeasure": 0.11606876587008064, "rougeL_fmeasure_stderr": 0.0013292695114441179, "rougeL_precision": 0.06909862720510897, "rougeL_precision_stderr": 0.0009250350731313912, "rougeL_recall": 0.4944830893295965, "rougeL_recall_stderr": 0.0043942354024226, "rougeLsum_fmeasure": 0.11717633383150924, "rougeLsum_fmeasure_stderr": 0.001626596491079524, "rougeLsum_precision": 0.07020253091326639, "rougeLsum_precision_stderr": 0.0011333955676522578, "rougeLsum_recall": 0.4865962462473607, "rougeLsum_recall_stderr": 0.0044821465897910496}}, "3": {"PALM_prompt": {"bleu": 0.4473435878442557, "bleu_stderr": 0.021920475877328035, "rouge1_fmeasure": 0.09638137782837077, "rouge1_fmeasure_stderr": 0.0014829064557002423, "rouge1_precision": 0.060184346315459554, "rouge1_precision_stderr": 0.001087884777571127, "rouge1_recall": 0.3414484174381052, "rouge1_recall_stderr": 0.005172668914728109, "rouge2_fmeasure": 0.038282775688304856, "rouge2_fmeasure_stderr": 0.0008484797038289617, "rouge2_precision": 0.02365049876833611, "rouge2_precision_stderr": 0.0005803565276567791, "rouge2_recall": 0.14942490983015533, "rouge2_recall_stderr": 0.003513250894148731, "rougeL_fmeasure": 0.08646072645548598, "rougeL_fmeasure_stderr": 0.0013059843771721818, "rougeL_precision": 0.05398887767175152, "rougeL_precision_stderr": 0.0009593908114878796, "rougeL_recall": 0.306677112968288, "rougeL_recall_stderr": 0.004574894720404393, "rougeLsum_fmeasure": 0.09025064736984334, "rougeLsum_fmeasure_stderr": 0.0013804266464441872, "rougeLsum_precision": 0.05643326067698096, "rougeLsum_precision_stderr": 0.0010263660613837862, "rougeLsum_recall": 0.31950364177604385, "rougeLsum_recall_stderr": 0.0047753784913865055}, "explicit-graph-description2": {"bleu": 3.364926224666278, "bleu_stderr": 0.15045294250595048, "rouge1_fmeasure": 0.362382378974461, "rouge1_fmeasure_stderr": 0.004512618110179221, "rouge1_precision": 0.3507235841349617, "rouge1_precision_stderr": 0.006063184051971503, "rouge1_recall": 0.5514350155602691, "rouge1_recall_stderr": 0.005182852642362103, "rouge2_fmeasure": 0.18921046750479484, "rouge2_fmeasure_stderr": 0.0033743653265960907, "rouge2_precision": 0.18695632263623327, "rouge2_precision_stderr": 0.004276682318235075, "rouge2_recall": 0.2924425597856253, "rouge2_recall_stderr": 0.004413282563913562, "rougeL_fmeasure": 0.2847450223443878, "rougeL_fmeasure_stderr": 0.0037300768332280904, "rougeL_precision": 0.2760401929409245, "rougeL_precision_stderr": 0.005081600899523718, "rougeL_recall": 0.44615489211207415, "rougeL_recall_stderr": 0.004731816271399687, "rougeLsum_fmeasure": 0.31660483791521354, "rougeLsum_fmeasure_stderr": 0.004003089154824215, "rougeLsum_precision": 0.3065083273455118, "rougeLsum_precision_stderr": 0.005390003916850601, "rougeLsum_recall": 0.4874589762506608, "rougeLsum_recall_stderr": 0.0048935064804825285}, "implicit-graph-description": {"bleu": 1.6393485578889786, "bleu_stderr": 0.05502650807288692, "rouge1_fmeasure": 0.14778204636533104, "rouge1_fmeasure_stderr": 0.0021535294935283475, "rouge1_precision": 0.09384501026064822, "rouge1_precision_stderr": 0.0020410705936949164, "rouge1_recall": 0.5594718362988982, "rouge1_recall_stderr": 0.004835901700325949, "rouge2_fmeasure": 0.07150806038871549, "rouge2_fmeasure_stderr": 0.0013846757650982076, "rouge2_precision": 0.04531974070359654, "rouge2_precision_stderr": 0.0012592530156795225, "rouge2_recall": 0.2943813172867371, "rouge2_recall_stderr": 0.004307507870651324, "rougeL_fmeasure": 0.12572390453967686, "rougeL_fmeasure_stderr": 0.0016646524639872145, "rougeL_precision": 0.07902797606364308, "rougeL_precision_stderr": 0.0015876424977483495, "rougeL_recall": 0.49738396026840304, "rougeL_recall_stderr": 0.004670214814101292, "rougeLsum_fmeasure": 0.12939899168462599, "rougeLsum_fmeasure_stderr": 0.0019528328894905256, "rougeLsum_precision": 0.08205420153622084, "rougeLsum_precision_stderr": 0.0017980621481149767, "rougeLsum_recall": 0.49588185251631867, "rougeLsum_recall_stderr": 0.004634355511167163}, "non-explicit-description": {"bleu": 1.905820614761778, "bleu_stderr": 0.05980040525962388, "rouge1_fmeasure": 0.18531216357578198, "rouge1_fmeasure_stderr": 0.0020715765879598448, "rouge1_precision": 0.11266596272845417, "rouge1_precision_stderr": 0.0014964276623119837, "rouge1_recall": 0.6895629319342107, "rouge1_recall_stderr": 0.0039035968599018256, "rouge2_fmeasure": 0.08163089196343765, "rouge2_fmeasure_stderr": 0.0012508375480738728, "rouge2_precision": 0.0490979915579544, "rouge2_precision_stderr": 0.0008487279552315122, "rouge2_recall": 0.33860846873242856, "rouge2_recall_stderr": 0.004229418238586871, "rougeL_fmeasure": 0.1452630444217927, "rougeL_fmeasure_stderr": 0.001454000064654399, "rougeL_precision": 0.08742234681738852, "rougeL_precision_stderr": 0.0010302549899947866, "rougeL_recall": 0.573502299945498, "rougeL_recall_stderr": 0.004398266028999487, "rougeLsum_fmeasure": 0.15908058820614102, "rougeLsum_fmeasure_stderr": 0.001825274022822222, "rougeLsum_precision": 0.09661481768181748, "rougeLsum_precision_stderr": 0.0013130442610157457, "rougeLsum_recall": 0.6010245565717771, "rougeLsum_recall_stderr": 0.003825525802603016}, "very-explicit-description": {"bleu": 1.4361174109813004, "bleu_stderr": 0.04805971189618587, "rouge1_fmeasure": 0.14844541358150928, "rouge1_fmeasure_stderr": 0.001876503606273822, "rouge1_precision": 0.08857774552753468, "rouge1_precision_stderr": 0.0013096992843302045, "rouge1_recall": 0.6136927754214567, "rouge1_recall_stderr": 0.004518995907883919, "rouge2_fmeasure": 0.06259164725081237, "rouge2_fmeasure_stderr": 0.0010634444975241323, "rouge2_precision": 0.036937522767553584, "rouge2_precision_stderr": 0.0007034293513284538, "rouge2_recall": 0.2927059910268213, "rouge2_recall_stderr": 0.004314941715310091, "rougeL_fmeasure": 0.12422704982040522, "rougeL_fmeasure_stderr": 0.0013236189729225278, "rougeL_precision": 0.07355997157054911, "rougeL_precision_stderr": 0.0009262242992983204, "rougeL_recall": 0.5376590137054139, "rougeL_recall_stderr": 0.004237557928355675, "rougeLsum_fmeasure": 0.12987847358588697, "rougeLsum_fmeasure_stderr": 0.0017109839553052014, "rougeLsum_precision": 0.07743844850036462, "rougeLsum_precision_stderr": 0.0011897972880537623, "rougeLsum_recall": 0.5449619727275773, "rougeLsum_recall_stderr": 0.004481006887033573}}, "4": {"PALM_prompt": {"bleu": 0.42391682641977435, "bleu_stderr": 0.025185202302157747, "rouge1_fmeasure": 0.09021138584698546, "rouge1_fmeasure_stderr": 0.0014520692052325743, "rouge1_precision": 0.056211864304467056, "rouge1_precision_stderr": 0.0010231526331645241, "rouge1_recall": 0.3243551913604374, "rouge1_recall_stderr": 0.005025965693402745, "rouge2_fmeasure": 0.035274756528572794, "rouge2_fmeasure_stderr": 0.0008164665662699456, "rouge2_precision": 0.021734837271928865, "rouge2_precision_stderr": 0.000537789346158442, "rouge2_recall": 0.1381516300501002, "rouge2_recall_stderr": 0.003431494708848335, "rougeL_fmeasure": 0.08124379060939273, "rougeL_fmeasure_stderr": 0.00126253468227921, "rougeL_precision": 0.050592287065444816, "rougeL_precision_stderr": 0.0008807998394512943, "rougeL_recall": 0.29101119493777294, "rougeL_recall_stderr": 0.004368680842712586, "rougeLsum_fmeasure": 0.08470874404935472, "rougeLsum_fmeasure_stderr": 0.001361092950551602, "rougeLsum_precision": 0.052801000471684074, "rougeLsum_precision_stderr": 0.0009509504143137485, "rougeLsum_recall": 0.30284581980574177, "rougeLsum_recall_stderr": 0.004598745476101752}, "explicit-graph-description2": {"bleu": 3.3107172893336636, "bleu_stderr": 0.10227637753147577, "rouge1_fmeasure": 0.347569335324534, "rouge1_fmeasure_stderr": 0.004544187677600413, "rouge1_precision": 0.33093996934970976, "rouge1_precision_stderr": 0.005991487675562594, "rouge1_recall": 0.5449792771957667, "rouge1_recall_stderr": 0.005139692154283905, "rouge2_fmeasure": 0.1826637134726461, "rouge2_fmeasure_stderr": 0.003379682428755349, "rouge2_precision": 0.1780670412943538, "rouge2_precision_stderr": 0.00422080166488958, "rouge2_recall": 0.2914835377835669, "rouge2_recall_stderr": 0.0044150558699339055, "rougeL_fmeasure": 0.2743549921011115, "rougeL_fmeasure_stderr": 0.0037902551916263946, "rougeL_precision": 0.2619538812269815, "rougeL_precision_stderr": 0.0050299249467279095, "rougeL_recall": 0.4421088823491754, "rougeL_recall_stderr": 0.004705889366270654, "rougeLsum_fmeasure": 0.30488899444485074, "rougeLsum_fmeasure_stderr": 0.00406398308359489, "rougeLsum_precision": 0.2899719423937745, "rougeLsum_precision_stderr": 0.00530779434079022, "rougeLsum_recall": 0.4827220286166995, "rougeLsum_recall_stderr": 0.004857508825913098}, "implicit-graph-description": {"bleu": 1.7027172131809374, "bleu_stderr": 0.05464254396711413, "rouge1_fmeasure": 0.15265161943963146, "rouge1_fmeasure_stderr": 0.0023054471134340703, "rouge1_precision": 0.10010463739366897, "rouge1_precision_stderr": 0.002451958960487023, "rouge1_recall": 0.5497072816752133, "rouge1_recall_stderr": 0.005022860494341607, "rouge2_fmeasure": 0.0756101747685517, "rouge2_fmeasure_stderr": 0.0015266803509634318, "rouge2_precision": 0.04962506210717931, "rouge2_precision_stderr": 0.001556539762450491, "rouge2_recall": 0.29783112596150063, "rouge2_recall_stderr": 0.0044507361953258365, "rougeL_fmeasure": 0.12964486472277573, "rougeL_fmeasure_stderr": 0.0018553204638499313, "rougeL_precision": 0.08430872709932842, "rougeL_precision_stderr": 0.0020229841753327373, "rougeL_recall": 0.4865738026311325, "rougeL_recall_stderr": 0.004864018248102004, "rougeLsum_fmeasure": 0.13454630567628098, "rougeLsum_fmeasure_stderr": 0.0021043161354972286, "rougeLsum_precision": 0.08814511726089463, "rougeLsum_precision_stderr": 0.0021838335215906522, "rougeLsum_recall": 0.4895059624812954, "rougeLsum_recall_stderr": 0.0047832687395796825}, "non-explicit-description": {"bleu": 1.9502927707808198, "bleu_stderr": 0.052020548700027196, "rouge1_fmeasure": 0.18934839167613163, "rouge1_fmeasure_stderr": 0.00214187386551334, "rouge1_precision": 0.11665256703134419, "rouge1_precision_stderr": 0.001621644318040768, "rouge1_recall": 0.6784675206475869, "rouge1_recall_stderr": 0.004050953554566147, "rouge2_fmeasure": 0.0843786630967621, "rouge2_fmeasure_stderr": 0.00127055713033487, "rouge2_precision": 0.05132787287975547, "rouge2_precision_stderr": 0.000890879748268458, "rouge2_recall": 0.3387961329017755, "rouge2_recall_stderr": 0.004335693857367359, "rougeL_fmeasure": 0.14739292124783474, "rougeL_fmeasure_stderr": 0.0015062281210773386, "rougeL_precision": 0.08975734228271094, "rougeL_precision_stderr": 0.0011007534157180517, "rougeL_recall": 0.5600296137109959, "rougeL_recall_stderr": 0.004483398306149866, "rougeLsum_fmeasure": 0.16288936969969903, "rougeLsum_fmeasure_stderr": 0.001883263572481922, "rougeLsum_precision": 0.10024251913081043, "rougeLsum_precision_stderr": 0.0014111520702234548, "rougeLsum_recall": 0.5919031167415147, "rougeLsum_recall_stderr": 0.0038677271600221276}, "very-explicit-description": {"bleu": 1.7466592730628139, "bleu_stderr": 0.06692844403557563, "rouge1_fmeasure": 0.17657657718723166, "rouge1_fmeasure_stderr": 0.0020678023491144965, "rouge1_precision": 0.1061802370663775, "rouge1_precision_stderr": 0.001463735554331979, "rouge1_recall": 0.6843250436064637, "rouge1_recall_stderr": 0.003920444970220284, "rouge2_fmeasure": 0.07984816777629551, "rouge2_fmeasure_stderr": 0.0011943148273671116, "rouge2_precision": 0.047497391617128334, "rouge2_precision_stderr": 0.000803679569171621, "rouge2_recall": 0.34713674890385776, "rouge2_recall_stderr": 0.004140778426080258, "rougeL_fmeasure": 0.13715386462196175, "rougeL_fmeasure_stderr": 0.0013567926683826159, "rougeL_precision": 0.08171760415769097, "rougeL_precision_stderr": 0.0009627992963593435, "rougeL_recall": 0.5631982903026473, "rougeL_recall_stderr": 0.003904405962830157, "rougeLsum_fmeasure": 0.15756804588459467, "rougeLsum_fmeasure_stderr": 0.0019178471738246978, "rougeLsum_precision": 0.09467046528011347, "rougeLsum_precision_stderr": 0.0013521830799601206, "rougeLsum_recall": 0.6186687061302965, "rougeLsum_recall_stderr": 0.004013369252904057}}, "5": {"PALM_prompt": {"bleu": 0.4676760272424504, "bleu_stderr": 0.02303026154350977, "rouge1_fmeasure": 0.10005721103740961, "rouge1_fmeasure_stderr": 0.0016082944292364137, "rouge1_precision": 0.06321141539943463, "rouge1_precision_stderr": 0.001223200335826357, "rouge1_recall": 0.34518893665726086, "rouge1_recall_stderr": 0.005046837181813745, "rouge2_fmeasure": 0.04337652772461485, "rouge2_fmeasure_stderr": 0.0009940986270453964, "rouge2_precision": 0.027408388290100973, "rouge2_precision_stderr": 0.0007495147001664794, "rouge2_recall": 0.15813941094305575, "rouge2_recall_stderr": 0.0034339025470937284, "rougeL_fmeasure": 0.09146925689558233, "rougeL_fmeasure_stderr": 0.0014605690131820356, "rougeL_precision": 0.057943890660243344, "rougeL_precision_stderr": 0.0011331942696761865, "rougeL_recall": 0.31252933987652376, "rougeL_recall_stderr": 0.004418336624345426, "rougeLsum_fmeasure": 0.09443075155430865, "rougeLsum_fmeasure_stderr": 0.001535767562485951, "rougeLsum_precision": 0.059802685455499036, "rougeLsum_precision_stderr": 0.0011813042691715043, "rougeLsum_recall": 0.3231337567814217, "rougeLsum_recall_stderr": 0.004620411779037274}, "explicit-graph-description2": {"bleu": 3.3115358177828296, "bleu_stderr": 0.08566639180105731, "rouge1_fmeasure": 0.33762053843234585, "rouge1_fmeasure_stderr": 0.004523525986261017, "rouge1_precision": 0.3193273080501676, "rouge1_precision_stderr": 0.005974116499113547, "rouge1_recall": 0.5422038885918751, "rouge1_recall_stderr": 0.005190435741567349, "rouge2_fmeasure": 0.1794175550575056, "rouge2_fmeasure_stderr": 0.003344405579036441, "rouge2_precision": 0.17378010156605053, "rouge2_precision_stderr": 0.004277219617652875, "rouge2_recall": 0.29377091837736397, "rouge2_recall_stderr": 0.004471597574308531, "rougeL_fmeasure": 0.2687968014233197, "rougeL_fmeasure_stderr": 0.0038430798579204405, "rougeL_precision": 0.2557225993181413, "rougeL_precision_stderr": 0.005171558754142887, "rougeL_recall": 0.4418326332600047, "rougeL_recall_stderr": 0.004753839209134025, "rougeLsum_fmeasure": 0.298462025989663, "rougeLsum_fmeasure_stderr": 0.004099301415037842, "rougeLsum_precision": 0.2829548142815575, "rougeLsum_precision_stderr": 0.005414729183997336, "rougeLsum_recall": 0.48224704322769923, "rougeLsum_recall_stderr": 0.004901420700466331}, "implicit-graph-description": {"bleu": 1.6824798233982887, "bleu_stderr": 0.04389086131276499, "rouge1_fmeasure": 0.15505433986152437, "rouge1_fmeasure_stderr": 0.002379813501347791, "rouge1_precision": 0.10174735745648503, "rouge1_precision_stderr": 0.0024884464079584444, "rouge1_recall": 0.5382688832041609, "rouge1_recall_stderr": 0.004925292282078669, "rouge2_fmeasure": 0.07734137021121697, "rouge2_fmeasure_stderr": 0.001591701564073588, "rouge2_precision": 0.05132100913201794, "rouge2_precision_stderr": 0.0016726572160033631, "rouge2_recall": 0.2917616762862299, "rouge2_recall_stderr": 0.004373173732071418, "rougeL_fmeasure": 0.1300997256412491, "rougeL_fmeasure_stderr": 0.001870157247818775, "rougeL_precision": 0.08461663803530442, "rougeL_precision_stderr": 0.0020159579534064975, "rougeL_recall": 0.4721845695274402, "rougeL_recall_stderr": 0.004823954126413361, "rougeLsum_fmeasure": 0.13698687887451672, "rougeLsum_fmeasure_stderr": 0.002131522469829751, "rougeLsum_precision": 0.08987573937521823, "rougeLsum_precision_stderr": 0.002214880051358518, "rougeLsum_recall": 0.4806380167892775, "rougeLsum_recall_stderr": 0.004741206948484179}, "non-explicit-description": {"bleu": 2.226426427718422, "bleu_stderr": 0.07967413364321198, "rouge1_fmeasure": 0.21838667940415663, "rouge1_fmeasure_stderr": 0.002647140626260128, "rouge1_precision": 0.13927878202099392, "rouge1_precision_stderr": 0.0020669175591838764, "rouge1_recall": 0.685945936711341, "rouge1_recall_stderr": 0.0039463810264872105, "rouge2_fmeasure": 0.10182499744322755, "rouge2_fmeasure_stderr": 0.001652327275662303, "rouge2_precision": 0.06409430762337304, "rouge2_precision_stderr": 0.0011913214183825302, "rouge2_recall": 0.35060843893599064, "rouge2_recall_stderr": 0.004265349732243715, "rougeL_fmeasure": 0.16835209567162987, "rougeL_fmeasure_stderr": 0.0019166110953087441, "rougeL_precision": 0.10609511200918728, "rougeL_precision_stderr": 0.0014594305881602467, "rougeL_recall": 0.5591004118426898, "rougeL_recall_stderr": 0.004296337590666952, "rougeLsum_fmeasure": 0.18842254040460846, "rougeLsum_fmeasure_stderr": 0.002322513232328268, "rougeLsum_precision": 0.11992638153693579, "rougeLsum_precision_stderr": 0.0018004459291189343, "rougeLsum_recall": 0.6004050096251264, "rougeLsum_recall_stderr": 0.0038569357942678053}, "very-explicit-description": {"bleu": 1.8970740397337433, "bleu_stderr": 0.06599212740506186, "rouge1_fmeasure": 0.18316360410296145, "rouge1_fmeasure_stderr": 0.0021020976278752416, "rouge1_precision": 0.1107893274792466, "rouge1_precision_stderr": 0.0015002868666159005, "rouge1_recall": 0.682687883229354, "rouge1_recall_stderr": 0.003949108407059522, "rouge2_fmeasure": 0.08392662509619025, "rouge2_fmeasure_stderr": 0.0012407470721852972, "rouge2_precision": 0.05017272336825218, "rouge2_precision_stderr": 0.0008414952830314042, "rouge2_recall": 0.3508191888121574, "rouge2_recall_stderr": 0.00424171066823648, "rougeL_fmeasure": 0.14161914296738579, "rougeL_fmeasure_stderr": 0.0013917592431469871, "rougeL_precision": 0.08485730190536447, "rougeL_precision_stderr": 0.0009914986606501136, "rougeL_recall": 0.5574721443850629, "rougeL_recall_stderr": 0.003989848256957862, "rougeLsum_fmeasure": 0.16308062545421093, "rougeLsum_fmeasure_stderr": 0.0019353462436310985, "rougeLsum_precision": 0.09855924666321038, "rougeLsum_precision_stderr": 0.0013739812469034743, "rougeLsum_recall": 0.6138525490419964, "rougeLsum_recall_stderr": 0.004009808040489546}}}, "GEM/wiki_lingua_en": {"0": {"article_summary_en": {"bleu": 2.055966635720875, "bleu_stderr": 0.08686542114749111, "rouge1_fmeasure": 0.21366320551854953, "rouge1_fmeasure_stderr": 0.0018302570738398985, "rouge1_precision": 0.17950975675556238, "rouge1_precision_stderr": 0.001914110077898584, "rouge1_recall": 0.3173563571239822, "rouge1_recall_stderr": 0.0026609188157218433, "rouge2_fmeasure": 0.046175339585206684, "rouge2_fmeasure_stderr": 0.0009130716302254005, "rouge2_precision": 0.03857602767598888, "rouge2_precision_stderr": 0.0008009614211947492, "rouge2_recall": 0.07079824622444279, "rouge2_recall_stderr": 0.0015441980491765989, "rougeL_fmeasure": 0.1487525060765686, "rougeL_fmeasure_stderr": 0.0011699273254385182, "rougeL_precision": 0.12327264643230568, "rougeL_precision_stderr": 0.0011861060865238978, "rougeL_recall": 0.2279772883295184, "rougeL_recall_stderr": 0.002052566227453621, "rougeLsum_fmeasure": 0.19795971817373118, "rougeLsum_fmeasure_stderr": 0.0016914247514331865, "rougeLsum_precision": 0.16610983563072396, "rougeLsum_precision_stderr": 0.0017654810205492105, "rougeLsum_recall": 0.29490158184592447, "rougeLsum_recall_stderr": 0.0024968605272683683}, "rephrase_en": {"bleu": 0.5075417133761574, "bleu_stderr": 0.03895909978746295, "rouge1_fmeasure": 0.08828349309152168, "rouge1_fmeasure_stderr": 0.0015115974554693625, "rouge1_precision": 0.07582764050130288, "rouge1_precision_stderr": 0.0014190110762892887, "rouge1_recall": 0.1277933977634297, "rouge1_recall_stderr": 0.0022684792145317057, "rouge2_fmeasure": 0.011691902088528949, "rouge2_fmeasure_stderr": 0.0004869755165002716, "rouge2_precision": 0.009815067973854086, "rouge2_precision_stderr": 0.0004155165586017958, "rouge2_recall": 0.01808910255093881, "rouge2_recall_stderr": 0.0008644066184478445, "rougeL_fmeasure": 0.07791270770462665, "rougeL_fmeasure_stderr": 0.0012475871998712364, "rougeL_precision": 0.06629364537173689, "rougeL_precision_stderr": 0.0011431141940482985, "rougeL_recall": 0.11462267518707464, "rougeL_recall_stderr": 0.0020002236200532, "rougeLsum_fmeasure": 0.08254932336997271, "rougeLsum_fmeasure_stderr": 0.00139813861005186, "rougeLsum_precision": 0.0708523421701339, "rougeLsum_precision_stderr": 0.0013109444182449496, "rougeLsum_recall": 0.11963990635370186, "rougeLsum_recall_stderr": 0.0021050935160865825}, "summarize_above_en": {"bleu": 0.4796152263707372, "bleu_stderr": 0.04020140828753968, "rouge1_fmeasure": 0.11036144862640505, "rouge1_fmeasure_stderr": 0.001530599636812561, "rouge1_precision": 0.15985049689784436, "rouge1_precision_stderr": 0.003621011460260559, "rouge1_recall": 0.13728275328074455, "rouge1_recall_stderr": 0.002052435724642858, "rouge2_fmeasure": 0.013469176886654546, "rouge2_fmeasure_stderr": 0.0005757882354899623, "rouge2_precision": 0.027302086171180434, "rouge2_precision_stderr": 0.0018954157009048764, "rouge2_recall": 0.016538074725194387, "rouge2_recall_stderr": 0.0008215165441681659, "rougeL_fmeasure": 0.09678143768792878, "rougeL_fmeasure_stderr": 0.0012407236257212446, "rougeL_precision": 0.14013802103333775, "rougeL_precision_stderr": 0.0032342892555596208, "rougeL_recall": 0.12221875356209057, "rougeL_recall_stderr": 0.0017527389452773581, "rougeLsum_fmeasure": 0.10348080739372295, "rougeLsum_fmeasure_stderr": 0.0014111115580161292, "rougeLsum_precision": 0.15058787526521, "rougeLsum_precision_stderr": 0.0034756128206833727, "rougeLsum_recall": 0.12914383726578268, "rougeLsum_recall_stderr": 0.0019154210955539695}, "tldr_en": {"bleu": 1.484894171357125, "bleu_stderr": 0.04279862989387049, "rouge1_fmeasure": 0.1751592790344999, "rouge1_fmeasure_stderr": 0.0017874794849447823, "rouge1_precision": 0.1498248405216555, "rouge1_precision_stderr": 0.0018423908624139891, "rouge1_recall": 0.25526647462429297, "rouge1_recall_stderr": 0.002588155452951415, "rouge2_fmeasure": 0.033925070200158246, "rouge2_fmeasure_stderr": 0.0008240877531204439, "rouge2_precision": 0.02866254588676867, "rouge2_precision_stderr": 0.0007271444111163778, "rouge2_recall": 0.05162759513352221, "rouge2_recall_stderr": 0.0014037908842865225, "rougeL_fmeasure": 0.13711284339871613, "rougeL_fmeasure_stderr": 0.001264815697203119, "rougeL_precision": 0.11555678082540916, "rougeL_precision_stderr": 0.0012586001477032337, "rougeL_recall": 0.20559491764149926, "rougeL_recall_stderr": 0.002138172468666332, "rougeLsum_fmeasure": 0.16124043802649227, "rougeLsum_fmeasure_stderr": 0.0016384576957041044, "rougeLsum_precision": 0.13771072102711007, "rougeLsum_precision_stderr": 0.0016874302363856556, "rougeLsum_recall": 0.23587046561796698, "rougeLsum_recall_stderr": 0.0024016834853047104}, "write_abstract_en": {"bleu": 0.07299605737537797, "bleu_stderr": 0.0185742385092221, "rouge1_fmeasure": 0.05960995824024046, "rouge1_fmeasure_stderr": 0.0009498209060065967, "rouge1_precision": 0.05237091241441722, "rouge1_precision_stderr": 0.0009605690159925948, "rouge1_recall": 0.08326091502282784, "rouge1_recall_stderr": 0.001307194264865998, "rouge2_fmeasure": 0.0012597496537706083, "rouge2_fmeasure_stderr": 0.00013199105164764973, "rouge2_precision": 0.0011869125521613468, "rouge2_precision_stderr": 0.00014357479408292278, "rouge2_recall": 0.0016283614649422807, "rouge2_recall_stderr": 0.00016925819879762434, "rougeL_fmeasure": 0.05747907925182631, "rougeL_fmeasure_stderr": 0.0008796408111632294, "rougeL_precision": 0.05023321535529295, "rougeL_precision_stderr": 0.0008739713332714877, "rougeL_recall": 0.08083866411125382, "rougeL_recall_stderr": 0.0012488146734351867, "rougeLsum_fmeasure": 0.05521751020238309, "rougeLsum_fmeasure_stderr": 0.0008638920844959852, "rougeLsum_precision": 0.04836041522625723, "rougeLsum_precision_stderr": 0.0008749326026700274, "rougeLsum_recall": 0.07770833418809467, "rougeLsum_recall_stderr": 0.0012165480160224367}}, "1": {"article_summary_en": {"bleu": 1.8472449992080195, "bleu_stderr": 0.05972184050829182, "rouge1_fmeasure": 0.1917572377223911, "rouge1_fmeasure_stderr": 0.001964109694244107, "rouge1_precision": 0.1645016598415173, "rouge1_precision_stderr": 0.0020439988941701897, "rouge1_recall": 0.2781090142745825, "rouge1_recall_stderr": 0.00283011336228724, "rouge2_fmeasure": 0.040168134051973815, "rouge2_fmeasure_stderr": 0.0008924396157130113, "rouge2_precision": 0.03451158809220734, "rouge2_precision_stderr": 0.000862899520574188, "rouge2_recall": 0.06057288990931606, "rouge2_recall_stderr": 0.0015023541556298495, "rougeL_fmeasure": 0.1418918052948833, "rougeL_fmeasure_stderr": 0.001334398810370599, "rougeL_precision": 0.12025147568699371, "rougeL_precision_stderr": 0.0013845922051473774, "rougeL_recall": 0.21167965788899928, "rougeL_recall_stderr": 0.0022235680991272095, "rougeLsum_fmeasure": 0.17718406703037073, "rougeLsum_fmeasure_stderr": 0.0018199571065040082, "rougeLsum_precision": 0.15192216506378955, "rougeLsum_precision_stderr": 0.0018955299167400376, "rougeLsum_recall": 0.2574177460768254, "rougeLsum_recall_stderr": 0.002636255512192128}, "rephrase_en": {"bleu": 1.2257876930057892, "bleu_stderr": 0.05098211417674737, "rouge1_fmeasure": 0.14743483455043024, "rouge1_fmeasure_stderr": 0.0018102937951181683, "rouge1_precision": 0.12719095564056437, "rouge1_precision_stderr": 0.00177458933509712, "rouge1_recall": 0.21244487177301377, "rouge1_recall_stderr": 0.002669065247115643, "rouge2_fmeasure": 0.022783938421878452, "rouge2_fmeasure_stderr": 0.0007214151081414727, "rouge2_precision": 0.019336079923150524, "rouge2_precision_stderr": 0.0006341515576063969, "rouge2_recall": 0.03414861395030515, "rouge2_recall_stderr": 0.0011732733204742757, "rougeL_fmeasure": 0.1046400973633858, "rougeL_fmeasure_stderr": 0.001187021079229482, "rougeL_precision": 0.08932246378264151, "rougeL_precision_stderr": 0.0011470443498572222, "rougeL_recall": 0.15500596932030242, "rougeL_recall_stderr": 0.001969158666967818, "rougeLsum_fmeasure": 0.13791245377427, "rougeLsum_fmeasure_stderr": 0.0016717888556597065, "rougeLsum_precision": 0.11886530728516936, "rougeLsum_precision_stderr": 0.0016395583632656235, "rougeLsum_recall": 0.19926746759843972, "rougeLsum_recall_stderr": 0.002492327836587634}, "summarize_above_en": {"bleu": 1.466848818979777, "bleu_stderr": 0.05122046143290242, "rouge1_fmeasure": 0.17228713075884045, "rouge1_fmeasure_stderr": 0.0017266755452744172, "rouge1_precision": 0.14983169230821397, "rouge1_precision_stderr": 0.001875244981058499, "rouge1_recall": 0.2495117966397059, "rouge1_recall_stderr": 0.0025670277663872557, "rouge2_fmeasure": 0.029442252351593513, "rouge2_fmeasure_stderr": 0.0007924688244414296, "rouge2_precision": 0.026135894449625118, "rouge2_precision_stderr": 0.000877367958825746, "rouge2_recall": 0.04414741214905309, "rouge2_recall_stderr": 0.0013248833452923223, "rougeL_fmeasure": 0.12211374752033236, "rougeL_fmeasure_stderr": 0.0011442930963585947, "rougeL_precision": 0.10567365961028638, "rougeL_precision_stderr": 0.0013254739675144156, "rougeL_recall": 0.18141239888145783, "rougeL_recall_stderr": 0.0019407502811784976, "rougeLsum_fmeasure": 0.16158045703046053, "rougeLsum_fmeasure_stderr": 0.001603451683480107, "rougeLsum_precision": 0.14040216161795577, "rougeLsum_precision_stderr": 0.0017456836256513886, "rougeLsum_recall": 0.23471174419778962, "rougeLsum_recall_stderr": 0.002419985972720382}, "tldr_en": {"bleu": 2.6577106650236018, "bleu_stderr": 0.07159113573701131, "rouge1_fmeasure": 0.2199694812178742, "rouge1_fmeasure_stderr": 0.002010819735875878, "rouge1_precision": 0.19820460200268925, "rouge1_precision_stderr": 0.002420333361877177, "rouge1_recall": 0.3126809375051542, "rouge1_recall_stderr": 0.002841672003398339, "rouge2_fmeasure": 0.053937307211284244, "rouge2_fmeasure_stderr": 0.001075193073237261, "rouge2_precision": 0.05016728772414271, "rouge2_precision_stderr": 0.001264223371544086, "rouge2_recall": 0.07815311881303431, "rouge2_recall_stderr": 0.0017034183791022172, "rougeL_fmeasure": 0.15551270440786846, "rougeL_fmeasure_stderr": 0.0013566776773220371, "rougeL_precision": 0.13997435500321276, "rougeL_precision_stderr": 0.0017712606376717578, "rougeL_recall": 0.22624387436643148, "rougeL_recall_stderr": 0.0022018166295221108, "rougeLsum_fmeasure": 0.20618836045677164, "rougeLsum_fmeasure_stderr": 0.0018846829640657573, "rougeLsum_precision": 0.18565508806059028, "rougeLsum_precision_stderr": 0.002281622879542436, "rougeLsum_recall": 0.29360401456830726, "rougeLsum_recall_stderr": 0.0026791632190064567}, "write_abstract_en": {"bleu": 1.1223009951089067, "bleu_stderr": 0.05836654112265766, "rouge1_fmeasure": 0.1528764985075287, "rouge1_fmeasure_stderr": 0.0016363848989828594, "rouge1_precision": 0.1343039642172445, "rouge1_precision_stderr": 0.0016732214768695325, "rouge1_recall": 0.21562840393454535, "rouge1_recall_stderr": 0.0023741026821102597, "rouge2_fmeasure": 0.02069013916427929, "rouge2_fmeasure_stderr": 0.0006612995194022327, "rouge2_precision": 0.018046982061484945, "rouge2_precision_stderr": 0.0005964805823451741, "rouge2_recall": 0.030411699104941473, "rouge2_recall_stderr": 0.0011304776755259463, "rougeL_fmeasure": 0.10660240871433446, "rougeL_fmeasure_stderr": 0.0010219647383725968, "rougeL_precision": 0.09295669984502772, "rougeL_precision_stderr": 0.0010567113830758765, "rougeL_recall": 0.15447310700799974, "rougeL_recall_stderr": 0.0017294150710180575, "rougeLsum_fmeasure": 0.1444397478005342, "rougeLsum_fmeasure_stderr": 0.0015191037848864151, "rougeLsum_precision": 0.12687184785116162, "rougeLsum_precision_stderr": 0.0015609172625269986, "rougeLsum_recall": 0.20412846443023683, "rougeLsum_recall_stderr": 0.0022261282748651857}}, "2": {"article_summary_en": {"bleu": 2.1250913371882443, "bleu_stderr": 0.08082994225847336, "rouge1_fmeasure": 0.20405321174077107, "rouge1_fmeasure_stderr": 0.0019056600496191807, "rouge1_precision": 0.17461621417538908, "rouge1_precision_stderr": 0.002031440517917525, "rouge1_recall": 0.2964362262793112, "rouge1_recall_stderr": 0.002726714099481616, "rouge2_fmeasure": 0.04567156337896173, "rouge2_fmeasure_stderr": 0.000932894552583276, "rouge2_precision": 0.03893762449906625, "rouge2_precision_stderr": 0.0008525122868332632, "rouge2_recall": 0.06838081813859713, "rouge2_recall_stderr": 0.0015611393255752124, "rougeL_fmeasure": 0.14991267136518538, "rougeL_fmeasure_stderr": 0.0012964821918338744, "rougeL_precision": 0.12669938791067256, "rougeL_precision_stderr": 0.0013586831428520094, "rougeL_recall": 0.2241091285172054, "rougeL_recall_stderr": 0.002190856032138392, "rougeLsum_fmeasure": 0.188778720271424, "rougeLsum_fmeasure_stderr": 0.0017684778684466625, "rougeLsum_precision": 0.16135042684996723, "rougeLsum_precision_stderr": 0.0018812047542648442, "rougeLsum_recall": 0.2751530306927472, "rougeLsum_recall_stderr": 0.002569597423808091}, "rephrase_en": {"bleu": 2.2113811064047963, "bleu_stderr": 0.08314019884439218, "rouge1_fmeasure": 0.18051522966442277, "rouge1_fmeasure_stderr": 0.002061705179353923, "rouge1_precision": 0.16106457000008795, "rouge1_precision_stderr": 0.002292064983729894, "rouge1_recall": 0.2557520463315039, "rouge1_recall_stderr": 0.002954728954094822, "rouge2_fmeasure": 0.040384981669924526, "rouge2_fmeasure_stderr": 0.0009583825110008918, "rouge2_precision": 0.03652753511175213, "rouge2_precision_stderr": 0.000971777928411179, "rouge2_recall": 0.05873911371100809, "rouge2_recall_stderr": 0.0015215967074205675, "rougeL_fmeasure": 0.13548549449383468, "rougeL_fmeasure_stderr": 0.0014439794973907486, "rougeL_precision": 0.12029197082375237, "rougeL_precision_stderr": 0.0016567061948962215, "rougeL_recall": 0.1961569067857406, "rougeL_recall_stderr": 0.00230187114523418, "rougeLsum_fmeasure": 0.16793091858839554, "rougeLsum_fmeasure_stderr": 0.0019139229473260492, "rougeLsum_precision": 0.14978318904580395, "rougeLsum_precision_stderr": 0.0021402940102766984, "rougeLsum_recall": 0.23874291009412155, "rougeLsum_recall_stderr": 0.002780798318742396}, "summarize_above_en": {"bleu": 2.1241997955779706, "bleu_stderr": 0.07242902905742744, "rouge1_fmeasure": 0.19387864929813403, "rouge1_fmeasure_stderr": 0.0019373404999633163, "rouge1_precision": 0.18601116127457196, "rouge1_precision_stderr": 0.002674455746095039, "rouge1_recall": 0.2697340694293458, "rouge1_recall_stderr": 0.0027730262294329102, "rouge2_fmeasure": 0.042503871652830684, "rouge2_fmeasure_stderr": 0.0009697312508853417, "rouge2_precision": 0.04394534305996648, "rouge2_precision_stderr": 0.0014119764995516235, "rouge2_recall": 0.05981581118058386, "rouge2_recall_stderr": 0.001480173209074023, "rougeL_fmeasure": 0.14644335466030722, "rougeL_fmeasure_stderr": 0.0013693635698585327, "rougeL_precision": 0.14114910909948458, "rougeL_precision_stderr": 0.00212420941260633, "rougeL_recall": 0.2082056428265491, "rougeL_recall_stderr": 0.0022135365734890456, "rougeLsum_fmeasure": 0.18052324198395442, "rougeLsum_fmeasure_stderr": 0.0018002340718791736, "rougeLsum_precision": 0.17326036096446687, "rougeLsum_precision_stderr": 0.0025172862394438143, "rougeLsum_recall": 0.2520550560532216, "rougeLsum_recall_stderr": 0.002612078427237868}, "tldr_en": {"bleu": 2.8678009037418817, "bleu_stderr": 0.037508767497823454, "rouge1_fmeasure": 0.22435918885551207, "rouge1_fmeasure_stderr": 0.0018967794998520616, "rouge1_precision": 0.2156784752655553, "rouge1_precision_stderr": 0.002732677109431224, "rouge1_recall": 0.31237685950514665, "rouge1_recall_stderr": 0.0027963244796416183, "rouge2_fmeasure": 0.05625290668830642, "rouge2_fmeasure_stderr": 0.0010633219356063186, "rouge2_precision": 0.05738474692529928, "rouge2_precision_stderr": 0.0015267876059967937, "rouge2_recall": 0.07970531997370003, "rouge2_recall_stderr": 0.0016700905255101184, "rougeL_fmeasure": 0.15917844337997755, "rougeL_fmeasure_stderr": 0.0013165721234956872, "rougeL_precision": 0.15451467159603913, "rougeL_precision_stderr": 0.002139183021303883, "rougeL_recall": 0.225376744790094, "rougeL_recall_stderr": 0.0021771714204925977, "rougeLsum_fmeasure": 0.2111183297537402, "rougeLsum_fmeasure_stderr": 0.0017924385498513864, "rougeLsum_precision": 0.20314308377906012, "rougeLsum_precision_stderr": 0.002612094861988029, "rougeLsum_recall": 0.29423268640214134, "rougeLsum_recall_stderr": 0.002642162937712342}, "write_abstract_en": {"bleu": 1.1180377312448153, "bleu_stderr": 0.06708589951037792, "rouge1_fmeasure": 0.14316491558968603, "rouge1_fmeasure_stderr": 0.0017639356738778811, "rouge1_precision": 0.12697080832224822, "rouge1_precision_stderr": 0.0018201089851012445, "rouge1_recall": 0.20109011349921127, "rouge1_recall_stderr": 0.0025569830771983836, "rouge2_fmeasure": 0.020465708060419606, "rouge2_fmeasure_stderr": 0.000671449947746698, "rouge2_precision": 0.018246718615969375, "rouge2_precision_stderr": 0.0006418388179326544, "rouge2_recall": 0.029736909566824603, "rouge2_recall_stderr": 0.0011083029361886995, "rougeL_fmeasure": 0.10367762364897098, "rougeL_fmeasure_stderr": 0.0011709222919166911, "rougeL_precision": 0.09139918661616583, "rougeL_precision_stderr": 0.0012236115566095836, "rougeL_recall": 0.14881163413712634, "rougeL_recall_stderr": 0.0018937511898562988, "rougeLsum_fmeasure": 0.1341766534995056, "rougeLsum_fmeasure_stderr": 0.0016336963572172586, "rougeLsum_precision": 0.11904750892035158, "rougeLsum_precision_stderr": 0.0017004264999703578, "rougeLsum_recall": 0.1886874000288314, "rougeLsum_recall_stderr": 0.0023745592779610444}}, "3": {"article_summary_en": {"bleu": 2.198529629762632, "bleu_stderr": 0.065300185227075, "rouge1_fmeasure": 0.17160204720456523, "rouge1_fmeasure_stderr": 0.0022197524833034343, "rouge1_precision": 0.15230744068697982, "rouge1_precision_stderr": 0.0022959707383516507, "rouge1_recall": 0.24820187689883588, "rouge1_recall_stderr": 0.0032867277020760042, "rouge2_fmeasure": 0.03899411935423918, "rouge2_fmeasure_stderr": 0.0009187136392638074, "rouge2_precision": 0.03434424273820165, "rouge2_precision_stderr": 0.0009243231784645324, "rouge2_recall": 0.058823449262171576, "rouge2_recall_stderr": 0.0015444564017563487, "rougeL_fmeasure": 0.12583346122758107, "rougeL_fmeasure_stderr": 0.001558801054784864, "rougeL_precision": 0.11092147540747452, "rougeL_precision_stderr": 0.0016240093919999271, "rougeL_recall": 0.18694301859386991, "rougeL_recall_stderr": 0.0025880219875020465, "rougeLsum_fmeasure": 0.1593908947935989, "rougeLsum_fmeasure_stderr": 0.002057263273890102, "rougeLsum_precision": 0.14135092515979492, "rougeLsum_precision_stderr": 0.0021286008140915364, "rougeLsum_recall": 0.23101489053303867, "rougeLsum_recall_stderr": 0.003064522846659888}, "rephrase_en": {"bleu": 2.4654367545222904, "bleu_stderr": 0.07024206603649888, "rouge1_fmeasure": 0.16139635448455195, "rouge1_fmeasure_stderr": 0.002249446575446669, "rouge1_precision": 0.1533759665863088, "rouge1_precision_stderr": 0.0025577920564815457, "rouge1_recall": 0.2259942077891069, "rouge1_recall_stderr": 0.0033053436745626315, "rouge2_fmeasure": 0.03839113878587267, "rouge2_fmeasure_stderr": 0.0009971143418407548, "rouge2_precision": 0.03725286699889529, "rouge2_precision_stderr": 0.0012332919944079443, "rouge2_recall": 0.05566813880650291, "rouge2_recall_stderr": 0.0016306556541296991, "rougeL_fmeasure": 0.12314296805478218, "rougeL_fmeasure_stderr": 0.0016547329508161137, "rougeL_precision": 0.11690535506105171, "rougeL_precision_stderr": 0.0019433797646747934, "rougeL_recall": 0.17591155413008638, "rougeL_recall_stderr": 0.0026478755663225566, "rougeLsum_fmeasure": 0.15022458367211336, "rougeLsum_fmeasure_stderr": 0.0020979925530018205, "rougeLsum_precision": 0.14302393847897105, "rougeLsum_precision_stderr": 0.002412541062432112, "rougeLsum_recall": 0.21074872905474742, "rougeLsum_recall_stderr": 0.003105762840068013}, "summarize_above_en": {"bleu": 2.452893619605998, "bleu_stderr": 0.07535908896046421, "rouge1_fmeasure": 0.16464113343789846, "rouge1_fmeasure_stderr": 0.002337449087567628, "rouge1_precision": 0.17904260819617532, "rouge1_precision_stderr": 0.0034025414878592264, "rouge1_recall": 0.22026524786669358, "rouge1_recall_stderr": 0.0032630989926899674, "rouge2_fmeasure": 0.03902103124926395, "rouge2_fmeasure_stderr": 0.0010593881502234943, "rouge2_precision": 0.04756210696143349, "rouge2_precision_stderr": 0.0019178497340389886, "rouge2_recall": 0.05249440732325283, "rouge2_recall_stderr": 0.001516559702111135, "rougeL_fmeasure": 0.1254589966504649, "rougeL_fmeasure_stderr": 0.0017313167474427463, "rougeL_precision": 0.13877691598479244, "rougeL_precision_stderr": 0.002822308452152884, "rougeL_recall": 0.17071662509195945, "rougeL_recall_stderr": 0.0025830517349758665, "rougeLsum_fmeasure": 0.15262792584886165, "rougeLsum_fmeasure_stderr": 0.002174966680287157, "rougeLsum_precision": 0.16705087756637185, "rougeLsum_precision_stderr": 0.003249700349640652, "rougeLsum_recall": 0.20445349405368257, "rougeLsum_recall_stderr": 0.003053057869293393}, "tldr_en": {"bleu": 2.9935371246792863, "bleu_stderr": 0.08389052572374653, "rouge1_fmeasure": 0.18429548803283216, "rouge1_fmeasure_stderr": 0.002297153333126352, "rouge1_precision": 0.18405392984121038, "rouge1_precision_stderr": 0.0029940021228191355, "rouge1_recall": 0.25514944671406053, "rouge1_recall_stderr": 0.0033459375347498267, "rouge2_fmeasure": 0.04688317854067561, "rouge2_fmeasure_stderr": 0.001094346737838606, "rouge2_precision": 0.04819840246533481, "rouge2_precision_stderr": 0.0015092856540081493, "rouge2_recall": 0.06631560044043883, "rouge2_recall_stderr": 0.0016628335172097389, "rougeL_fmeasure": 0.13173075629033756, "rougeL_fmeasure_stderr": 0.0016475788902520988, "rougeL_precision": 0.13375051095662321, "rougeL_precision_stderr": 0.002380892659084607, "rougeL_recall": 0.1853146721328658, "rougeL_recall_stderr": 0.0025854840001956502, "rougeLsum_fmeasure": 0.1733778342320142, "rougeLsum_fmeasure_stderr": 0.0021681294491408274, "rougeLsum_precision": 0.1733338698061136, "rougeLsum_precision_stderr": 0.0028479993519003807, "rougeLsum_recall": 0.2401891556702282, "rougeLsum_recall_stderr": 0.003169734302090452}, "write_abstract_en": {"bleu": 1.2836466923526264, "bleu_stderr": 0.07070516293414274, "rouge1_fmeasure": 0.10634798502269208, "rouge1_fmeasure_stderr": 0.0019791669918603184, "rouge1_precision": 0.09887369502675063, "rouge1_precision_stderr": 0.002043227921916099, "rouge1_recall": 0.14678993430035586, "rouge1_recall_stderr": 0.0028179673635622318, "rouge2_fmeasure": 0.01711652805678942, "rouge2_fmeasure_stderr": 0.0006882433105651278, "rouge2_precision": 0.01567989332792064, "rouge2_precision_stderr": 0.0006694052515306127, "rouge2_recall": 0.024427777089035076, "rouge2_recall_stderr": 0.0010678311796068322, "rougeL_fmeasure": 0.07999450335755226, "rougeL_fmeasure_stderr": 0.0013938218605260453, "rougeL_precision": 0.07436580031645514, "rougeL_precision_stderr": 0.001457445017450415, "rougeL_recall": 0.11271151612770153, "rougeL_recall_stderr": 0.0021453041661070715, "rougeLsum_fmeasure": 0.09936266385012239, "rougeLsum_fmeasure_stderr": 0.0018384025373419349, "rougeLsum_precision": 0.09222600214130718, "rougeLsum_precision_stderr": 0.0018861337709654303, "rougeLsum_recall": 0.13747754955861127, "rougeLsum_recall_stderr": 0.002634806979634438}}, "4": {"article_summary_en": {"bleu": 0.520053083116806, "bleu_stderr": 0.039623345914897065, "rouge1_fmeasure": 0.054987533485356926, "rouge1_fmeasure_stderr": 0.001890886367853537, "rouge1_precision": 0.05083357669029093, "rouge1_precision_stderr": 0.0019357918297942475, "rouge1_recall": 0.08244342719669248, "rouge1_recall_stderr": 0.0028813872714789964, "rouge2_fmeasure": 0.012617379158558721, "rouge2_fmeasure_stderr": 0.0006313655807193013, "rouge2_precision": 0.01117210874945532, "rouge2_precision_stderr": 0.0005995255967837975, "rouge2_recall": 0.0202786766574289, "rouge2_recall_stderr": 0.00114540984571034, "rougeL_fmeasure": 0.04157458484226149, "rougeL_fmeasure_stderr": 0.0014009278175922768, "rougeL_precision": 0.038366665167606055, "rougeL_precision_stderr": 0.0014542518656553282, "rougeL_recall": 0.06396990192776628, "rougeL_recall_stderr": 0.0022687929818678404, "rougeLsum_fmeasure": 0.0506618129825057, "rougeLsum_fmeasure_stderr": 0.0017421779361228911, "rougeLsum_precision": 0.0469155306552092, "rougeLsum_precision_stderr": 0.0017939164451876647, "rougeLsum_recall": 0.07620716657990878, "rougeLsum_recall_stderr": 0.002680492989639914}, "rephrase_en": {"bleu": 0.4508825944565298, "bleu_stderr": 0.03559978337994854, "rouge1_fmeasure": 0.05011197616686206, "rouge1_fmeasure_stderr": 0.0018346039701914467, "rouge1_precision": 0.05028568853458266, "rouge1_precision_stderr": 0.00208707409282097, "rouge1_recall": 0.07186995080563745, "rouge1_recall_stderr": 0.002683339801036182, "rouge2_fmeasure": 0.012281079448761325, "rouge2_fmeasure_stderr": 0.000653869447400363, "rouge2_precision": 0.01253202087368826, "rouge2_precision_stderr": 0.0007848832756426618, "rouge2_recall": 0.017990065250321798, "rouge2_recall_stderr": 0.001031089985634214, "rougeL_fmeasure": 0.03907293229046315, "rougeL_fmeasure_stderr": 0.0013997352977999286, "rougeL_precision": 0.039231023055013595, "rougeL_precision_stderr": 0.0016071013078014019, "rougeL_recall": 0.05718233871131803, "rougeL_recall_stderr": 0.0021546633328687506, "rougeLsum_fmeasure": 0.04654151233042153, "rougeLsum_fmeasure_stderr": 0.0017091787582805187, "rougeLsum_precision": 0.04676947454126935, "rougeLsum_precision_stderr": 0.0019385598696383512, "rougeLsum_recall": 0.0667844061216085, "rougeLsum_recall_stderr": 0.0025123745789632553}, "summarize_above_en": {"bleu": 0.2501992622083412, "bleu_stderr": 0.029972656605070418, "rouge1_fmeasure": 0.04778674517582264, "rouge1_fmeasure_stderr": 0.0018365614759514285, "rouge1_precision": 0.05505736228678805, "rouge1_precision_stderr": 0.0024837468725238195, "rouge1_recall": 0.06470336830124988, "rouge1_recall_stderr": 0.002537880582413385, "rouge2_fmeasure": 0.01084532274916111, "rouge2_fmeasure_stderr": 0.0006463021300909807, "rouge2_precision": 0.014779446264054693, "rouge2_precision_stderr": 0.0012182175752722738, "rouge2_recall": 0.014759184580045196, "rouge2_recall_stderr": 0.0009302529142526923, "rougeL_fmeasure": 0.03728476607430857, "rougeL_fmeasure_stderr": 0.001410339676510439, "rougeL_precision": 0.04401215074366241, "rougeL_precision_stderr": 0.0020765428405668932, "rougeL_recall": 0.05148347306396624, "rougeL_recall_stderr": 0.0020437326538755756, "rougeLsum_fmeasure": 0.044201424272258295, "rougeLsum_fmeasure_stderr": 0.0017014625957187778, "rougeLsum_precision": 0.051354143409556074, "rougeLsum_precision_stderr": 0.0023530881508590775, "rougeLsum_recall": 0.05990804305583379, "rougeLsum_recall_stderr": 0.0023627429149512}, "tldr_en": {"bleu": 0.5461443377554994, "bleu_stderr": 0.044799916123802616, "rouge1_fmeasure": 0.05659885761358278, "rouge1_fmeasure_stderr": 0.001930223829324733, "rouge1_precision": 0.056788240211910736, "rouge1_precision_stderr": 0.0022506997974891542, "rouge1_recall": 0.08339625174295712, "rouge1_recall_stderr": 0.0029290040087097207, "rouge2_fmeasure": 0.013810868807903593, "rouge2_fmeasure_stderr": 0.0006930003786904605, "rouge2_precision": 0.014380667558554284, "rouge2_precision_stderr": 0.000977357379524512, "rouge2_recall": 0.02101316087357165, "rouge2_recall_stderr": 0.0011619283931715888, "rougeL_fmeasure": 0.042007409754473535, "rougeL_fmeasure_stderr": 0.0014258890197756744, "rougeL_precision": 0.04257610439946902, "rougeL_precision_stderr": 0.0017464834049321358, "rougeL_recall": 0.06302882516092573, "rougeL_recall_stderr": 0.0022681193452949904, "rougeLsum_fmeasure": 0.05299986098950862, "rougeLsum_fmeasure_stderr": 0.0018068900698462816, "rougeLsum_precision": 0.05318358596123625, "rougeLsum_precision_stderr": 0.0021166691321526723, "rougeLsum_recall": 0.07838773551352705, "rougeLsum_recall_stderr": 0.002768443855799422}, "write_abstract_en": {"bleu": 0.0870839562978014, "bleu_stderr": 0.006785360071011486, "rouge1_fmeasure": 0.02713454046170108, "rouge1_fmeasure_stderr": 0.0012975310299864612, "rouge1_precision": 0.025095421938052895, "rouge1_precision_stderr": 0.0012461801615902318, "rouge1_recall": 0.039170985603147426, "rouge1_recall_stderr": 0.0019196761418163904, "rouge2_fmeasure": 0.0045440484711843825, "rouge2_fmeasure_stderr": 0.0003623167516261232, "rouge2_precision": 0.0040053841270205085, "rouge2_precision_stderr": 0.0003368144516166286, "rouge2_recall": 0.006847830102416033, "rouge2_recall_stderr": 0.0005830833698761628, "rougeL_fmeasure": 0.020373823655558135, "rougeL_fmeasure_stderr": 0.0009374343240357042, "rougeL_precision": 0.01906700487359207, "rougeL_precision_stderr": 0.0009287450742508868, "rougeL_recall": 0.029866238646993826, "rougeL_recall_stderr": 0.0014553209547808461, "rougeLsum_fmeasure": 0.02507998631189558, "rougeLsum_fmeasure_stderr": 0.0011959812138736497, "rougeLsum_precision": 0.023282946639617354, "rougeLsum_precision_stderr": 0.001159003398883688, "rougeLsum_recall": 0.036183810616452226, "rougeLsum_recall_stderr": 0.0017667839555253213}}, "5": {"article_summary_en": {"bleu": 5.517328369410883e-07, "bleu_stderr": 1.0386230327964954e-06, "rouge1_fmeasure": 0.008446934607939634, "rouge1_fmeasure_stderr": 0.000836413601735138, "rouge1_precision": 0.00861372538113473, "rouge1_precision_stderr": 0.001009916194724285, "rouge1_recall": 0.01235013958324705, "rouge1_recall_stderr": 0.001203646367559825, "rouge2_fmeasure": 0.0019124752375349172, "rouge2_fmeasure_stderr": 0.00025472485058213127, "rouge2_precision": 0.001811642633420617, "rouge2_precision_stderr": 0.00027281114466191735, "rouge2_recall": 0.0027137133448325683, "rouge2_recall_stderr": 0.0003698010920981975, "rougeL_fmeasure": 0.00624239280071284, "rougeL_fmeasure_stderr": 0.0006013075457035342, "rougeL_precision": 0.006548532654933444, "rougeL_precision_stderr": 0.0008249280853205021, "rougeL_recall": 0.009381116428004776, "rougeL_recall_stderr": 0.0009128471950554741, "rougeLsum_fmeasure": 0.007713721590588463, "rougeLsum_fmeasure_stderr": 0.0007579891955786283, "rougeLsum_precision": 0.00793806088766393, "rougeLsum_precision_stderr": 0.0009498773132576785, "rougeLsum_recall": 0.011369439233893231, "rougeLsum_recall_stderr": 0.001105032966952515}, "rephrase_en": {"bleu": 5.6409965833073724e-08, "bleu_stderr": 1.1870470936219164e-07, "rouge1_fmeasure": 0.0073426086769494935, "rouge1_fmeasure_stderr": 0.0007675421686666748, "rouge1_precision": 0.007347042331656005, "rouge1_precision_stderr": 0.000855877168700121, "rouge1_recall": 0.010778250545529063, "rouge1_recall_stderr": 0.0011521636177528214, "rouge2_fmeasure": 0.0017130160893721097, "rouge2_fmeasure_stderr": 0.00024830222441035233, "rouge2_precision": 0.001710980048638282, "rouge2_precision_stderr": 0.0002755932507822362, "rouge2_recall": 0.002533909675938538, "rouge2_recall_stderr": 0.0003864838960639161, "rougeL_fmeasure": 0.005567943512045762, "rougeL_fmeasure_stderr": 0.0005784419503900142, "rougeL_precision": 0.005509333596115347, "rougeL_precision_stderr": 0.0006321643878647678, "rougeL_recall": 0.00833978450728592, "rougeL_recall_stderr": 0.0009079115070139992, "rougeLsum_fmeasure": 0.006800973372764665, "rougeLsum_fmeasure_stderr": 0.0007105353563633134, "rougeLsum_precision": 0.006862990886278549, "rougeLsum_precision_stderr": 0.0008064303620221591, "rougeLsum_recall": 0.009971153288718336, "rougeLsum_recall_stderr": 0.0010652484370260787}, "summarize_above_en": {"bleu": 5.897323116804045e-10, "bleu_stderr": 1.7226783506844502e-09, "rouge1_fmeasure": 0.006582381210133649, "rouge1_fmeasure_stderr": 0.000736046358117549, "rouge1_precision": 0.007051967760789057, "rouge1_precision_stderr": 0.000875522796789484, "rouge1_recall": 0.008622048628992848, "rouge1_recall_stderr": 0.000979421104357186, "rouge2_fmeasure": 0.0013869187743711499, "rouge2_fmeasure_stderr": 0.00023793902364453525, "rouge2_precision": 0.0017517411283309776, "rouge2_precision_stderr": 0.00042676011787971466, "rouge2_recall": 0.0017169194523458937, "rouge2_recall_stderr": 0.0002723182367088392, "rougeL_fmeasure": 0.005255733664967675, "rougeL_fmeasure_stderr": 0.0005774560163471757, "rougeL_precision": 0.005694743781016978, "rougeL_precision_stderr": 0.0007231881422995928, "rougeL_recall": 0.007034117275652774, "rougeL_recall_stderr": 0.0008076270617686096, "rougeLsum_fmeasure": 0.0061682356318462596, "rougeLsum_fmeasure_stderr": 0.0006919242043320082, "rougeLsum_precision": 0.006656694900838437, "rougeLsum_precision_stderr": 0.0008364652583978511, "rougeLsum_recall": 0.008085611227571524, "rougeLsum_recall_stderr": 0.0009219645931116729}, "tldr_en": {"bleu": 1.0112557087204399e-06, "bleu_stderr": 2.4150938941127893e-06, "rouge1_fmeasure": 0.009231042835921754, "rouge1_fmeasure_stderr": 0.0008787834704319035, "rouge1_precision": 0.008911405879685974, "rouge1_precision_stderr": 0.0009422475593785767, "rouge1_recall": 0.014063917251247313, "rouge1_recall_stderr": 0.001396048114694196, "rouge2_fmeasure": 0.0024939038248004536, "rouge2_fmeasure_stderr": 0.00031409966961336354, "rouge2_precision": 0.0023383395560896076, "rouge2_precision_stderr": 0.0003211681514690954, "rouge2_recall": 0.004206929070053043, "rouge2_recall_stderr": 0.0006053899081506379, "rougeL_fmeasure": 0.007056657402327181, "rougeL_fmeasure_stderr": 0.0006761968053255695, "rougeL_precision": 0.006838517996278174, "rougeL_precision_stderr": 0.0007309899843193323, "rougeL_recall": 0.010882093390231358, "rougeL_recall_stderr": 0.0011052580830821516, "rougeLsum_fmeasure": 0.008623638497104687, "rougeLsum_fmeasure_stderr": 0.0008181729546128778, "rougeLsum_precision": 0.008357520892672053, "rougeLsum_precision_stderr": 0.0008835066902880334, "rougeLsum_recall": 0.013172143042073951, "rougeLsum_recall_stderr": 0.0013147147386398047}, "write_abstract_en": {"bleu": 8.093999898733972e-15, "bleu_stderr": 1.0779878610297242e-14, "rouge1_fmeasure": 0.00249534987383561, "rouge1_fmeasure_stderr": 0.0003873473209623069, "rouge1_precision": 0.0021405897631643946, "rouge1_precision_stderr": 0.0003541239789070288, "rouge1_recall": 0.0038253034844642347, "rouge1_recall_stderr": 0.0005823536138290148, "rouge2_fmeasure": 0.00036639163657650276, "rouge2_fmeasure_stderr": 0.00010049770707316975, "rouge2_precision": 0.0003169846199202593, "rouge2_precision_stderr": 8.619486969926879e-05, "rouge2_recall": 0.0005250548080533436, "rouge2_recall_stderr": 0.00015216407757586244, "rougeL_fmeasure": 0.0018743787593694622, "rougeL_fmeasure_stderr": 0.0002863935366559193, "rougeL_precision": 0.001603458706659755, "rougeL_precision_stderr": 0.0002602089443308959, "rougeL_recall": 0.002861894235467422, "rougeL_recall_stderr": 0.0004267622874298756, "rougeLsum_fmeasure": 0.0023091312121054073, "rougeLsum_fmeasure_stderr": 0.0003592857430542886, "rougeLsum_precision": 0.001980619561518416, "rougeLsum_precision_stderr": 0.0003290309068301346, "rougeLsum_recall": 0.0035607312643202195, "rougeLsum_recall_stderr": 0.0005431230446018559}}}, "anli_r1": {"0": {"GPT-3 style": {"acc": 0.334, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732958, "acc_stderr": 0.014922019523732954, "subset": 1}, "MNLI crowdsource": {"acc": 0.337, "acc_norm": 0.331, "acc_norm_stderr": 0.01488827258820394, "acc_stderr": 0.014955087918653598, "subset": 1}, "can we infer": {"acc": 0.335, "acc_norm": 0.339, "acc_norm_stderr": 0.014976758771620347, "acc_stderr": 0.014933117490932573, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.349, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229859, "acc_stderr": 0.015080663991563098, "subset": 1}, "justified in saying": {"acc": 0.339, "acc_norm": 0.329, "acc_norm_stderr": 0.014865395385928359, "acc_stderr": 0.014976758771620347, "subset": 1}}, "1": {"GPT-3 style": {"acc": 0.324, "acc_norm": 0.329, "acc_norm_stderr": 0.014865395385928364, "acc_stderr": 0.01480686473373886, "subset": 1}, "MNLI crowdsource": {"acc": 0.332, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732961, "acc_stderr": 0.014899597242811485, "subset": 1}, "can we infer": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}, "justified in saying": {"acc": 0.332, "acc_norm": 0.332, "acc_norm_stderr": 0.014899597242811487, "acc_stderr": 0.014899597242811485, "subset": 1}}, "2": {"GPT-3 style": {"acc": 0.352, "acc_norm": 0.361, "acc_norm_stderr": 0.015195720118175117, "acc_stderr": 0.01511040450564866, "subset": 1}, "MNLI crowdsource": {"acc": 0.337, "acc_norm": 0.329, "acc_norm_stderr": 0.014865395385928362, "acc_stderr": 0.014955087918653595, "subset": 1}, "can we infer": {"acc": 0.345, "acc_norm": 0.342, "acc_norm_stderr": 0.015008706182121731, "acc_stderr": 0.015039986742055235, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.33, "acc_norm": 0.345, "acc_norm_stderr": 0.015039986742055235, "acc_stderr": 0.014876872027456732, "subset": 1}, "justified in saying": {"acc": 0.356, "acc_norm": 0.343, "acc_norm_stderr": 0.015019206922356953, "acc_stderr": 0.015149042659306625, "subset": 1}}, "3": {"GPT-3 style": {"acc": 0.343, "acc_norm": 0.361, "acc_norm_stderr": 0.015195720118175113, "acc_stderr": 0.015019206922356951, "subset": 1}, "MNLI crowdsource": {"acc": 0.352, "acc_norm": 0.35, "acc_norm_stderr": 0.015090650341444233, "acc_stderr": 0.015110404505648658, "subset": 1}, "can we infer": {"acc": 0.359, "acc_norm": 0.349, "acc_norm_stderr": 0.0150806639915631, "acc_stderr": 0.015177264224798597, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.325, "acc_norm": 0.329, "acc_norm_stderr": 0.014865395385928357, "acc_stderr": 0.014818724459095526, "subset": 1}, "justified in saying": {"acc": 0.35, "acc_norm": 0.345, "acc_norm_stderr": 0.015039986742055235, "acc_stderr": 0.015090650341444236, "subset": 1}}, "4": {"GPT-3 style": {"acc": 0.338, "acc_norm": 0.338, "acc_norm_stderr": 0.014965960710224475, "acc_stderr": 0.014965960710224472, "subset": 1}, "MNLI crowdsource": {"acc": 0.341, "acc_norm": 0.353, "acc_norm_stderr": 0.01512017260548369, "acc_stderr": 0.014998131348402707, "subset": 1}, "can we infer": {"acc": 0.337, "acc_norm": 0.341, "acc_norm_stderr": 0.014998131348402714, "acc_stderr": 0.01495508791865359, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.331, "acc_norm": 0.329, "acc_norm_stderr": 0.014865395385928359, "acc_stderr": 0.014888272588203943, "subset": 1}, "justified in saying": {"acc": 0.328, "acc_norm": 0.344, "acc_norm_stderr": 0.015029633724408948, "acc_stderr": 0.014853842487270334, "subset": 1}}, "5": {"GPT-3 style": {"acc": 0.348, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456736, "acc_stderr": 0.01507060460376841, "subset": 1}, "MNLI crowdsource": {"acc": 0.356, "acc_norm": 0.351, "acc_norm_stderr": 0.015100563798316405, "acc_stderr": 0.01514904265930662, "subset": 1}, "can we infer": {"acc": 0.337, "acc_norm": 0.345, "acc_norm_stderr": 0.015039986742055237, "acc_stderr": 0.0149550879186536, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229857, "acc_stderr": 0.014910846164229857, "subset": 1}, "justified in saying": {"acc": 0.327, "acc_norm": 0.337, "acc_norm_stderr": 0.01495508791865359, "acc_stderr": 0.014842213153411237, "subset": 1}}}, "anli_r2": {"0": {"GPT-3 style": {"acc": 0.333, "acc_norm": 0.35, "acc_norm_stderr": 0.015090650341444233, "acc_stderr": 0.014910846164229871, "subset": 2}, "MNLI crowdsource": {"acc": 0.325, "acc_norm": 0.324, "acc_norm_stderr": 0.014806864733738854, "acc_stderr": 0.014818724459095526, "subset": 2}, "can we infer": {"acc": 0.332, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732963, "acc_stderr": 0.01489959724281148, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.311, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732972, "acc_stderr": 0.014645596385722692, "subset": 2}, "justified in saying": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422987, "acc_stderr": 0.014910846164229859, "subset": 2}}, "1": {"GPT-3 style": {"acc": 0.314, "acc_norm": 0.321, "acc_norm_stderr": 0.014770821817934645, "acc_stderr": 0.014683991951087971, "subset": 2}, "MNLI crowdsource": {"acc": 0.319, "acc_norm": 0.316, "acc_norm_stderr": 0.01470919305605713, "acc_stderr": 0.014746404865473479, "subset": 2}, "can we infer": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}, "justified in saying": {"acc": 0.32, "acc_norm": 0.317, "acc_norm_stderr": 0.014721675438880215, "acc_stderr": 0.014758652303574888, "subset": 2}}, "2": {"GPT-3 style": {"acc": 0.334, "acc_norm": 0.331, "acc_norm_stderr": 0.014888272588203922, "acc_stderr": 0.014922019523732954, "subset": 2}, "MNLI crowdsource": {"acc": 0.339, "acc_norm": 0.331, "acc_norm_stderr": 0.014888272588203926, "acc_stderr": 0.014976758771620335, "subset": 2}, "can we infer": {"acc": 0.323, "acc_norm": 0.322, "acc_norm_stderr": 0.014782913600996686, "acc_stderr": 0.014794927843348628, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.335, "acc_norm": 0.331, "acc_norm_stderr": 0.014888272588203933, "acc_stderr": 0.014933117490932575, "subset": 2}, "justified in saying": {"acc": 0.322, "acc_norm": 0.312, "acc_norm_stderr": 0.014658474370509015, "acc_stderr": 0.014782913600996686, "subset": 2}}, "3": {"GPT-3 style": {"acc": 0.325, "acc_norm": 0.337, "acc_norm_stderr": 0.0149550879186536, "acc_stderr": 0.014818724459095526, "subset": 2}, "MNLI crowdsource": {"acc": 0.314, "acc_norm": 0.313, "acc_norm_stderr": 0.014671272822977886, "acc_stderr": 0.014683991951087973, "subset": 2}, "can we infer": {"acc": 0.321, "acc_norm": 0.319, "acc_norm_stderr": 0.014746404865473479, "acc_stderr": 0.014770821817934654, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.335, "acc_norm": 0.332, "acc_norm_stderr": 0.014899597242811485, "acc_stderr": 0.014933117490932577, "subset": 2}, "justified in saying": {"acc": 0.322, "acc_norm": 0.312, "acc_norm_stderr": 0.014658474370509015, "acc_stderr": 0.014782913600996686, "subset": 2}}, "4": {"GPT-3 style": {"acc": 0.311, "acc_norm": 0.304, "acc_norm_stderr": 0.014553205687950453, "acc_stderr": 0.014645596385722692, "subset": 2}, "MNLI crowdsource": {"acc": 0.303, "acc_norm": 0.31, "acc_norm_stderr": 0.014632638658632903, "acc_stderr": 0.014539683710535265, "subset": 2}, "can we infer": {"acc": 0.332, "acc_norm": 0.31, "acc_norm_stderr": 0.014632638658632896, "acc_stderr": 0.014899597242811492, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.333, "acc_norm": 0.337, "acc_norm_stderr": 0.0149550879186536, "acc_stderr": 0.014910846164229868, "subset": 2}, "justified in saying": {"acc": 0.331, "acc_norm": 0.31, "acc_norm_stderr": 0.014632638658632896, "acc_stderr": 0.014888272588203928, "subset": 2}}, "5": {"GPT-3 style": {"acc": 0.313, "acc_norm": 0.319, "acc_norm_stderr": 0.014746404865473477, "acc_stderr": 0.014671272822977885, "subset": 2}, "MNLI crowdsource": {"acc": 0.305, "acc_norm": 0.315, "acc_norm_stderr": 0.0146966319607925, "acc_stderr": 0.014566646394664397, "subset": 2}, "can we infer": {"acc": 0.326, "acc_norm": 0.316, "acc_norm_stderr": 0.014709193056057127, "acc_stderr": 0.014830507204541035, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.333, "acc_norm": 0.336, "acc_norm_stderr": 0.014944140233795018, "acc_stderr": 0.01491084616422987, "subset": 2}, "justified in saying": {"acc": 0.319, "acc_norm": 0.322, "acc_norm_stderr": 0.01478291360099667, "acc_stderr": 0.014746404865473475, "subset": 2}}}, "anli_r3": {"0": {"GPT-3 style": {"acc": 0.33416666666666667, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.013613950010225598, "acc_stderr": 0.013622434813136774, "subset": 3}, "MNLI crowdsource": {"acc": 0.33416666666666667, "acc_norm": 0.32416666666666666, "acc_norm_stderr": 0.013517438120881645, "acc_stderr": 0.01362243481313677, "subset": 3}, "can we infer": {"acc": 0.32666666666666666, "acc_norm": 0.32666666666666666, "acc_norm_stderr": 0.013544340907003665, "acc_stderr": 0.013544340907003665, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.31583333333333335, "acc_norm": 0.32666666666666666, "acc_norm_stderr": 0.013544340907003663, "acc_stderr": 0.013424568830356452, "subset": 3}, "justified in saying": {"acc": 0.3358333333333333, "acc_norm": 0.3283333333333333, "acc_norm_stderr": 0.013562032919529014, "acc_stderr": 0.013639261190932882, "subset": 3}}, "1": {"GPT-3 style": {"acc": 0.3275, "acc_norm": 0.32666666666666666, "acc_norm_stderr": 0.013544340907003663, "acc_stderr": 0.013553211167251947, "subset": 3}, "MNLI crowdsource": {"acc": 0.3333333333333333, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.013647602942406393, "acc_stderr": 0.013613950010225606, "subset": 3}, "can we infer": {"acc": 0.33666666666666667, "acc_norm": 0.3383333333333333, "acc_norm_stderr": 0.013664144006618265, "acc_stderr": 0.013647602942406393, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.33666666666666667, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.013647602942406393, "acc_stderr": 0.013647602942406393, "subset": 3}, "justified in saying": {"acc": 0.33416666666666667, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.013647602942406393, "acc_stderr": 0.013622434813136774, "subset": 3}}, "2": {"GPT-3 style": {"acc": 0.33, "acc_norm": 0.33416666666666667, "acc_norm_stderr": 0.013622434813136772, "acc_stderr": 0.013579531277800917, "subset": 3}, "MNLI crowdsource": {"acc": 0.31583333333333335, "acc_norm": 0.305, "acc_norm_stderr": 0.013296358936471103, "acc_stderr": 0.01342456883035645, "subset": 3}, "can we infer": {"acc": 0.31166666666666665, "acc_norm": 0.3125, "acc_norm_stderr": 0.013386029277441229, "acc_stderr": 0.013376268790982096, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.33166666666666667, "acc_norm": 0.325, "acc_norm_stderr": 0.013526454480351023, "acc_stderr": 0.01359683672948517, "subset": 3}, "justified in saying": {"acc": 0.32166666666666666, "acc_norm": 0.31833333333333336, "acc_norm_stderr": 0.013452948996996303, "acc_stderr": 0.013490095282989521, "subset": 3}}, "3": {"GPT-3 style": {"acc": 0.335, "acc_norm": 0.3475, "acc_norm_stderr": 0.013751753243291854, "acc_stderr": 0.013630871843821474, "subset": 3}, "MNLI crowdsource": {"acc": 0.3333333333333333, "acc_norm": 0.32666666666666666, "acc_norm_stderr": 0.013544340907003665, "acc_stderr": 0.013613950010225605, "subset": 3}, "can we infer": {"acc": 0.3333333333333333, "acc_norm": 0.32166666666666666, "acc_norm_stderr": 0.013490095282989521, "acc_stderr": 0.013613950010225605, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.32916666666666666, "acc_norm": 0.3283333333333333, "acc_norm_stderr": 0.013562032919529015, "acc_stderr": 0.013570806258433633, "subset": 3}, "justified in saying": {"acc": 0.3383333333333333, "acc_norm": 0.3225, "acc_norm_stderr": 0.013499258621103244, "acc_stderr": 0.013664144006618268, "subset": 3}}, "4": {"GPT-3 style": {"acc": 0.31666666666666665, "acc_norm": 0.32, "acc_norm_stderr": 0.013471620929769139, "acc_stderr": 0.013434078660827388, "subset": 3}, "MNLI crowdsource": {"acc": 0.31666666666666665, "acc_norm": 0.3175, "acc_norm_stderr": 0.013443538681348056, "acc_stderr": 0.013434078660827384, "subset": 3}, "can we infer": {"acc": 0.31916666666666665, "acc_norm": 0.3358333333333333, "acc_norm_stderr": 0.013639261190932879, "acc_stderr": 0.013462309712005136, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3425, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.01364760294240639, "acc_stderr": 0.013704669762934727, "subset": 3}, "justified in saying": {"acc": 0.3275, "acc_norm": 0.3325, "acc_norm_stderr": 0.013605417345710528, "acc_stderr": 0.013553211167251947, "subset": 3}}, "5": {"GPT-3 style": {"acc": 0.3308333333333333, "acc_norm": 0.3358333333333333, "acc_norm_stderr": 0.013639261190932879, "acc_stderr": 0.013588208070708992, "subset": 3}, "MNLI crowdsource": {"acc": 0.315, "acc_norm": 0.32083333333333336, "acc_norm_stderr": 0.013480882752851548, "acc_stderr": 0.013415009084004868, "subset": 3}, "can we infer": {"acc": 0.3225, "acc_norm": 0.32, "acc_norm_stderr": 0.013471620929769135, "acc_stderr": 0.013499258621103245, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.33416666666666667, "acc_norm": 0.3375, "acc_norm_stderr": 0.013655897185463653, "acc_stderr": 0.013622434813136774, "subset": 3}, "justified in saying": {"acc": 0.31833333333333336, "acc_norm": 0.32916666666666666, "acc_norm_stderr": 0.01357080625843362, "acc_stderr": 0.013452948996996296, "subset": 3}}}, "arc_easy": {"0": {"heres_a_problem": {"acc": 0.23274410774410775, "acc_norm": 0.23274410774410775, "acc_norm_stderr": 0.0086711691205793, "acc_stderr": 0.0086711691205793}, "i_am_hesitating": {"acc": 0.26706484641638223, "acc_norm": 0.2901023890784983, "acc_norm_stderr": 0.01326157367752078, "acc_stderr": 0.012928933196496344}, "multiple_choice": {"acc": 0.2958754208754209, "acc_norm": 0.28114478114478114, "acc_norm_stderr": 0.009224735470287007, "acc_stderr": 0.009365854134140067}, "pick_the_most_correct_option": {"acc": 0.2295221843003413, "acc_norm": 0.2295221843003413, "acc_norm_stderr": 0.012288926760890797, "acc_stderr": 0.012288926760890797}, "qa_options": {"acc": 0.35269360269360267, "acc_norm": 0.31986531986531985, "acc_norm_stderr": 0.00957082182057359, "acc_stderr": 0.009804420599378657}}, "1": {"heres_a_problem": {"acc": 0.23208191126279865, "acc_norm": 0.23208191126279865, "acc_norm_stderr": 0.012336718284948856, "acc_stderr": 0.012336718284948856}, "i_am_hesitating": {"acc": 0.2790102389078498, "acc_norm": 0.2858361774744027, "acc_norm_stderr": 0.013203196088537367, "acc_stderr": 0.013106784883601346}, "multiple_choice": {"acc": 0.30303030303030304, "acc_norm": 0.31565656565656564, "acc_norm_stderr": 0.009537019245566084, "acc_stderr": 0.009430140669278959}, "pick_the_most_correct_option": {"acc": 0.22440273037542663, "acc_norm": 0.22440273037542663, "acc_norm_stderr": 0.01219140493860383, "acc_stderr": 0.01219140493860383}, "qa_options": {"acc": 0.26621160409556316, "acc_norm": 0.2960750853242321, "acc_norm_stderr": 0.013340916085246263, "acc_stderr": 0.01291577478152322}}, "2": {"heres_a_problem": {"acc": 0.22013651877133106, "acc_norm": 0.22013651877133106, "acc_norm_stderr": 0.012108124883460978, "acc_stderr": 0.012108124883460978}, "i_am_hesitating": {"acc": 0.33207070707070707, "acc_norm": 0.3085016835016835, "acc_norm_stderr": 0.00947747234297813, "acc_stderr": 0.009663817543072694}, "multiple_choice": {"acc": 0.2431740614334471, "acc_norm": 0.25341296928327645, "acc_norm_stderr": 0.012710896778378606, "acc_stderr": 0.012536554144587094}, "pick_the_most_correct_option": {"acc": 0.21928327645051193, "acc_norm": 0.21928327645051193, "acc_norm_stderr": 0.012091245787615728, "acc_stderr": 0.012091245787615728}, "qa_options": {"acc": 0.3409090909090909, "acc_norm": 0.3122895622895623, "acc_norm_stderr": 0.009509325983631458, "acc_stderr": 0.00972657959342402}}, "3": {"heres_a_problem": {"acc": 0.24368686868686867, "acc_norm": 0.24368686868686867, "acc_norm_stderr": 0.008809171744720559, "acc_stderr": 0.008809171744720559}, "i_am_hesitating": {"acc": 0.2508532423208191, "acc_norm": 0.27474402730375425, "acc_norm_stderr": 0.013044617212771227, "acc_stderr": 0.01266819862131543}, "multiple_choice": {"acc": 0.3202861952861953, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.009673016668133383, "acc_stderr": 0.00957415266873942}, "pick_the_most_correct_option": {"acc": 0.24494949494949494, "acc_norm": 0.24494949494949494, "acc_norm_stderr": 0.00882458861121908, "acc_stderr": 0.00882458861121908}, "qa_options": {"acc": 0.26023890784982934, "acc_norm": 0.27559726962457337, "acc_norm_stderr": 0.01305716965576184, "acc_stderr": 0.012821930225112552}}, "4": {"heres_a_problem": {"acc": 0.23863636363636365, "acc_norm": 0.23863636363636365, "acc_norm_stderr": 0.008746465140706126, "acc_stderr": 0.008746465140706126}, "i_am_hesitating": {"acc": 0.3400673400673401, "acc_norm": 0.32365319865319864, "acc_norm_stderr": 0.009600478182273787, "acc_stderr": 0.009720765494805281}, "multiple_choice": {"acc": 0.30134680134680136, "acc_norm": 0.31607744107744107, "acc_norm_stderr": 0.009540440071928289, "acc_stderr": 0.009415259879351623}, "pick_the_most_correct_option": {"acc": 0.2354948805460751, "acc_norm": 0.2354948805460751, "acc_norm_stderr": 0.012399451855004748, "acc_stderr": 0.012399451855004748}, "qa_options": {"acc": 0.3287037037037037, "acc_norm": 0.3042929292929293, "acc_norm_stderr": 0.009441202922359185, "acc_stderr": 0.00963890316702217}}, "5": {"heres_a_problem": {"acc": 0.23208191126279865, "acc_norm": 0.23208191126279865, "acc_norm_stderr": 0.012336718284948854, "acc_stderr": 0.012336718284948854}, "i_am_hesitating": {"acc": 0.335016835016835, "acc_norm": 0.3181818181818182, "acc_norm_stderr": 0.009557408782506374, "acc_stderr": 0.00968516076593236}, "multiple_choice": {"acc": 0.24146757679180889, "acc_norm": 0.257679180887372, "acc_norm_stderr": 0.012780770562768405, "acc_stderr": 0.012506564839739432}, "pick_the_most_correct_option": {"acc": 0.23378839590443687, "acc_norm": 0.23378839590443687, "acc_norm_stderr": 0.012368225378507148, "acc_stderr": 0.012368225378507148}, "qa_options": {"acc": 0.25170648464163825, "acc_norm": 0.26791808873720135, "acc_norm_stderr": 0.012942030195136414, "acc_stderr": 0.012682496334042967}}}, "boolq": {"0": {"GPT-3 Style": {"acc": 0.5496666666666666, "acc_norm": 0.625, "acc_norm_stderr": 0.008840308272346428, "acc_stderr": 0.009085074954912698}, "after_reading": {"acc": 0.6233333333333333, "acc_norm": 0.5423333333333333, "acc_norm_stderr": 0.009097447488896784, "acc_stderr": 0.00884811049411477}, "exercise": {"acc": 0.6236666666666667, "acc_norm": 0.603, "acc_norm_stderr": 0.00893440584870012, "acc_stderr": 0.008846558976258922}, "valid_binary": {"acc": 0.611, "acc_norm": 0.5006666666666667, "acc_norm_stderr": 0.009130223008005275, "acc_stderr": 0.008902401412932075}, "yes_no_question": {"acc": 0.606, "acc_norm": 0.6236666666666667, "acc_norm_stderr": 0.008846558976258922, "acc_stderr": 0.008922697920438163}}, "1": {"GPT-3 Style": {"acc": 0.56, "acc_norm": 0.6123333333333333, "acc_norm_stderr": 0.008896822947561608, "acc_stderr": 0.00906425508467605}, "after_reading": {"acc": 0.5856666666666667, "acc_norm": 0.5773333333333334, "acc_norm_stderr": 0.009020364414843638, "acc_stderr": 0.008995223478188034}, "exercise": {"acc": 0.5576666666666666, "acc_norm": 0.5476666666666666, "acc_norm_stderr": 0.009088646624339615, "acc_stderr": 0.009069303681923062}, "valid_binary": {"acc": 0.6203333333333333, "acc_norm": 0.6233333333333333, "acc_norm_stderr": 0.00884811049411477, "acc_stderr": 0.008861873799148995}, "yes_no_question": {"acc": 0.5746666666666667, "acc_norm": 0.6173333333333333, "acc_norm_stderr": 0.008875277637761275, "acc_stderr": 0.009027853030468712}}, "2": {"GPT-3 Style": {"acc": 0.58, "acc_norm": 0.623, "acc_norm_stderr": 0.008849657553427542, "acc_stderr": 0.009012606487132143}, "after_reading": {"acc": 0.6053333333333333, "acc_norm": 0.6023333333333334, "acc_norm_stderr": 0.008936959925716907, "acc_stderr": 0.008925330066832188}, "exercise": {"acc": 0.5663333333333334, "acc_norm": 0.554, "acc_norm_stderr": 0.009076827433934433, "acc_stderr": 0.009049526374650797}, "valid_binary": {"acc": 0.623, "acc_norm": 0.6213333333333333, "acc_norm_stderr": 0.008857326053368308, "acc_stderr": 0.00884965755342756}, "yes_no_question": {"acc": 0.5926666666666667, "acc_norm": 0.6173333333333333, "acc_norm_stderr": 0.008875277637761277, "acc_stderr": 0.008972056373066367}}, "3": {"GPT-3 Style": {"acc": 0.5823333333333334, "acc_norm": 0.621, "acc_norm_stderr": 0.0088588464102222, "acc_stderr": 0.009005596833757831}, "after_reading": {"acc": 0.6026666666666667, "acc_norm": 0.5963333333333334, "acc_norm_stderr": 0.00895916952266258, "acc_stderr": 0.008935685051576502}, "exercise": {"acc": 0.5706666666666667, "acc_norm": 0.546, "acc_norm_stderr": 0.00909150987738652, "acc_stderr": 0.009038582451449425}, "valid_binary": {"acc": 0.6233333333333333, "acc_norm": 0.6226666666666667, "acc_norm_stderr": 0.00885120015653439, "acc_stderr": 0.00884811049411477}, "yes_no_question": {"acc": 0.595, "acc_norm": 0.6123333333333333, "acc_norm_stderr": 0.008896822947561611, "acc_stderr": 0.008963915658236387}}, "4": {"GPT-3 Style": {"acc": 0.587, "acc_norm": 0.624, "acc_norm_stderr": 0.008845002997512763, "acc_stderr": 0.008990955404907169}, "after_reading": {"acc": 0.6043333333333333, "acc_norm": 0.5946666666666667, "acc_norm_stderr": 0.008965091467970754, "acc_stderr": 0.008929245712536294}, "exercise": {"acc": 0.5726666666666667, "acc_norm": 0.5613333333333334, "acc_norm_stderr": 0.009061278956794627, "acc_stderr": 0.009033293159951224}, "valid_binary": {"acc": 0.621, "acc_norm": 0.6196666666666667, "acc_norm_stderr": 0.008864883436857793, "acc_stderr": 0.008858846410222197}, "yes_no_question": {"acc": 0.577, "acc_norm": 0.6136666666666667, "acc_norm_stderr": 0.00889117431069549, "acc_stderr": 0.009021315205815771}}, "5": {"GPT-3 Style": {"acc": 0.5886666666666667, "acc_norm": 0.6206666666666667, "acc_norm_stderr": 0.00886036232472252, "acc_stderr": 0.008985524690229495}, "after_reading": {"acc": 0.604, "acc_norm": 0.596, "acc_norm_stderr": 0.008960362494453694, "acc_stderr": 0.008930542249025198}, "exercise": {"acc": 0.5673333333333334, "acc_norm": 0.553, "acc_norm_stderr": 0.009078792586293545, "acc_stderr": 0.00904706345689798}, "valid_binary": {"acc": 0.6223333333333333, "acc_norm": 0.6196666666666667, "acc_norm_stderr": 0.008864883436857793, "acc_stderr": 0.00885273830576469}, "yes_no_question": {"acc": 0.5723333333333334, "acc_norm": 0.6086666666666667, "acc_norm_stderr": 0.008911995272576807, "acc_stderr": 0.009034185176145654}}}, "cb": {"0": {"GPT-3 style": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359538, "f1": 0.1940928270042194}, "MNLI crowdsource": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813057, "f1": 0.2850877192982456}, "can we infer": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.2580185317177477}, "guaranteed/possible/impossible": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.336846728151076}, "justified in saying": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.3076923076923077}}, "1": {"GPT-3 style": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.22169059011164274}, "MNLI crowdsource": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "can we infer": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "guaranteed/possible/impossible": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.27441920164292133}, "justified in saying": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.26652142338416845}}, "2": {"GPT-3 style": {"acc": 0.4107142857142857, "acc_stderr": 0.0663363415035954, "f1": 0.2593837535014005}, "MNLI crowdsource": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.25438596491228077}, "can we infer": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.27619047619047615}, "guaranteed/possible/impossible": {"acc": 0.16071428571428573, "acc_stderr": 0.04952230059306299, "f1": 0.15573630249667678}, "justified in saying": {"acc": 0.44642857142857145, "acc_stderr": 0.06703189227942398, "f1": 0.32269503546099293}}, "3": {"GPT-3 style": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359538, "f1": 0.24839948783610755}, "MNLI crowdsource": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930824, "f1": 0.20028724376550458}, "can we infer": {"acc": 0.4642857142857143, "acc_stderr": 0.0672477765493766, "f1": 0.33283950617283947}, "guaranteed/possible/impossible": {"acc": 0.16071428571428573, "acc_stderr": 0.049522300593062986, "f1": 0.14387464387464388}, "justified in saying": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359541, "f1": 0.2976100628930818}}, "4": {"GPT-3 style": {"acc": 0.35714285714285715, "acc_stderr": 0.06460957383809221, "f1": 0.2198067632850241}, "MNLI crowdsource": {"acc": 0.35714285714285715, "acc_stderr": 0.06460957383809221, "f1": 0.2183052617835226}, "can we infer": {"acc": 0.4642857142857143, "acc_stderr": 0.06724777654937658, "f1": 0.3328395061728395}, "guaranteed/possible/impossible": {"acc": 0.14285714285714285, "acc_stderr": 0.04718416136255829, "f1": 0.12557319223985888}, "justified in saying": {"acc": 0.48214285714285715, "acc_stderr": 0.0673769750864465, "f1": 0.3461728395061729}}, "5": {"GPT-3 style": {"acc": 0.35714285714285715, "acc_stderr": 0.06460957383809221, "f1": 0.2183052617835226}, "MNLI crowdsource": {"acc": 0.4107142857142857, "acc_stderr": 0.0663363415035954, "f1": 0.2641898864809082}, "can we infer": {"acc": 0.5, "acc_stderr": 0.06741998624632421, "f1": 0.35176007116533237}, "guaranteed/possible/impossible": {"acc": 0.16071428571428573, "acc_stderr": 0.04952230059306299, "f1": 0.14033189033189034}, "justified in saying": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359541, "f1": 0.2976100628930818}}}, "copa": {"0": {"best_option": {"acc": 0.55, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.049999999999999996}, "cause_effect": {"acc": 0.54, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284, "acc_stderr": 0.05009082659620332}, "choose": {"acc": 0.57, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.04975698519562427}, "i_am_hesitating": {"acc": 0.54, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.05009082659620333}, "plausible_alternatives": {"acc": 0.57, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912, "acc_stderr": 0.04975698519562427}}, "1": {"best_option": {"acc": 0.48, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.050211673156867795}, "cause_effect": {"acc": 0.44, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974, "acc_stderr": 0.04988876515698589}, "choose": {"acc": 0.46, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316, "acc_stderr": 0.05009082659620332}, "i_am_hesitating": {"acc": 0.46, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974, "acc_stderr": 0.05009082659620332}, "plausible_alternatives": {"acc": 0.41, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196, "acc_stderr": 0.04943110704237101}}, "2": {"best_option": {"acc": 0.42, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176, "acc_stderr": 0.049604496374885836}, "cause_effect": {"acc": 0.41, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084, "acc_stderr": 0.049431107042371025}, "choose": {"acc": 0.4, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218, "acc_stderr": 0.049236596391733084}, "i_am_hesitating": {"acc": 0.4, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.049236596391733084}, "plausible_alternatives": {"acc": 0.39, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974, "acc_stderr": 0.04902071300001974}}, "3": {"best_option": {"acc": 0.46, "acc_norm": 0.45, "acc_norm_stderr": 0.05, "acc_stderr": 0.05009082659620333}, "cause_effect": {"acc": 0.42, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.049604496374885836}, "choose": {"acc": 0.4, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974, "acc_stderr": 0.049236596391733084}, "i_am_hesitating": {"acc": 0.42, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284, "acc_stderr": 0.049604496374885836}, "plausible_alternatives": {"acc": 0.43, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.049756985195624284}}, "4": {"best_option": {"acc": 0.47, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605, "acc_stderr": 0.050161355804659205}, "cause_effect": {"acc": 0.39, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974, "acc_stderr": 0.04902071300001974}, "choose": {"acc": 0.46, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.05009082659620332}, "i_am_hesitating": {"acc": 0.41, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.049431107042371025}, "plausible_alternatives": {"acc": 0.43, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.049756985195624284}}, "5": {"best_option": {"acc": 0.46, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.05009082659620332}, "cause_effect": {"acc": 0.43, "acc_norm": 0.45, "acc_norm_stderr": 0.049999999999999996, "acc_stderr": 0.049756985195624284}, "choose": {"acc": 0.45, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284, "acc_stderr": 0.04999999999999999}, "i_am_hesitating": {"acc": 0.41, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284, "acc_stderr": 0.049431107042371025}, "plausible_alternatives": {"acc": 0.45, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.049999999999999996}}}, "e2e_nlg_cleaned": {"0": {"coherent_text": {"bleu": 3.872234562531536, "bleu_stderr": 0.06666687313033742, "rouge1_fmeasure": 0.24189630901934833, "rouge1_fmeasure_stderr": 0.0019324120985675024, "rouge1_precision": 0.19449892872154664, "rouge1_precision_stderr": 0.0016649751999105959, "rouge1_recall": 0.3385859719691564, "rouge1_recall_stderr": 0.0026943011480723986, "rouge2_fmeasure": 0.09892905722529392, "rouge2_fmeasure_stderr": 0.001163722813056667, "rouge2_precision": 0.07891999672740428, "rouge2_precision_stderr": 0.0009533683551406194, "rouge2_recall": 0.14092640846495855, "rouge2_recall_stderr": 0.0017170890317364676, "rougeL_fmeasure": 0.19713523690512094, "rougeL_fmeasure_stderr": 0.0015249596621099413, "rougeL_precision": 0.15770316249517122, "rougeL_precision_stderr": 0.0012736741855482734, "rougeL_recall": 0.27888219312603624, "rougeL_recall_stderr": 0.0022960014811976687, "rougeLsum_fmeasure": 0.21518082639820849, "rougeLsum_fmeasure_stderr": 0.0017666395173976037, "rougeLsum_precision": 0.17297504164263205, "rougeLsum_precision_stderr": 0.0015157865911726413, "rougeLsum_recall": 0.30157958831453335, "rougeLsum_recall_stderr": 0.0024874143749410256}, "create_text_for_me": {"bleu": 1.2087041243674348, "bleu_stderr": 0.06236227717440753, "rouge1_fmeasure": 0.1485373299435639, "rouge1_fmeasure_stderr": 0.0016885326851467698, "rouge1_precision": 0.11495488513218448, "rouge1_precision_stderr": 0.0014917009813178457, "rouge1_recall": 0.2228480616093966, "rouge1_recall_stderr": 0.0020952477514755564, "rouge2_fmeasure": 0.02500994962430241, "rouge2_fmeasure_stderr": 0.0008315526871574698, "rouge2_precision": 0.01961510433394365, "rouge2_precision_stderr": 0.0006727987776785344, "rouge2_recall": 0.03657357239352748, "rouge2_recall_stderr": 0.0011960431470597172, "rougeL_fmeasure": 0.12205181393918246, "rougeL_fmeasure_stderr": 0.0012795921662070245, "rougeL_precision": 0.09374319199704093, "rougeL_precision_stderr": 0.0010902607092754888, "rougeL_recall": 0.18550669125605201, "rougeL_recall_stderr": 0.0017172608070746052, "rougeLsum_fmeasure": 0.13207144996991402, "rougeLsum_fmeasure_stderr": 0.0014103766498503702, "rougeLsum_precision": 0.10182213242486479, "rougeLsum_precision_stderr": 0.0012349205591824877, "rougeLsum_recall": 0.19958826927621215, "rougeLsum_recall_stderr": 0.0017982055619464277}, "generate_gramatically_correct_text": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0003977278759887456, "rouge1_fmeasure_stderr": 0.00013354742540894128, "rouge1_precision": 0.0028333333333333335, "rouge1_precision_stderr": 0.0009561880076163621, "rouge1_recall": 0.00021398046398046395, "rouge1_recall_stderr": 7.182900764312899e-05, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0003977278759887456, "rougeL_fmeasure_stderr": 0.00013354742540894128, "rougeL_precision": 0.0028333333333333335, "rougeL_precision_stderr": 0.0009561880076163621, "rougeL_recall": 0.00021398046398046395, "rougeL_recall_stderr": 7.182900764312899e-05, "rougeLsum_fmeasure": 0.0003977278759887456, "rougeLsum_fmeasure_stderr": 0.00013354742540894128, "rougeLsum_precision": 0.0028333333333333335, "rougeLsum_precision_stderr": 0.0009561880076163621, "rougeLsum_recall": 0.00021398046398046395, "rougeLsum_recall_stderr": 7.182900764312899e-05}, "generate_text_restaurant": {"bleu": 0.3138256134465956, "bleu_stderr": 0.042478650177872466, "rouge1_fmeasure": 0.08626819478230045, "rouge1_fmeasure_stderr": 0.0014217358109946006, "rouge1_precision": 0.09054875368611116, "rouge1_precision_stderr": 0.001766981005220654, "rouge1_recall": 0.09899686345340121, "rouge1_recall_stderr": 0.0016339321555193618, "rouge2_fmeasure": 0.004707141554710639, "rouge2_fmeasure_stderr": 0.00037478881160494376, "rouge2_precision": 0.004635097118571576, "rouge2_precision_stderr": 0.0004584060579508488, "rouge2_recall": 0.006688039413188141, "rouge2_recall_stderr": 0.0004649040639428354, "rougeL_fmeasure": 0.08482739955206746, "rougeL_fmeasure_stderr": 0.001374786595816378, "rougeL_precision": 0.08868499326830791, "rougeL_precision_stderr": 0.0016895151305050993, "rougeL_recall": 0.09761537714635501, "rougeL_recall_stderr": 0.0015968584648120006, "rougeLsum_fmeasure": 0.06674662648443055, "rougeLsum_fmeasure_stderr": 0.0011005680176848465, "rougeLsum_precision": 0.0697631127415905, "rougeLsum_precision_stderr": 0.0014027633376290944, "rougeLsum_recall": 0.07822809105458635, "rougeLsum_recall_stderr": 0.001337355132253774}, "text": {"bleu": 4.369170070060735, "bleu_stderr": 0.07204262900742721, "rouge1_fmeasure": 0.2540920126266791, "rouge1_fmeasure_stderr": 0.001743002313846714, "rouge1_precision": 0.19059607371932916, "rouge1_precision_stderr": 0.0015976396764760467, "rouge1_recall": 0.41030667797285486, "rouge1_recall_stderr": 0.002620865522372885, "rouge2_fmeasure": 0.1073793884770636, "rouge2_fmeasure_stderr": 0.001057524513675077, "rouge2_precision": 0.07958464128539072, "rouge2_precision_stderr": 0.0008512115391560173, "rouge2_recall": 0.17899027774981593, "rouge2_recall_stderr": 0.0018510907312350095, "rougeL_fmeasure": 0.2161433172109618, "rougeL_fmeasure_stderr": 0.0013779255059734295, "rougeL_precision": 0.1612888816485327, "rougeL_precision_stderr": 0.0012295381787847557, "rougeL_recall": 0.3522789855005096, "rougeL_recall_stderr": 0.002286155396163342, "rougeLsum_fmeasure": 0.21871213194709838, "rougeLsum_fmeasure_stderr": 0.001635496810687772, "rougeLsum_precision": 0.16374077920845426, "rougeLsum_precision_stderr": 0.0014355224351735455, "rougeLsum_recall": 0.35431534893364613, "rougeLsum_recall_stderr": 0.002559304615115808}}, "1": {"coherent_text": {"bleu": 5.618664355824302, "bleu_stderr": 0.044804380544830956, "rouge1_fmeasure": 0.3790777063050893, "rouge1_fmeasure_stderr": 0.0019180894806166623, "rouge1_precision": 0.29980096617944, "rouge1_precision_stderr": 0.0018479476291498587, "rouge1_recall": 0.5510960644199242, "rouge1_recall_stderr": 0.002746177251645845, "rouge2_fmeasure": 0.1571930586851638, "rouge2_fmeasure_stderr": 0.0013851703119846389, "rouge2_precision": 0.12320524930026791, "rouge2_precision_stderr": 0.0011603794001722478, "rouge2_recall": 0.23411042101520732, "rouge2_recall_stderr": 0.0021839181570230963, "rougeL_fmeasure": 0.2692553660223294, "rougeL_fmeasure_stderr": 0.0013312984394904634, "rougeL_precision": 0.21150876369324337, "rougeL_precision_stderr": 0.0012091371590184875, "rougeL_recall": 0.3973772754817481, "rougeL_recall_stderr": 0.002348890772602287, "rougeLsum_fmeasure": 0.3132026852747841, "rougeLsum_fmeasure_stderr": 0.0017579190044477355, "rougeLsum_precision": 0.24763246470479972, "rougeLsum_precision_stderr": 0.0016397591465259363, "rougeLsum_recall": 0.45603718052824205, "rougeLsum_recall_stderr": 0.0025930969811509105}, "create_text_for_me": {"bleu": 5.719467204595487, "bleu_stderr": 0.05467102028072658, "rouge1_fmeasure": 0.3880684705597522, "rouge1_fmeasure_stderr": 0.0017293341079469464, "rouge1_precision": 0.30689525431041903, "rouge1_precision_stderr": 0.0016979120981340232, "rouge1_recall": 0.5607471544318147, "rouge1_recall_stderr": 0.002541857266880532, "rouge2_fmeasure": 0.16340881202208163, "rouge2_fmeasure_stderr": 0.0013545945899839753, "rouge2_precision": 0.12815271668435907, "rouge2_precision_stderr": 0.0011402205486632784, "rouge2_recall": 0.24170118147382424, "rouge2_recall_stderr": 0.0021562937828079123, "rougeL_fmeasure": 0.2620939439124173, "rougeL_fmeasure_stderr": 0.0013281943135113716, "rougeL_precision": 0.20611685586596215, "rougeL_precision_stderr": 0.001198478500170298, "rougeL_recall": 0.38367500882495165, "rougeL_recall_stderr": 0.0023144118800144126, "rougeLsum_fmeasure": 0.32115446566223654, "rougeLsum_fmeasure_stderr": 0.0016529379356976753, "rougeLsum_precision": 0.2540076031390351, "rougeLsum_precision_stderr": 0.0015610224671734501, "rougeLsum_recall": 0.46438460329961756, "rougeLsum_recall_stderr": 0.002459731462210942}, "generate_gramatically_correct_text": {"bleu": 0.7785930657042605, "bleu_stderr": 0.07371286925187409, "rouge1_fmeasure": 0.07707231946910663, "rouge1_fmeasure_stderr": 0.0029012639839078266, "rouge1_precision": 0.09307418766156685, "rouge1_precision_stderr": 0.003831133208744367, "rouge1_recall": 0.09684372086710462, "rouge1_recall_stderr": 0.0037674098990848615, "rouge2_fmeasure": 0.030860375813935463, "rouge2_fmeasure_stderr": 0.0013215709327802443, "rouge2_precision": 0.026646794320064136, "rouge2_precision_stderr": 0.0011779628965901102, "rouge2_recall": 0.040281858969099534, "rouge2_recall_stderr": 0.0017553221623290288, "rougeL_fmeasure": 0.05293870853728192, "rougeL_fmeasure_stderr": 0.0019913751107059636, "rougeL_precision": 0.07194869702103061, "rougeL_precision_stderr": 0.0033904984440571335, "rougeL_recall": 0.06646442818559543, "rougeL_recall_stderr": 0.002627445124677514, "rougeLsum_fmeasure": 0.06475443481713601, "rougeLsum_fmeasure_stderr": 0.0024499768948500517, "rougeLsum_precision": 0.08237065814087174, "rougeLsum_precision_stderr": 0.003597616882105883, "rougeLsum_recall": 0.08104209712761277, "rougeLsum_recall_stderr": 0.003180633123309235}, "generate_text_restaurant": {"bleu": 6.380638446426456, "bleu_stderr": 0.09557609009378211, "rouge1_fmeasure": 0.3400267632274026, "rouge1_fmeasure_stderr": 0.0020403696620665606, "rouge1_precision": 0.3328975043204133, "rouge1_precision_stderr": 0.003509121285453912, "rouge1_recall": 0.45270988550034613, "rouge1_recall_stderr": 0.003062231284181297, "rouge2_fmeasure": 0.1486663277769484, "rouge2_fmeasure_stderr": 0.0015136508138340347, "rouge2_precision": 0.1499329665878703, "rouge2_precision_stderr": 0.002263691832530101, "rouge2_recall": 0.1983614972096706, "rouge2_recall_stderr": 0.0020825028564397568, "rougeL_fmeasure": 0.2676344730573564, "rougeL_fmeasure_stderr": 0.001562435233832024, "rougeL_precision": 0.2592088269282877, "rougeL_precision_stderr": 0.002698465205645672, "rougeL_recall": 0.36228982914822633, "rougeL_recall_stderr": 0.0026355915533943, "rougeLsum_fmeasure": 0.2753891028069307, "rougeLsum_fmeasure_stderr": 0.0019025285983907253, "rougeLsum_precision": 0.27037933017473875, "rougeLsum_precision_stderr": 0.003062674460499446, "rougeLsum_recall": 0.3663561677071641, "rougeLsum_recall_stderr": 0.0027976045204321166}, "text": {"bleu": 6.211882508400509, "bleu_stderr": 0.06087984691073597, "rouge1_fmeasure": 0.4396792572537879, "rouge1_fmeasure_stderr": 0.0020547864942206586, "rouge1_precision": 0.36654808520126797, "rouge1_precision_stderr": 0.0021595231400698836, "rouge1_recall": 0.5825981312150091, "rouge1_recall_stderr": 0.002574992415760553, "rouge2_fmeasure": 0.19855266031915028, "rouge2_fmeasure_stderr": 0.0015519283833039233, "rouge2_precision": 0.1645678333861915, "rouge2_precision_stderr": 0.001432238933148995, "rouge2_recall": 0.2677310092553899, "rouge2_recall_stderr": 0.0021604481761519365, "rougeL_fmeasure": 0.3103214994230373, "rougeL_fmeasure_stderr": 0.0015536997171868116, "rougeL_precision": 0.2570869498638546, "rougeL_precision_stderr": 0.001512490055714042, "rougeL_recall": 0.41635046992448654, "rougeL_recall_stderr": 0.002345198987228168, "rougeLsum_fmeasure": 0.3637273198283416, "rougeLsum_fmeasure_stderr": 0.0019205698683061002, "rougeLsum_precision": 0.3030665303851319, "rougeLsum_precision_stderr": 0.0019368940114554841, "rougeLsum_recall": 0.48277562928021334, "rougeLsum_recall_stderr": 0.002499697912373815}}, "2": {"coherent_text": {"bleu": 6.655264751599302, "bleu_stderr": 0.10851166271873992, "rouge1_fmeasure": 0.37482466732781167, "rouge1_fmeasure_stderr": 0.001820505134270105, "rouge1_precision": 0.2956786617020848, "rouge1_precision_stderr": 0.0018018971024399673, "rouge1_recall": 0.5505653372006829, "rouge1_recall_stderr": 0.0026480715686419227, "rouge2_fmeasure": 0.16543669174208636, "rouge2_fmeasure_stderr": 0.001418641213100885, "rouge2_precision": 0.129469707240088, "rouge2_precision_stderr": 0.00121806826731611, "rouge2_recall": 0.24899455762459116, "rouge2_recall_stderr": 0.002246273347531876, "rougeL_fmeasure": 0.27750839928052384, "rougeL_fmeasure_stderr": 0.0013211747767623314, "rougeL_precision": 0.2175954785088815, "rougeL_precision_stderr": 0.0012481117214859327, "rougeL_recall": 0.41307647476574705, "rougeL_recall_stderr": 0.002319703375666559, "rougeLsum_fmeasure": 0.3133486537665277, "rougeLsum_fmeasure_stderr": 0.0017193781609667741, "rougeLsum_precision": 0.2472195816112961, "rougeLsum_precision_stderr": 0.0016439382219917379, "rougeLsum_recall": 0.46060025676338434, "rougeLsum_recall_stderr": 0.002548116733582456}, "create_text_for_me": {"bleu": 6.481462554213176, "bleu_stderr": 0.0637597415756679, "rouge1_fmeasure": 0.397978553188765, "rouge1_fmeasure_stderr": 0.0017070834320949354, "rouge1_precision": 0.31304294257033033, "rouge1_precision_stderr": 0.001675584026068997, "rouge1_recall": 0.5791447618132454, "rouge1_recall_stderr": 0.002476899866394818, "rouge2_fmeasure": 0.17467867245016275, "rouge2_fmeasure_stderr": 0.0013955055510223417, "rouge2_precision": 0.1361700393455817, "rouge2_precision_stderr": 0.0011628001288050507, "rouge2_recall": 0.2603882016032195, "rouge2_recall_stderr": 0.002228895122171891, "rougeL_fmeasure": 0.27248947489993885, "rougeL_fmeasure_stderr": 0.0013417737031681963, "rougeL_precision": 0.21315768275684321, "rougeL_precision_stderr": 0.0011907007297547669, "rougeL_recall": 0.4014777565448129, "rougeL_recall_stderr": 0.0023252703119155016, "rougeLsum_fmeasure": 0.3327776090849803, "rougeLsum_fmeasure_stderr": 0.0016458286188702437, "rougeLsum_precision": 0.2617186737971144, "rougeLsum_precision_stderr": 0.0015398800543719527, "rougeLsum_recall": 0.48463177955108755, "rougeLsum_recall_stderr": 0.002439535597275367}, "generate_gramatically_correct_text": {"bleu": 4.024301729421313, "bleu_stderr": 0.1532981962559809, "rouge1_fmeasure": 0.15080808821145256, "rouge1_fmeasure_stderr": 0.0036526531895815233, "rouge1_precision": 0.1602164847749438, "rouge1_precision_stderr": 0.004262488194816882, "rouge1_recall": 0.1893467230974042, "rouge1_recall_stderr": 0.0047311550193664745, "rouge2_fmeasure": 0.062337691922640125, "rouge2_fmeasure_stderr": 0.0017277056488068175, "rouge2_precision": 0.053157839459133875, "rouge2_precision_stderr": 0.001489446683458996, "rouge2_recall": 0.08110347423194934, "rouge2_recall_stderr": 0.0023151112057970876, "rougeL_fmeasure": 0.10356717573719904, "rougeL_fmeasure_stderr": 0.002527891994310027, "rougeL_precision": 0.11906005873487642, "rougeL_precision_stderr": 0.003676903873427953, "rougeL_recall": 0.13037070564313896, "rougeL_recall_stderr": 0.0033405273229848032, "rougeLsum_fmeasure": 0.12582494031130287, "rougeLsum_fmeasure_stderr": 0.0030784867770067177, "rougeLsum_precision": 0.1386315725589492, "rougeLsum_precision_stderr": 0.003950043680135484, "rougeLsum_recall": 0.15770240014439066, "rougeLsum_recall_stderr": 0.0039943713450391076}, "generate_text_restaurant": {"bleu": 6.3634920937473884, "bleu_stderr": 0.08616925114576995, "rouge1_fmeasure": 0.33740317942041553, "rouge1_fmeasure_stderr": 0.0019381935089587043, "rouge1_precision": 0.2954297857860177, "rouge1_precision_stderr": 0.0029411840632251563, "rouge1_recall": 0.48120558575759376, "rouge1_recall_stderr": 0.002844478586577598, "rouge2_fmeasure": 0.1507673483604289, "rouge2_fmeasure_stderr": 0.0014399337503822303, "rouge2_precision": 0.13374093712808155, "rouge2_precision_stderr": 0.0018836984171556573, "rouge2_recall": 0.2183048706264126, "rouge2_recall_stderr": 0.002121311703914965, "rougeL_fmeasure": 0.26968155815162015, "rougeL_fmeasure_stderr": 0.0014592699636009493, "rougeL_precision": 0.23342217539614568, "rougeL_precision_stderr": 0.002214559299952568, "rougeL_recall": 0.3907575640409568, "rougeL_recall_stderr": 0.002505134070265779, "rougeLsum_fmeasure": 0.272599814561145, "rougeLsum_fmeasure_stderr": 0.00182733683050086, "rougeLsum_precision": 0.23886641857979662, "rougeLsum_precision_stderr": 0.0025627121020893886, "rougeLsum_recall": 0.3892155963722792, "rougeLsum_recall_stderr": 0.002698506480779706}, "text": {"bleu": 6.618865785291484, "bleu_stderr": 0.09097368527602766, "rouge1_fmeasure": 0.4418673874038735, "rouge1_fmeasure_stderr": 0.0020138555153139262, "rouge1_precision": 0.3662955696861521, "rouge1_precision_stderr": 0.0020996329194513177, "rouge1_recall": 0.5898222196381344, "rouge1_recall_stderr": 0.002594156432810366, "rouge2_fmeasure": 0.20344292743727435, "rouge2_fmeasure_stderr": 0.0015925311127849093, "rouge2_precision": 0.16735791685138235, "rouge2_precision_stderr": 0.0014286796631250598, "rouge2_recall": 0.27701539052345003, "rouge2_recall_stderr": 0.0022736180715884347, "rougeL_fmeasure": 0.3135424508873694, "rougeL_fmeasure_stderr": 0.0015516029637538471, "rougeL_precision": 0.25828930577862486, "rougeL_precision_stderr": 0.001478479770624807, "rougeL_recall": 0.4238497421384899, "rougeL_recall_stderr": 0.002419962083852209, "rougeLsum_fmeasure": 0.36881863339193455, "rougeLsum_fmeasure_stderr": 0.001931324334873031, "rougeLsum_precision": 0.3055329010748124, "rougeLsum_precision_stderr": 0.001915478709605836, "rougeLsum_recall": 0.49320303301582513, "rougeLsum_recall_stderr": 0.0025853778948590807}}, "3": {"coherent_text": {"bleu": 6.948031260618786, "bleu_stderr": 0.07024191698575197, "rouge1_fmeasure": 0.37972552437025653, "rouge1_fmeasure_stderr": 0.0018092314859358045, "rouge1_precision": 0.3028962873149275, "rouge1_precision_stderr": 0.0018453669075132997, "rouge1_recall": 0.5503482491585948, "rouge1_recall_stderr": 0.0026466497124214964, "rouge2_fmeasure": 0.1703052547809578, "rouge2_fmeasure_stderr": 0.0014604429313616653, "rouge2_precision": 0.13494336292795747, "rouge2_precision_stderr": 0.0012826406801381255, "rouge2_recall": 0.2522143776432092, "rouge2_recall_stderr": 0.0022660646785606113, "rougeL_fmeasure": 0.2796335792236481, "rougeL_fmeasure_stderr": 0.001342957090112707, "rougeL_precision": 0.22176702009033705, "rougeL_precision_stderr": 0.0012999603001234818, "rougeL_recall": 0.41046709529612835, "rougeL_recall_stderr": 0.0023382620871218933, "rougeLsum_fmeasure": 0.3191175618704876, "rougeLsum_fmeasure_stderr": 0.0017351084798555157, "rougeLsum_precision": 0.2546972130694942, "rougeLsum_precision_stderr": 0.001706082034205406, "rougeLsum_recall": 0.46258844899711243, "rougeLsum_recall_stderr": 0.0025482851759882335}, "create_text_for_me": {"bleu": 6.612323545023084, "bleu_stderr": 0.05976238792927671, "rouge1_fmeasure": 0.3967724891293664, "rouge1_fmeasure_stderr": 0.0017005575062401018, "rouge1_precision": 0.3112721062235333, "rouge1_precision_stderr": 0.0016625710557320614, "rouge1_recall": 0.5797972107422691, "rouge1_recall_stderr": 0.0024927181964562407, "rouge2_fmeasure": 0.17648552604551038, "rouge2_fmeasure_stderr": 0.0014153455920465827, "rouge2_precision": 0.1371955035707705, "rouge2_precision_stderr": 0.0011750480699642973, "rouge2_recall": 0.2640373903872155, "rouge2_recall_stderr": 0.0022501962149717035, "rougeL_fmeasure": 0.27221279373131524, "rougeL_fmeasure_stderr": 0.0013520964327518468, "rougeL_precision": 0.21252206793017903, "rougeL_precision_stderr": 0.0011987552894719596, "rougeL_recall": 0.4019456170099069, "rougeL_recall_stderr": 0.0023015152807916543, "rougeLsum_fmeasure": 0.3309974761576321, "rougeLsum_fmeasure_stderr": 0.0016730505404799294, "rougeLsum_precision": 0.2596822226642286, "rougeLsum_precision_stderr": 0.0015551841205914923, "rougeLsum_recall": 0.48377583683476527, "rougeLsum_recall_stderr": 0.002472550681565558}, "generate_gramatically_correct_text": {"bleu": 6.2616432240038, "bleu_stderr": 0.17231457870322375, "rouge1_fmeasure": 0.20303028288911848, "rouge1_fmeasure_stderr": 0.003846560801683136, "rouge1_precision": 0.20703579454147295, "rouge1_precision_stderr": 0.004332835339427102, "rouge1_recall": 0.2544471431249528, "rouge1_recall_stderr": 0.004998201805906469, "rouge2_fmeasure": 0.08500284986690841, "rouge2_fmeasure_stderr": 0.0018852577618806415, "rouge2_precision": 0.0728023032757386, "rouge2_precision_stderr": 0.0016348952484523923, "rouge2_recall": 0.11015181488739925, "rouge2_recall_stderr": 0.002540705745285619, "rougeL_fmeasure": 0.13949830107514838, "rougeL_fmeasure_stderr": 0.002682661526674531, "rougeL_precision": 0.15147184243295325, "rougeL_precision_stderr": 0.003718090407697094, "rougeL_recall": 0.1755748249833486, "rougeL_recall_stderr": 0.0035736277120166807, "rougeLsum_fmeasure": 0.16864868735595545, "rougeLsum_fmeasure_stderr": 0.0032412006854200303, "rougeLsum_precision": 0.17716302997626837, "rougeLsum_precision_stderr": 0.0039987377595068554, "rougeLsum_recall": 0.21119592100785614, "rougeLsum_recall_stderr": 0.004224651124239091}, "generate_text_restaurant": {"bleu": 6.870066226903575, "bleu_stderr": 0.09429297050700895, "rouge1_fmeasure": 0.3444162792721549, "rouge1_fmeasure_stderr": 0.0019646659016549933, "rouge1_precision": 0.2995136417218204, "rouge1_precision_stderr": 0.002877943933866991, "rouge1_recall": 0.4870699752407154, "rouge1_recall_stderr": 0.0028159974008536274, "rouge2_fmeasure": 0.158839720125521, "rouge2_fmeasure_stderr": 0.0014985646156960436, "rouge2_precision": 0.13989458331193982, "rouge2_precision_stderr": 0.0019259857294848428, "rouge2_recall": 0.2283186786335389, "rouge2_recall_stderr": 0.002178927464111703, "rougeL_fmeasure": 0.2794048970004934, "rougeL_fmeasure_stderr": 0.0015380955393133962, "rougeL_precision": 0.24052403808712888, "rougeL_precision_stderr": 0.0022515901969111986, "rougeL_recall": 0.4008413679522952, "rougeL_recall_stderr": 0.0025096595704963047, "rougeLsum_fmeasure": 0.2812645652371936, "rougeLsum_fmeasure_stderr": 0.00189815704910076, "rougeLsum_precision": 0.24501488471812358, "rougeLsum_precision_stderr": 0.002592614810126833, "rougeLsum_recall": 0.39799000178268285, "rougeLsum_recall_stderr": 0.002715969584528833}, "text": {"bleu": 6.838654430562631, "bleu_stderr": 0.08770664641031416, "rouge1_fmeasure": 0.44203444996828867, "rouge1_fmeasure_stderr": 0.0020077674797794494, "rouge1_precision": 0.3653208109803443, "rouge1_precision_stderr": 0.002089278039587839, "rouge1_recall": 0.592412984015862, "rouge1_recall_stderr": 0.0025780992158462137, "rouge2_fmeasure": 0.20463135769763866, "rouge2_fmeasure_stderr": 0.001622776743332051, "rouge2_precision": 0.16785818992713525, "rouge2_precision_stderr": 0.001444636379498952, "rouge2_recall": 0.27957982192927644, "rouge2_recall_stderr": 0.002322764141427059, "rougeL_fmeasure": 0.31244127951805134, "rougeL_fmeasure_stderr": 0.0015421579875842658, "rougeL_precision": 0.256612100970035, "rougeL_precision_stderr": 0.0014662808098234782, "rougeL_recall": 0.42390466259959425, "rougeL_recall_stderr": 0.002389221135189317, "rougeLsum_fmeasure": 0.3710630525199719, "rougeLsum_fmeasure_stderr": 0.0019551761163624595, "rougeLsum_precision": 0.3065499840629947, "rougeLsum_precision_stderr": 0.001929720013741525, "rougeLsum_recall": 0.4978186288299029, "rougeLsum_recall_stderr": 0.002597275043020314}}, "4": {"coherent_text": {"bleu": 7.11987499239522, "bleu_stderr": 0.12056984540140626, "rouge1_fmeasure": 0.38334638913624297, "rouge1_fmeasure_stderr": 0.0018564450302232401, "rouge1_precision": 0.30702881034304425, "rouge1_precision_stderr": 0.0018940787804765146, "rouge1_recall": 0.5510843355772259, "rouge1_recall_stderr": 0.0026497395614324347, "rouge2_fmeasure": 0.1738253944796353, "rouge2_fmeasure_stderr": 0.0014854824402731792, "rouge2_precision": 0.1383407701093842, "rouge2_precision_stderr": 0.0013023724651524862, "rouge2_recall": 0.2550398171391987, "rouge2_recall_stderr": 0.0022789288990834686, "rougeL_fmeasure": 0.2793510383067617, "rougeL_fmeasure_stderr": 0.0013892803599562522, "rougeL_precision": 0.2225382181098862, "rougeL_precision_stderr": 0.001346023397016377, "rougeL_recall": 0.40621675306005106, "rougeL_recall_stderr": 0.0023173016257193608, "rougeLsum_fmeasure": 0.32528129826527175, "rougeLsum_fmeasure_stderr": 0.0017876332019525448, "rougeLsum_precision": 0.2605328118487642, "rougeLsum_precision_stderr": 0.0017504023543387641, "rougeLsum_recall": 0.4677978217704759, "rougeLsum_recall_stderr": 0.002582278022445715}, "create_text_for_me": {"bleu": 6.691991628747666, "bleu_stderr": 0.08350250197448322, "rouge1_fmeasure": 0.3918497625319794, "rouge1_fmeasure_stderr": 0.0017118070244260361, "rouge1_precision": 0.3071216799151067, "rouge1_precision_stderr": 0.0016645873768075543, "rouge1_recall": 0.5734199732733007, "rouge1_recall_stderr": 0.002497421870329201, "rouge2_fmeasure": 0.17525069265082474, "rouge2_fmeasure_stderr": 0.001423044315562278, "rouge2_precision": 0.13621235371774604, "rouge2_precision_stderr": 0.0011845432721289762, "rouge2_recall": 0.2624365602666334, "rouge2_recall_stderr": 0.0022614408095196375, "rougeL_fmeasure": 0.26936703256276495, "rougeL_fmeasure_stderr": 0.0013634469715740345, "rougeL_precision": 0.2100790609152195, "rougeL_precision_stderr": 0.0012057172066683016, "rougeL_recall": 0.3985723300660603, "rougeL_recall_stderr": 0.0023232420666500823, "rougeLsum_fmeasure": 0.32924292332328525, "rougeLsum_fmeasure_stderr": 0.0016762151102916476, "rougeLsum_precision": 0.2581428748840792, "rougeLsum_precision_stderr": 0.001562414889046062, "rougeLsum_recall": 0.4817784670028705, "rougeLsum_recall_stderr": 0.0024626011315168057}, "generate_gramatically_correct_text": {"bleu": 7.796909762932712, "bleu_stderr": 0.1669092241005445, "rouge1_fmeasure": 0.2339004008254913, "rouge1_fmeasure_stderr": 0.003879962429844055, "rouge1_precision": 0.23562398578259758, "rouge1_precision_stderr": 0.0043331973616364825, "rouge1_recall": 0.2918086256369194, "rouge1_recall_stderr": 0.005035638323372833, "rouge2_fmeasure": 0.1009343907225879, "rouge2_fmeasure_stderr": 0.0019833361884621687, "rouge2_precision": 0.0868161625025297, "rouge2_precision_stderr": 0.0017252832094021909, "rouge2_recall": 0.1301381299164276, "rouge2_recall_stderr": 0.0026742073604647753, "rougeL_fmeasure": 0.16128440664165175, "rougeL_fmeasure_stderr": 0.0027278295370788417, "rougeL_precision": 0.1717453473169215, "rougeL_precision_stderr": 0.0037266451879112016, "rougeL_recall": 0.20219125520831865, "rougeL_recall_stderr": 0.003636988165748045, "rougeLsum_fmeasure": 0.19430984547637892, "rougeLsum_fmeasure_stderr": 0.00328399177028088, "rougeLsum_precision": 0.20100861293322395, "rougeLsum_precision_stderr": 0.004005483075655645, "rougeLsum_recall": 0.24237757228826143, "rougeLsum_recall_stderr": 0.004281416671478345}, "generate_text_restaurant": {"bleu": 7.017381678234455, "bleu_stderr": 0.09166907850331675, "rouge1_fmeasure": 0.3465967840398284, "rouge1_fmeasure_stderr": 0.0019574393578079014, "rouge1_precision": 0.30143650107181685, "rouge1_precision_stderr": 0.0029314795240309452, "rouge1_recall": 0.4898031820273571, "rouge1_recall_stderr": 0.0027687333603014837, "rouge2_fmeasure": 0.16013598883167798, "rouge2_fmeasure_stderr": 0.0015096765844560063, "rouge2_precision": 0.14062404051839908, "rouge2_precision_stderr": 0.0019112056232477572, "rouge2_recall": 0.2300710505484132, "rouge2_recall_stderr": 0.0021943432548559823, "rougeL_fmeasure": 0.2820802687702902, "rougeL_fmeasure_stderr": 0.0015095048991938657, "rougeL_precision": 0.2422982418880056, "rougeL_precision_stderr": 0.0022053611865584053, "rougeL_recall": 0.4046980533581673, "rougeL_recall_stderr": 0.0024846126430562907, "rougeLsum_fmeasure": 0.28522597669249417, "rougeLsum_fmeasure_stderr": 0.0018960405914839836, "rougeLsum_precision": 0.2479193537232055, "rougeLsum_precision_stderr": 0.0025849971540480303, "rougeLsum_recall": 0.40376209522673123, "rougeLsum_recall_stderr": 0.002721581361466135}, "text": {"bleu": 6.664692810007541, "bleu_stderr": 0.09761641585182086, "rouge1_fmeasure": 0.4334753650888552, "rouge1_fmeasure_stderr": 0.0020383137826383165, "rouge1_precision": 0.3575562551113939, "rouge1_precision_stderr": 0.0021077592166897385, "rouge1_recall": 0.5827020964603545, "rouge1_recall_stderr": 0.0025902702211649093, "rouge2_fmeasure": 0.19812463968549573, "rouge2_fmeasure_stderr": 0.00163502903837983, "rouge2_precision": 0.1624053149694478, "rouge2_precision_stderr": 0.0014639579406940794, "rouge2_recall": 0.27102562833445276, "rouge2_recall_stderr": 0.0022990242634122787, "rougeL_fmeasure": 0.30580137424286685, "rougeL_fmeasure_stderr": 0.001562672024634994, "rougeL_precision": 0.2508887474422941, "rougeL_precision_stderr": 0.0014913740208745742, "rougeL_recall": 0.4154733525023283, "rougeL_recall_stderr": 0.0023523244972575128, "rougeLsum_fmeasure": 0.3638439110752512, "rougeLsum_fmeasure_stderr": 0.001949294337253243, "rougeLsum_precision": 0.29999696611259524, "rougeLsum_precision_stderr": 0.0019295094756439808, "rougeLsum_recall": 0.4897297628793384, "rougeLsum_recall_stderr": 0.002565917788303965}}, "5": {"coherent_text": {"bleu": 7.021064993034315, "bleu_stderr": 0.11474627928215739, "rouge1_fmeasure": 0.38381893127843775, "rouge1_fmeasure_stderr": 0.0018139236221588497, "rouge1_precision": 0.30826387603129907, "rouge1_precision_stderr": 0.0018679719801617597, "rouge1_recall": 0.5491523180108668, "rouge1_recall_stderr": 0.0026220387381628244, "rouge2_fmeasure": 0.17329021394802077, "rouge2_fmeasure_stderr": 0.0014896565428026973, "rouge2_precision": 0.1382607118725467, "rouge2_precision_stderr": 0.0013102373270393998, "rouge2_recall": 0.25338957755007935, "rouge2_recall_stderr": 0.0023051813874414608, "rougeL_fmeasure": 0.2791853032877538, "rougeL_fmeasure_stderr": 0.0013976915078093702, "rougeL_precision": 0.22310499125513847, "rougeL_precision_stderr": 0.0013603626862817944, "rougeL_recall": 0.4041141573216587, "rougeL_recall_stderr": 0.0023508428676916305, "rougeLsum_fmeasure": 0.3260232619607743, "rougeLsum_fmeasure_stderr": 0.0017795108045555923, "rougeLsum_precision": 0.26191481097823616, "rougeLsum_precision_stderr": 0.0017583830712396412, "rougeLsum_recall": 0.4668488083352266, "rougeLsum_recall_stderr": 0.0025930890163178466}, "create_text_for_me": {"bleu": 6.645972521435882, "bleu_stderr": 0.09437494975947229, "rouge1_fmeasure": 0.39134978775086127, "rouge1_fmeasure_stderr": 0.0016911013819947294, "rouge1_precision": 0.3064735649925744, "rouge1_precision_stderr": 0.0016464375750162158, "rouge1_recall": 0.5729264801132828, "rouge1_recall_stderr": 0.0024516925929830738, "rouge2_fmeasure": 0.17585670830781294, "rouge2_fmeasure_stderr": 0.001429411829744073, "rouge2_precision": 0.13656095219775433, "rouge2_precision_stderr": 0.0011845029911870509, "rouge2_recall": 0.2634948500997642, "rouge2_recall_stderr": 0.0022812481290132693, "rougeL_fmeasure": 0.2702295976370569, "rougeL_fmeasure_stderr": 0.0013593970744284988, "rougeL_precision": 0.21048241965429831, "rougeL_precision_stderr": 0.0011953684611401595, "rougeL_recall": 0.40033920081936836, "rougeL_recall_stderr": 0.0023247277061194146, "rougeLsum_fmeasure": 0.3301539965432325, "rougeLsum_fmeasure_stderr": 0.0016676064941407525, "rougeLsum_precision": 0.25867956586812857, "rougeLsum_precision_stderr": 0.0015561827508017755, "rougeLsum_recall": 0.4833218028427562, "rougeLsum_recall_stderr": 0.002451881437295078}, "generate_gramatically_correct_text": {"bleu": 8.504901044297757, "bleu_stderr": 0.1823982585890325, "rouge1_fmeasure": 0.2534732144080346, "rouge1_fmeasure_stderr": 0.0038630129775201146, "rouge1_precision": 0.24905192651947664, "rouge1_precision_stderr": 0.004201658035798914, "rouge1_recall": 0.31491991748172854, "rouge1_recall_stderr": 0.004981037624003543, "rouge2_fmeasure": 0.11077316594795349, "rouge2_fmeasure_stderr": 0.0019906677332412748, "rouge2_precision": 0.09568433998600458, "rouge2_precision_stderr": 0.0017454509782120025, "rouge2_recall": 0.14154088138404378, "rouge2_recall_stderr": 0.002652205184771609, "rougeL_fmeasure": 0.17650040533909794, "rougeL_fmeasure_stderr": 0.0027472261983493805, "rougeL_precision": 0.1812665699547391, "rougeL_precision_stderr": 0.003574420490219075, "rougeL_recall": 0.22010316835835358, "rougeL_recall_stderr": 0.003620300580738604, "rougeLsum_fmeasure": 0.211084813103085, "rougeLsum_fmeasure_stderr": 0.003295971549920327, "rougeLsum_precision": 0.21189846325175168, "rougeLsum_precision_stderr": 0.0038691558037997923, "rougeLsum_recall": 0.26224286165281196, "rougeLsum_recall_stderr": 0.00426905910632388}, "generate_text_restaurant": {"bleu": 7.260211722864896, "bleu_stderr": 0.12365349150703955, "rouge1_fmeasure": 0.3539789382700311, "rouge1_fmeasure_stderr": 0.002014807799247425, "rouge1_precision": 0.3111425574072635, "rouge1_precision_stderr": 0.002912308644972262, "rouge1_recall": 0.486978690036314, "rouge1_recall_stderr": 0.002680144522191287, "rouge2_fmeasure": 0.1648812739511937, "rouge2_fmeasure_stderr": 0.0015410229627803459, "rouge2_precision": 0.14647797107879432, "rouge2_precision_stderr": 0.001938088224804308, "rouge2_recall": 0.2296020100709576, "rouge2_recall_stderr": 0.0021177366913279795, "rougeL_fmeasure": 0.2855363745230388, "rougeL_fmeasure_stderr": 0.0015231421878303527, "rougeL_precision": 0.24849239666244133, "rougeL_precision_stderr": 0.002231342628042193, "rougeL_recall": 0.39930639747976754, "rougeL_recall_stderr": 0.002405155865675547, "rougeLsum_fmeasure": 0.29494158509832297, "rougeLsum_fmeasure_stderr": 0.0019724923487253014, "rougeLsum_precision": 0.25960917213172013, "rougeLsum_precision_stderr": 0.0026365144213955465, "rougeLsum_recall": 0.4057201628355578, "rougeLsum_recall_stderr": 0.002662489705069069}, "text": {"bleu": 6.621008002795477, "bleu_stderr": 0.10561546867270089, "rouge1_fmeasure": 0.42664757166768275, "rouge1_fmeasure_stderr": 0.001997906445835858, "rouge1_precision": 0.35130413207029, "rouge1_precision_stderr": 0.0020584837208460355, "rouge1_recall": 0.5752142038046015, "rouge1_recall_stderr": 0.0025882386282357122, "rouge2_fmeasure": 0.1945014895681582, "rouge2_fmeasure_stderr": 0.0016230097540462368, "rouge2_precision": 0.15915263498125684, "rouge2_precision_stderr": 0.0014446749684239855, "rouge2_recall": 0.26695747744909765, "rouge2_recall_stderr": 0.0022973676919300907, "rougeL_fmeasure": 0.30416823372321367, "rougeL_fmeasure_stderr": 0.0015597528882425847, "rougeL_precision": 0.24911377312443383, "rougeL_precision_stderr": 0.0014837091652911921, "rougeL_recall": 0.41458252977023213, "rougeL_recall_stderr": 0.002378711814640577, "rougeLsum_fmeasure": 0.35902864736200923, "rougeLsum_fmeasure_stderr": 0.0019327436427338909, "rougeLsum_precision": 0.2956310081550243, "rougeLsum_precision_stderr": 0.001909657845289513, "rougeLsum_recall": 0.4844948513886233, "rougeLsum_recall_stderr": 0.0025773079411811253}}}, "gem_xsum": {"0": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.7010026542087479, "bleu_stderr": 0.05557886575567746, "rouge1_fmeasure": 0.14045583841997622, "rouge1_fmeasure_stderr": 0.0021460672635315163, "rouge1_precision": 0.09973135507547863, "rouge1_precision_stderr": 0.0015973080444745867, "rouge1_recall": 0.2483551581802608, "rouge1_recall_stderr": 0.0036884551976866927, "rouge2_fmeasure": 0.0190566029197429, "rouge2_fmeasure_stderr": 0.0009165165117914078, "rouge2_precision": 0.013401782513852779, "rouge2_precision_stderr": 0.0006463045667296502, "rouge2_recall": 0.034653579743229065, "rouge2_recall_stderr": 0.001719993783733488, "rougeL_fmeasure": 0.11250509517032704, "rougeL_fmeasure_stderr": 0.0015602744886009001, "rougeL_precision": 0.07968673916612877, "rougeL_precision_stderr": 0.0011529481491568863, "rougeL_recall": 0.20035694919481709, "rougeL_recall_stderr": 0.0028106923696229165, "rougeLsum_fmeasure": 0.11288459242315821, "rougeLsum_fmeasure_stderr": 0.0017514781321482647, "rougeLsum_precision": 0.07993531686685008, "rougeLsum_precision_stderr": 0.0012877020733687047, "rougeLsum_recall": 0.20117384720414933, "rougeLsum_recall_stderr": 0.0031333624545092204}, "DOC_tldr": {"bleu": 2.0794867877910552, "bleu_stderr": 0.08783103175339281, "rouge1_fmeasure": 0.21057429412703157, "rouge1_fmeasure_stderr": 0.0029620022295376264, "rouge1_precision": 0.17644165930352093, "rouge1_precision_stderr": 0.0035159930377663164, "rouge1_recall": 0.31669581782847406, "rouge1_recall_stderr": 0.004204880715544353, "rouge2_fmeasure": 0.05349637631115593, "rouge2_fmeasure_stderr": 0.0018866213566015306, "rouge2_precision": 0.045892492699293866, "rouge2_precision_stderr": 0.0019747730351747164, "rouge2_recall": 0.08074272013017356, "rouge2_recall_stderr": 0.0027066202276012723, "rougeL_fmeasure": 0.16723801755038487, "rougeL_fmeasure_stderr": 0.0023715164713580613, "rougeL_precision": 0.13966279995339326, "rougeL_precision_stderr": 0.0027977834380213088, "rougeL_recall": 0.2533248436654788, "rougeL_recall_stderr": 0.0034708540063103682, "rougeLsum_fmeasure": 0.16467108129832378, "rougeLsum_fmeasure_stderr": 0.002492627666300806, "rougeLsum_precision": 0.13757208272049673, "rougeLsum_precision_stderr": 0.0028514349754435358, "rougeLsum_recall": 0.2496171915878273, "rougeLsum_recall_stderr": 0.003711257561558963}, "article_DOC_summary": {"bleu": 2.0719375915266327, "bleu_stderr": 0.09381739892136316, "rouge1_fmeasure": 0.21991779087563262, "rouge1_fmeasure_stderr": 0.0025921031610263, "rouge1_precision": 0.17021358510341336, "rouge1_precision_stderr": 0.0024373039382226077, "rouge1_recall": 0.3520563857681371, "rouge1_recall_stderr": 0.0043918973093069365, "rouge2_fmeasure": 0.050742967235947956, "rouge2_fmeasure_stderr": 0.0016968813344534614, "rouge2_precision": 0.03871189783801819, "rouge2_precision_stderr": 0.0013940343459980615, "rouge2_recall": 0.0841631095984103, "rouge2_recall_stderr": 0.002861696063072104, "rougeL_fmeasure": 0.1634042310463717, "rougeL_fmeasure_stderr": 0.002018854408540066, "rougeL_precision": 0.12632426569550884, "rougeL_precision_stderr": 0.001895537050144316, "rougeL_recall": 0.262981285619352, "rougeL_recall_stderr": 0.0034911247032713426, "rougeLsum_fmeasure": 0.17070284614092165, "rougeLsum_fmeasure_stderr": 0.0021883919787213176, "rougeLsum_precision": 0.13144702322249674, "rougeLsum_precision_stderr": 0.001966787320258499, "rougeLsum_recall": 0.2761271441560725, "rougeLsum_recall_stderr": 0.0039020054143034}, "summarize_DOC": {"bleu": 1.6729531930201382, "bleu_stderr": 0.10373342822948405, "rouge1_fmeasure": 0.20603426954244244, "rouge1_fmeasure_stderr": 0.002503133782526908, "rouge1_precision": 0.15669364098257, "rouge1_precision_stderr": 0.0023593186592923633, "rouge1_recall": 0.342374371356989, "rouge1_recall_stderr": 0.00437907614669953, "rouge2_fmeasure": 0.040126835769534804, "rouge2_fmeasure_stderr": 0.0014339486984846503, "rouge2_precision": 0.030783905399410026, "rouge2_precision_stderr": 0.0014282737935382643, "rouge2_recall": 0.06881031698129042, "rouge2_recall_stderr": 0.00251481083212002, "rougeL_fmeasure": 0.146668869960305, "rougeL_fmeasure_stderr": 0.0018236466465731617, "rougeL_precision": 0.1118036127326331, "rougeL_precision_stderr": 0.0018294125382753313, "rougeL_recall": 0.24498174699159114, "rougeL_recall_stderr": 0.0033060394110144436, "rougeLsum_fmeasure": 0.16072295732233483, "rougeLsum_fmeasure_stderr": 0.0020479535637844687, "rougeLsum_precision": 0.1218098460935552, "rougeLsum_precision_stderr": 0.001924491999404007, "rougeLsum_recall": 0.2695259798153811, "rougeLsum_recall_stderr": 0.0037470358825042837}, "summarize_this_DOC_summary": {"bleu": 2.139796123448687, "bleu_stderr": 0.0864912451995421, "rouge1_fmeasure": 0.2156313325024613, "rouge1_fmeasure_stderr": 0.002659844715910497, "rouge1_precision": 0.161357867973778, "rouge1_precision_stderr": 0.0023312739071830115, "rouge1_recall": 0.3579687540791487, "rouge1_recall_stderr": 0.0045453513416916095, "rouge2_fmeasure": 0.05124089074244038, "rouge2_fmeasure_stderr": 0.0017434147699145796, "rouge2_precision": 0.03816216737627533, "rouge2_precision_stderr": 0.0014424649587105316, "rouge2_recall": 0.08755656958993782, "rouge2_recall_stderr": 0.0030000299859141506, "rougeL_fmeasure": 0.16164669459673872, "rougeL_fmeasure_stderr": 0.0020770948236222796, "rougeL_precision": 0.12090520179487235, "rougeL_precision_stderr": 0.0018371253252162907, "rougeL_recall": 0.2697345298612283, "rougeL_recall_stderr": 0.003627327469111352, "rougeLsum_fmeasure": 0.16919936670173066, "rougeLsum_fmeasure_stderr": 0.0022655905836503737, "rougeLsum_precision": 0.1263110602681917, "rougeLsum_precision_stderr": 0.0019426636581592435, "rougeLsum_recall": 0.2828419243164692, "rougeLsum_recall_stderr": 0.003990709561595801}}, "1": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.6799429561855435, "bleu_stderr": 0.04879251393094118, "rouge1_fmeasure": 0.15840047240909824, "rouge1_fmeasure_stderr": 0.00214692641322964, "rouge1_precision": 0.11279192477107143, "rouge1_precision_stderr": 0.0016032349108474262, "rouge1_recall": 0.2766207479271877, "rouge1_recall_stderr": 0.0036066236500548418, "rouge2_fmeasure": 0.01908469081931983, "rouge2_fmeasure_stderr": 0.000926119882035849, "rouge2_precision": 0.013486589799660116, "rouge2_precision_stderr": 0.0006573025385863716, "rouge2_recall": 0.033834009894690674, "rouge2_recall_stderr": 0.0016640769892439025, "rougeL_fmeasure": 0.11129001725840247, "rougeL_fmeasure_stderr": 0.0014098818626850234, "rougeL_precision": 0.07903088461894958, "rougeL_precision_stderr": 0.001042052923479771, "rougeL_recall": 0.195968848936739, "rougeL_recall_stderr": 0.002491056986604568, "rougeLsum_fmeasure": 0.1276356932508086, "rougeLsum_fmeasure_stderr": 0.0016950763004042622, "rougeLsum_precision": 0.09067322769906408, "rougeLsum_precision_stderr": 0.001252995528243287, "rougeLsum_recall": 0.22440113001075607, "rougeLsum_recall_stderr": 0.0029610037715897537}, "DOC_tldr": {"bleu": 2.063408955221386, "bleu_stderr": 0.10793829466409724, "rouge1_fmeasure": 0.20366433099160464, "rouge1_fmeasure_stderr": 0.002506589804140991, "rouge1_precision": 0.1452348716004801, "rouge1_precision_stderr": 0.0018745609056768366, "rouge1_recall": 0.354637619733734, "rouge1_recall_stderr": 0.004310412338891105, "rouge2_fmeasure": 0.051542487477497304, "rouge2_fmeasure_stderr": 0.001622782922318877, "rouge2_precision": 0.03631887785067335, "rouge2_precision_stderr": 0.0011535908212142747, "rouge2_recall": 0.09256525737403981, "rouge2_recall_stderr": 0.0029692107599791377, "rougeL_fmeasure": 0.16066302558679957, "rougeL_fmeasure_stderr": 0.0020034483691429915, "rougeL_precision": 0.11435270633499948, "rougeL_precision_stderr": 0.0014769836849210006, "rougeL_recall": 0.281332040329211, "rougeL_recall_stderr": 0.0035948252196224143, "rougeLsum_fmeasure": 0.161499696273688, "rougeLsum_fmeasure_stderr": 0.002187584389664099, "rougeLsum_precision": 0.1148601995557349, "rougeLsum_precision_stderr": 0.0016090320139797805, "rougeLsum_recall": 0.2832784836907842, "rougeLsum_recall_stderr": 0.0038965527110954635}, "article_DOC_summary": {"bleu": 1.5179351179741758, "bleu_stderr": 0.05443901746520919, "rouge1_fmeasure": 0.18467573869385082, "rouge1_fmeasure_stderr": 0.0024560883653668973, "rouge1_precision": 0.13144344568204896, "rouge1_precision_stderr": 0.0018269600623641938, "rouge1_recall": 0.32351107047900046, "rouge1_recall_stderr": 0.0042227591238073415, "rouge2_fmeasure": 0.03956915695649403, "rouge2_fmeasure_stderr": 0.0013969113381600835, "rouge2_precision": 0.027812878457341133, "rouge2_precision_stderr": 0.0009822890456735115, "rouge2_recall": 0.07174244190002714, "rouge2_recall_stderr": 0.002623646025136939, "rougeL_fmeasure": 0.1433445973017209, "rougeL_fmeasure_stderr": 0.0018589035220238875, "rougeL_precision": 0.10177712280555466, "rougeL_precision_stderr": 0.0013642070339291124, "rougeL_recall": 0.2530115289147634, "rougeL_recall_stderr": 0.003361296390437375, "rougeLsum_fmeasure": 0.1463591812261117, "rougeLsum_fmeasure_stderr": 0.0020474760089749054, "rougeLsum_precision": 0.10392255150260828, "rougeLsum_precision_stderr": 0.0014996096643151138, "rougeLsum_recall": 0.25806957261490177, "rougeLsum_recall_stderr": 0.003642123743587609}, "summarize_DOC": {"bleu": 1.9783686260260467, "bleu_stderr": 0.1607197494307949, "rouge1_fmeasure": 0.20772684063267086, "rouge1_fmeasure_stderr": 0.0024574107856270774, "rouge1_precision": 0.14788910472235636, "rouge1_precision_stderr": 0.0018389112071502275, "rouge1_recall": 0.3632558549763597, "rouge1_recall_stderr": 0.004249174048494396, "rouge2_fmeasure": 0.04851112854401421, "rouge2_fmeasure_stderr": 0.0015396516708931478, "rouge2_precision": 0.03422805026149446, "rouge2_precision_stderr": 0.0010929716541644215, "rouge2_recall": 0.08684664107270823, "rouge2_recall_stderr": 0.002805600167681186, "rougeL_fmeasure": 0.1542221581796846, "rougeL_fmeasure_stderr": 0.0018545206679636554, "rougeL_precision": 0.10960862089079661, "rougeL_precision_stderr": 0.0013730570572271972, "rougeL_recall": 0.27132452858310807, "rougeL_recall_stderr": 0.003336994048105982, "rougeLsum_fmeasure": 0.1666793386233674, "rougeLsum_fmeasure_stderr": 0.0020689435117509725, "rougeLsum_precision": 0.11840342830898004, "rougeLsum_precision_stderr": 0.0015232398298886394, "rougeLsum_recall": 0.2933510485102284, "rougeLsum_recall_stderr": 0.0037203141797391708}, "summarize_this_DOC_summary": {"bleu": 1.4741686816314712, "bleu_stderr": 0.09394903896491678, "rouge1_fmeasure": 0.1823978250185138, "rouge1_fmeasure_stderr": 0.002494889694305653, "rouge1_precision": 0.13025847232053914, "rouge1_precision_stderr": 0.0018599814789393584, "rouge1_recall": 0.31675206292581976, "rouge1_recall_stderr": 0.004227418702250003, "rouge2_fmeasure": 0.03871722499957788, "rouge2_fmeasure_stderr": 0.0014304008993493518, "rouge2_precision": 0.027325104152554167, "rouge2_precision_stderr": 0.001013865586628979, "rouge2_recall": 0.0693976720523522, "rouge2_recall_stderr": 0.0026208374515650262, "rougeL_fmeasure": 0.14404400088909086, "rougeL_fmeasure_stderr": 0.0018950818451793567, "rougeL_precision": 0.10265566343076016, "rougeL_precision_stderr": 0.0013991604181805165, "rougeL_recall": 0.251846472183418, "rougeL_recall_stderr": 0.0033529856967831223, "rougeLsum_fmeasure": 0.1425722908096705, "rougeLsum_fmeasure_stderr": 0.0020191945274259963, "rougeLsum_precision": 0.10154113890914356, "rougeLsum_precision_stderr": 0.0014807942774655064, "rougeLsum_recall": 0.2494785849680061, "rougeLsum_recall_stderr": 0.0035726690764623942}}, "2": {"DOC_boils_down_to_simple_idea_that": {"bleu": 1.0018438511511791, "bleu_stderr": 0.04226334665257509, "rouge1_fmeasure": 0.1651965564409125, "rouge1_fmeasure_stderr": 0.0023832852201559826, "rouge1_precision": 0.11759195430339829, "rouge1_precision_stderr": 0.0017689078804778012, "rouge1_recall": 0.2889675810942405, "rouge1_recall_stderr": 0.004062189732858682, "rouge2_fmeasure": 0.026322118720045605, "rouge2_fmeasure_stderr": 0.0011845918036721787, "rouge2_precision": 0.018563434665803503, "rouge2_precision_stderr": 0.0008384469201564846, "rouge2_recall": 0.047169939804170904, "rouge2_recall_stderr": 0.002175973099128802, "rougeL_fmeasure": 0.12076659101965195, "rougeL_fmeasure_stderr": 0.0016553933643729203, "rougeL_precision": 0.08578950235723362, "rougeL_precision_stderr": 0.0012213960229954062, "rougeL_recall": 0.21259477004430985, "rougeL_recall_stderr": 0.002927850647938266, "rougeLsum_fmeasure": 0.13222073865796682, "rougeLsum_fmeasure_stderr": 0.0019276011969453877, "rougeLsum_precision": 0.09390304577868243, "rougeLsum_precision_stderr": 0.0014156962162275915, "rougeLsum_recall": 0.23292557695865002, "rougeLsum_recall_stderr": 0.0034258792471330533}, "DOC_tldr": {"bleu": 2.0662695038715833, "bleu_stderr": 0.11604502835868223, "rouge1_fmeasure": 0.20750338741773555, "rouge1_fmeasure_stderr": 0.0025080223518937884, "rouge1_precision": 0.14803725413512445, "rouge1_precision_stderr": 0.0018733512973581588, "rouge1_recall": 0.3610354652240794, "rouge1_recall_stderr": 0.004389296174890975, "rouge2_fmeasure": 0.05337831779753894, "rouge2_fmeasure_stderr": 0.001659922934200303, "rouge2_precision": 0.03764444720923887, "rouge2_precision_stderr": 0.0011831632104640343, "rouge2_recall": 0.09572292100591462, "rouge2_recall_stderr": 0.003053817939922839, "rougeL_fmeasure": 0.16422047885591626, "rougeL_fmeasure_stderr": 0.0020030333826574695, "rougeL_precision": 0.117013811273496, "rougeL_precision_stderr": 0.001487754889983635, "rougeL_recall": 0.28688812754471305, "rougeL_recall_stderr": 0.003583873941321553, "rougeLsum_fmeasure": 0.16481921665835544, "rougeLsum_fmeasure_stderr": 0.0021846730064098525, "rougeLsum_precision": 0.11725652432083132, "rougeLsum_precision_stderr": 0.0016010042579447314, "rougeLsum_recall": 0.2890122736547096, "rougeLsum_recall_stderr": 0.003981206683676861}, "article_DOC_summary": {"bleu": 1.5610996318449655, "bleu_stderr": 0.061361742735683046, "rouge1_fmeasure": 0.18395936513372504, "rouge1_fmeasure_stderr": 0.0023841684164883858, "rouge1_precision": 0.1310122719121047, "rouge1_precision_stderr": 0.001774463078853173, "rouge1_recall": 0.32197642898756434, "rouge1_recall_stderr": 0.004128514222989525, "rouge2_fmeasure": 0.040730679478674064, "rouge2_fmeasure_stderr": 0.0014445249170945054, "rouge2_precision": 0.028592074721851848, "rouge2_precision_stderr": 0.0010146723977319822, "rouge2_recall": 0.07420088979991933, "rouge2_recall_stderr": 0.0027242327105131063, "rougeL_fmeasure": 0.14791071749656537, "rougeL_fmeasure_stderr": 0.0018605423267814124, "rougeL_precision": 0.10506688444375611, "rougeL_precision_stderr": 0.0013615120559406814, "rougeL_recall": 0.2609222630071341, "rougeL_recall_stderr": 0.0034251450207401918, "rougeLsum_fmeasure": 0.1463682832676705, "rougeLsum_fmeasure_stderr": 0.0020126152822738364, "rougeLsum_precision": 0.1039952495955331, "rougeLsum_precision_stderr": 0.00147131239785375, "rougeLsum_recall": 0.2579710996210495, "rougeLsum_recall_stderr": 0.0036391736637925577}, "summarize_DOC": {"bleu": 1.802744798920398, "bleu_stderr": 0.05920522944838898, "rouge1_fmeasure": 0.20419784491514162, "rouge1_fmeasure_stderr": 0.0025189481605346693, "rouge1_precision": 0.14591017846113652, "rouge1_precision_stderr": 0.0018829813606970626, "rouge1_recall": 0.35417878162606353, "rouge1_recall_stderr": 0.004336112520719766, "rouge2_fmeasure": 0.04736923476037229, "rouge2_fmeasure_stderr": 0.0015840684340098893, "rouge2_precision": 0.033450171038277396, "rouge2_precision_stderr": 0.0011255630755179697, "rouge2_recall": 0.08454708459773036, "rouge2_recall_stderr": 0.0028776060737512313, "rougeL_fmeasure": 0.15747382683561614, "rougeL_fmeasure_stderr": 0.0019009865513009152, "rougeL_precision": 0.112357445163031, "rougeL_precision_stderr": 0.0014098869920409992, "rougeL_recall": 0.27457572511057443, "rougeL_recall_stderr": 0.003414209315676662, "rougeLsum_fmeasure": 0.16076832155716242, "rougeLsum_fmeasure_stderr": 0.0021544746480995827, "rougeLsum_precision": 0.11456495811701795, "rougeLsum_precision_stderr": 0.0015800062086523089, "rougeLsum_recall": 0.28097827542002, "rougeLsum_recall_stderr": 0.0038590856032283075}, "summarize_this_DOC_summary": {"bleu": 1.4891373477876866, "bleu_stderr": 0.0811523557434027, "rouge1_fmeasure": 0.17515450965695337, "rouge1_fmeasure_stderr": 0.002375995326570658, "rouge1_precision": 0.12565737450340297, "rouge1_precision_stderr": 0.0017756118340194749, "rouge1_recall": 0.301251337982613, "rouge1_recall_stderr": 0.004081353108976665, "rouge2_fmeasure": 0.038463882894735665, "rouge2_fmeasure_stderr": 0.0014439165288524466, "rouge2_precision": 0.02725985494322299, "rouge2_precision_stderr": 0.0010257708768464891, "rouge2_recall": 0.06822620908929833, "rouge2_recall_stderr": 0.002624357062351836, "rougeL_fmeasure": 0.14750967049873018, "rougeL_fmeasure_stderr": 0.0019352926679694342, "rougeL_precision": 0.10562763001104193, "rougeL_precision_stderr": 0.0014305250731057942, "rougeL_recall": 0.2550603689921568, "rougeL_recall_stderr": 0.0034548246638929424, "rougeLsum_fmeasure": 0.1340651218539847, "rougeLsum_fmeasure_stderr": 0.0019565808957794375, "rougeLsum_precision": 0.09586724631609989, "rougeLsum_precision_stderr": 0.001432813745557231, "rougeLsum_recall": 0.23246625054729966, "rougeLsum_recall_stderr": 0.0034962734013648925}}, "3": {"DOC_boils_down_to_simple_idea_that": {"bleu": 1.3657013486015295, "bleu_stderr": 0.08729701107016043, "rouge1_fmeasure": 0.16959674585893536, "rouge1_fmeasure_stderr": 0.002631406491027894, "rouge1_precision": 0.12268247063515418, "rouge1_precision_stderr": 0.002004429738071348, "rouge1_recall": 0.2920661828398265, "rouge1_recall_stderr": 0.004556228488458248, "rouge2_fmeasure": 0.03227615271942288, "rouge2_fmeasure_stderr": 0.0013255534293243935, "rouge2_precision": 0.022908311173769465, "rouge2_precision_stderr": 0.0009443759203422804, "rouge2_recall": 0.057536503485596614, "rouge2_recall_stderr": 0.0024061542462623067, "rougeL_fmeasure": 0.12716380123453083, "rougeL_fmeasure_stderr": 0.001928363853274409, "rougeL_precision": 0.09180476596307438, "rougeL_precision_stderr": 0.001465280362062083, "rougeL_recall": 0.22042631379574948, "rougeL_recall_stderr": 0.0034317489548799755, "rougeLsum_fmeasure": 0.13571148483109557, "rougeLsum_fmeasure_stderr": 0.0021714373357817397, "rougeLsum_precision": 0.09794481566650097, "rougeLsum_precision_stderr": 0.0016340921798746898, "rougeLsum_recall": 0.23499521663243025, "rougeLsum_recall_stderr": 0.0038347753688523496}, "DOC_tldr": {"bleu": 2.187340339866472, "bleu_stderr": 0.09337748219810847, "rouge1_fmeasure": 0.20491006822687677, "rouge1_fmeasure_stderr": 0.002761147760944239, "rouge1_precision": 0.15005408001279993, "rouge1_precision_stderr": 0.0022275262688393857, "rouge1_recall": 0.34931766468701486, "rouge1_recall_stderr": 0.004799937617922755, "rouge2_fmeasure": 0.052933674983345634, "rouge2_fmeasure_stderr": 0.0017273248399238827, "rouge2_precision": 0.03817756156834947, "rouge2_precision_stderr": 0.0012790098539207952, "rouge2_recall": 0.09333832339595143, "rouge2_recall_stderr": 0.0031111903067984276, "rougeL_fmeasure": 0.15924458772595346, "rougeL_fmeasure_stderr": 0.002153213639538366, "rougeL_precision": 0.11653214452758791, "rougeL_precision_stderr": 0.0017452636081294359, "rougeL_recall": 0.27290106515166107, "rougeL_recall_stderr": 0.0038631652216362072, "rougeLsum_fmeasure": 0.16175793225879814, "rougeLsum_fmeasure_stderr": 0.0023311720661425635, "rougeLsum_precision": 0.11841489997398834, "rougeLsum_precision_stderr": 0.0018745424755303739, "rougeLsum_recall": 0.2773235770549664, "rougeLsum_recall_stderr": 0.00417220414799562}, "article_DOC_summary": {"bleu": 1.5762937299614814, "bleu_stderr": 0.07362827411239845, "rouge1_fmeasure": 0.17637720631581139, "rouge1_fmeasure_stderr": 0.00264797834039018, "rouge1_precision": 0.1284212760059797, "rouge1_precision_stderr": 0.002041598803417733, "rouge1_recall": 0.3028097339054088, "rouge1_recall_stderr": 0.004615535937102858, "rouge2_fmeasure": 0.03767895922224648, "rouge2_fmeasure_stderr": 0.001448043764579111, "rouge2_precision": 0.026902495307978534, "rouge2_precision_stderr": 0.0010371639486970465, "rouge2_recall": 0.06717388422150615, "rouge2_recall_stderr": 0.0027080652362817086, "rougeL_fmeasure": 0.14193633867939442, "rougeL_fmeasure_stderr": 0.002090431283179229, "rougeL_precision": 0.1032652428870357, "rougeL_precision_stderr": 0.001609764469770287, "rougeL_recall": 0.24490235352225267, "rougeL_recall_stderr": 0.003766948264946798, "rougeLsum_fmeasure": 0.13894288832668733, "rougeLsum_fmeasure_stderr": 0.002188039489816412, "rougeLsum_precision": 0.10103180270559059, "rougeLsum_precision_stderr": 0.0016751299955245328, "rougeLsum_recall": 0.2403006525007405, "rougeLsum_recall_stderr": 0.003943762632099877}, "summarize_DOC": {"bleu": 1.7884006934583645, "bleu_stderr": 0.07907574142987076, "rouge1_fmeasure": 0.19422961589893378, "rouge1_fmeasure_stderr": 0.0026855658611884946, "rouge1_precision": 0.14153741165892486, "rouge1_precision_stderr": 0.0020927973441702535, "rouge1_recall": 0.33105961459502653, "rouge1_recall_stderr": 0.004650854800318841, "rouge2_fmeasure": 0.04353291741965738, "rouge2_fmeasure_stderr": 0.0015306953449422075, "rouge2_precision": 0.031230937266789643, "rouge2_precision_stderr": 0.0011103593069502417, "rouge2_recall": 0.07676489034975985, "rouge2_recall_stderr": 0.0027744764471766274, "rougeL_fmeasure": 0.1484909183587473, "rougeL_fmeasure_stderr": 0.0020672313290265535, "rougeL_precision": 0.10809159570024703, "rougeL_precision_stderr": 0.001596172312046679, "rougeL_recall": 0.25411156370365817, "rougeL_recall_stderr": 0.0036584523222178214, "rougeLsum_fmeasure": 0.15261940522006723, "rougeLsum_fmeasure_stderr": 0.002267767106091909, "rougeLsum_precision": 0.11098479481711339, "rougeLsum_precision_stderr": 0.0017470432310918238, "rougeLsum_recall": 0.26198935148766644, "rougeLsum_recall_stderr": 0.004040675001072013}, "summarize_this_DOC_summary": {"bleu": 1.548542398410883, "bleu_stderr": 0.07300875765901975, "rouge1_fmeasure": 0.1652692842917284, "rouge1_fmeasure_stderr": 0.002687544705180198, "rouge1_precision": 0.12133492152104836, "rouge1_precision_stderr": 0.002108052911078574, "rouge1_recall": 0.2771118310610424, "rouge1_recall_stderr": 0.004513875830290629, "rouge2_fmeasure": 0.0361588906854937, "rouge2_fmeasure_stderr": 0.0015014165247081573, "rouge2_precision": 0.02620916447329947, "rouge2_precision_stderr": 0.0011062000625487128, "rouge2_recall": 0.062338878703100085, "rouge2_recall_stderr": 0.002685465546018337, "rougeL_fmeasure": 0.14006875772839858, "rougeL_fmeasure_stderr": 0.0021980071144352764, "rougeL_precision": 0.10257894420601281, "rougeL_precision_stderr": 0.0017009747933540017, "rougeL_recall": 0.23624832791965056, "rougeL_recall_stderr": 0.0038268204334818137, "rougeLsum_fmeasure": 0.1270213480349254, "rougeLsum_fmeasure_stderr": 0.0022098783786563886, "rougeLsum_precision": 0.09313729488739347, "rougeLsum_precision_stderr": 0.001717942721208578, "rougeLsum_recall": 0.2143290252431487, "rougeLsum_recall_stderr": 0.0038151394078170886}}, "4": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.6251466968093098, "bleu_stderr": 0.11921280575698076, "rouge1_fmeasure": 0.0434980036557313, "rouge1_fmeasure_stderr": 0.002547351830150303, "rouge1_precision": 0.0353729182277329, "rouge1_precision_stderr": 0.002120724556038934, "rouge1_recall": 0.06963538400891243, "rouge1_recall_stderr": 0.004245614128488828, "rouge2_fmeasure": 0.00873416530365632, "rouge2_fmeasure_stderr": 0.0008131278526907818, "rouge2_precision": 0.006434256872019694, "rouge2_precision_stderr": 0.0006047069516458604, "rouge2_recall": 0.015043103963332192, "rouge2_recall_stderr": 0.0014409503521463944, "rougeL_fmeasure": 0.03275624297934461, "rougeL_fmeasure_stderr": 0.0019019226248188242, "rougeL_precision": 0.02693683498375084, "rougeL_precision_stderr": 0.0016356740431801658, "rougeL_recall": 0.0526495618772081, "rougeL_recall_stderr": 0.003212351040820403, "rougeLsum_fmeasure": 0.03526865532078303, "rougeLsum_fmeasure_stderr": 0.0020893045270848074, "rougeLsum_precision": 0.02896015737664471, "rougeLsum_precision_stderr": 0.0017797342286746572, "rougeLsum_recall": 0.056639676651518825, "rougeLsum_recall_stderr": 0.003532055034648066}, "DOC_tldr": {"bleu": 1.1685643584135952, "bleu_stderr": 0.1951566077821766, "rouge1_fmeasure": 0.05673653310470254, "rouge1_fmeasure_stderr": 0.0030997047693230627, "rouge1_precision": 0.051357937177083465, "rouge1_precision_stderr": 0.0034115765564453455, "rouge1_recall": 0.08957854109700515, "rouge1_recall_stderr": 0.0050933482072906084, "rouge2_fmeasure": 0.01418426797251855, "rouge2_fmeasure_stderr": 0.0012124277033053392, "rouge2_precision": 0.011627742228248835, "rouge2_precision_stderr": 0.0012945900748551743, "rouge2_recall": 0.023933291444758153, "rouge2_recall_stderr": 0.002099710700842424, "rougeL_fmeasure": 0.04423726916636083, "rougeL_fmeasure_stderr": 0.002428377343138416, "rougeL_precision": 0.04126617524992539, "rougeL_precision_stderr": 0.0029705705175859863, "rougeL_recall": 0.06969693031985601, "rougeL_recall_stderr": 0.003989487538391626, "rougeLsum_fmeasure": 0.045736152774752646, "rougeLsum_fmeasure_stderr": 0.0025402110740203494, "rougeLsum_precision": 0.04252467493794107, "rougeLsum_precision_stderr": 0.0030391760211394485, "rougeLsum_recall": 0.07216556113564128, "rougeLsum_recall_stderr": 0.004193160744760016}, "article_DOC_summary": {"bleu": 0.7513821615574038, "bleu_stderr": 0.07905589981346285, "rouge1_fmeasure": 0.048911634240246346, "rouge1_fmeasure_stderr": 0.002765697903406571, "rouge1_precision": 0.041468733586705914, "rouge1_precision_stderr": 0.0025971833623889785, "rouge1_recall": 0.07627594081632273, "rouge1_recall_stderr": 0.004409224115860876, "rouge2_fmeasure": 0.01035393012550112, "rouge2_fmeasure_stderr": 0.0009785561170831303, "rouge2_precision": 0.009048514108124975, "rouge2_precision_stderr": 0.0012181947613622947, "rouge2_recall": 0.01699435587690579, "rouge2_recall_stderr": 0.001625353914816647, "rougeL_fmeasure": 0.03907075670579812, "rougeL_fmeasure_stderr": 0.002182547417010533, "rougeL_precision": 0.03361424158025485, "rougeL_precision_stderr": 0.002192257614857122, "rougeL_recall": 0.06101985571016838, "rougeL_recall_stderr": 0.0035062484157730514, "rougeLsum_fmeasure": 0.039302040446406394, "rougeLsum_fmeasure_stderr": 0.0022230245548452298, "rougeLsum_precision": 0.03401363654888806, "rougeLsum_precision_stderr": 0.0022524026754152373, "rougeLsum_recall": 0.0612418372651088, "rougeLsum_recall_stderr": 0.0035630181436990096}, "summarize_DOC": {"bleu": 0.858896407235953, "bleu_stderr": 0.17813375842071116, "rouge1_fmeasure": 0.05196928184468956, "rouge1_fmeasure_stderr": 0.0028568557569465304, "rouge1_precision": 0.04322202252738077, "rouge1_precision_stderr": 0.0026080541844490843, "rouge1_recall": 0.08145947706027855, "rouge1_recall_stderr": 0.004597925269874288, "rouge2_fmeasure": 0.011212198666180598, "rouge2_fmeasure_stderr": 0.0009995094999208916, "rouge2_precision": 0.009217798283394814, "rouge2_precision_stderr": 0.0011515928095724962, "rouge2_recall": 0.01852184368380728, "rouge2_recall_stderr": 0.001676813297073238, "rougeL_fmeasure": 0.04056089023030101, "rougeL_fmeasure_stderr": 0.002247569677623111, "rougeL_precision": 0.034066440428070936, "rougeL_precision_stderr": 0.00214328249850724, "rougeL_recall": 0.06359700022888996, "rougeL_recall_stderr": 0.0036377298500940803, "rougeLsum_fmeasure": 0.04238835402403162, "rougeLsum_fmeasure_stderr": 0.0023646209795898624, "rougeLsum_precision": 0.03534210706251266, "rougeLsum_precision_stderr": 0.0022064701869351773, "rougeLsum_recall": 0.06678926323593559, "rougeLsum_recall_stderr": 0.0038482831513349976}, "summarize_this_DOC_summary": {"bleu": 0.6256689661157525, "bleu_stderr": 0.07517644959617945, "rouge1_fmeasure": 0.042930043440632525, "rouge1_fmeasure_stderr": 0.002579054260477061, "rouge1_precision": 0.03643719370186493, "rouge1_precision_stderr": 0.002327364890907121, "rouge1_recall": 0.06520608390203138, "rouge1_recall_stderr": 0.003996161845682704, "rouge2_fmeasure": 0.009582916301059853, "rouge2_fmeasure_stderr": 0.0009704986418888822, "rouge2_precision": 0.007917882560037967, "rouge2_precision_stderr": 0.0009203895438615683, "rouge2_recall": 0.01504105351920977, "rouge2_recall_stderr": 0.0015317801562663841, "rougeL_fmeasure": 0.0355906132515583, "rougeL_fmeasure_stderr": 0.0021268636919251588, "rougeL_precision": 0.030403765142791965, "rougeL_precision_stderr": 0.0019637434849462013, "rougeL_recall": 0.05414283521586396, "rougeL_recall_stderr": 0.0033076203364920415, "rougeLsum_fmeasure": 0.033437432695987035, "rougeLsum_fmeasure_stderr": 0.0020528905348075315, "rougeLsum_precision": 0.028885184139424556, "rougeLsum_precision_stderr": 0.0019316925983647367, "rougeLsum_recall": 0.050662384755815255, "rougeLsum_recall_stderr": 0.0031895292441193332}}, "5": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0003020119153708095, "rouge1_fmeasure_stderr": 0.0001543461059925123, "rouge1_precision": 0.003430531732418525, "rouge1_precision_stderr": 0.0017130559457731787, "rouge1_recall": 0.0001583205885417987, "rouge1_recall_stderr": 8.10965586465461e-05, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0003020119153708095, "rougeL_fmeasure_stderr": 0.0001543461059925123, "rougeL_precision": 0.003430531732418525, "rougeL_precision_stderr": 0.0017130559457731787, "rougeL_recall": 0.0001583205885417987, "rougeL_recall_stderr": 8.10965586465461e-05, "rougeLsum_fmeasure": 0.0003020119153708095, "rougeLsum_fmeasure_stderr": 0.0001543461059925123, "rougeLsum_precision": 0.003430531732418525, "rougeLsum_precision_stderr": 0.0017130559457731787, "rougeLsum_recall": 0.0001583205885417987, "rougeLsum_recall_stderr": 8.10965586465461e-05}, "DOC_tldr": {"bleu": 9.155484870335624e-42, "bleu_stderr": 1.4620320117295877e-34, "rouge1_fmeasure": 0.002332190655682947, "rouge1_fmeasure_stderr": 0.0006168325099045509, "rouge1_precision": 0.002815097659210261, "rouge1_precision_stderr": 0.0007550078312773055, "rouge1_recall": 0.002058439850730106, "rouge1_recall_stderr": 0.0005489718025062792, "rouge2_fmeasure": 0.000325473526945072, "rouge2_fmeasure_stderr": 0.00019692751752173486, "rouge2_precision": 0.000355815568079719, "rouge2_precision_stderr": 0.00020880203605271082, "rouge2_recall": 0.0003037449971412236, "rouge2_recall_stderr": 0.00018804668706895537, "rougeL_fmeasure": 0.0016104525385042783, "rougeL_fmeasure_stderr": 0.00044591521297227443, "rougeL_precision": 0.0019160135596418518, "rougeL_precision_stderr": 0.0005319338017808152, "rougeL_recall": 0.0014383859186266749, "rougeL_recall_stderr": 0.0004043038910360008, "rougeLsum_fmeasure": 0.001800332071457902, "rougeLsum_fmeasure_stderr": 0.0004998974285323676, "rougeLsum_precision": 0.0021304217929180094, "rougeLsum_precision_stderr": 0.0005877695890086486, "rougeLsum_recall": 0.0016128812346314246, "rougeLsum_recall_stderr": 0.0004569974976967699}, "article_DOC_summary": {"bleu": 5.9664964945316196e-36, "bleu_stderr": 6.37663460549604e-30, "rouge1_fmeasure": 0.002904097352119159, "rouge1_fmeasure_stderr": 0.0008025657506213006, "rouge1_precision": 0.003214034101498322, "rouge1_precision_stderr": 0.0009221332972255172, "rouge1_recall": 0.002755372289086832, "rouge1_recall_stderr": 0.0007486991780011711, "rouge2_fmeasure": 0.0003555930988203656, "rouge2_fmeasure_stderr": 0.00020825306662602857, "rouge2_precision": 0.0004553870087049595, "rouge2_precision_stderr": 0.0002660277152615564, "rouge2_recall": 0.0002936371804296332, "rouge2_recall_stderr": 0.00017278177414040624, "rougeL_fmeasure": 0.0021617867610418057, "rougeL_fmeasure_stderr": 0.000600765218788033, "rougeL_precision": 0.0023980769070289483, "rougeL_precision_stderr": 0.0006896763285113808, "rougeL_recall": 0.0020343184807463843, "rougeL_recall_stderr": 0.0005525851742486296, "rougeLsum_fmeasure": 0.0024822527344372726, "rougeLsum_fmeasure_stderr": 0.0006926475370230472, "rougeLsum_precision": 0.0027490625210994384, "rougeLsum_precision_stderr": 0.0008051771911473355, "rougeLsum_recall": 0.0023634542664537216, "rougeLsum_recall_stderr": 0.0006464968490936459}, "summarize_DOC": {"bleu": 6.730767350313837e-38, "bleu_stderr": 5.433189914171102e-32, "rouge1_fmeasure": 0.0024619745886219103, "rouge1_fmeasure_stderr": 0.0007068894891377003, "rouge1_precision": 0.002820827522912925, "rouge1_precision_stderr": 0.0008241014146566761, "rouge1_recall": 0.0023517396771233516, "rouge1_recall_stderr": 0.0006955198818749466, "rouge2_fmeasure": 0.0005008107704990395, "rouge2_fmeasure_stderr": 0.00029259545868901905, "rouge2_precision": 0.0006013749644563891, "rouge2_precision_stderr": 0.00034827715329002203, "rouge2_recall": 0.0004582288780401988, "rouge2_recall_stderr": 0.000276736360710651, "rougeL_fmeasure": 0.0021415097697000925, "rougeL_fmeasure_stderr": 0.0006302603088867335, "rougeL_precision": 0.00242534330718512, "rougeL_precision_stderr": 0.0007134131592000302, "rougeL_recall": 0.002077455090168221, "rougeL_recall_stderr": 0.0006420787798082092, "rougeLsum_fmeasure": 0.002363959396267095, "rougeLsum_fmeasure_stderr": 0.0006794393822266892, "rougeLsum_precision": 0.0026888839947429815, "rougeLsum_precision_stderr": 0.0007810704185094188, "rougeLsum_recall": 0.0022737730468411124, "rougeLsum_recall_stderr": 0.0006780275889829637}, "summarize_this_DOC_summary": {"bleu": 5.331681904215219e-233, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0003382885458357156, "rouge1_fmeasure_stderr": 0.0002086842561151499, "rouge1_precision": 0.0011435105774728416, "rouge1_precision_stderr": 0.0006698929504092044, "rouge1_recall": 0.00020164583843829126, "rouge1_recall_stderr": 0.00012654106887528574, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0003382885458357156, "rougeL_fmeasure_stderr": 0.0002086842561151499, "rougeL_precision": 0.0011435105774728416, "rougeL_precision_stderr": 0.0006698929504092044, "rougeL_recall": 0.00020164583843829126, "rougeL_recall_stderr": 0.00012654106887528574, "rougeLsum_fmeasure": 0.0003382885458357156, "rougeLsum_fmeasure_stderr": 0.0002086842561151499, "rougeLsum_precision": 0.0011435105774728416, "rougeLsum_precision_stderr": 0.0006698929504092044, "rougeLsum_recall": 0.00020164583843829126, "rougeLsum_recall_stderr": 0.00012654106887528574}}}, "piqa": {"0": {"Correct the solution": {"bleu": 8.221264640592608, "bleu_stderr": 0.3149873650760696, "rouge1_fmeasure": 0.21211270156193437, "rouge1_fmeasure_stderr": 0.005814456021753165, "rouge1_precision": 0.16666080011920595, "rouge1_precision_stderr": 0.005180719422543184, "rouge1_recall": 0.501863559860732, "rouge1_recall_stderr": 0.009052996765184864, "rouge2_fmeasure": 0.15599026594193496, "rouge2_fmeasure_stderr": 0.005128636671935841, "rouge2_precision": 0.12083386201007094, "rouge2_precision_stderr": 0.004471674153000206, "rouge2_recall": 0.3806965470945097, "rouge2_recall_stderr": 0.008597586629652064, "rougeL_fmeasure": 0.20516494642415065, "rougeL_fmeasure_stderr": 0.0057181465206832, "rougeL_precision": 0.16072996710527454, "rougeL_precision_stderr": 0.005070936585051591, "rougeL_recall": 0.48913954648097346, "rougeL_recall_stderr": 0.009013481487297286, "rougeLsum_fmeasure": 0.20678919293239362, "rougeLsum_fmeasure_stderr": 0.005766515768579625, "rougeLsum_precision": 0.16227516864724326, "rougeLsum_precision_stderr": 0.005116365519276017, "rougeLsum_recall": 0.4895892873231678, "rougeLsum_recall_stderr": 0.009023257251500322}, "choose the most appropriate solution": {"acc": 0.49455930359085964, "acc_norm": 0.49455930359085964, "acc_norm_stderr": 0.011665133500637059, "acc_stderr": 0.011665133500637059}, "no prompt needed": {"bleu": 0.1653247598984174, "bleu_stderr": 0.013821408415080715, "rouge1_fmeasure": 0.036532603399439284, "rouge1_fmeasure_stderr": 0.0008337337093122241, "rouge1_precision": 0.020998722136358804, "rouge1_precision_stderr": 0.0005269403955655725, "rouge1_recall": 0.23228388738077216, "rouge1_recall_stderr": 0.0041360813271565395, "rouge2_fmeasure": 0.005465981531797976, "rouge2_fmeasure_stderr": 0.000254177372520646, "rouge2_precision": 0.0030927874561869893, "rouge2_precision_stderr": 0.00014944861884636616, "rouge2_recall": 0.0386031419214451, "rouge2_recall_stderr": 0.0018914429245019343, "rougeL_fmeasure": 0.033434992892598596, "rougeL_fmeasure_stderr": 0.000727186829587635, "rougeL_precision": 0.01917199471590868, "rougeL_precision_stderr": 0.00045663889214525273, "rougeL_recall": 0.2161115069458247, "rougeL_recall_stderr": 0.003797851830611, "rougeLsum_fmeasure": 0.029988024300834148, "rougeLsum_fmeasure_stderr": 0.0006784161057869979, "rougeLsum_precision": 0.01720510228861565, "rougeLsum_precision_stderr": 0.00042829503630886036, "rougeLsum_recall": 0.1977154227864982, "rougeLsum_recall_stderr": 0.0036870557526130065}, "pick_correct_choice_index": {"acc": 0.49510337323177367, "acc_norm": 0.49510337323177367, "acc_norm_stderr": 0.01166526473007815, "acc_stderr": 0.01166526473007815}, "what_is_the_correct_ending": {"acc": 0.559847660500544, "acc_norm": 0.5522306855277476, "acc_norm_stderr": 0.01160199979686681, "acc_stderr": 0.011581954727227395}}, "1": {"Correct the solution": {"bleu": 7.3767510595275665, "bleu_stderr": 0.31999310082457355, "rouge1_fmeasure": 0.26918713196522137, "rouge1_fmeasure_stderr": 0.005971404751580413, "rouge1_precision": 0.2215000417152911, "rouge1_precision_stderr": 0.006137200329200616, "rouge1_recall": 0.6794277393295427, "rouge1_recall_stderr": 0.0066774062595464905, "rouge2_fmeasure": 0.20124233569826713, "rouge2_fmeasure_stderr": 0.005612862896818122, "rouge2_precision": 0.1640201975855851, "rouge2_precision_stderr": 0.005503885160452421, "rouge2_recall": 0.5160107151183727, "rouge2_recall_stderr": 0.008200440904778427, "rougeL_fmeasure": 0.2608873390569801, "rougeL_fmeasure_stderr": 0.0059334583892896664, "rougeL_precision": 0.2140530349109963, "rougeL_precision_stderr": 0.006041148360807656, "rougeL_recall": 0.6614267819116432, "rougeL_recall_stderr": 0.006904677170599779, "rougeLsum_fmeasure": 0.26275507037270174, "rougeLsum_fmeasure_stderr": 0.005950371049088624, "rougeLsum_precision": 0.21594311252863366, "rougeLsum_precision_stderr": 0.006068594142965329, "rougeLsum_recall": 0.6630530525028233, "rougeLsum_recall_stderr": 0.006882384898937598}, "choose the most appropriate solution": {"acc": 0.5021762785636561, "acc_norm": 0.5021762785636561, "acc_norm_stderr": 0.011665713661738877, "acc_stderr": 0.011665713661738877}, "no prompt needed": {"bleu": 0.17208205991093362, "bleu_stderr": 0.010301725465366625, "rouge1_fmeasure": 0.03341660597066338, "rouge1_fmeasure_stderr": 0.0008614668722471364, "rouge1_precision": 0.0197808533467512, "rouge1_precision_stderr": 0.0006637399084174438, "rouge1_recall": 0.2083765310213374, "rouge1_recall_stderr": 0.004054556695081749, "rouge2_fmeasure": 0.005423476681292554, "rouge2_fmeasure_stderr": 0.00027465476938201355, "rouge2_precision": 0.0030943760026781223, "rouge2_precision_stderr": 0.0001632745006225758, "rouge2_recall": 0.036249687836481304, "rouge2_recall_stderr": 0.0018967653687895932, "rougeL_fmeasure": 0.0310540742053096, "rougeL_fmeasure_stderr": 0.0007532823292737799, "rougeL_precision": 0.018317454340834892, "rougeL_precision_stderr": 0.0005782023821136248, "rougeL_recall": 0.19659411245081346, "rougeL_recall_stderr": 0.003796357362538508, "rougeLsum_fmeasure": 0.027295011757363707, "rougeLsum_fmeasure_stderr": 0.0007014745436916857, "rougeLsum_precision": 0.016131297257791454, "rougeLsum_precision_stderr": 0.0005516670663513384, "rougeLsum_recall": 0.1776065844884305, "rougeLsum_recall_stderr": 0.0036553543805523184}, "pick_correct_choice_index": {"acc": 0.49782372143634385, "acc_norm": 0.49782372143634385, "acc_norm_stderr": 0.011665713661738873, "acc_stderr": 0.011665713661738873}, "what_is_the_correct_ending": {"acc": 0.5418933623503809, "acc_norm": 0.5424374319912949, "acc_norm_stderr": 0.011623729421518132, "acc_stderr": 0.011624803747232126}}, "2": {"Correct the solution": {"bleu": 10.989723799423952, "bleu_stderr": 0.477322210631878, "rouge1_fmeasure": 0.4219856738666251, "rouge1_fmeasure_stderr": 0.007684410627868399, "rouge1_precision": 0.40990335902092456, "rouge1_precision_stderr": 0.008259416445477983, "rouge1_recall": 0.6715421937524536, "rouge1_recall_stderr": 0.006833449843147891, "rouge2_fmeasure": 0.3238830207743833, "rouge2_fmeasure_stderr": 0.00755763096612864, "rouge2_precision": 0.31002907544350694, "rouge2_precision_stderr": 0.007809148641476673, "rouge2_recall": 0.5179977182718715, "rouge2_recall_stderr": 0.008225556639305274, "rougeL_fmeasure": 0.41063474482635426, "rougeL_fmeasure_stderr": 0.007718338436949308, "rougeL_precision": 0.39649832180640104, "rougeL_precision_stderr": 0.008182538521597163, "rougeL_recall": 0.6546910349804671, "rougeL_recall_stderr": 0.007082687223632161, "rougeLsum_fmeasure": 0.41268141482914944, "rougeLsum_fmeasure_stderr": 0.007720124425223064, "rougeLsum_precision": 0.39933775034518904, "rougeLsum_precision_stderr": 0.008215822611471774, "rougeLsum_recall": 0.6566914458481494, "rougeLsum_recall_stderr": 0.007055405166544278}, "choose the most appropriate solution": {"acc": 0.514689880304679, "acc_norm": 0.514689880304679, "acc_norm_stderr": 0.011660788281735496, "acc_stderr": 0.011660788281735496}, "no prompt needed": {"bleu": 0.13535549963348786, "bleu_stderr": 0.011883201814221686, "rouge1_fmeasure": 0.03167331762534985, "rouge1_fmeasure_stderr": 0.0008149162454634007, "rouge1_precision": 0.019974450931119343, "rouge1_precision_stderr": 0.0009096575382126053, "rouge1_recall": 0.20327780750289512, "rouge1_recall_stderr": 0.004138109405329968, "rouge2_fmeasure": 0.004712505847751591, "rouge2_fmeasure_stderr": 0.0002441580237771217, "rouge2_precision": 0.002913124216946272, "rouge2_precision_stderr": 0.00025124507871194965, "rouge2_recall": 0.03375022580735237, "rouge2_recall_stderr": 0.0018971946690473117, "rougeL_fmeasure": 0.029502879197560596, "rougeL_fmeasure_stderr": 0.0007158939543230575, "rougeL_precision": 0.018532639910942308, "rougeL_precision_stderr": 0.0008013412795519539, "rougeL_recall": 0.19189422414566185, "rougeL_recall_stderr": 0.003826189608547334, "rougeLsum_fmeasure": 0.02594274780299628, "rougeLsum_fmeasure_stderr": 0.0006803333166678133, "rougeLsum_precision": 0.016551043276073225, "rougeLsum_precision_stderr": 0.0008244428247702725, "rougeLsum_recall": 0.17348668108248785, "rougeLsum_recall_stderr": 0.0037039820894878926}, "pick_correct_choice_index": {"acc": 0.49347116430903154, "acc_norm": 0.49347116430903154, "acc_norm_stderr": 0.011664829595210969, "acc_stderr": 0.011664829595210969}, "what_is_the_correct_ending": {"acc": 0.5321001088139282, "acc_norm": 0.5342763873775843, "acc_norm_stderr": 0.011638380213532437, "acc_stderr": 0.011641758014820126}}, "3": {"Correct the solution": {"bleu": 15.589010417058658, "bleu_stderr": 0.8454556706430587, "rouge1_fmeasure": 0.5126931501254426, "rouge1_fmeasure_stderr": 0.007956454706339308, "rouge1_precision": 0.5135186861705211, "rouge1_precision_stderr": 0.008431281414525968, "rouge1_recall": 0.6766130266106622, "rouge1_recall_stderr": 0.006905636373535094, "rouge2_fmeasure": 0.4057090348109076, "rouge2_fmeasure_stderr": 0.008083151105761782, "rouge2_precision": 0.4022833285618224, "rouge2_precision_stderr": 0.008346085939704194, "rouge2_recall": 0.5340662726903487, "rouge2_recall_stderr": 0.008140102426002193, "rougeL_fmeasure": 0.5011072943138228, "rougeL_fmeasure_stderr": 0.008026772259796028, "rougeL_precision": 0.49968417841017443, "rougeL_precision_stderr": 0.00841093839820918, "rougeL_recall": 0.6615311276531707, "rougeL_recall_stderr": 0.007119822079266015, "rougeLsum_fmeasure": 0.5031731006894711, "rougeLsum_fmeasure_stderr": 0.008007126558781245, "rougeLsum_precision": 0.5022494859348401, "rougeLsum_precision_stderr": 0.008415389296058761, "rougeLsum_recall": 0.6647270305912438, "rougeLsum_recall_stderr": 0.007078089811566776}, "choose the most appropriate solution": {"acc": 0.5130576713819369, "acc_norm": 0.5130576713819369, "acc_norm_stderr": 0.011661845375886342, "acc_stderr": 0.011661845375886342}, "no prompt needed": {"bleu": 0.13494230997950132, "bleu_stderr": 0.009681091491781743, "rouge1_fmeasure": 0.03186472547854922, "rouge1_fmeasure_stderr": 0.0008686349358294461, "rouge1_precision": 0.01911132845089878, "rouge1_precision_stderr": 0.0006891049637900843, "rouge1_recall": 0.1986412875937398, "rouge1_recall_stderr": 0.003917458930279225, "rouge2_fmeasure": 0.00470586424330017, "rouge2_fmeasure_stderr": 0.00027245099685908755, "rouge2_precision": 0.0028206675747611733, "rouge2_precision_stderr": 0.00019060442122968908, "rouge2_recall": 0.03252120345407891, "rouge2_recall_stderr": 0.0019655949748724573, "rougeL_fmeasure": 0.029595321454027084, "rougeL_fmeasure_stderr": 0.0007712454590830175, "rougeL_precision": 0.017666871030393803, "rougeL_precision_stderr": 0.0005944257003565911, "rougeL_recall": 0.18735984838330555, "rougeL_recall_stderr": 0.0036829206769154214, "rougeLsum_fmeasure": 0.0260842453327782, "rougeLsum_fmeasure_stderr": 0.0007064564831714856, "rougeLsum_precision": 0.0156124167337213, "rougeLsum_precision_stderr": 0.0005611219330915058, "rougeLsum_recall": 0.17009443127881965, "rougeLsum_recall_stderr": 0.0035528405308160285}, "pick_correct_choice_index": {"acc": 0.4880304678998912, "acc_norm": 0.4880304678998912, "acc_norm_stderr": 0.011662480968070049, "acc_stderr": 0.011662480968070049}, "what_is_the_correct_ending": {"acc": 0.5310119695321001, "acc_norm": 0.5348204570184983, "acc_norm_stderr": 0.011637500993815848, "acc_stderr": 0.011643363511107457}}, "4": {"Correct the solution": {"bleu": 20.004791024670315, "bleu_stderr": 1.073121182179684, "rouge1_fmeasure": 0.5564854580202747, "rouge1_fmeasure_stderr": 0.007723304287288785, "rouge1_precision": 0.5635809903885909, "rouge1_precision_stderr": 0.008037675550817377, "rouge1_recall": 0.675003625813131, "rouge1_recall_stderr": 0.006934430967374778, "rouge2_fmeasure": 0.44072198270594876, "rouge2_fmeasure_stderr": 0.008143626907731532, "rouge2_precision": 0.44094340540720767, "rouge2_precision_stderr": 0.008336049944543478, "rouge2_recall": 0.5353191759273204, "rouge2_recall_stderr": 0.00816164151577736, "rougeL_fmeasure": 0.5439527567369827, "rougeL_fmeasure_stderr": 0.007832639006320164, "rougeL_precision": 0.5481052255869446, "rougeL_precision_stderr": 0.0080708932692548, "rougeL_recall": 0.660534735042919, "rougeL_recall_stderr": 0.007153754571776756, "rougeLsum_fmeasure": 0.5461795795622169, "rougeLsum_fmeasure_stderr": 0.0078049936381923745, "rougeLsum_precision": 0.5513165852733105, "rougeLsum_precision_stderr": 0.008062156731256905, "rougeLsum_recall": 0.6634891599008953, "rougeLsum_recall_stderr": 0.007102976763556776}, "choose the most appropriate solution": {"acc": 0.5076169749727966, "acc_norm": 0.5076169749727966, "acc_norm_stderr": 0.011664470424044972, "acc_stderr": 0.011664470424044972}, "no prompt needed": {"bleu": 0.11573444053320703, "bleu_stderr": 0.008126260546620632, "rouge1_fmeasure": 0.030447754959100907, "rouge1_fmeasure_stderr": 0.0007971323310690517, "rouge1_precision": 0.018273055931518078, "rouge1_precision_stderr": 0.0006213051033422689, "rouge1_recall": 0.1929448451365874, "rouge1_recall_stderr": 0.004110007510165885, "rouge2_fmeasure": 0.004310884060921524, "rouge2_fmeasure_stderr": 0.00023951170742409153, "rouge2_precision": 0.002439514135649695, "rouge2_precision_stderr": 0.00013971466675209783, "rouge2_recall": 0.033060693641110854, "rouge2_recall_stderr": 0.0021387293538605323, "rougeL_fmeasure": 0.02807329381234764, "rougeL_fmeasure_stderr": 0.0007021802242750015, "rougeL_precision": 0.016854212838326506, "rougeL_precision_stderr": 0.0005620151575706898, "rougeL_recall": 0.17976699456632925, "rougeL_recall_stderr": 0.0038004506713869165, "rougeLsum_fmeasure": 0.02508537471034259, "rougeLsum_fmeasure_stderr": 0.0006359302429150094, "rougeLsum_precision": 0.015034023539825695, "rougeLsum_precision_stderr": 0.0005141904845083435, "rougeLsum_recall": 0.16691215660017894, "rougeLsum_recall_stderr": 0.003747360877378185}, "pick_correct_choice_index": {"acc": 0.5195865070729053, "acc_norm": 0.5195865070729053, "acc_norm_stderr": 0.011656869979288456, "acc_stderr": 0.011656869979288456}, "what_is_the_correct_ending": {"acc": 0.5413492927094669, "acc_norm": 0.5424374319912949, "acc_norm_stderr": 0.011623729421518134, "acc_stderr": 0.01162586411331582}}, "5": {"Correct the solution": {"bleu": 20.541530615378996, "bleu_stderr": 1.0236650658463775, "rouge1_fmeasure": 0.5698885703517087, "rouge1_fmeasure_stderr": 0.007714826040679644, "rouge1_precision": 0.5751633897784844, "rouge1_precision_stderr": 0.00803435176216989, "rouge1_recall": 0.6898127295322619, "rouge1_recall_stderr": 0.006762762659826006, "rouge2_fmeasure": 0.4560672630321141, "rouge2_fmeasure_stderr": 0.008171926975401585, "rouge2_precision": 0.4546948153669257, "rouge2_precision_stderr": 0.008366514761157184, "rouge2_recall": 0.551114824772799, "rouge2_recall_stderr": 0.00808762271709775, "rougeL_fmeasure": 0.5584117042533167, "rougeL_fmeasure_stderr": 0.007831607552652558, "rougeL_precision": 0.5612455638728635, "rougeL_precision_stderr": 0.008091161331947056, "rougeL_recall": 0.6761124712337756, "rougeL_recall_stderr": 0.006983397662954877, "rougeLsum_fmeasure": 0.5603248083479772, "rougeLsum_fmeasure_stderr": 0.0078052598358510986, "rougeLsum_precision": 0.563673777021132, "rougeLsum_precision_stderr": 0.008077158591177138, "rougeLsum_recall": 0.6788119556165896, "rougeLsum_recall_stderr": 0.006943167870722146}, "choose the most appropriate solution": {"acc": 0.5087051142546246, "acc_norm": 0.5087051142546246, "acc_norm_stderr": 0.011664055982032838, "acc_stderr": 0.011664055982032838}, "no prompt needed": {"bleu": 0.12248518122547715, "bleu_stderr": 0.007751047228685078, "rouge1_fmeasure": 0.03135317378479231, "rouge1_fmeasure_stderr": 0.0008162298173479573, "rouge1_precision": 0.018924493431835766, "rouge1_precision_stderr": 0.0006692135261600222, "rouge1_recall": 0.19719638092839015, "rouge1_recall_stderr": 0.004120526129777433, "rouge2_fmeasure": 0.0046971364054093695, "rouge2_fmeasure_stderr": 0.00025307514350424073, "rouge2_precision": 0.002778538269095852, "rouge2_precision_stderr": 0.00018494204805708875, "rouge2_recall": 0.035803100604613836, "rouge2_recall_stderr": 0.0021320541894636206, "rougeL_fmeasure": 0.028760590162420697, "rougeL_fmeasure_stderr": 0.0007251430881917033, "rougeL_precision": 0.01729661551542472, "rougeL_precision_stderr": 0.000579508348294998, "rougeL_recall": 0.18308780779648284, "rougeL_recall_stderr": 0.003805938347899692, "rougeLsum_fmeasure": 0.0260467389410542, "rougeLsum_fmeasure_stderr": 0.0006803550833664029, "rougeLsum_precision": 0.015684253519749444, "rougeLsum_precision_stderr": 0.0005565161139387542, "rougeLsum_recall": 0.1712948423769965, "rougeLsum_recall_stderr": 0.003765879887375436}, "pick_correct_choice_index": {"acc": 0.5076169749727966, "acc_norm": 0.5076169749727966, "acc_norm_stderr": 0.011664470424044981, "acc_stderr": 0.011664470424044981}, "what_is_the_correct_ending": {"acc": 0.5386289445048966, "acc_norm": 0.5478781284004353, "acc_norm_stderr": 0.011612217507379627, "acc_stderr": 0.011630956681145912}}}, "sciq": {"0": {"Direct Question": {"acc": 0.876, "acc_norm": 0.804, "acc_norm_stderr": 0.012559527926707352, "acc_stderr": 0.01042749887234397}, "Direct Question (Closed Book)": {"acc": 0.623, "acc_norm": 0.548, "acc_norm_stderr": 0.015746235865880677, "acc_stderr": 0.01533317012577985}, "Multiple Choice": {"acc": 0.6, "acc_norm": 0.519, "acc_norm_stderr": 0.015807874268505853, "acc_stderr": 0.01549968516584259}, "Multiple Choice (Closed Book)": {"acc": 0.486, "acc_norm": 0.419, "acc_norm_stderr": 0.015610338967577794, "acc_stderr": 0.015813097547730984}, "Multiple Choice Question First": {"acc": 0.627, "acc_norm": 0.534, "acc_norm_stderr": 0.015782683329937625, "acc_stderr": 0.01530049362292281}}, "1": {"Direct Question": {"acc": 0.913, "acc_norm": 0.876, "acc_norm_stderr": 0.010427498872343972, "acc_stderr": 0.008916866630745906}, "Direct Question (Closed Book)": {"acc": 0.698, "acc_norm": 0.686, "acc_norm_stderr": 0.01468399195108797, "acc_stderr": 0.014526080235459544}, "Multiple Choice": {"acc": 0.585, "acc_norm": 0.544, "acc_norm_stderr": 0.015757928553979172, "acc_stderr": 0.015589035185604632}, "Multiple Choice (Closed Book)": {"acc": 0.517, "acc_norm": 0.477, "acc_norm_stderr": 0.015802554246726098, "acc_stderr": 0.01581015372983343}, "Multiple Choice Question First": {"acc": 0.51, "acc_norm": 0.472, "acc_norm_stderr": 0.015794475789511476, "acc_stderr": 0.015816135752773203}}, "2": {"Direct Question": {"acc": 0.914, "acc_norm": 0.893, "acc_norm_stderr": 0.009779910359847165, "acc_stderr": 0.008870325962594766}, "Direct Question (Closed Book)": {"acc": 0.715, "acc_norm": 0.698, "acc_norm_stderr": 0.014526080235459546, "acc_stderr": 0.014282120955200482}, "Multiple Choice": {"acc": 0.608, "acc_norm": 0.59, "acc_norm_stderr": 0.015560917136921672, "acc_stderr": 0.015445859463771297}, "Multiple Choice (Closed Book)": {"acc": 0.51, "acc_norm": 0.48, "acc_norm_stderr": 0.015806639423035167, "acc_stderr": 0.015816135752773207}, "Multiple Choice Question First": {"acc": 0.583, "acc_norm": 0.537, "acc_norm_stderr": 0.015775927227262423, "acc_stderr": 0.015599819048769618}}, "3": {"Direct Question": {"acc": 0.92, "acc_norm": 0.914, "acc_norm_stderr": 0.008870325962594766, "acc_stderr": 0.00858333697775365}, "Direct Question (Closed Book)": {"acc": 0.71, "acc_norm": 0.709, "acc_norm_stderr": 0.01437099598237794, "acc_stderr": 0.014356395999905689}, "Multiple Choice": {"acc": 0.637, "acc_norm": 0.599, "acc_norm_stderr": 0.015506109745498318, "acc_stderr": 0.015213890444671276}, "Multiple Choice (Closed Book)": {"acc": 0.529, "acc_norm": 0.486, "acc_norm_stderr": 0.015813097547730984, "acc_stderr": 0.0157926694516289}, "Multiple Choice Question First": {"acc": 0.595, "acc_norm": 0.555, "acc_norm_stderr": 0.015723301886760944, "acc_stderr": 0.015531136990453047}}, "4": {"Direct Question": {"acc": 0.922, "acc_norm": 0.914, "acc_norm_stderr": 0.008870325962594766, "acc_stderr": 0.008484573530118588}, "Direct Question (Closed Book)": {"acc": 0.717, "acc_norm": 0.724, "acc_norm_stderr": 0.01414298497574067, "acc_stderr": 0.014251810906481754}, "Multiple Choice": {"acc": 0.62, "acc_norm": 0.624, "acc_norm_stderr": 0.015325105508898134, "acc_stderr": 0.015356947477797575}, "Multiple Choice (Closed Book)": {"acc": 0.545, "acc_norm": 0.492, "acc_norm_stderr": 0.015817274929209008, "acc_stderr": 0.01575510149834709}, "Multiple Choice Question First": {"acc": 0.599, "acc_norm": 0.58, "acc_norm_stderr": 0.015615500115072957, "acc_stderr": 0.015506109745498318}}, "5": {"Direct Question": {"acc": 0.924, "acc_norm": 0.919, "acc_norm_stderr": 0.008632121032139967, "acc_stderr": 0.008384169266796386}, "Direct Question (Closed Book)": {"acc": 0.727, "acc_norm": 0.726, "acc_norm_stderr": 0.014111099288259587, "acc_stderr": 0.014095022868717595}, "Multiple Choice": {"acc": 0.625, "acc_norm": 0.609, "acc_norm_stderr": 0.01543882629468179, "acc_stderr": 0.015316971293620996}, "Multiple Choice (Closed Book)": {"acc": 0.547, "acc_norm": 0.506, "acc_norm_stderr": 0.015818160898606715, "acc_stderr": 0.015749255189977582}, "Multiple Choice Question First": {"acc": 0.585, "acc_norm": 0.563, "acc_norm_stderr": 0.015693223928730377, "acc_stderr": 0.015589035185604623}}}, "story_cloze_2016": {"0": {"Answer Given options": {"acc": 0.4778193479422769, "acc_norm": 0.4975948690539818, "acc_norm_stderr": 0.011562298481438053, "acc_stderr": 0.011551049647290307}, "Choose Story Ending": {"acc": 0.4890432923570283, "acc_norm": 0.5264564404061999, "acc_norm_stderr": 0.011546234813777409, "acc_stderr": 0.011559655791130727}, "Novel Correct Ending": {"acc": 0.4751469802244789, "acc_norm": 0.521111704970604, "acc_norm_stderr": 0.011552120807053812, "acc_stderr": 0.01154813982307477}, "Story Continuation and Options": {"acc": 0.5114911811865313, "acc_norm": 0.5259219668626403, "acc_norm_stderr": 0.011546883081384903, "acc_stderr": 0.011559378273599123}}, "1": {"Answer Given options": {"acc": 0.4585783003741315, "acc_norm": 0.4767504008551577, "acc_norm_stderr": 0.011549925483927456, "acc_stderr": 0.011522687288692527}, "Choose Story Ending": {"acc": 0.46980224478888294, "acc_norm": 0.4965259219668626, "acc_norm_stderr": 0.01156215314916829, "acc_stderr": 0.011541325320336616}, "Novel Correct Ending": {"acc": 0.47995724211651525, "acc_norm": 0.4938535542490647, "acc_norm_stderr": 0.01156155858904076, "acc_stderr": 0.011553138977961007}, "Story Continuation and Options": {"acc": 0.48957776590058794, "acc_norm": 0.5136290753607696, "acc_norm_stderr": 0.011558135970599896, "acc_stderr": 0.011559920087347776}}, "2": {"Answer Given options": {"acc": 0.46980224478888294, "acc_norm": 0.4740780331373597, "acc_norm_stderr": 0.011546883081384896, "acc_stderr": 0.011541325320336618}, "Choose Story Ending": {"acc": 0.4660609299839658, "acc_norm": 0.4820951362907536, "acc_norm_stderr": 0.011555016408505476, "acc_stderr": 0.011535764881641411}, "Novel Correct Ending": {"acc": 0.4730090860502405, "acc_norm": 0.48583645109567075, "acc_norm_stderr": 0.011557792331301667, "acc_stderr": 0.011545573278697235}, "Story Continuation and Options": {"acc": 0.4949225013361839, "acc_norm": 0.5114911811865313, "acc_norm_stderr": 0.011559378273599126, "acc_stderr": 0.011561836054238777}}, "3": {"Answer Given options": {"acc": 0.46125066809192944, "acc_norm": 0.4649919828968466, "acc_norm_stderr": 0.011534056494505864, "acc_stderr": 0.011527657726586461}, "Choose Story Ending": {"acc": 0.47247461250668094, "acc_norm": 0.4836985569214324, "acc_norm_stderr": 0.011556285484521572, "acc_stderr": 0.01154489847386459}, "Novel Correct Ending": {"acc": 0.4655264564404062, "acc_norm": 0.47995724211651525, "acc_norm_stderr": 0.011553138977961008, "acc_stderr": 0.011534917341355132}, "Story Continuation and Options": {"acc": 0.4938535542490647, "acc_norm": 0.4997327632282202, "acc_norm_stderr": 0.011562430600098489, "acc_stderr": 0.011561558589040757}}, "4": {"Answer Given options": {"acc": 0.4436130411544629, "acc_norm": 0.4585783003741315, "acc_norm_stderr": 0.011522687288692525, "acc_stderr": 0.01148867172507346}, "Choose Story Ending": {"acc": 0.46392303580972744, "acc_norm": 0.4756814537680385, "acc_norm_stderr": 0.011548748301487319, "acc_stderr": 0.01153229486915312}, "Novel Correct Ending": {"acc": 0.46125066809192944, "acc_norm": 0.46766435061464456, "acc_norm_stderr": 0.011538227692217273, "acc_stderr": 0.011527657726586461}, "Story Continuation and Options": {"acc": 0.5077498663816141, "acc_norm": 0.5179048637092464, "acc_norm_stderr": 0.011555016408505476, "acc_stderr": 0.01156104327886355}}, "5": {"Answer Given options": {"acc": 0.4462854088722608, "acc_norm": 0.4569748797434527, "acc_norm_stderr": 0.011519544865928056, "acc_stderr": 0.011495517440721683}, "Choose Story Ending": {"acc": 0.4708711918760021, "acc_norm": 0.4735435595938001, "acc_norm_stderr": 0.011546234813777393, "acc_stderr": 0.011542794417345717}, "Novel Correct Ending": {"acc": 0.4462854088722608, "acc_norm": 0.46178514163548906, "acc_norm_stderr": 0.011528611805439893, "acc_stderr": 0.011495517440721682}, "Story Continuation and Options": {"acc": 0.4938535542490647, "acc_norm": 0.5152324959914484, "acc_norm_stderr": 0.011557065368348293, "acc_stderr": 0.01156155858904076}}}, "superglue_rte": {"0": {"GPT-3 style": {"acc": 0.5234657039711191, "acc_norm": 0.49458483754512633, "acc_norm_stderr": 0.030094698123239966, "acc_stderr": 0.03006330041190266}, "MNLI crowdsource": {"acc": 0.48014440433212996, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.0300727231673172}, "does it follow that": {"acc": 0.48014440433212996, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.0300727231673172}, "guaranteed true": {"acc": 0.49458483754512633, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030094698123239966}, "should assume": {"acc": 0.4981949458483754, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030096267148976626}}, "1": {"GPT-3 style": {"acc": 0.516245487364621, "acc_norm": 0.5126353790613718, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030080573208738064}, "MNLI crowdsource": {"acc": 0.49097472924187724, "acc_norm": 0.49458483754512633, "acc_norm_stderr": 0.030094698123239966, "acc_stderr": 0.030091559826331334}, "does it follow that": {"acc": 0.49097472924187724, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}, "guaranteed true": {"acc": 0.49458483754512633, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030094698123239966}, "should assume": {"acc": 0.49097472924187724, "acc_norm": 0.49458483754512633, "acc_norm_stderr": 0.030094698123239966, "acc_stderr": 0.030091559826331334}}, "2": {"GPT-3 style": {"acc": 0.5270758122743683, "acc_norm": 0.4981949458483754, "acc_norm_stderr": 0.030096267148976626, "acc_stderr": 0.030052303463143706}, "MNLI crowdsource": {"acc": 0.5054151624548736, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030094698123239966}, "does it follow that": {"acc": 0.5126353790613718, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.030039730592197812, "acc_stderr": 0.030086851767188564}, "guaranteed true": {"acc": 0.5090252707581228, "acc_norm": 0.5126353790613718, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030091559826331334}, "should assume": {"acc": 0.5054151624548736, "acc_norm": 0.5054151624548736, "acc_norm_stderr": 0.030094698123239966, "acc_stderr": 0.030094698123239966}}, "3": {"GPT-3 style": {"acc": 0.5342960288808665, "acc_norm": 0.5379061371841155, "acc_norm_stderr": 0.030009848912529113, "acc_stderr": 0.030025579819366422}, "MNLI crowdsource": {"acc": 0.516245487364621, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.030039730592197812, "acc_stderr": 0.030080573208738064}, "does it follow that": {"acc": 0.51985559566787, "acc_norm": 0.51985559566787, "acc_norm_stderr": 0.030072723167317184, "acc_stderr": 0.030072723167317177}, "guaranteed true": {"acc": 0.5054151624548736, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030094698123239966}, "should assume": {"acc": 0.5306859205776173, "acc_norm": 0.5234657039711191, "acc_norm_stderr": 0.03006330041190266, "acc_stderr": 0.03003973059219781}}, "4": {"GPT-3 style": {"acc": 0.555956678700361, "acc_norm": 0.5776173285198556, "acc_norm_stderr": 0.029731622646495887, "acc_stderr": 0.029907396333795997}, "MNLI crowdsource": {"acc": 0.5342960288808665, "acc_norm": 0.5342960288808665, "acc_norm_stderr": 0.030025579819366426, "acc_stderr": 0.030025579819366426}, "does it follow that": {"acc": 0.51985559566787, "acc_norm": 0.5342960288808665, "acc_norm_stderr": 0.030025579819366426, "acc_stderr": 0.030072723167317184}, "guaranteed true": {"acc": 0.5379061371841155, "acc_norm": 0.555956678700361, "acc_norm_stderr": 0.029907396333795997, "acc_stderr": 0.030009848912529113}, "should assume": {"acc": 0.5523465703971119, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.02993107036293953}}, "5": {"GPT-3 style": {"acc": 0.5667870036101083, "acc_norm": 0.5595667870036101, "acc_norm_stderr": 0.029882123363118726, "acc_stderr": 0.029826764082138267}, "MNLI crowdsource": {"acc": 0.5415162454873647, "acc_norm": 0.5342960288808665, "acc_norm_stderr": 0.030025579819366426, "acc_stderr": 0.029992535385373314}, "does it follow that": {"acc": 0.5379061371841155, "acc_norm": 0.5451263537906137, "acc_norm_stderr": 0.02997363649541526, "acc_stderr": 0.03000984891252912}, "guaranteed true": {"acc": 0.5270758122743683, "acc_norm": 0.555956678700361, "acc_norm_stderr": 0.029907396333795997, "acc_stderr": 0.030052303463143706}, "should assume": {"acc": 0.51985559566787, "acc_norm": 0.5379061371841155, "acc_norm_stderr": 0.030009848912529117, "acc_stderr": 0.030072723167317177}}}, "winogrande": {"0": {"Replace": {"acc": 0.5019731649565904, "acc_norm": 0.500394632991318, "acc_norm_stderr": 0.014052481306049516, "acc_stderr": 0.01405237625922564}, "True or False": {"acc": 0.4956590370955012, "acc_norm": 0.4956590370955012, "acc_norm_stderr": 0.014051956064076896, "acc_stderr": 0.014051956064076896}, "does underscore refer to": {"acc": 0.4996053670086819, "acc_norm": 0.49171270718232046, "acc_norm_stderr": 0.014050555322824192, "acc_stderr": 0.014052481306049512}, "stand for": {"acc": 0.510655090765588, "acc_norm": 0.4964483030781373, "acc_norm_stderr": 0.014052131146915864, "acc_stderr": 0.0140492945362904}, "underscore refer to": {"acc": 0.5138121546961326, "acc_norm": 0.4956590370955012, "acc_norm_stderr": 0.014051956064076892, "acc_stderr": 0.014047122916440419}}, "1": {"Replace": {"acc": 0.5074980268350434, "acc_norm": 0.5074980268350434, "acc_norm_stderr": 0.014050905521228584, "acc_stderr": 0.014050905521228584}, "True or False": {"acc": 0.48855564325177586, "acc_norm": 0.48697711128650356, "acc_norm_stderr": 0.014047718393997667, "acc_stderr": 0.014048804199859325}, "does underscore refer to": {"acc": 0.4956590370955012, "acc_norm": 0.48145224940805054, "acc_norm_stderr": 0.014042813708888378, "acc_stderr": 0.014051956064076906}, "stand for": {"acc": 0.5035516969218626, "acc_norm": 0.49013417521704816, "acc_norm_stderr": 0.014049749833367596, "acc_stderr": 0.014052131146915848}, "underscore refer to": {"acc": 0.4972375690607735, "acc_norm": 0.48303078137332284, "acc_norm_stderr": 0.014044390401612976, "acc_stderr": 0.014052271211616441}}, "2": {"Replace": {"acc": 0.5090765588003157, "acc_norm": 0.516179952644041, "acc_norm_stderr": 0.01404512613097859, "acc_stderr": 0.014050170094497704}, "True or False": {"acc": 0.4940805051302289, "acc_norm": 0.5074980268350434, "acc_norm_stderr": 0.014050905521228573, "acc_stderr": 0.014051500838485807}, "does underscore refer to": {"acc": 0.5011838989739542, "acc_norm": 0.5067087608524072, "acc_norm_stderr": 0.014051220692330346, "acc_stderr": 0.014052446290529007}, "stand for": {"acc": 0.4980268350434096, "acc_norm": 0.48855564325177586, "acc_norm_stderr": 0.014048804199859325, "acc_stderr": 0.01405237625922564}, "underscore refer to": {"acc": 0.5082872928176796, "acc_norm": 0.5067087608524072, "acc_norm_stderr": 0.014051220692330349, "acc_stderr": 0.014050555322824189}}, "3": {"Replace": {"acc": 0.5217048145224941, "acc_norm": 0.526440410418311, "acc_norm_stderr": 0.014032823874407229, "acc_stderr": 0.014039239216484627}, "True or False": {"acc": 0.4996053670086819, "acc_norm": 0.5074980268350434, "acc_norm_stderr": 0.014050905521228571, "acc_stderr": 0.014052481306049512}, "does underscore refer to": {"acc": 0.5153906866614049, "acc_norm": 0.5027624309392266, "acc_norm_stderr": 0.014052271211616441, "acc_stderr": 0.014045826789783666}, "stand for": {"acc": 0.5035516969218626, "acc_norm": 0.5011838989739542, "acc_norm_stderr": 0.014052446290529019, "acc_stderr": 0.014052131146915848}, "underscore refer to": {"acc": 0.505130228887135, "acc_norm": 0.5035516969218626, "acc_norm_stderr": 0.014052131146915852, "acc_stderr": 0.014051745961790513}}, "4": {"Replace": {"acc": 0.5224940805051302, "acc_norm": 0.5201262825572218, "acc_norm_stderr": 0.014041096664344329, "acc_stderr": 0.014038257824059892}, "True or False": {"acc": 0.5027624309392266, "acc_norm": 0.5082872928176796, "acc_norm_stderr": 0.014050555322824189, "acc_stderr": 0.014052271211616445}, "does underscore refer to": {"acc": 0.5098658247829518, "acc_norm": 0.5082872928176796, "acc_norm_stderr": 0.014050555322824194, "acc_stderr": 0.014049749833367592}, "stand for": {"acc": 0.5082872928176796, "acc_norm": 0.500394632991318, "acc_norm_stderr": 0.01405248130604952, "acc_stderr": 0.014050555322824192}, "underscore refer to": {"acc": 0.5043409629044988, "acc_norm": 0.5019731649565904, "acc_norm_stderr": 0.014052376259225632, "acc_stderr": 0.014051956064076892}}, "5": {"Replace": {"acc": 0.5122336227308603, "acc_norm": 0.5185477505919495, "acc_norm_stderr": 0.014042813708888378, "acc_stderr": 0.01404827882040562}, "True or False": {"acc": 0.5035516969218626, "acc_norm": 0.5114443567482242, "acc_norm_stderr": 0.014048804199859322, "acc_stderr": 0.014052131146915852}, "does underscore refer to": {"acc": 0.5074980268350434, "acc_norm": 0.4972375690607735, "acc_norm_stderr": 0.014052271211616441, "acc_stderr": 0.014050905521228577}, "stand for": {"acc": 0.48382004735595896, "acc_norm": 0.4980268350434096, "acc_norm_stderr": 0.014052376259225629, "acc_stderr": 0.014045126130978601}, "underscore refer to": {"acc": 0.5098658247829518, "acc_norm": 0.5043409629044988, "acc_norm_stderr": 0.014051956064076892, "acc_stderr": 0.014049749833367585}}}} \ No newline at end of file