Muennighoff commited on
Commit
763a8f1
1 Parent(s): 5328b51
8b7178b58b/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.04207195309354536
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.04207195309354536
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.22765626131798156
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.22765626131798156
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2594509643808628
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2594509643808628
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2655834480484239
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2655834480484239
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2697484320297564
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2697484320297564
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.26998871183013234
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.26998871183013234
14
+ e2e_nlg_cleaned,5,average,multiple,0.22241662845011706
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.056521580391250686
16
+ gem_xsum,0,median,rouge2_fmeasure,0.056521580391250686
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04964766400970655
18
+ gem_xsum,1,median,rouge2_fmeasure,0.04964766400970655
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05421648195620343
20
+ gem_xsum,2,median,rouge2_fmeasure,0.05421648195620343
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.058170472646030105
22
+ gem_xsum,3,median,rouge2_fmeasure,0.058170472646030105
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.014126745787126624
24
+ gem_xsum,4,median,rouge2_fmeasure,0.014126745787126624
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0006840490507628032
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0006840490507628032
27
+ gem_xsum,5,average,multiple,0.03889449897351337
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05055369077754123
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05055369077754123
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08433674778206289
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.08433674778206289
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.10803467148430693
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.10803467148430693
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.11272429502619766
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.11272429502619766
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.11683410537192468
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.11683410537192468
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.12701615271509778
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.12701615271509778
40
+ web_nlg_en,5,average,multiple,0.09991661052618853
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03346604327851081
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03346604327851081
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04746965955266556
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.04746965955266556
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07776422929510499
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.07776422929510499
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.06469489115245816
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.06469489115245816
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.020767091534965167
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.020767091534965167
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0034609320209794962
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0034609320209794962
53
+ wiki_lingua_en,5,average,multiple,0.04127047447244736
8b7178b58b/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4214863735405607, "bleu_stderr": 0.029154609162010323, "rouge1_fmeasure": 0.10879688033992795, "rouge1_fmeasure_stderr": 0.0021627779616647516, "rouge1_precision": 0.08388349369737222, "rouge1_precision_stderr": 0.0028741362760589364, "rouge1_recall": 0.2839223617284565, "rouge1_recall_stderr": 0.005264418568471245, "rouge2_fmeasure": 0.05055369077754123, "rouge2_fmeasure_stderr": 0.0012930809657985872, "rouge2_precision": 0.04057930751764507, "rouge2_precision_stderr": 0.002047816625134428, "rouge2_recall": 0.13506279227711254, "rouge2_recall_stderr": 0.0032130647312794175, "rougeL_fmeasure": 0.10393401018258999, "rougeL_fmeasure_stderr": 0.002007764403266908, "rougeL_precision": 0.08041455507307416, "rougeL_precision_stderr": 0.002796739709192279, "rougeL_recall": 0.2738666536323073, "rougeL_recall_stderr": 0.005085678508176572, "rougeLsum_fmeasure": 0.10270283885145984, "rougeLsum_fmeasure_stderr": 0.002020111493088052, "rougeLsum_precision": 0.07977634567972974, "rougeLsum_precision_stderr": 0.002805961856125458, "rougeLsum_recall": 0.26749167398935675, "rougeLsum_recall_stderr": 0.004882015301399527}}, "1": {"PALM_prompt": {"bleu": 0.5958530874585899, "bleu_stderr": 0.03649262475380295, "rouge1_fmeasure": 0.16359576779434043, "rouge1_fmeasure_stderr": 0.003809967310568941, "rouge1_precision": 0.13749267610186958, "rouge1_precision_stderr": 0.004325878829249963, "rouge1_recall": 0.3241716808952326, "rouge1_recall_stderr": 0.005125119098986173, "rouge2_fmeasure": 0.08433674778206289, "rouge2_fmeasure_stderr": 0.002591174609210514, "rouge2_precision": 0.07246358161159391, "rouge2_precision_stderr": 0.0029673895627135506, "rouge2_recall": 0.16876397664351184, "rouge2_recall_stderr": 0.00367859006299185, "rougeL_fmeasure": 0.14881701677232517, "rougeL_fmeasure_stderr": 0.003277859581563782, "rougeL_precision": 0.12393180075905823, "rougeL_precision_stderr": 0.0037992843733980434, "rougeL_recall": 0.30308740481864604, "rougeL_recall_stderr": 0.004697897076217226, "rougeLsum_fmeasure": 0.15170700107447863, "rougeLsum_fmeasure_stderr": 0.0033575398797432874, "rougeLsum_precision": 0.12662324522811608, "rougeLsum_precision_stderr": 0.00387820514144903, "rougeLsum_recall": 0.3067988164213401, "rougeLsum_recall_stderr": 0.004754578958904846}}, "2": {"PALM_prompt": {"bleu": 0.7632123296691333, "bleu_stderr": 0.038667647524407975, "rouge1_fmeasure": 0.2022385311781214, "rouge1_fmeasure_stderr": 0.004271916206549367, "rouge1_precision": 0.17619531326285107, "rouge1_precision_stderr": 0.005080542288823818, "rouge1_recall": 0.3742085058586801, "rouge1_recall_stderr": 0.004920653945655996, "rouge2_fmeasure": 0.10803467148430693, "rouge2_fmeasure_stderr": 0.0030612514779265803, "rouge2_precision": 0.09658792599721289, "rouge2_precision_stderr": 0.003611262853993622, "rouge2_recall": 0.20162933402440936, "rouge2_recall_stderr": 0.0038801577073926965, "rougeL_fmeasure": 0.18252899170611903, "rougeL_fmeasure_stderr": 0.0037263532749030616, "rougeL_precision": 0.15692981536908263, "rougeL_precision_stderr": 0.004446297762122498, "rougeL_recall": 0.3484780480719596, "rougeL_recall_stderr": 0.0045553445266088635, "rougeLsum_fmeasure": 0.18711611785244323, "rougeLsum_fmeasure_stderr": 0.003848891104616288, "rougeLsum_precision": 0.16217269474751653, "rougeLsum_precision_stderr": 0.004631557820535045, "rougeLsum_recall": 0.35330840347834674, "rougeLsum_recall_stderr": 0.004610857396095154}}, "3": {"PALM_prompt": {"bleu": 0.9207133879279265, "bleu_stderr": 0.03255931768193071, "rouge1_fmeasure": 0.20867536510345913, "rouge1_fmeasure_stderr": 0.004354382473584997, "rouge1_precision": 0.18377058302813723, "rouge1_precision_stderr": 0.005253580997340217, "rouge1_recall": 0.381271206422023, "rouge1_recall_stderr": 0.004890766475780684, "rouge2_fmeasure": 0.11272429502619766, "rouge2_fmeasure_stderr": 0.0031312620749979446, "rouge2_precision": 0.10294612127680682, "rouge2_precision_stderr": 0.0037309770662019557, "rouge2_recall": 0.2060814488662053, "rouge2_recall_stderr": 0.0038997213229875847, "rougeL_fmeasure": 0.18616032806880206, "rougeL_fmeasure_stderr": 0.003714239240740623, "rougeL_precision": 0.16232375389423376, "rougeL_precision_stderr": 0.0045761676741338285, "rougeL_recall": 0.35147344800261143, "rougeL_recall_stderr": 0.004418957418369871, "rougeLsum_fmeasure": 0.19151809123693445, "rougeLsum_fmeasure_stderr": 0.003873742236641197, "rougeLsum_precision": 0.16801891503402527, "rougeLsum_precision_stderr": 0.004781137280246318, "rougeLsum_recall": 0.3573456612646077, "rougeLsum_recall_stderr": 0.004500705918880425}}, "4": {"PALM_prompt": {"bleu": 0.9902518578876233, "bleu_stderr": 0.040737732761129415, "rouge1_fmeasure": 0.21290146130579965, "rouge1_fmeasure_stderr": 0.0043700834724458276, "rouge1_precision": 0.18789368573901274, "rouge1_precision_stderr": 0.005285851879717852, "rouge1_recall": 0.3899093270922249, "rouge1_recall_stderr": 0.004962887842702003, "rouge2_fmeasure": 0.11683410537192468, "rouge2_fmeasure_stderr": 0.0031833984768610346, "rouge2_precision": 0.10560975558011194, "rouge2_precision_stderr": 0.003731700714381224, "rouge2_recall": 0.2145796483953565, "rouge2_recall_stderr": 0.004035868026365014, "rougeL_fmeasure": 0.1890303353482411, "rougeL_fmeasure_stderr": 0.003700023042470156, "rougeL_precision": 0.16382625310518403, "rougeL_precision_stderr": 0.004486884267673038, "rougeL_recall": 0.3597071518828505, "rougeL_recall_stderr": 0.004521922524975513, "rougeLsum_fmeasure": 0.19508857877285685, "rougeLsum_fmeasure_stderr": 0.003865606407547543, "rougeLsum_precision": 0.1707530708743364, "rougeLsum_precision_stderr": 0.004732141735621611, "rougeLsum_recall": 0.36629613801895733, "rougeLsum_recall_stderr": 0.004591525609735619}}, "5": {"PALM_prompt": {"bleu": 1.0873761263949675, "bleu_stderr": 0.03934330697010386, "rouge1_fmeasure": 0.23018635133450477, "rouge1_fmeasure_stderr": 0.004612080716063762, "rouge1_precision": 0.2117037418048869, "rouge1_precision_stderr": 0.005797595530762211, "rouge1_recall": 0.39911584774379655, "rouge1_recall_stderr": 0.00498148260658174, "rouge2_fmeasure": 0.12701615271509778, "rouge2_fmeasure_stderr": 0.003293666322506656, "rouge2_precision": 0.12132204155512089, "rouge2_precision_stderr": 0.004137193433006742, "rouge2_recall": 0.2209500964153402, "rouge2_recall_stderr": 0.004025388576844993, "rougeL_fmeasure": 0.20292345789497096, "rougeL_fmeasure_stderr": 0.003854003762475144, "rougeL_precision": 0.18406790377006843, "rougeL_precision_stderr": 0.004937322660278917, "rougeL_recall": 0.36561941279365967, "rougeL_recall_stderr": 0.004496892763588926, "rougeLsum_fmeasure": 0.20912697515193987, "rougeLsum_fmeasure_stderr": 0.00401671896906783, "rougeLsum_precision": 0.19092590715178714, "rougeLsum_precision_stderr": 0.005162190150819128, "rougeLsum_recall": 0.3726247925460039, "rougeLsum_recall_stderr": 0.004574209186658482}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.318732566456423, "bleu_stderr": 0.13881817798189033, "rouge1_fmeasure": 0.1485707360665655, "rouge1_fmeasure_stderr": 0.0023637963366588163, "rouge1_precision": 0.14513591635070752, "rouge1_precision_stderr": 0.002933831212103272, "rouge1_recall": 0.19891787194668262, "rouge1_recall_stderr": 0.003187558635297779, "rouge2_fmeasure": 0.03346604327851081, "rouge2_fmeasure_stderr": 0.0009887781706946746, "rouge2_precision": 0.029907874227091444, "rouge2_precision_stderr": 0.000921657729333574, "rouge2_recall": 0.04649017309040178, "rouge2_recall_stderr": 0.0014882369637529544, "rougeL_fmeasure": 0.11823159104307632, "rougeL_fmeasure_stderr": 0.0017829954318414592, "rougeL_precision": 0.11668487543384909, "rougeL_precision_stderr": 0.002498896390227391, "rougeL_recall": 0.16129272420830681, "rougeL_recall_stderr": 0.002571624720811961, "rougeLsum_fmeasure": 0.13755330088318402, "rougeLsum_fmeasure_stderr": 0.00219534310275043, "rougeLsum_precision": 0.13518764121791138, "rougeLsum_precision_stderr": 0.0028124634604811717, "rougeLsum_recall": 0.18445870962398936, "rougeLsum_recall_stderr": 0.0029688108031568614}}, "1": {"tldr_en": {"bleu": 2.926117660825783, "bleu_stderr": 0.0707920058779335, "rouge1_fmeasure": 0.18955083016621427, "rouge1_fmeasure_stderr": 0.002379156871875284, "rouge1_precision": 0.248970781629388, "rouge1_precision_stderr": 0.003874261949118977, "rouge1_recall": 0.20254661208456856, "rouge1_recall_stderr": 0.0028554953044439015, "rouge2_fmeasure": 0.04746965955266556, "rouge2_fmeasure_stderr": 0.0013357687513664974, "rouge2_precision": 0.0689782299271333, "rouge2_precision_stderr": 0.002313495667945766, "rouge2_recall": 0.049284004115414136, "rouge2_recall_stderr": 0.0014678263918321242, "rougeL_fmeasure": 0.14594622749474678, "rougeL_fmeasure_stderr": 0.0018424648214888, "rougeL_precision": 0.19495136652936196, "rougeL_precision_stderr": 0.0032222442707770895, "rougeL_recall": 0.1564134625395888, "rougeL_recall_stderr": 0.002249411692052382, "rougeLsum_fmeasure": 0.17879643931529124, "rougeLsum_fmeasure_stderr": 0.0022381896082302615, "rougeLsum_precision": 0.23507901615131913, "rougeLsum_precision_stderr": 0.0036892088805215145, "rougeLsum_recall": 0.19138092032456272, "rougeLsum_recall_stderr": 0.002706865345022287}}, "2": {"tldr_en": {"bleu": 4.482664241932101, "bleu_stderr": 0.08212180996103731, "rouge1_fmeasure": 0.25402066807919726, "rouge1_fmeasure_stderr": 0.0022729602698409765, "rouge1_precision": 0.35679011396950494, "rouge1_precision_stderr": 0.0039535541787012966, "rouge1_recall": 0.2525122163812022, "rouge1_recall_stderr": 0.0028104769002719686, "rouge2_fmeasure": 0.07776422929510499, "rouge2_fmeasure_stderr": 0.0015470867437394145, "rouge2_precision": 0.11566892110217673, "rouge2_precision_stderr": 0.0025968459819964438, "rouge2_recall": 0.0758485270748662, "rouge2_recall_stderr": 0.0016494845495132536, "rougeL_fmeasure": 0.19606027302517406, "rougeL_fmeasure_stderr": 0.0018502837344183064, "rougeL_precision": 0.2787485396392972, "rougeL_precision_stderr": 0.00333060867857478, "rougeL_recall": 0.1946844152208185, "rougeL_recall_stderr": 0.002269590023349728, "rougeLsum_fmeasure": 0.24005412545885543, "rougeLsum_fmeasure_stderr": 0.0021905454232920693, "rougeLsum_precision": 0.33780149114906943, "rougeLsum_precision_stderr": 0.003819320386381285, "rougeLsum_recall": 0.23865291484527454, "rougeLsum_recall_stderr": 0.0027000637562272383}}, "3": {"tldr_en": {"bleu": 3.0072655046058805, "bleu_stderr": 0.09596043858220096, "rouge1_fmeasure": 0.2113115990683054, "rouge1_fmeasure_stderr": 0.0027547750855983046, "rouge1_precision": 0.3061484539454385, "rouge1_precision_stderr": 0.004423098151670539, "rouge1_recall": 0.20167165147009586, "rouge1_recall_stderr": 0.003037208327664155, "rouge2_fmeasure": 0.06469489115245816, "rouge2_fmeasure_stderr": 0.0015394791419511076, "rouge2_precision": 0.10017174467185903, "rouge2_precision_stderr": 0.002669280506981759, "rouge2_recall": 0.06128388820701079, "rouge2_recall_stderr": 0.0016275016915406385, "rougeL_fmeasure": 0.1649989495768865, "rougeL_fmeasure_stderr": 0.002208842551468564, "rougeL_precision": 0.2427018640988824, "rougeL_precision_stderr": 0.0037099343074804333, "rougeL_recall": 0.1573050958528797, "rougeL_recall_stderr": 0.002452551923134493, "rougeLsum_fmeasure": 0.19989739105994972, "rougeLsum_fmeasure_stderr": 0.0026343775714049306, "rougeLsum_precision": 0.29051399119634647, "rougeLsum_precision_stderr": 0.004266689720705603, "rougeLsum_recall": 0.19088557384190263, "rougeLsum_recall_stderr": 0.0029163289376507914}}, "4": {"tldr_en": {"bleu": 0.02961687910857089, "bleu_stderr": 0.00667797537003518, "rouge1_fmeasure": 0.06646559577850898, "rouge1_fmeasure_stderr": 0.002389468792903583, "rouge1_precision": 0.09734210523837013, "rouge1_precision_stderr": 0.003585657330962349, "rouge1_recall": 0.06338401184872311, "rouge1_recall_stderr": 0.002461970340537986, "rouge2_fmeasure": 0.020767091534965167, "rouge2_fmeasure_stderr": 0.001088077081953597, "rouge2_precision": 0.0321171525149136, "rouge2_precision_stderr": 0.0017883302892353346, "rouge2_recall": 0.019779405420936473, "rouge2_recall_stderr": 0.0011693482689711905, "rougeL_fmeasure": 0.05241812699049824, "rougeL_fmeasure_stderr": 0.0019162468765830335, "rougeL_precision": 0.07721998831670598, "rougeL_precision_stderr": 0.002911924342353414, "rougeL_recall": 0.05016221089779053, "rougeL_recall_stderr": 0.001994888739662481, "rougeLsum_fmeasure": 0.062355467850168954, "rougeLsum_fmeasure_stderr": 0.0022529742622293867, "rougeLsum_precision": 0.09174287392107108, "rougeLsum_precision_stderr": 0.0034181860926286, "rougeLsum_recall": 0.05945776168044872, "rougeLsum_recall_stderr": 0.0023246003955623533}}, "5": {"tldr_en": {"bleu": 5.0425595242130537e-17, "bleu_stderr": 5.828914972997529e-15, "rouge1_fmeasure": 0.010515163702672926, "rouge1_fmeasure_stderr": 0.001060191799945764, "rouge1_precision": 0.01662223836437527, "rouge1_precision_stderr": 0.0017200192815433816, "rouge1_recall": 0.010369987242786843, "rouge1_recall_stderr": 0.0011350821871749513, "rouge2_fmeasure": 0.0034609320209794962, "rouge2_fmeasure_stderr": 0.000506080127090741, "rouge2_precision": 0.0056669762870846345, "rouge2_precision_stderr": 0.0008937346232787428, "rouge2_recall": 0.0035137406087487875, "rouge2_recall_stderr": 0.0005888971524329703, "rougeL_fmeasure": 0.00858027237742148, "rougeL_fmeasure_stderr": 0.0008896522724178397, "rougeL_precision": 0.013605527079975888, "rougeL_precision_stderr": 0.0014624549576253666, "rougeL_recall": 0.008580131540181942, "rougeL_recall_stderr": 0.0009742333834018987, "rougeLsum_fmeasure": 0.010008632917285501, "rougeLsum_fmeasure_stderr": 0.0010183013884774283, "rougeLsum_precision": 0.015819080798652796, "rougeLsum_precision_stderr": 0.001649211944475785, "rougeLsum_recall": 0.009938412449545603, "rougeLsum_recall_stderr": 0.0011052383189021134}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.727703428672484, "bleu_stderr": 0.16225275145292814, "rouge1_fmeasure": 0.11677483396146617, "rouge1_fmeasure_stderr": 0.0026043522717618377, "rouge1_precision": 0.15580458811715164, "rouge1_precision_stderr": 0.004522055277902883, "rouge1_recall": 0.16212105310140335, "rouge1_recall_stderr": 0.0038770863196531777, "rouge2_fmeasure": 0.04207195309354536, "rouge2_fmeasure_stderr": 0.0011458322627199238, "rouge2_precision": 0.08759078800738393, "rouge2_precision_stderr": 0.004060781916683637, "rouge2_recall": 0.05918434764233474, "rouge2_recall_stderr": 0.001745078033222418, "rougeL_fmeasure": 0.0970712318345091, "rougeL_fmeasure_stderr": 0.002118544001686245, "rougeL_precision": 0.13984894212789747, "rougeL_precision_stderr": 0.0044166136430229454, "rougeL_recall": 0.1340573709320961, "rougeL_recall_stderr": 0.0031951681845401422, "rougeLsum_fmeasure": 0.10190557193119215, "rougeLsum_fmeasure_stderr": 0.0022803567703003585, "rougeLsum_precision": 0.14388054529296093, "rougeLsum_precision_stderr": 0.004453879377805569, "rougeLsum_recall": 0.14043655820750423, "rougeLsum_recall_stderr": 0.003394973082402341}}, "1": {"generate_text_restaurant": {"bleu": 12.253787105095654, "bleu_stderr": 0.1531956101772429, "rouge1_fmeasure": 0.48227052690365063, "rouge1_fmeasure_stderr": 0.002317881335104088, "rouge1_precision": 0.5916925132644157, "rouge1_precision_stderr": 0.00322106862270845, "rouge1_recall": 0.4463736554053124, "rouge1_recall_stderr": 0.0030343957764200962, "rouge2_fmeasure": 0.22765626131798156, "rouge2_fmeasure_stderr": 0.0020703831562182183, "rouge2_precision": 0.2838064949819702, "rouge2_precision_stderr": 0.0027789665070579485, "rouge2_recall": 0.2105351617568858, "rouge2_recall_stderr": 0.0022171957363924296, "rougeL_fmeasure": 0.34946822459925764, "rougeL_fmeasure_stderr": 0.0020719433030147168, "rougeL_precision": 0.43217159380951586, "rougeL_precision_stderr": 0.0029973445963260422, "rougeL_recall": 0.32230580960354577, "rougeL_recall_stderr": 0.0024575923055788883, "rougeLsum_fmeasure": 0.3932633862316688, "rougeLsum_fmeasure_stderr": 0.002340814619008484, "rougeLsum_precision": 0.48366766620362545, "rougeLsum_precision_stderr": 0.0032029340181387936, "rougeLsum_recall": 0.36354140064435775, "rougeLsum_recall_stderr": 0.0027910193529178087}}, "2": {"generate_text_restaurant": {"bleu": 14.934222887897532, "bleu_stderr": 0.21278791413856055, "rouge1_fmeasure": 0.5160444555536067, "rouge1_fmeasure_stderr": 0.00228951307038736, "rouge1_precision": 0.6137196300402722, "rouge1_precision_stderr": 0.0031475076717186906, "rouge1_recall": 0.48231229181347424, "rouge1_recall_stderr": 0.002994935365094384, "rouge2_fmeasure": 0.2594509643808628, "rouge2_fmeasure_stderr": 0.0021848217582054836, "rouge2_precision": 0.3128748478554642, "rouge2_precision_stderr": 0.0028571435877434844, "rouge2_recall": 0.24224424588278937, "rouge2_recall_stderr": 0.002338319832959116, "rougeL_fmeasure": 0.37626113256931526, "rougeL_fmeasure_stderr": 0.0021363134322449147, "rougeL_precision": 0.4496161361042401, "rougeL_precision_stderr": 0.002965960715542685, "rougeL_recall": 0.35106979956416473, "rougeL_recall_stderr": 0.0025186871703425898, "rougeLsum_fmeasure": 0.42631762287613684, "rougeLsum_fmeasure_stderr": 0.00239431216095114, "rougeLsum_precision": 0.5075541777391559, "rougeLsum_precision_stderr": 0.003180746032746745, "rougeLsum_recall": 0.3982152959674102, "rougeLsum_recall_stderr": 0.002838831357019365}}, "3": {"generate_text_restaurant": {"bleu": 15.550166229098965, "bleu_stderr": 0.21100324131614723, "rouge1_fmeasure": 0.5240247422123916, "rouge1_fmeasure_stderr": 0.0022364704364336795, "rouge1_precision": 0.6164424274284126, "rouge1_precision_stderr": 0.0031180799828950793, "rouge1_recall": 0.49077882213900875, "rouge1_recall_stderr": 0.002902655623188235, "rouge2_fmeasure": 0.2655834480484239, "rouge2_fmeasure_stderr": 0.0021599553939183514, "rouge2_precision": 0.3165046826237152, "rouge2_precision_stderr": 0.002782884317807398, "rouge2_recall": 0.24838679979596562, "rouge2_recall_stderr": 0.0023021548981493603, "rougeL_fmeasure": 0.38460762326164105, "rougeL_fmeasure_stderr": 0.0021692227869033478, "rougeL_precision": 0.45453733777109134, "rougeL_precision_stderr": 0.0029846320407150165, "rougeL_recall": 0.35943496219625287, "rougeL_recall_stderr": 0.002503259775759038, "rougeLsum_fmeasure": 0.4372484038981659, "rougeLsum_fmeasure_stderr": 0.002394606287079693, "rougeLsum_precision": 0.5148376263253541, "rougeLsum_precision_stderr": 0.0031730624992959006, "rougeLsum_recall": 0.4092664706750279, "rougeLsum_recall_stderr": 0.002806743229599867}}, "4": {"generate_text_restaurant": {"bleu": 15.882734366339575, "bleu_stderr": 0.20014792318307217, "rouge1_fmeasure": 0.5268811401300224, "rouge1_fmeasure_stderr": 0.002296537855184882, "rouge1_precision": 0.617829970748385, "rouge1_precision_stderr": 0.0031627975477214964, "rouge1_recall": 0.4934813921824228, "rouge1_recall_stderr": 0.0028974554963002866, "rouge2_fmeasure": 0.2697484320297564, "rouge2_fmeasure_stderr": 0.0022376325121211964, "rouge2_precision": 0.31982595333569097, "rouge2_precision_stderr": 0.002834806640598152, "rouge2_recall": 0.2524396113719004, "rouge2_recall_stderr": 0.002374239958755143, "rougeL_fmeasure": 0.38708461902411606, "rougeL_fmeasure_stderr": 0.002242571110085911, "rougeL_precision": 0.4547884196194121, "rougeL_precision_stderr": 0.0029857683967765883, "rougeL_recall": 0.36242825370807497, "rougeL_recall_stderr": 0.00256326606590863, "rougeLsum_fmeasure": 0.4406025001412695, "rougeLsum_fmeasure_stderr": 0.0024516808057243426, "rougeLsum_precision": 0.5162937268213095, "rougeLsum_precision_stderr": 0.003187124832415968, "rougeLsum_recall": 0.4128904038699314, "rougeLsum_recall_stderr": 0.0028402630566630306}}, "5": {"generate_text_restaurant": {"bleu": 15.843755546529314, "bleu_stderr": 0.23380050838762192, "rouge1_fmeasure": 0.5268007980171965, "rouge1_fmeasure_stderr": 0.0022830659821340285, "rouge1_precision": 0.6189660173531598, "rouge1_precision_stderr": 0.003202216597059111, "rouge1_recall": 0.49387787166933184, "rouge1_recall_stderr": 0.00292152626004202, "rouge2_fmeasure": 0.26998871183013234, "rouge2_fmeasure_stderr": 0.0021968682565042135, "rouge2_precision": 0.32113506253536717, "rouge2_precision_stderr": 0.002823936183018856, "rouge2_recall": 0.25289072399317875, "rouge2_recall_stderr": 0.0023526916821315917, "rougeL_fmeasure": 0.38651546157123484, "rougeL_fmeasure_stderr": 0.002221892359998622, "rougeL_precision": 0.45492486270003574, "rougeL_precision_stderr": 0.002993407598047982, "rougeL_recall": 0.3622504836542661, "rougeL_recall_stderr": 0.002569318863258077, "rougeLsum_fmeasure": 0.44101339790167177, "rougeLsum_fmeasure_stderr": 0.0024405926593214712, "rougeLsum_precision": 0.5179172358492237, "rougeLsum_precision_stderr": 0.0032140290174150392, "rougeLsum_recall": 0.4135121055179984, "rougeLsum_recall_stderr": 0.0028519351140784115}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.4021403698389303, "bleu_stderr": 0.067244017856741, "rouge1_fmeasure": 0.22900899484830875, "rouge1_fmeasure_stderr": 0.002592170416843644, "rouge1_precision": 0.16834475552405148, "rouge1_precision_stderr": 0.002143734509300351, "rouge1_recall": 0.3812302941527284, "rouge1_recall_stderr": 0.004466228338306761, "rouge2_fmeasure": 0.056521580391250686, "rouge2_fmeasure_stderr": 0.0017743435854162372, "rouge2_precision": 0.04090800254887081, "rouge2_precision_stderr": 0.001331305757616898, "rouge2_recall": 0.09717669760813316, "rouge2_recall_stderr": 0.0031305903631064793, "rougeL_fmeasure": 0.16720327598962526, "rougeL_fmeasure_stderr": 0.0020460047183396978, "rougeL_precision": 0.12317643828521285, "rougeL_precision_stderr": 0.0017773839864967875, "rougeL_recall": 0.2797265709939456, "rougeL_recall_stderr": 0.0036566609405842706, "rougeLsum_fmeasure": 0.18208585408505043, "rougeLsum_fmeasure_stderr": 0.00227028405987386, "rougeLsum_precision": 0.1339383429556578, "rougeLsum_precision_stderr": 0.0019089490816378235, "rougeLsum_recall": 0.3045528042834054, "rougeLsum_recall_stderr": 0.004033519848039268}}, "1": {"article_DOC_summary": {"bleu": 2.587973656923586, "bleu_stderr": 0.18271829592693106, "rouge1_fmeasure": 0.22010181845678548, "rouge1_fmeasure_stderr": 0.0035289902424468713, "rouge1_precision": 0.2269075171314588, "rouge1_precision_stderr": 0.004333302214651766, "rouge1_recall": 0.24532990314212508, "rouge1_recall_stderr": 0.003988402379243326, "rouge2_fmeasure": 0.04964766400970655, "rouge2_fmeasure_stderr": 0.002214536505649702, "rouge2_precision": 0.052629985612383176, "rouge2_precision_stderr": 0.0026272197223151062, "rouge2_recall": 0.05514694411681367, "rouge2_recall_stderr": 0.002420925395374345, "rougeL_fmeasure": 0.16668249804551818, "rougeL_fmeasure_stderr": 0.002858700548419538, "rougeL_precision": 0.1721080105359506, "rougeL_precision_stderr": 0.003568699794613313, "rougeL_recall": 0.18654158462763012, "rougeL_recall_stderr": 0.0032259213006477503, "rougeLsum_fmeasure": 0.168054574737479, "rougeLsum_fmeasure_stderr": 0.0028788648929427077, "rougeLsum_precision": 0.17318677280555747, "rougeLsum_precision_stderr": 0.0035702087618089537, "rougeLsum_recall": 0.18887685302453283, "rougeLsum_recall_stderr": 0.003346146632601525}}, "2": {"article_DOC_summary": {"bleu": 2.894349555094649, "bleu_stderr": 0.20938561045016857, "rouge1_fmeasure": 0.22681874696670282, "rouge1_fmeasure_stderr": 0.0035634918232377074, "rouge1_precision": 0.23962839287439405, "rouge1_precision_stderr": 0.004379685148326459, "rouge1_recall": 0.24084621475655701, "rouge1_recall_stderr": 0.003791189958412602, "rouge2_fmeasure": 0.05421648195620343, "rouge2_fmeasure_stderr": 0.0023479794051376422, "rouge2_precision": 0.05825281986003038, "rouge2_precision_stderr": 0.0026810505602905777, "rouge2_recall": 0.056775559678677465, "rouge2_recall_stderr": 0.002426834325647628, "rougeL_fmeasure": 0.17492387767242143, "rougeL_fmeasure_stderr": 0.0029179836282209, "rougeL_precision": 0.1845331825452337, "rougeL_precision_stderr": 0.003580958301916115, "rougeL_recall": 0.18702023658313446, "rougeL_recall_stderr": 0.003174519635609054, "rougeLsum_fmeasure": 0.17541718007715307, "rougeLsum_fmeasure_stderr": 0.0029228112018755566, "rougeLsum_precision": 0.1851109498506353, "rougeLsum_precision_stderr": 0.0035857038136326946, "rougeLsum_recall": 0.1874180466959014, "rougeLsum_recall_stderr": 0.0031802199539878143}}, "3": {"article_DOC_summary": {"bleu": 3.3807185036949794, "bleu_stderr": 0.21747997288721058, "rouge1_fmeasure": 0.22875361771029976, "rouge1_fmeasure_stderr": 0.004084338493816929, "rouge1_precision": 0.2493216186518576, "rouge1_precision_stderr": 0.0049084458961309585, "rouge1_recall": 0.23437481439478378, "rouge1_recall_stderr": 0.00414167551937926, "rouge2_fmeasure": 0.058170472646030105, "rouge2_fmeasure_stderr": 0.002575465227066567, "rouge2_precision": 0.06426035314217526, "rouge2_precision_stderr": 0.0029430708789924724, "rouge2_recall": 0.05825170712730457, "rouge2_recall_stderr": 0.0025593535615914685, "rougeL_fmeasure": 0.17477557833568425, "rougeL_fmeasure_stderr": 0.003389234359734604, "rougeL_precision": 0.19044444305113, "rougeL_precision_stderr": 0.004057392892502295, "rougeL_recall": 0.1796956624879631, "rougeL_recall_stderr": 0.0034464435487860908, "rougeLsum_fmeasure": 0.17624162420032213, "rougeLsum_fmeasure_stderr": 0.0033874175464656286, "rougeLsum_precision": 0.19176785292110343, "rougeLsum_precision_stderr": 0.004051683679981941, "rougeLsum_recall": 0.1817729140455351, "rougeLsum_recall_stderr": 0.0034769204180304563}}, "4": {"article_DOC_summary": {"bleu": 0.11188091321184003, "bleu_stderr": 0.02931669237987178, "rouge1_fmeasure": 0.055495095278546853, "rouge1_fmeasure_stderr": 0.0035332699011158906, "rouge1_precision": 0.06368749975876878, "rouge1_precision_stderr": 0.004163291649927898, "rouge1_recall": 0.05633355721143847, "rouge1_recall_stderr": 0.003674327931986382, "rouge2_fmeasure": 0.014126745787126624, "rouge2_fmeasure_stderr": 0.0014244420667161892, "rouge2_precision": 0.01625809935844295, "rouge2_precision_stderr": 0.001744008450570692, "rouge2_recall": 0.014127871826558468, "rouge2_recall_stderr": 0.0014279592951878697, "rougeL_fmeasure": 0.042006581084650725, "rougeL_fmeasure_stderr": 0.002736359920016333, "rougeL_precision": 0.048922678698080466, "rougeL_precision_stderr": 0.003321571568066582, "rougeL_recall": 0.04260833742058202, "rougeL_recall_stderr": 0.0028433365851945613, "rougeLsum_fmeasure": 0.04223553974124363, "rougeLsum_fmeasure_stderr": 0.0027526558135678226, "rougeLsum_precision": 0.049121011091270275, "rougeLsum_precision_stderr": 0.0033301554897141204, "rougeLsum_recall": 0.042965789423027506, "rougeLsum_recall_stderr": 0.0028912525537101723}}, "5": {"article_DOC_summary": {"bleu": 2.2560041633308834e-45, "bleu_stderr": 1.8421463064085026e-30, "rouge1_fmeasure": 0.002533146063446972, "rouge1_fmeasure_stderr": 0.0008926774274323296, "rouge1_precision": 0.002666427538603693, "rouge1_precision_stderr": 0.001013100557726815, "rouge1_recall": 0.0025861091150659685, "rouge1_recall_stderr": 0.0008630243879275325, "rouge2_fmeasure": 0.0006840490507628032, "rouge2_fmeasure_stderr": 0.00035126113759812106, "rouge2_precision": 0.0007974546793721661, "rouge2_precision_stderr": 0.0004389545775956508, "rouge2_recall": 0.0006295661715078737, "rouge2_recall_stderr": 0.00030074990914758437, "rougeL_fmeasure": 0.0018694866909844472, "rougeL_fmeasure_stderr": 0.0007140136174659965, "rougeL_precision": 0.002022260888604123, "rougeL_precision_stderr": 0.0008452973095491474, "rougeL_recall": 0.0018561449826780235, "rougeL_recall_stderr": 0.0006574742336350735, "rougeLsum_fmeasure": 0.0019113224438188196, "rougeLsum_fmeasure_stderr": 0.000720026623055153, "rougeLsum_precision": 0.00205235327222183, "rougeLsum_precision_stderr": 0.0008479115148274438, "rougeLsum_recall": 0.0019247556173263936, "rougeLsum_recall_stderr": 0.0006749866649238696}}}}
8b7178b58b/evaluation/rankeval/8b7178b58b_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.331,0.014888272588203936,0
3
+ anli_r2,acc,0.336,0.014944140233795027,0
4
+ anli_r3,acc,0.3425,0.013704669762934727,0
5
+ arc_challenge,acc,0.2781569965870307,0.013094469919538816,0
6
+ arc_challenge,acc_norm,0.29436860068259385,0.013318528460539426,0
7
+ arc_easy,acc,0.609006734006734,0.010012992232540633,0
8
+ arc_easy,acc_norm,0.5593434343434344,0.010187264635711991,0
9
+ boolq,acc,0.5892966360856269,0.008604460608471413,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.21956970232832299,,1
12
+ copa,acc,0.74,0.044084400227680794,0
13
+ hellaswag,acc,0.4480183230432185,0.004962742426849887,0
14
+ hellaswag,acc_norm,0.5839474208325035,0.0049189510191838875,0
15
+ piqa,acc,0.7442872687704026,0.010178690109459857,0
16
+ piqa,acc_norm,0.7546245919477693,0.010039831320422386,0
17
+ rte,acc,0.5631768953068592,0.029855247390314945,0
18
+ sciq,acc,0.865,0.010811655372416053,0
19
+ sciq,acc_norm,0.793,0.012818553557843983,0
20
+ storycloze_2016,acc,0.6916087653661144,0.010679734445487797,0
21
+ winogrande,acc,0.5730071033938438,0.01390187807257506,0
8b7178b58b/evaluation/rankeval/8b7178b58b_0_lm-eval_global_step84877_2023-02-04-19-09-29_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.331,
5
- "acc_stderr": 0.014888272588203936
6
- },
7
- "anli_r2": {
8
- "acc": 0.336,
9
- "acc_stderr": 0.014944140233795027
10
- },
11
- "anli_r3": {
12
- "acc": 0.3425,
13
- "acc_stderr": 0.013704669762934727
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.21956970232832299
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.044084400227680794
23
- },
24
- "hellaswag": {
25
- "acc": 0.4480183230432185,
26
- "acc_stderr": 0.004962742426849887,
27
- "acc_norm": 0.5839474208325035,
28
- "acc_norm_stderr": 0.0049189510191838875
29
- },
30
- "rte": {
31
- "acc": 0.5631768953068592,
32
- "acc_stderr": 0.029855247390314945
33
- },
34
- "winogrande": {
35
- "acc": 0.5730071033938438,
36
- "acc_stderr": 0.01390187807257506
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6916087653661144,
40
- "acc_stderr": 0.010679734445487797
41
- },
42
- "boolq": {
43
- "acc": 0.5892966360856269,
44
- "acc_stderr": 0.008604460608471413
45
- },
46
- "arc_easy": {
47
- "acc": 0.609006734006734,
48
- "acc_stderr": 0.010012992232540633,
49
- "acc_norm": 0.5593434343434344,
50
- "acc_norm_stderr": 0.010187264635711991
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2781569965870307,
54
- "acc_stderr": 0.013094469919538816,
55
- "acc_norm": 0.29436860068259385,
56
- "acc_norm_stderr": 0.013318528460539426
57
- },
58
- "sciq": {
59
- "acc": 0.865,
60
- "acc_stderr": 0.010811655372416053,
61
- "acc_norm": 0.793,
62
- "acc_norm_stderr": 0.012818553557843983
63
- },
64
- "piqa": {
65
- "acc": 0.7442872687704026,
66
- "acc_stderr": 0.010178690109459857,
67
- "acc_norm": 0.7546245919477693,
68
- "acc_norm_stderr": 0.010039831320422386
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b58b/evaluation/rankeval/8b7178b58b_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.337,0.014955087918653605,0
3
+ anli_r2,acc,0.339,0.014976758771620345,0
4
+ anli_r3,acc,0.33,0.013579531277800923,0
5
+ arc_challenge,acc,0.2901023890784983,0.01326157367752076,0
6
+ arc_challenge,acc_norm,0.3174061433447099,0.01360223908803817,0
7
+ arc_easy,acc,0.63510101010101,0.009878157021155649,0
8
+ arc_easy,acc_norm,0.625,0.009933992677987828,0
9
+ boolq,acc,0.618348623853211,0.008496550741178263,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.32751039809863336,,1
12
+ copa,acc,0.78,0.04163331998932261,0
13
+ hellaswag,acc,0.4508066122286397,0.004965572246803864,0
14
+ hellaswag,acc_norm,0.5974905397331209,0.004894012555642632,0
15
+ piqa,acc,0.7470076169749728,0.01014288869886246,0
16
+ piqa,acc_norm,0.7486398258977149,0.01012115601681924,0
17
+ rte,acc,0.5018050541516246,0.030096267148976626,0
18
+ sciq,acc,0.905,0.009276910103103317,0
19
+ sciq,acc_norm,0.906,0.009233052000787735,0
20
+ storycloze_2016,acc,0.7001603420630679,0.010595525174558598,0
21
+ winogrande,acc,0.5611681136543015,0.013946933444507032,0
8b7178b58b/evaluation/rankeval/8b7178b58b_1_lm-eval_global_step84877_2023-02-04-19-09-29_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.337,
5
- "acc_stderr": 0.014955087918653605
6
- },
7
- "anli_r2": {
8
- "acc": 0.339,
9
- "acc_stderr": 0.014976758771620345
10
- },
11
- "anli_r3": {
12
- "acc": 0.33,
13
- "acc_stderr": 0.013579531277800923
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.32751039809863336
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932261
23
- },
24
- "hellaswag": {
25
- "acc": 0.4508066122286397,
26
- "acc_stderr": 0.004965572246803864,
27
- "acc_norm": 0.5974905397331209,
28
- "acc_norm_stderr": 0.004894012555642632
29
- },
30
- "rte": {
31
- "acc": 0.5018050541516246,
32
- "acc_stderr": 0.030096267148976626
33
- },
34
- "winogrande": {
35
- "acc": 0.5611681136543015,
36
- "acc_stderr": 0.013946933444507032
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7001603420630679,
40
- "acc_stderr": 0.010595525174558598
41
- },
42
- "boolq": {
43
- "acc": 0.618348623853211,
44
- "acc_stderr": 0.008496550741178263
45
- },
46
- "arc_easy": {
47
- "acc": 0.63510101010101,
48
- "acc_stderr": 0.009878157021155649,
49
- "acc_norm": 0.625,
50
- "acc_norm_stderr": 0.009933992677987828
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2901023890784983,
54
- "acc_stderr": 0.01326157367752076,
55
- "acc_norm": 0.3174061433447099,
56
- "acc_norm_stderr": 0.01360223908803817
57
- },
58
- "sciq": {
59
- "acc": 0.905,
60
- "acc_stderr": 0.009276910103103317,
61
- "acc_norm": 0.906,
62
- "acc_norm_stderr": 0.009233052000787735
63
- },
64
- "piqa": {
65
- "acc": 0.7470076169749728,
66
- "acc_stderr": 0.01014288869886246,
67
- "acc_norm": 0.7486398258977149,
68
- "acc_norm_stderr": 0.01012115601681924
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b58b/evaluation/rankeval/8b7178b58b_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.325,0.014818724459095526,0
3
+ anli_r2,acc,0.326,0.014830507204541033,0
4
+ anli_r3,acc,0.3475,0.013751753243291854,0
5
+ arc_challenge,acc,0.31569965870307165,0.013582571095815293,0
6
+ arc_challenge,acc_norm,0.3250853242320819,0.013688147309729124,0
7
+ arc_easy,acc,0.6367845117845118,0.009868397136118794,0
8
+ arc_easy,acc_norm,0.63510101010101,0.009878157021155649,0
9
+ boolq,acc,0.6039755351681957,0.008553881336813412,1
10
+ cb,acc,0.48214285714285715,0.06737697508644648,1
11
+ cb,f1,0.3356643356643356,,1
12
+ copa,acc,0.72,0.04512608598542129,0
13
+ hellaswag,acc,0.4523003385779725,0.004967023435680015,0
14
+ hellaswag,acc_norm,0.5990838478390759,0.004890824718530304,0
15
+ piqa,acc,0.750816104461371,0.01009188277012022,0
16
+ piqa,acc_norm,0.7546245919477693,0.010039831320422386,0
17
+ rte,acc,0.49458483754512633,0.030094698123239966,0
18
+ sciq,acc,0.927,0.00823035471524406,0
19
+ sciq,acc_norm,0.924,0.008384169266796384,0
20
+ storycloze_2016,acc,0.694815606627472,0.010648664383985665,0
21
+ winogrande,acc,0.5911602209944752,0.013816954295135686,0
8b7178b58b/evaluation/rankeval/8b7178b58b_2_lm-eval_global_step84877_2023-02-04-19-09-29_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.325,
5
- "acc_stderr": 0.014818724459095526
6
- },
7
- "anli_r2": {
8
- "acc": 0.326,
9
- "acc_stderr": 0.014830507204541033
10
- },
11
- "anli_r3": {
12
- "acc": 0.3475,
13
- "acc_stderr": 0.013751753243291854
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.3356643356643356
19
- },
20
- "copa": {
21
- "acc": 0.72,
22
- "acc_stderr": 0.04512608598542129
23
- },
24
- "hellaswag": {
25
- "acc": 0.4523003385779725,
26
- "acc_stderr": 0.004967023435680015,
27
- "acc_norm": 0.5990838478390759,
28
- "acc_norm_stderr": 0.004890824718530304
29
- },
30
- "rte": {
31
- "acc": 0.49458483754512633,
32
- "acc_stderr": 0.030094698123239966
33
- },
34
- "winogrande": {
35
- "acc": 0.5911602209944752,
36
- "acc_stderr": 0.013816954295135686
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.694815606627472,
40
- "acc_stderr": 0.010648664383985665
41
- },
42
- "boolq": {
43
- "acc": 0.6039755351681957,
44
- "acc_stderr": 0.008553881336813412
45
- },
46
- "arc_easy": {
47
- "acc": 0.6367845117845118,
48
- "acc_stderr": 0.009868397136118794,
49
- "acc_norm": 0.63510101010101,
50
- "acc_norm_stderr": 0.009878157021155649
51
- },
52
- "arc_challenge": {
53
- "acc": 0.31569965870307165,
54
- "acc_stderr": 0.013582571095815293,
55
- "acc_norm": 0.3250853242320819,
56
- "acc_norm_stderr": 0.013688147309729124
57
- },
58
- "sciq": {
59
- "acc": 0.927,
60
- "acc_stderr": 0.00823035471524406,
61
- "acc_norm": 0.924,
62
- "acc_norm_stderr": 0.008384169266796384
63
- },
64
- "piqa": {
65
- "acc": 0.750816104461371,
66
- "acc_stderr": 0.01009188277012022,
67
- "acc_norm": 0.7546245919477693,
68
- "acc_norm_stderr": 0.010039831320422386
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b58b/evaluation/rankeval/8b7178b58b_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.321,0.014770821817934656,0
3
+ anli_r2,acc,0.333,0.014910846164229859,0
4
+ anli_r3,acc,0.3475,0.013751753243291852,0
5
+ arc_challenge,acc,0.3122866894197952,0.013542598541688064,0
6
+ arc_challenge,acc_norm,0.33532423208191126,0.013796182947785566,0
7
+ arc_easy,acc,0.6359427609427609,0.009873293392779118,0
8
+ arc_easy,acc_norm,0.6325757575757576,0.00989255261621155,0
9
+ boolq,acc,0.600611620795107,0.008566178448007833,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.269763077644851,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4523999203345947,0.004967118575905285,0
14
+ hellaswag,acc_norm,0.5977892850029874,0.004893418929918259,0
15
+ piqa,acc,0.750272034820457,0.010099232969867488,0
16
+ piqa,acc_norm,0.7573449401523396,0.010002002569708688,0
17
+ rte,acc,0.5054151624548736,0.030094698123239966,0
18
+ sciq,acc,0.926,0.008282064512704159,0
19
+ sciq,acc_norm,0.928,0.008178195576218681,0
20
+ storycloze_2016,acc,0.711918760021379,0.010472537019822575,0
21
+ winogrande,acc,0.5864246250986582,0.013840971763195308,0
8b7178b58b/evaluation/rankeval/8b7178b58b_3_lm-eval_global_step84877_2023-02-04-19-09-29_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.321,
5
- "acc_stderr": 0.014770821817934656
6
- },
7
- "anli_r2": {
8
- "acc": 0.333,
9
- "acc_stderr": 0.014910846164229859
10
- },
11
- "anli_r3": {
12
- "acc": 0.3475,
13
- "acc_stderr": 0.013751753243291852
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.269763077644851
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4523999203345947,
26
- "acc_stderr": 0.004967118575905285,
27
- "acc_norm": 0.5977892850029874,
28
- "acc_norm_stderr": 0.004893418929918259
29
- },
30
- "rte": {
31
- "acc": 0.5054151624548736,
32
- "acc_stderr": 0.030094698123239966
33
- },
34
- "winogrande": {
35
- "acc": 0.5864246250986582,
36
- "acc_stderr": 0.013840971763195308
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.711918760021379,
40
- "acc_stderr": 0.010472537019822575
41
- },
42
- "boolq": {
43
- "acc": 0.600611620795107,
44
- "acc_stderr": 0.008566178448007833
45
- },
46
- "arc_easy": {
47
- "acc": 0.6359427609427609,
48
- "acc_stderr": 0.009873293392779118,
49
- "acc_norm": 0.6325757575757576,
50
- "acc_norm_stderr": 0.00989255261621155
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3122866894197952,
54
- "acc_stderr": 0.013542598541688064,
55
- "acc_norm": 0.33532423208191126,
56
- "acc_norm_stderr": 0.013796182947785566
57
- },
58
- "sciq": {
59
- "acc": 0.926,
60
- "acc_stderr": 0.008282064512704159,
61
- "acc_norm": 0.928,
62
- "acc_norm_stderr": 0.008178195576218681
63
- },
64
- "piqa": {
65
- "acc": 0.750272034820457,
66
- "acc_stderr": 0.010099232969867488,
67
- "acc_norm": 0.7573449401523396,
68
- "acc_norm_stderr": 0.010002002569708688
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b58b/evaluation/rankeval/8b7178b58b_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.324,0.014806864733738856,0
3
+ anli_r2,acc,0.333,0.014910846164229873,0
4
+ anli_r3,acc,0.33666666666666667,0.013647602942406401,0
5
+ arc_challenge,acc,0.29948805460750855,0.013385021637313565,0
6
+ arc_challenge,acc_norm,0.3387372013651877,0.01383056892797433,0
7
+ arc_easy,acc,0.6439393939393939,0.00982545460841631,0
8
+ arc_easy,acc_norm,0.640993265993266,0.009843424713072174,0
9
+ boolq,acc,0.5883792048929664,0.008607357686607963,1
10
+ cb,acc,0.35714285714285715,0.0646095738380922,1
11
+ cb,f1,0.2275946275946276,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4509061939852619,0.004965670398127354,0
14
+ hellaswag,acc_norm,0.5998805018920533,0.004889210628907973,0
15
+ piqa,acc,0.750816104461371,0.010091882770120216,0
16
+ piqa,acc_norm,0.750272034820457,0.010099232969867472,0
17
+ rte,acc,0.4584837545126354,0.029992535385373314,0
18
+ sciq,acc,0.922,0.008484573530118583,0
19
+ sciq,acc_norm,0.93,0.008072494358323499,0
20
+ storycloze_2016,acc,0.7081774452164618,0.010512588616199622,0
21
+ winogrande,acc,0.5824782951854776,0.013859978264440248,0
8b7178b58b/evaluation/rankeval/8b7178b58b_4_lm-eval_global_step84877_2023-02-04-19-09-29_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.324,
5
- "acc_stderr": 0.014806864733738856
6
- },
7
- "anli_r2": {
8
- "acc": 0.333,
9
- "acc_stderr": 0.014910846164229873
10
- },
11
- "anli_r3": {
12
- "acc": 0.33666666666666667,
13
- "acc_stderr": 0.013647602942406401
14
- },
15
- "cb": {
16
- "acc": 0.35714285714285715,
17
- "acc_stderr": 0.0646095738380922,
18
- "f1": 0.2275946275946276
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4509061939852619,
26
- "acc_stderr": 0.004965670398127354,
27
- "acc_norm": 0.5998805018920533,
28
- "acc_norm_stderr": 0.004889210628907973
29
- },
30
- "rte": {
31
- "acc": 0.4584837545126354,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5824782951854776,
36
- "acc_stderr": 0.013859978264440248
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7081774452164618,
40
- "acc_stderr": 0.010512588616199622
41
- },
42
- "boolq": {
43
- "acc": 0.5883792048929664,
44
- "acc_stderr": 0.008607357686607963
45
- },
46
- "arc_easy": {
47
- "acc": 0.6439393939393939,
48
- "acc_stderr": 0.00982545460841631,
49
- "acc_norm": 0.640993265993266,
50
- "acc_norm_stderr": 0.009843424713072174
51
- },
52
- "arc_challenge": {
53
- "acc": 0.29948805460750855,
54
- "acc_stderr": 0.013385021637313565,
55
- "acc_norm": 0.3387372013651877,
56
- "acc_norm_stderr": 0.01383056892797433
57
- },
58
- "sciq": {
59
- "acc": 0.922,
60
- "acc_stderr": 0.008484573530118583,
61
- "acc_norm": 0.93,
62
- "acc_norm_stderr": 0.008072494358323499
63
- },
64
- "piqa": {
65
- "acc": 0.750816104461371,
66
- "acc_stderr": 0.010091882770120216,
67
- "acc_norm": 0.750272034820457,
68
- "acc_norm_stderr": 0.010099232969867472
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b58b/evaluation/rankeval/8b7178b58b_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.34,0.014987482264363937,0
3
+ anli_r2,acc,0.326,0.014830507204541033,0
4
+ anli_r3,acc,0.3458333333333333,0.013736245342311014,0
5
+ arc_challenge,acc,0.3165529010238908,0.01359243151906808,0
6
+ arc_challenge,acc_norm,0.3370307167235495,0.013813476652902274,0
7
+ arc_easy,acc,0.6426767676767676,0.009833205612463114,0
8
+ arc_easy,acc_norm,0.6426767676767676,0.009833205612463106,0
9
+ boolq,acc,0.5801223241590214,0.008632045504781744,1
10
+ cb,acc,0.5178571428571429,0.06737697508644648,1
11
+ cb,f1,0.33534439416792355,,1
12
+ copa,acc,0.73,0.044619604333847394,0
13
+ hellaswag,acc,0.45030870344552876,0.0049650784774355715,0
14
+ hellaswag,acc_norm,0.60017924716192,0.004888601874547486,0
15
+ piqa,acc,0.7584330794341676,0.009986718001804461,0
16
+ piqa,acc_norm,0.7600652883569097,0.009963625892809545,0
17
+ rte,acc,0.48375451263537905,0.030080573208738064,0
18
+ sciq,acc,0.932,0.007964887911291603,0
19
+ sciq,acc_norm,0.929,0.008125578442487914,0
20
+ storycloze_2016,acc,0.7012292891501871,0.010584692134739974,0
21
+ winogrande,acc,0.5674822415153907,0.013923911578623827,0
8b7178b58b/evaluation/rankeval/8b7178b58b_5_lm-eval_global_step84877_2023-02-04-19-09-29_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.34,
5
- "acc_stderr": 0.014987482264363937
6
- },
7
- "anli_r2": {
8
- "acc": 0.326,
9
- "acc_stderr": 0.014830507204541033
10
- },
11
- "anli_r3": {
12
- "acc": 0.3458333333333333,
13
- "acc_stderr": 0.013736245342311014
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.33534439416792355
19
- },
20
- "copa": {
21
- "acc": 0.73,
22
- "acc_stderr": 0.044619604333847394
23
- },
24
- "hellaswag": {
25
- "acc": 0.45030870344552876,
26
- "acc_stderr": 0.0049650784774355715,
27
- "acc_norm": 0.60017924716192,
28
- "acc_norm_stderr": 0.004888601874547486
29
- },
30
- "rte": {
31
- "acc": 0.48375451263537905,
32
- "acc_stderr": 0.030080573208738064
33
- },
34
- "winogrande": {
35
- "acc": 0.5674822415153907,
36
- "acc_stderr": 0.013923911578623827
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7012292891501871,
40
- "acc_stderr": 0.010584692134739974
41
- },
42
- "boolq": {
43
- "acc": 0.5801223241590214,
44
- "acc_stderr": 0.008632045504781744
45
- },
46
- "arc_easy": {
47
- "acc": 0.6426767676767676,
48
- "acc_stderr": 0.009833205612463114,
49
- "acc_norm": 0.6426767676767676,
50
- "acc_norm_stderr": 0.009833205612463106
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3165529010238908,
54
- "acc_stderr": 0.01359243151906808,
55
- "acc_norm": 0.3370307167235495,
56
- "acc_norm_stderr": 0.013813476652902274
57
- },
58
- "sciq": {
59
- "acc": 0.932,
60
- "acc_stderr": 0.007964887911291603,
61
- "acc_norm": 0.929,
62
- "acc_norm_stderr": 0.008125578442487914
63
- },
64
- "piqa": {
65
- "acc": 0.7584330794341676,
66
- "acc_stderr": 0.009986718001804461,
67
- "acc_norm": 0.7600652883569097,
68
- "acc_norm_stderr": 0.009963625892809545
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }